feat: scanner added

2026-02-11 20:28:29 -05:00
parent aca197ef51
commit b3f9f08c62
6 changed files with 136 additions and 183 deletions
--- a/pkg/lambda/scan.go
+++ b/pkg/lambda/scan.go
@@ -1,45 +1,17 @@
 package lambda
-import (
+import "git.maximhutz.com/max/lambda/pkg/token"
 	"fmt"
 	"unicode"
-	"git.maximhutz.com/max/lambda/pkg/iterator"
+// scanner is the declarative lexer for the lambda calculus.
-	"git.maximhutz.com/max/lambda/pkg/token"
+var scanner = token.NewScanner[tokenType]().
-)
+	On(`\(`, tokenOpenParen).
-
+	On(`\)`, tokenCloseParen).
-// scanToken pulls the next lambda calculus token from a rune iterator.
+	On(`\\`, tokenSlash).
-func scanToken(i *iterator.Iterator[rune]) (*lambdaToken, error) {
+	On(`\.`, tokenDot).
-	index := i.Index()
+	On(`[a-zA-Z0-9_]+`, tokenAtom).
-
+	Skip(`\s+`)
 	if i.Done() {
 		return nil, nil
 	}
 	letter, err := i.Next()
 	if err != nil {
 		return nil, fmt.Errorf("cannot produce next token: %w", err)
 	}
 	switch {
 	case letter == '(':
 		return token.New(tokenOpenParen, index), nil
 	case letter == ')':
 		return token.New(tokenCloseParen, index), nil
 	case letter == '\\':
 		return token.New(tokenSlash, index), nil
 	case letter == '.':
 		return token.New(tokenDot, index), nil
 	case unicode.IsSpace(letter):
 		return nil, nil
 	case token.IsVariable(letter):
 		return token.ScanAtom(i, letter, tokenAtom, index), nil
 	}
 	return nil, fmt.Errorf("unknown character '%v'", string(letter))
 }
 // scan tokenizes an input string into lambda calculus tokens.
 func scan(input string) ([]lambdaToken, error) {
-	return token.Scan(input, scanToken)
+	return scanner.Scan(input)
 }
--- a/pkg/saccharine/scan.go
+++ b/pkg/saccharine/scan.go
@@ -1,64 +1,23 @@
 package saccharine
-import (
+import "git.maximhutz.com/max/lambda/pkg/token"
 	"fmt"
 	"unicode"
-	"git.maximhutz.com/max/lambda/pkg/iterator"
+// scanner is the declarative lexer for the Saccharine language.
-	"git.maximhutz.com/max/lambda/pkg/token"
+var scanner = token.NewScanner[TokenType]().
-)
+	On(`:=`, TokenAssign).
 	On(`\(`, TokenOpenParen).
 	On(`\)`, TokenCloseParen).
 	On(`\{`, TokenOpenBrace).
 	On(`\}`, TokenCloseBrace).
 	On(`;`, TokenHardBreak).
 	On(`\n`, TokenSoftBreak).
 	On(`\\`, TokenSlash).
 	On(`\.`, TokenDot).
 	On(`[a-zA-Z0-9_]+`, TokenAtom).
 	Skip(`#[^\n]*`).
 	Skip(`[^\S\n]+`)
-// Pulls the next token from an iterator over runes. If it cannot, it will
+// scan tokenizes a string into Saccharine tokens.
 // return nil. If an error occurs, it will return that.
 func scanToken(i *iterator.Iterator[rune]) (*Token, error) {
 	index := i.Index()
 	if i.Done() {
 		return nil, nil
 	}
 	letter, err := i.Next()
 	if err != nil {
 		return nil, fmt.Errorf("cannot produce next token: %w", err)
 	}
 	switch {
 	case letter == '(':
 		return token.New(TokenOpenParen, index), nil
 	case letter == ')':
 		return token.New(TokenCloseParen, index), nil
 	case letter == '.':
 		return token.New(TokenDot, index), nil
 	case letter == '\\':
 		return token.New(TokenSlash, index), nil
 	case letter == '\n':
 		return token.New(TokenSoftBreak, index), nil
 	case letter == '{':
 		return token.New(TokenOpenBrace, index), nil
 	case letter == '}':
 		return token.New(TokenCloseBrace, index), nil
 	case letter == ':':
 		if _, err := token.ScanCharacter(i, '='); err != nil {
 			return nil, err
 		} else {
 			return token.New(TokenAssign, index), nil
 		}
 	case letter == ';':
 		return token.New(TokenHardBreak, index), nil
 	case letter == '#':
 		// Skip everything until the next newline or EOF.
 		i.While(func(r rune) bool { return r != '\n' })
 		return nil, nil
 	case unicode.IsSpace(letter):
 		return nil, nil
 	case token.IsVariable(letter):
 		return token.ScanAtom(i, letter, TokenAtom, index), nil
 	}
 	return nil, fmt.Errorf("unknown character '%v'", string(letter))
 }
 // scan a string into tokens.
 func scan(input string) ([]Token, error) {
-	return token.Scan(input, scanToken)
+	return scanner.Scan(input)
 }
--- a/pkg/token/parse.go
+++ b/pkg/token/parse.go
@@ -16,7 +16,7 @@ func ParseRawToken[T Type](i *iterator.Iterator[Token[T]], expected T) (*Token[T
 		return nil, err
 	}
 	if tok.Type != expected {
-		return nil, fmt.Errorf("expected token %v, got %v'", expected.Name(), tok.Value)
+		return nil, fmt.Errorf("expected token '%v', got '%v'", expected.Name(), tok.Value)
 	}
 	i.Forward()
 	return &tok, nil
--- a/pkg/token/scan.go
+++ b/pkg/token/scan.go
@@ -1,74 +0,0 @@
 package token
 import (
 	"errors"
 	"fmt"
 	"unicode"
 	"git.maximhutz.com/max/lambda/pkg/iterator"
 )
 // IsVariable determines whether a rune can be a valid variable character.
 func IsVariable(r rune) bool {
 	return unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_'
 }
 // ScanRune consumes the next rune from the iterator if it satisfies the
 // predicate.
 // Returns an error if the iterator is exhausted or the rune does not match.
 func ScanRune(i *iterator.Iterator[rune], expected func(rune) bool) (rune, error) {
 	r, err := i.Get()
 	if err != nil {
 		return r, err
 	}
 	if !expected(r) {
 		return r, fmt.Errorf("got unexpected rune %v'", r)
 	}
 	i.Forward()
 	return r, nil
 }
 // ScanCharacter consumes the next rune from the iterator if it matches the
 // expected rune exactly.
 // Returns an error if the iterator is exhausted or the rune does not match.
 func ScanCharacter(i *iterator.Iterator[rune], expected rune) (rune, error) {
 	return ScanRune(i, func(r rune) bool { return r == expected })
 }
 // ScanAtom scans a contiguous sequence of variable characters into a single
 // atom token.
 // The first rune has already been consumed and is passed in.
 func ScanAtom[T Type](i *iterator.Iterator[rune], first rune, typ T, column int) *Token[T] {
 	atom := []rune{first}
 	for {
 		if r, err := ScanRune(i, IsVariable); err != nil {
 			break
 		} else {
 			atom = append(atom, r)
 		}
 	}
 	return NewAtom(typ, string(atom), column)
 }
 // Scan tokenizes an input string using a language-specific scanToken function.
 // The scanToken function is called repeatedly until the input is exhausted.
 // It returns nil (no token, no error) for skippable input like whitespace.
 // Errors are accumulated and returned joined at the end.
 func Scan[T Type](input string, scanToken func(*iterator.Iterator[rune]) (*Token[T], error)) ([]Token[T], error) {
 	i := iterator.Of([]rune(input))
 	tokens := []Token[T]{}
 	errorList := []error{}
 	for !i.Done() {
 		token, err := scanToken(i)
 		if err != nil {
 			errorList = append(errorList, err)
 		} else if token != nil {
 			tokens = append(tokens, *token)
 		}
 	}
 	return tokens, errors.Join(errorList...)
 }
--- a/pkg/token/scanner.go
+++ b/pkg/token/scanner.go
@@ -0,0 +1,108 @@
 package token
 import (
 	"errors"
 	"fmt"
 	"regexp"
 	"unicode/utf8"
 )
 // A rule describes a single lexical pattern for the scanner.
 type rule[T Type] struct {
 	pattern *regexp.Regexp
 	typ     T
 	skip    bool
 }
 // A Scanner is a declarative lexer configured by registering regex rules.
 // At each position in the input, all rules are tested and the longest match
 // wins.
 // Ties are broken by registration order (first registered wins).
 type Scanner[T Type] struct {
 	rules []rule[T]
 }
 // NewScanner creates a new Scanner with no rules.
 func NewScanner[T Type]() *Scanner[T] {
 	return &Scanner[T]{}
 }
 // On registers a rule that emits a token of the given type when the pattern
 // matches.
 // The token's value is the matched text.
 func (s *Scanner[T]) On(pattern string, typ T) *Scanner[T] {
 	s.rules = append(s.rules, rule[T]{
 		pattern: compileAnchored(pattern),
 		typ:     typ,
 	})
 	return s
 }
 // Skip registers a rule that consumes matching text without emitting a token.
 // This is used for whitespace and comments.
 func (s *Scanner[T]) Skip(pattern string) *Scanner[T] {
 	var zero T
 	s.rules = append(s.rules, rule[T]{
 		pattern: compileAnchored(pattern),
 		typ:     zero,
 		skip:    true,
 	})
 	return s
 }
 // Scan tokenizes the input string using the registered rules.
 // At each position, all rules are tested and the longest match wins.
 // If no rule matches, an error is recorded and the scanner advances one rune.
 func (s *Scanner[T]) Scan(input string) ([]Token[T], error) {
 	tokens := []Token[T]{}
 	errorList := []error{}
 	pos := 0
 	column := 0
 	for pos < len(input) {
 		bestLen := 0
 		bestRule := -1
 		for idx, r := range s.rules {
 			loc := r.pattern.FindStringIndex(input[pos:])
 			if loc == nil {
 				continue
 			}
 			if matchLen := loc[1]; matchLen > bestLen {
 				bestLen = matchLen
 				bestRule = idx
 			}
 		}
 		if bestRule == -1 || bestLen == 0 {
 			_, size := utf8.DecodeRuneInString(input[pos:])
 			errorList = append(errorList, fmt.Errorf("unknown character '%v'", input[pos:pos+size]))
 			pos += size
 			column++
 			continue
 		}
 		matched := input[pos : pos+bestLen]
 		r := s.rules[bestRule]
 		if !r.skip {
 			tokens = append(tokens, Token[T]{
 				Type:   r.typ,
 				Value:  matched,
 				Column: column,
 			})
 		}
 		column += utf8.RuneCountInString(matched)
 		pos += bestLen
 	}
 	return tokens, errors.Join(errorList...)
 }
 // compileAnchored compiles a regex pattern, prepending \A so it only matches
 // at the current scan position.
 // Patterns must not be pre-anchored.
 func compileAnchored(pattern string) *regexp.Regexp {
 	return regexp.MustCompile(`\A(?:` + pattern + `)`)
 }
--- a/pkg/token/token.go
+++ b/pkg/token/token.go
@@ -18,18 +18,6 @@ type Token[T Type] struct {
 	Value  string // The value of the token.
 }
 // New creates a Token of the given type at the given column.
 // The token's value is derived from its type's Name method.
 func New[T Type](typ T, column int) *Token[T] {
 	return &Token[T]{Type: typ, Column: column, Value: typ.Name()}
 }
 // NewAtom creates a Token of the given type with a custom value at the given
 // column.
 func NewAtom[T Type](typ T, name string, column int) *Token[T] {
 	return &Token[T]{Type: typ, Column: column, Value: name}
 }
 // Name returns the type of the Token, as a string.
 func (t Token[T]) Name() string {
 	return t.Type.Name()