feat: scanner added

2026-02-11 20:28:29 -05:00
parent aca197ef51
commit b3f9f08c62
6 changed files with 136 additions and 183 deletions
--- a/pkg/token/parse.go
+++ b/pkg/token/parse.go
@@ -16,7 +16,7 @@ func ParseRawToken[T Type](i *iterator.Iterator[Token[T]], expected T) (*Token[T
 		return nil, err
 	}
 	if tok.Type != expected {
-		return nil, fmt.Errorf("expected token %v, got %v'", expected.Name(), tok.Value)
+		return nil, fmt.Errorf("expected token '%v', got '%v'", expected.Name(), tok.Value)
 	}
 	i.Forward()
 	return &tok, nil
--- a/pkg/token/scan.go
+++ b/pkg/token/scan.go
@@ -1,74 +0,0 @@
-package token
-
-import (
-	"errors"
-	"fmt"
-	"unicode"
-
-	"git.maximhutz.com/max/lambda/pkg/iterator"
-)
-
-// IsVariable determines whether a rune can be a valid variable character.
-func IsVariable(r rune) bool {
-	return unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_'
-}
-
-// ScanRune consumes the next rune from the iterator if it satisfies the
-// predicate.
-// Returns an error if the iterator is exhausted or the rune does not match.
-func ScanRune(i *iterator.Iterator[rune], expected func(rune) bool) (rune, error) {
-	r, err := i.Get()
-	if err != nil {
-		return r, err
-	}
-	if !expected(r) {
-		return r, fmt.Errorf("got unexpected rune %v'", r)
-	}
-	i.Forward()
-	return r, nil
-}
-
-// ScanCharacter consumes the next rune from the iterator if it matches the
-// expected rune exactly.
-// Returns an error if the iterator is exhausted or the rune does not match.
-func ScanCharacter(i *iterator.Iterator[rune], expected rune) (rune, error) {
-	return ScanRune(i, func(r rune) bool { return r == expected })
-}
-
-// ScanAtom scans a contiguous sequence of variable characters into a single
-// atom token.
-// The first rune has already been consumed and is passed in.
-func ScanAtom[T Type](i *iterator.Iterator[rune], first rune, typ T, column int) *Token[T] {
-	atom := []rune{first}
-
-	for {
-		if r, err := ScanRune(i, IsVariable); err != nil {
-			break
-		} else {
-			atom = append(atom, r)
-		}
-	}
-
-	return NewAtom(typ, string(atom), column)
-}
-
-// Scan tokenizes an input string using a language-specific scanToken function.
-// The scanToken function is called repeatedly until the input is exhausted.
-// It returns nil (no token, no error) for skippable input like whitespace.
-// Errors are accumulated and returned joined at the end.
-func Scan[T Type](input string, scanToken func(*iterator.Iterator[rune]) (*Token[T], error)) ([]Token[T], error) {
-	i := iterator.Of([]rune(input))
-	tokens := []Token[T]{}
-	errorList := []error{}
-
-	for !i.Done() {
-		token, err := scanToken(i)
-		if err != nil {
-			errorList = append(errorList, err)
-		} else if token != nil {
-			tokens = append(tokens, *token)
-		}
-	}
-
-	return tokens, errors.Join(errorList...)
-}
--- a/pkg/token/scanner.go
+++ b/pkg/token/scanner.go
@@ -0,0 +1,108 @@
+package token
+
+import (
+	"errors"
+	"fmt"
+	"regexp"
+	"unicode/utf8"
+)
+
+// A rule describes a single lexical pattern for the scanner.
+type rule[T Type] struct {
+	pattern *regexp.Regexp
+	typ     T
+	skip    bool
+}
+
+// A Scanner is a declarative lexer configured by registering regex rules.
+// At each position in the input, all rules are tested and the longest match
+// wins.
+// Ties are broken by registration order (first registered wins).
+type Scanner[T Type] struct {
+	rules []rule[T]
+}
+
+// NewScanner creates a new Scanner with no rules.
+func NewScanner[T Type]() *Scanner[T] {
+	return &Scanner[T]{}
+}
+
+// On registers a rule that emits a token of the given type when the pattern
+// matches.
+// The token's value is the matched text.
+func (s *Scanner[T]) On(pattern string, typ T) *Scanner[T] {
+	s.rules = append(s.rules, rule[T]{
+		pattern: compileAnchored(pattern),
+		typ:     typ,
+	})
+	return s
+}
+
+// Skip registers a rule that consumes matching text without emitting a token.
+// This is used for whitespace and comments.
+func (s *Scanner[T]) Skip(pattern string) *Scanner[T] {
+	var zero T
+	s.rules = append(s.rules, rule[T]{
+		pattern: compileAnchored(pattern),
+		typ:     zero,
+		skip:    true,
+	})
+	return s
+}
+
+// Scan tokenizes the input string using the registered rules.
+// At each position, all rules are tested and the longest match wins.
+// If no rule matches, an error is recorded and the scanner advances one rune.
+func (s *Scanner[T]) Scan(input string) ([]Token[T], error) {
+	tokens := []Token[T]{}
+	errorList := []error{}
+	pos := 0
+	column := 0
+
+	for pos < len(input) {
+		bestLen := 0
+		bestRule := -1
+
+		for idx, r := range s.rules {
+			loc := r.pattern.FindStringIndex(input[pos:])
+			if loc == nil {
+				continue
+			}
+			if matchLen := loc[1]; matchLen > bestLen {
+				bestLen = matchLen
+				bestRule = idx
+			}
+		}
+
+		if bestRule == -1 || bestLen == 0 {
+			_, size := utf8.DecodeRuneInString(input[pos:])
+			errorList = append(errorList, fmt.Errorf("unknown character '%v'", input[pos:pos+size]))
+			pos += size
+			column++
+			continue
+		}
+
+		matched := input[pos : pos+bestLen]
+		r := s.rules[bestRule]
+
+		if !r.skip {
+			tokens = append(tokens, Token[T]{
+				Type:   r.typ,
+				Value:  matched,
+				Column: column,
+			})
+		}
+
+		column += utf8.RuneCountInString(matched)
+		pos += bestLen
+	}
+
+	return tokens, errors.Join(errorList...)
+}
+
+// compileAnchored compiles a regex pattern, prepending \A so it only matches
+// at the current scan position.
+// Patterns must not be pre-anchored.
+func compileAnchored(pattern string) *regexp.Regexp {
+	return regexp.MustCompile(`\A(?:` + pattern + `)`)
+}
--- a/pkg/token/token.go
+++ b/pkg/token/token.go
@@ -18,18 +18,6 @@ type Token[T Type] struct {
 	Value  string // The value of the token.
 }

-// New creates a Token of the given type at the given column.
-// The token's value is derived from its type's Name method.
-func New[T Type](typ T, column int) *Token[T] {
-	return &Token[T]{Type: typ, Column: column, Value: typ.Name()}
-}
-
-// NewAtom creates a Token of the given type with a custom value at the given
-// column.
-func NewAtom[T Type](typ T, name string, column int) *Token[T] {
-	return &Token[T]{Type: typ, Column: column, Value: name}
-}
-
 // Name returns the type of the Token, as a string.
 func (t Token[T]) Name() string {
 	return t.Type.Name()