package token import ( "errors" "fmt" "regexp" "unicode/utf8" ) // A rule describes a single lexical pattern for the scanner. type rule[T Type] struct { pattern *regexp.Regexp typ T skip bool } // A Scanner is a declarative lexer configured by registering regex rules. // At each position in the input, all rules are tested and the longest match // wins. // Ties are broken by registration order (first registered wins). type Scanner[T Type] struct { rules []rule[T] } // NewScanner creates a new Scanner with no rules. func NewScanner[T Type]() *Scanner[T] { return &Scanner[T]{} } // On registers a rule that emits a token of the given type when the pattern // matches. // The token's value is the matched text. func (s *Scanner[T]) On(pattern string, typ T) *Scanner[T] { s.rules = append(s.rules, rule[T]{ pattern: compileAnchored(pattern), typ: typ, }) return s } // Skip registers a rule that consumes matching text without emitting a token. // This is used for whitespace and comments. func (s *Scanner[T]) Skip(pattern string) *Scanner[T] { var zero T s.rules = append(s.rules, rule[T]{ pattern: compileAnchored(pattern), typ: zero, skip: true, }) return s } // Scan tokenizes the input string using the registered rules. // At each position, all rules are tested and the longest match wins. // If no rule matches, an error is recorded and the scanner advances one rune. func (s *Scanner[T]) Scan(input string) ([]Token[T], error) { tokens := []Token[T]{} errorList := []error{} pos := 0 column := 0 for pos < len(input) { bestLen := 0 bestRule := -1 for idx, r := range s.rules { loc := r.pattern.FindStringIndex(input[pos:]) if loc == nil { continue } if matchLen := loc[1]; matchLen > bestLen { bestLen = matchLen bestRule = idx } } if bestRule == -1 || bestLen == 0 { _, size := utf8.DecodeRuneInString(input[pos:]) errorList = append(errorList, fmt.Errorf("unknown character '%v'", input[pos:pos+size])) pos += size column++ continue } matched := input[pos : pos+bestLen] r := s.rules[bestRule] if !r.skip { tokens = append(tokens, Token[T]{ Type: r.typ, Value: matched, Column: column, }) } column += utf8.RuneCountInString(matched) pos += bestLen } return tokens, errors.Join(errorList...) } // compileAnchored compiles a regex pattern, prepending \A so it only matches // at the current scan position. // Patterns must not be pre-anchored. func compileAnchored(pattern string) *regexp.Regexp { return regexp.MustCompile(`\A(?:` + pattern + `)`) }