package token import ( "errors" "fmt" "regexp" ) // A rule describes a single lexical pattern for the scanner. type rule[T Type] struct { pattern *regexp.Regexp typ T skip bool } // A Scanner is a declarative lexer configured by registering regex rules. // At each position in the input, all rules are tested and the longest match // wins. // Ties are broken by registration order (first registered wins). type Scanner[T Type] struct { rules []rule[T] } // NewScanner creates a new Scanner with no rules. func NewScanner[T Type]() *Scanner[T] { return &Scanner[T]{} } // On registers a rule that emits a token of the given type when the pattern // matches. // The token's value is the matched text. func (s *Scanner[T]) On(pattern string, typ T) *Scanner[T] { s.rules = append(s.rules, rule[T]{ pattern: compileAnchored(pattern), typ: typ, }) return s } // Skip registers a rule that consumes matching text without emitting a token. // This is used for whitespace and comments. func (s *Scanner[T]) Skip(pattern string) *Scanner[T] { s.rules = append(s.rules, rule[T]{ pattern: compileAnchored(pattern), skip: true, }) return s } // Scan tokenizes the input string using the registered rules. // At each position, all rules are tested and the longest match wins. // If no rule matches, an error is recorded and the scanner advances one byte. func (s *Scanner[T]) Scan(input string) ([]Token[T], error) { tokens := []Token[T]{} errorList := []error{} for pos := 0; pos < len(input); { bestLen := 0 bestRule := -1 for idx, r := range s.rules { loc := r.pattern.FindStringIndex(input[pos:]) if loc != nil && loc[1] > bestLen { bestLen = loc[1] bestRule = idx } } if bestRule == -1 || bestLen == 0 { errorList = append(errorList, fmt.Errorf("unknown character '%v'", string(input[pos]))) pos++ continue } if r := s.rules[bestRule]; !r.skip { tokens = append(tokens, Token[T]{ Type: r.typ, Value: input[pos : pos+bestLen], Column: pos, }) } pos += bestLen } return tokens, errors.Join(errorList...) } // compileAnchored compiles a regex pattern, prepending \A so it only matches // at the current scan position. // Patterns must not be pre-anchored. func compileAnchored(pattern string) *regexp.Regexp { return regexp.MustCompile(`\A(?:` + pattern + `)`) }