package token import ( "errors" "fmt" "regexp" "slices" ) // A rule describes a single lexical pattern for the scanner. type rule[T Type] struct { pattern *regexp.Regexp typ T precedence int skip bool } // compare orders rules by descending precedence. func (r rule[T]) compare(other rule[T]) int { return other.precedence - r.precedence } // An Option configures a Scanner during construction. type Option[T Type] func(rules []rule[T]) []rule[T] // On returns an option that registers a token-emitting rule. // The token's value is the matched text. // Higher precedence rules are tried first. func On[T Type](pattern string, typ T, precedence int) Option[T] { return func(rules []rule[T]) []rule[T] { return append(rules, rule[T]{ pattern: compileAnchored(pattern), typ: typ, precedence: precedence, }) } } // Skip returns an option that registers a non-emitting rule. // This is used for whitespace and comments. // Higher precedence rules are tried first. func Skip[T Type](pattern string, precedence int) Option[T] { return func(rules []rule[T]) []rule[T] { return append(rules, rule[T]{ pattern: compileAnchored(pattern), precedence: precedence, skip: true, }) } } // A Scanner is a declarative lexer built from a set of regex rules. // Rules are sorted by precedence (highest first), with registration order as // tiebreaker. // At each position, the first matching rule wins. type Scanner[T Type] struct { rules []rule[T] } // NewScanner creates a Scanner by applying the given options and sorting the // resulting rules by precedence. func NewScanner[T Type](opts ...Option[T]) *Scanner[T] { var rules []rule[T] for _, opt := range opts { rules = opt(rules) } slices.SortStableFunc(rules, rule[T].compare) return &Scanner[T]{rules: rules} } // scanOne tries each rule at the current position and returns the first match. // Returns the token (or nil if skipped) and the number of bytes consumed. // Returns 0 if no rule matched. func (s *Scanner[T]) scanOne(input string, pos int) (*Token[T], int) { for _, r := range s.rules { loc := r.pattern.FindStringIndex(input[pos:]) if loc == nil || loc[1] == 0 { continue } if r.skip { return nil, loc[1] } return &Token[T]{ Type: r.typ, Value: input[pos : pos+loc[1]], Column: pos, }, loc[1] } return nil, 0 } // Scan tokenizes the input string using the registered rules. // At each position, rules are tried in precedence order and the first match // wins. // If no rule matches, an error is recorded and the scanner advances one byte. func (s *Scanner[T]) Scan(input string) ([]Token[T], error) { tokens := []Token[T]{} errorList := []error{} for pos := 0; pos < len(input); { tok, n := s.scanOne(input, pos) if n == 0 { errorList = append(errorList, fmt.Errorf("unknown character '%v'", string(input[pos]))) pos++ continue } if tok != nil { tokens = append(tokens, *tok) } pos += n } return tokens, errors.Join(errorList...) } // compileAnchored compiles a regex pattern, prepending \A so it only matches // at the current scan position. // Patterns must not be pre-anchored. func compileAnchored(pattern string) *regexp.Regexp { return regexp.MustCompile(`\A(?:` + pattern + `)`) }