lambda/pkg/token/scanner.go

package token

import (
	"errors"
	"fmt"
	"regexp"
	"unicode/utf8"
)

// A rule describes a single lexical pattern for the scanner.
type rule[T Type] struct {
	pattern *regexp.Regexp
	typ     T
	skip    bool
}

// A Scanner is a declarative lexer configured by registering regex rules.
// At each position in the input, all rules are tested and the longest match
// wins.
// Ties are broken by registration order (first registered wins).
type Scanner[T Type] struct {
	rules []rule[T]
}

// NewScanner creates a new Scanner with no rules.
func NewScanner[T Type]() *Scanner[T] {
	return &Scanner[T]{}
}

// On registers a rule that emits a token of the given type when the pattern
// matches.
// The token's value is the matched text.
func (s *Scanner[T]) On(pattern string, typ T) *Scanner[T] {
	s.rules = append(s.rules, rule[T]{
		pattern: compileAnchored(pattern),
		typ:     typ,
	})
	return s
}

// Skip registers a rule that consumes matching text without emitting a token.
// This is used for whitespace and comments.
func (s *Scanner[T]) Skip(pattern string) *Scanner[T] {
	var zero T
	s.rules = append(s.rules, rule[T]{
		pattern: compileAnchored(pattern),
		typ:     zero,
		skip:    true,
	})
	return s
}

// Scan tokenizes the input string using the registered rules.
// At each position, all rules are tested and the longest match wins.
// If no rule matches, an error is recorded and the scanner advances one rune.
func (s *Scanner[T]) Scan(input string) ([]Token[T], error) {
	tokens := []Token[T]{}
	errorList := []error{}
	pos := 0
	column := 0

	for pos < len(input) {
		bestLen := 0
		bestRule := -1

		for idx, r := range s.rules {
			loc := r.pattern.FindStringIndex(input[pos:])
			if loc == nil {
				continue
			}
			if matchLen := loc[1]; matchLen > bestLen {
				bestLen = matchLen
				bestRule = idx
			}
		}

		if bestRule == -1 || bestLen == 0 {
			_, size := utf8.DecodeRuneInString(input[pos:])
			errorList = append(errorList, fmt.Errorf("unknown character '%v'", input[pos:pos+size]))
			pos += size
			column++
			continue
		}

		matched := input[pos : pos+bestLen]
		r := s.rules[bestRule]

		if !r.skip {
			tokens = append(tokens, Token[T]{
				Type:   r.typ,
				Value:  matched,
				Column: column,
			})
		}

		column += utf8.RuneCountInString(matched)
		pos += bestLen
	}

	return tokens, errors.Join(errorList...)
}

// compileAnchored compiles a regex pattern, prepending \A so it only matches
// at the current scan position.
// Patterns must not be pre-anchored.
func compileAnchored(pattern string) *regexp.Regexp {
	return regexp.MustCompile(`\A(?:` + pattern + `)`)
}