diff --git a/pkg/lambda/scan.go b/pkg/lambda/scan.go index 14b5e12..b1f08ad 100644 --- a/pkg/lambda/scan.go +++ b/pkg/lambda/scan.go @@ -1,45 +1,17 @@ package lambda -import ( - "fmt" - "unicode" +import "git.maximhutz.com/max/lambda/pkg/token" - "git.maximhutz.com/max/lambda/pkg/iterator" - "git.maximhutz.com/max/lambda/pkg/token" -) - -// scanToken pulls the next lambda calculus token from a rune iterator. -func scanToken(i *iterator.Iterator[rune]) (*lambdaToken, error) { - index := i.Index() - - if i.Done() { - return nil, nil - } - - letter, err := i.Next() - if err != nil { - return nil, fmt.Errorf("cannot produce next token: %w", err) - } - - switch { - case letter == '(': - return token.New(tokenOpenParen, index), nil - case letter == ')': - return token.New(tokenCloseParen, index), nil - case letter == '\\': - return token.New(tokenSlash, index), nil - case letter == '.': - return token.New(tokenDot, index), nil - case unicode.IsSpace(letter): - return nil, nil - case token.IsVariable(letter): - return token.ScanAtom(i, letter, tokenAtom, index), nil - } - - return nil, fmt.Errorf("unknown character '%v'", string(letter)) -} +// scanner is the declarative lexer for the lambda calculus. +var scanner = token.NewScanner[tokenType](). + On(`\(`, tokenOpenParen). + On(`\)`, tokenCloseParen). + On(`\\`, tokenSlash). + On(`\.`, tokenDot). + On(`[a-zA-Z0-9_]+`, tokenAtom). + Skip(`\s+`) // scan tokenizes an input string into lambda calculus tokens. func scan(input string) ([]lambdaToken, error) { - return token.Scan(input, scanToken) + return scanner.Scan(input) } diff --git a/pkg/saccharine/scan.go b/pkg/saccharine/scan.go index fd79d39..f784fef 100644 --- a/pkg/saccharine/scan.go +++ b/pkg/saccharine/scan.go @@ -1,64 +1,23 @@ package saccharine -import ( - "fmt" - "unicode" +import "git.maximhutz.com/max/lambda/pkg/token" - "git.maximhutz.com/max/lambda/pkg/iterator" - "git.maximhutz.com/max/lambda/pkg/token" -) +// scanner is the declarative lexer for the Saccharine language. +var scanner = token.NewScanner[TokenType](). + On(`:=`, TokenAssign). + On(`\(`, TokenOpenParen). + On(`\)`, TokenCloseParen). + On(`\{`, TokenOpenBrace). + On(`\}`, TokenCloseBrace). + On(`;`, TokenHardBreak). + On(`\n`, TokenSoftBreak). + On(`\\`, TokenSlash). + On(`\.`, TokenDot). + On(`[a-zA-Z0-9_]+`, TokenAtom). + Skip(`#[^\n]*`). + Skip(`[^\S\n]+`) -// Pulls the next token from an iterator over runes. If it cannot, it will -// return nil. If an error occurs, it will return that. -func scanToken(i *iterator.Iterator[rune]) (*Token, error) { - index := i.Index() - - if i.Done() { - return nil, nil - } - - letter, err := i.Next() - if err != nil { - return nil, fmt.Errorf("cannot produce next token: %w", err) - } - - switch { - case letter == '(': - return token.New(TokenOpenParen, index), nil - case letter == ')': - return token.New(TokenCloseParen, index), nil - case letter == '.': - return token.New(TokenDot, index), nil - case letter == '\\': - return token.New(TokenSlash, index), nil - case letter == '\n': - return token.New(TokenSoftBreak, index), nil - case letter == '{': - return token.New(TokenOpenBrace, index), nil - case letter == '}': - return token.New(TokenCloseBrace, index), nil - case letter == ':': - if _, err := token.ScanCharacter(i, '='); err != nil { - return nil, err - } else { - return token.New(TokenAssign, index), nil - } - case letter == ';': - return token.New(TokenHardBreak, index), nil - case letter == '#': - // Skip everything until the next newline or EOF. - i.While(func(r rune) bool { return r != '\n' }) - return nil, nil - case unicode.IsSpace(letter): - return nil, nil - case token.IsVariable(letter): - return token.ScanAtom(i, letter, TokenAtom, index), nil - } - - return nil, fmt.Errorf("unknown character '%v'", string(letter)) -} - -// scan a string into tokens. +// scan tokenizes a string into Saccharine tokens. func scan(input string) ([]Token, error) { - return token.Scan(input, scanToken) + return scanner.Scan(input) } diff --git a/pkg/token/parse.go b/pkg/token/parse.go index 5d39207..ce88f31 100644 --- a/pkg/token/parse.go +++ b/pkg/token/parse.go @@ -16,7 +16,7 @@ func ParseRawToken[T Type](i *iterator.Iterator[Token[T]], expected T) (*Token[T return nil, err } if tok.Type != expected { - return nil, fmt.Errorf("expected token %v, got %v'", expected.Name(), tok.Value) + return nil, fmt.Errorf("expected token '%v', got '%v'", expected.Name(), tok.Value) } i.Forward() return &tok, nil diff --git a/pkg/token/scan.go b/pkg/token/scan.go deleted file mode 100644 index bf5fa63..0000000 --- a/pkg/token/scan.go +++ /dev/null @@ -1,74 +0,0 @@ -package token - -import ( - "errors" - "fmt" - "unicode" - - "git.maximhutz.com/max/lambda/pkg/iterator" -) - -// IsVariable determines whether a rune can be a valid variable character. -func IsVariable(r rune) bool { - return unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_' -} - -// ScanRune consumes the next rune from the iterator if it satisfies the -// predicate. -// Returns an error if the iterator is exhausted or the rune does not match. -func ScanRune(i *iterator.Iterator[rune], expected func(rune) bool) (rune, error) { - r, err := i.Get() - if err != nil { - return r, err - } - if !expected(r) { - return r, fmt.Errorf("got unexpected rune %v'", r) - } - i.Forward() - return r, nil -} - -// ScanCharacter consumes the next rune from the iterator if it matches the -// expected rune exactly. -// Returns an error if the iterator is exhausted or the rune does not match. -func ScanCharacter(i *iterator.Iterator[rune], expected rune) (rune, error) { - return ScanRune(i, func(r rune) bool { return r == expected }) -} - -// ScanAtom scans a contiguous sequence of variable characters into a single -// atom token. -// The first rune has already been consumed and is passed in. -func ScanAtom[T Type](i *iterator.Iterator[rune], first rune, typ T, column int) *Token[T] { - atom := []rune{first} - - for { - if r, err := ScanRune(i, IsVariable); err != nil { - break - } else { - atom = append(atom, r) - } - } - - return NewAtom(typ, string(atom), column) -} - -// Scan tokenizes an input string using a language-specific scanToken function. -// The scanToken function is called repeatedly until the input is exhausted. -// It returns nil (no token, no error) for skippable input like whitespace. -// Errors are accumulated and returned joined at the end. -func Scan[T Type](input string, scanToken func(*iterator.Iterator[rune]) (*Token[T], error)) ([]Token[T], error) { - i := iterator.Of([]rune(input)) - tokens := []Token[T]{} - errorList := []error{} - - for !i.Done() { - token, err := scanToken(i) - if err != nil { - errorList = append(errorList, err) - } else if token != nil { - tokens = append(tokens, *token) - } - } - - return tokens, errors.Join(errorList...) -} diff --git a/pkg/token/scanner.go b/pkg/token/scanner.go new file mode 100644 index 0000000..e423fc4 --- /dev/null +++ b/pkg/token/scanner.go @@ -0,0 +1,108 @@ +package token + +import ( + "errors" + "fmt" + "regexp" + "unicode/utf8" +) + +// A rule describes a single lexical pattern for the scanner. +type rule[T Type] struct { + pattern *regexp.Regexp + typ T + skip bool +} + +// A Scanner is a declarative lexer configured by registering regex rules. +// At each position in the input, all rules are tested and the longest match +// wins. +// Ties are broken by registration order (first registered wins). +type Scanner[T Type] struct { + rules []rule[T] +} + +// NewScanner creates a new Scanner with no rules. +func NewScanner[T Type]() *Scanner[T] { + return &Scanner[T]{} +} + +// On registers a rule that emits a token of the given type when the pattern +// matches. +// The token's value is the matched text. +func (s *Scanner[T]) On(pattern string, typ T) *Scanner[T] { + s.rules = append(s.rules, rule[T]{ + pattern: compileAnchored(pattern), + typ: typ, + }) + return s +} + +// Skip registers a rule that consumes matching text without emitting a token. +// This is used for whitespace and comments. +func (s *Scanner[T]) Skip(pattern string) *Scanner[T] { + var zero T + s.rules = append(s.rules, rule[T]{ + pattern: compileAnchored(pattern), + typ: zero, + skip: true, + }) + return s +} + +// Scan tokenizes the input string using the registered rules. +// At each position, all rules are tested and the longest match wins. +// If no rule matches, an error is recorded and the scanner advances one rune. +func (s *Scanner[T]) Scan(input string) ([]Token[T], error) { + tokens := []Token[T]{} + errorList := []error{} + pos := 0 + column := 0 + + for pos < len(input) { + bestLen := 0 + bestRule := -1 + + for idx, r := range s.rules { + loc := r.pattern.FindStringIndex(input[pos:]) + if loc == nil { + continue + } + if matchLen := loc[1]; matchLen > bestLen { + bestLen = matchLen + bestRule = idx + } + } + + if bestRule == -1 || bestLen == 0 { + _, size := utf8.DecodeRuneInString(input[pos:]) + errorList = append(errorList, fmt.Errorf("unknown character '%v'", input[pos:pos+size])) + pos += size + column++ + continue + } + + matched := input[pos : pos+bestLen] + r := s.rules[bestRule] + + if !r.skip { + tokens = append(tokens, Token[T]{ + Type: r.typ, + Value: matched, + Column: column, + }) + } + + column += utf8.RuneCountInString(matched) + pos += bestLen + } + + return tokens, errors.Join(errorList...) +} + +// compileAnchored compiles a regex pattern, prepending \A so it only matches +// at the current scan position. +// Patterns must not be pre-anchored. +func compileAnchored(pattern string) *regexp.Regexp { + return regexp.MustCompile(`\A(?:` + pattern + `)`) +} diff --git a/pkg/token/token.go b/pkg/token/token.go index bb402b2..ac75d8d 100644 --- a/pkg/token/token.go +++ b/pkg/token/token.go @@ -18,18 +18,6 @@ type Token[T Type] struct { Value string // The value of the token. } -// New creates a Token of the given type at the given column. -// The token's value is derived from its type's Name method. -func New[T Type](typ T, column int) *Token[T] { - return &Token[T]{Type: typ, Column: column, Value: typ.Name()} -} - -// NewAtom creates a Token of the given type with a custom value at the given -// column. -func NewAtom[T Type](typ T, name string, column int) *Token[T] { - return &Token[T]{Type: typ, Column: column, Value: name} -} - // Name returns the type of the Token, as a string. func (t Token[T]) Name() string { return t.Type.Name()