From da3da708553211b5bfbe46a33a32c7f1a4ac41da Mon Sep 17 00:00:00 2001 From: "M.V. Hutz" Date: Thu, 12 Feb 2026 00:25:18 +0000 Subject: [PATCH] refactor: extract shared token package (#46) ## Description Both the `saccharine` and `lambda` packages need tokenizing and parsing primitives. This PR extracts shared token infrastructure into a new `pkg/token` package, then wires both languages up to use it. - Add `pkg/token` with a generic `Token[T]` type, `Scan`, `ScanAtom`, `ScanRune`, `ScanCharacter`, `IsVariable`, `ParseRawToken`, and `ParseList`. - Refactor `pkg/saccharine` to delegate to `pkg/token`, removing duplicated scanning and parsing helpers. - Implement `Codec.Decode` for `pkg/lambda` (scanner + parser) using the shared token package. - Add `iterator.While` for predicate-driven iteration. - Rename `iterator.Do` to `iterator.Try` to better describe its rollback semantics. ### Decisions - The `Type` constraint (`comparable` + `Name() string`) keeps the generic token flexible while ensuring every token type can produce readable error messages. - `iterator.Do` was renamed to `iterator.Try` since it describes a try/rollback operation, not a side-effecting "do". ## Benefits - Eliminates duplicated token, scanning, and parsing code between languages. - Enables the `lambda` package to decode (parse) lambda calculus strings, which was previously unimplemented. - Makes it straightforward to add new languages by reusing `pkg/token` primitives. ## Checklist - [x] Code follows conventional commit format. - [x] Branch follows naming convention (`/`). Always use underscores. - [x] Tests pass (if applicable). - [ ] Documentation updated (if applicable). Reviewed-on: https://git.maximhutz.com/mvhutz/lambda/pulls/46 Co-authored-by: M.V. Hutz Co-committed-by: M.V. Hutz --- pkg/iterator/iterator.go | 23 +++++++--- pkg/lambda/codec.go | 20 +++++---- pkg/lambda/parse.go | 85 +++++++++++++++++++++++++++++++++++ pkg/lambda/scan.go | 45 +++++++++++++++++++ pkg/lambda/token.go | 45 +++++++++++++++++++ pkg/saccharine/parse.go | 56 +++++++---------------- pkg/saccharine/scan.go | 95 +++++++--------------------------------- pkg/saccharine/token.go | 32 ++++---------- pkg/token/parse.go | 41 +++++++++++++++++ pkg/token/scan.go | 74 +++++++++++++++++++++++++++++++ pkg/token/token.go | 36 +++++++++++++++ 11 files changed, 392 insertions(+), 160 deletions(-) create mode 100644 pkg/lambda/parse.go create mode 100644 pkg/lambda/scan.go create mode 100644 pkg/lambda/token.go create mode 100644 pkg/token/parse.go create mode 100644 pkg/token/scan.go create mode 100644 pkg/token/token.go diff --git a/pkg/iterator/iterator.go b/pkg/iterator/iterator.go index bbcde01..6fd1241 100644 --- a/pkg/iterator/iterator.go +++ b/pkg/iterator/iterator.go @@ -43,12 +43,12 @@ func (i Iterator[T]) Get() (T, error) { // MustGet is a version of Get, that panics if the datum cannot be returned. func (i Iterator[T]) MustGet() T { - var null T - if i.Done() { - return null + t, err := i.Get() + if err != nil { + panic(fmt.Errorf("cannot get current token: %w", err)) } - return i.items[i.index] + return t } // Forward increments the iterator if the iterator is not yet at the end of the @@ -81,10 +81,21 @@ func (i Iterator[T]) Done() bool { return i.index == len(i.items) } -// Do attempts to perform an operation using the iterator. If the operation +// While increments the iterator as long as the current item satisfies the +// predicate. The first item that does not match is left unconsumed. +func (i *Iterator[T]) While(fn func(T) bool) { + for !i.Done() { + if !fn(i.MustGet()) { + return + } + i.Forward() + } +} + +// Try attempts to perform an operation using the iterator. If the operation // succeeds, the iterator is updated. If the operation fails, the iterator is // rolled back, and an error is returned. -func Do[T any, U any](i *Iterator[T], fn func(i *Iterator[T]) (U, error)) (U, error) { +func Try[T any, U any](i *Iterator[T], fn func(i *Iterator[T]) (U, error)) (U, error) { i2 := i.Copy() out, err := fn(i2) diff --git a/pkg/lambda/codec.go b/pkg/lambda/codec.go index 32d19df..c840dba 100644 --- a/pkg/lambda/codec.go +++ b/pkg/lambda/codec.go @@ -1,23 +1,25 @@ package lambda import ( - "fmt" - "git.maximhutz.com/max/lambda/pkg/codec" ) // A Codec is a [codec.Codec] that serializes lambda calculus expressions. -// Decode is not implemented and always returns an error. -// Encode stringifies an expression using standard lambda notation. type Codec struct{} -// Decode parses a string as lambda calculus. Returns an error if it cannot. -func (m Codec) Decode(string) (Expression, error) { - return nil, fmt.Errorf("unimplemented") +// Decode parses a string as lambda calculus. +// Returns an error if it cannot. +func (m Codec) Decode(s string) (Expression, error) { + tokens, err := scan(s) + if err != nil { + return nil, err + } + + return parse(tokens) } -// Encode turns a lambda calculus expression into a string. Returns an error if -// it cannot. +// Encode turns a lambda calculus expression into a string. +// Returns an error if it cannot. func (m Codec) Encode(e Expression) (string, error) { return Stringify(e), nil } diff --git a/pkg/lambda/parse.go b/pkg/lambda/parse.go new file mode 100644 index 0000000..b5859fa --- /dev/null +++ b/pkg/lambda/parse.go @@ -0,0 +1,85 @@ +package lambda + +import ( + "fmt" + + "git.maximhutz.com/max/lambda/pkg/iterator" + "git.maximhutz.com/max/lambda/pkg/token" +) + +type tokenIterator = iterator.Iterator[lambdaToken] + +func parseVariable(i *tokenIterator) (Expression, error) { + if tok, err := token.ParseRawToken(i, tokenAtom); err != nil { + return nil, fmt.Errorf("expected variable (col %d): %w", i.Index(), err) + } else { + return Variable{Name: tok.Value}, nil + } +} + +func parseAbstraction(i *tokenIterator) (Expression, error) { + return iterator.Try(i, func(i *tokenIterator) (Expression, error) { + if _, err := token.ParseRawToken(i, tokenSlash); err != nil { + return nil, fmt.Errorf("no backslash (col %d): %w", i.Index(), err) + } else if param, err := token.ParseRawToken(i, tokenAtom); err != nil { + return nil, fmt.Errorf("no param (col %d): %w", i.Index(), err) + } else if _, err := token.ParseRawToken(i, tokenDot); err != nil { + return nil, fmt.Errorf("no dot (col %d): %w", i.Index(), err) + } else if body, err := parseExpression(i); err != nil { + return nil, err + } else { + return Abstraction{Parameter: param.Value, Body: body}, nil + } + }) +} + +func parseApplication(i *tokenIterator) (Expression, error) { + return iterator.Try(i, func(i *tokenIterator) (Expression, error) { + if _, err := token.ParseRawToken(i, tokenOpenParen); err != nil { + return nil, fmt.Errorf("no opening paren (col %d): %w", i.Index(), err) + } else if abstraction, err := parseExpression(i); err != nil { + return nil, fmt.Errorf("expected function expression: %w", err) + } else if argument, err := parseExpression(i); err != nil { + return nil, fmt.Errorf("expected argument expression: %w", err) + } else if _, err := token.ParseRawToken(i, tokenCloseParen); err != nil { + return nil, fmt.Errorf("no closing paren (col %d): %w", i.Index(), err) + } else { + return Application{Abstraction: abstraction, Argument: argument}, nil + } + }) +} + +func parseExpression(i *tokenIterator) (Expression, error) { + return iterator.Try(i, func(i *tokenIterator) (Expression, error) { + if i.Done() { + return nil, fmt.Errorf("unexpected end of input") + } + + switch peek := i.MustGet(); peek.Type { + case tokenOpenParen: + return parseApplication(i) + case tokenSlash: + return parseAbstraction(i) + case tokenAtom: + return parseVariable(i) + default: + return nil, fmt.Errorf("expected expression, got '%v' (col %d)", peek.Value, peek.Column) + } + }) +} + +// parse converts a token slice into a lambda calculus expression. +func parse(tokens []lambdaToken) (Expression, error) { + i := iterator.Of(tokens) + + exp, err := parseExpression(i) + if err != nil { + return nil, err + } + + if !i.Done() { + return nil, fmt.Errorf("expected EOF, found more tokens (col %d)", i.MustGet().Column) + } + + return exp, nil +} diff --git a/pkg/lambda/scan.go b/pkg/lambda/scan.go new file mode 100644 index 0000000..14b5e12 --- /dev/null +++ b/pkg/lambda/scan.go @@ -0,0 +1,45 @@ +package lambda + +import ( + "fmt" + "unicode" + + "git.maximhutz.com/max/lambda/pkg/iterator" + "git.maximhutz.com/max/lambda/pkg/token" +) + +// scanToken pulls the next lambda calculus token from a rune iterator. +func scanToken(i *iterator.Iterator[rune]) (*lambdaToken, error) { + index := i.Index() + + if i.Done() { + return nil, nil + } + + letter, err := i.Next() + if err != nil { + return nil, fmt.Errorf("cannot produce next token: %w", err) + } + + switch { + case letter == '(': + return token.New(tokenOpenParen, index), nil + case letter == ')': + return token.New(tokenCloseParen, index), nil + case letter == '\\': + return token.New(tokenSlash, index), nil + case letter == '.': + return token.New(tokenDot, index), nil + case unicode.IsSpace(letter): + return nil, nil + case token.IsVariable(letter): + return token.ScanAtom(i, letter, tokenAtom, index), nil + } + + return nil, fmt.Errorf("unknown character '%v'", string(letter)) +} + +// scan tokenizes an input string into lambda calculus tokens. +func scan(input string) ([]lambdaToken, error) { + return token.Scan(input, scanToken) +} diff --git a/pkg/lambda/token.go b/pkg/lambda/token.go new file mode 100644 index 0000000..18f745f --- /dev/null +++ b/pkg/lambda/token.go @@ -0,0 +1,45 @@ +package lambda + +import ( + "fmt" + + "git.maximhutz.com/max/lambda/pkg/token" +) + +// A tokenType is an identifier for any token in the lambda calculus. +type tokenType int + +// All official tokens of the lambda calculus. +const ( + // tokenOpenParen denotes the '(' token. + tokenOpenParen tokenType = iota + // tokenCloseParen denotes the ')' token. + tokenCloseParen + // tokenSlash denotes the '\' token. + tokenSlash + // tokenDot denotes the '.' token. + tokenDot + // tokenAtom denotes an alpha-numeric variable. + tokenAtom +) + +// Name returns the type of the tokenType, as a string. +func (t tokenType) Name() string { + switch t { + case tokenOpenParen: + return "(" + case tokenCloseParen: + return ")" + case tokenSlash: + return "\\" + case tokenDot: + return "." + case tokenAtom: + return "ATOM" + default: + panic(fmt.Errorf("unknown token type %v", t)) + } +} + +// lambdaToken is the concrete token type for the lambda calculus. +type lambdaToken = token.Token[tokenType] diff --git a/pkg/saccharine/parse.go b/pkg/saccharine/parse.go index 60a8736..8620ba9 100644 --- a/pkg/saccharine/parse.go +++ b/pkg/saccharine/parse.go @@ -5,37 +5,26 @@ import ( "fmt" "git.maximhutz.com/max/lambda/pkg/iterator" + "git.maximhutz.com/max/lambda/pkg/token" ) type tokenIterator = iterator.Iterator[Token] -func parseRawToken(i *tokenIterator, expected TokenType) (*Token, error) { - return iterator.Do(i, func(i *tokenIterator) (*Token, error) { - if tok, err := i.Next(); err != nil { - return nil, err - } else if tok.Type != expected { - return nil, fmt.Errorf("expected token %v, got %v'", expected.Name(), tok.Value) - } else { - return &tok, nil - } - }) -} - func passSoftBreaks(i *tokenIterator) { for { - if _, err := parseRawToken(i, TokenSoftBreak); err != nil { + if _, err := token.ParseRawToken(i, TokenSoftBreak); err != nil { return } } } func parseToken(i *tokenIterator, expected TokenType, ignoreSoftBreaks bool) (*Token, error) { - return iterator.Do(i, func(i *tokenIterator) (*Token, error) { + return iterator.Try(i, func(i *tokenIterator) (*Token, error) { if ignoreSoftBreaks { passSoftBreaks(i) } - return parseRawToken(i, expected) + return token.ParseRawToken(i, expected) }) } @@ -48,35 +37,20 @@ func parseString(i *tokenIterator) (string, error) { } func parseBreak(i *tokenIterator) (*Token, error) { - if tok, softErr := parseRawToken(i, TokenSoftBreak); softErr == nil { + if tok, softErr := token.ParseRawToken(i, TokenSoftBreak); softErr == nil { return tok, nil - } else if tok, hardErr := parseRawToken(i, TokenHardBreak); hardErr == nil { + } else if tok, hardErr := token.ParseRawToken(i, TokenHardBreak); hardErr == nil { return tok, nil } else { return nil, errors.Join(softErr, hardErr) } } -func parseList[U any](i *tokenIterator, fn func(*tokenIterator) (U, error), minimum int) ([]U, error) { - results := []U{} - - for { - if u, err := fn(i); err != nil { - if len(results) < minimum { - return nil, fmt.Errorf("expected at least '%v' items, got only '%v': %w", minimum, len(results), err) - } - return results, nil - } else { - results = append(results, u) - } - } -} - func parseAbstraction(i *tokenIterator) (*Abstraction, error) { - return iterator.Do(i, func(i *tokenIterator) (*Abstraction, error) { + return iterator.Try(i, func(i *tokenIterator) (*Abstraction, error) { if _, err := parseToken(i, TokenSlash, true); err != nil { return nil, fmt.Errorf("no function slash (col %d): %w", i.MustGet().Column, err) - } else if parameters, err := parseList(i, parseString, 0); err != nil { + } else if parameters, err := token.ParseList(i, parseString, 0); err != nil { return nil, err } else if _, err = parseToken(i, TokenDot, true); err != nil { return nil, fmt.Errorf("no function dot (col %d): %w", i.MustGet().Column, err) @@ -89,10 +63,10 @@ func parseAbstraction(i *tokenIterator) (*Abstraction, error) { } func parseApplication(i *tokenIterator) (*Application, error) { - return iterator.Do(i, func(i *tokenIterator) (*Application, error) { + return iterator.Try(i, func(i *tokenIterator) (*Application, error) { if _, err := parseToken(i, TokenOpenParen, true); err != nil { return nil, fmt.Errorf("no openning brackets (col %d): %w", i.MustGet().Column, err) - } else if expressions, err := parseList(i, parseExpression, 1); err != nil { + } else if expressions, err := token.ParseList(i, parseExpression, 1); err != nil { return nil, err } else if _, err := parseToken(i, TokenCloseParen, true); err != nil { return nil, fmt.Errorf("no closing brackets (col %d): %w", i.MustGet().Column, err) @@ -114,12 +88,12 @@ func parseStatements(i *tokenIterator) ([]Statement, error) { statements := []Statement{} //nolint:errcheck - parseList(i, parseBreak, 0) + token.ParseList(i, parseBreak, 0) for { if statement, err := parseStatement(i); err != nil { break - } else if _, err := parseList(i, parseBreak, 1); err != nil && !i.Done() { + } else if _, err := token.ParseList(i, parseBreak, 1); err != nil && !i.Done() { break } else { statements = append(statements, statement) @@ -159,7 +133,7 @@ func parseClause(i *tokenIterator, braces bool) (*Clause, error) { } func parseExpression(i *tokenIterator) (Expression, error) { - return iterator.Do(i, func(i *tokenIterator) (Expression, error) { + return iterator.Try(i, func(i *tokenIterator) (Expression, error) { passSoftBreaks(i) switch peek := i.MustGet(); peek.Type { @@ -178,8 +152,8 @@ func parseExpression(i *tokenIterator) (Expression, error) { } func parseLet(i *tokenIterator) (*LetStatement, error) { - return iterator.Do(i, func(i *tokenIterator) (*LetStatement, error) { - if parameters, err := parseList(i, parseString, 1); err != nil { + return iterator.Try(i, func(i *tokenIterator) (*LetStatement, error) { + if parameters, err := token.ParseList(i, parseString, 1); err != nil { return nil, err } else if _, err := parseToken(i, TokenAssign, true); err != nil { return nil, err diff --git a/pkg/saccharine/scan.go b/pkg/saccharine/scan.go index 762900b..fd79d39 100644 --- a/pkg/saccharine/scan.go +++ b/pkg/saccharine/scan.go @@ -1,44 +1,13 @@ package saccharine import ( - "errors" "fmt" "unicode" "git.maximhutz.com/max/lambda/pkg/iterator" + "git.maximhutz.com/max/lambda/pkg/token" ) -// isVariables determines whether a rune can be a valid variable. -func isVariable(r rune) bool { - return unicode.IsLetter(r) || unicode.IsNumber(r) -} - -func scanRune(i *iterator.Iterator[rune], expected func(rune) bool) (rune, error) { - i2 := i.Copy() - - if r, err := i2.Next(); err != nil { - return r, err - } else if !expected(r) { - return r, fmt.Errorf("got unexpected rune %v'", r) - } else { - i.Sync(i2) - return r, nil - } -} - -func scanCharacter(i *iterator.Iterator[rune], expected rune) (rune, error) { - i2 := i.Copy() - - if r, err := i2.Next(); err != nil { - return r, err - } else if r != expected { - return r, fmt.Errorf("got unexpected rune %v'", r) - } else { - i.Sync(i2) - return r, nil - } -} - // Pulls the next token from an iterator over runes. If it cannot, it will // return nil. If an error occurs, it will return that. func scanToken(i *iterator.Iterator[rune]) (*Token, error) { @@ -55,56 +24,35 @@ func scanToken(i *iterator.Iterator[rune]) (*Token, error) { switch { case letter == '(': - return NewToken(TokenOpenParen, index), nil + return token.New(TokenOpenParen, index), nil case letter == ')': - return NewToken(TokenCloseParen, index), nil + return token.New(TokenCloseParen, index), nil case letter == '.': - return NewToken(TokenDot, index), nil + return token.New(TokenDot, index), nil case letter == '\\': - return NewToken(TokenSlash, index), nil + return token.New(TokenSlash, index), nil case letter == '\n': - return NewToken(TokenSoftBreak, index), nil + return token.New(TokenSoftBreak, index), nil case letter == '{': - return NewToken(TokenOpenBrace, index), nil + return token.New(TokenOpenBrace, index), nil case letter == '}': - return NewToken(TokenCloseBrace, index), nil + return token.New(TokenCloseBrace, index), nil case letter == ':': - if _, err := scanCharacter(i, '='); err != nil { + if _, err := token.ScanCharacter(i, '='); err != nil { return nil, err } else { - return NewToken(TokenAssign, index), nil + return token.New(TokenAssign, index), nil } case letter == ';': - return NewToken(TokenHardBreak, index), nil + return token.New(TokenHardBreak, index), nil case letter == '#': // Skip everything until the next newline or EOF. - for !i.Done() { - r, err := i.Next() - if err != nil { - return nil, fmt.Errorf("error while parsing comment: %w", err) - } - - if r == '\n' { - // Put the newline back so it can be processed as a soft break. - i.Back() - break - } - } + i.While(func(r rune) bool { return r != '\n' }) return nil, nil case unicode.IsSpace(letter): return nil, nil - case isVariable(letter): - atom := []rune{letter} - - for { - if r, err := scanRune(i, isVariable); err != nil { - break - } else { - atom = append(atom, r) - } - } - - return NewTokenAtom(string(atom), index), nil + case token.IsVariable(letter): + return token.ScanAtom(i, letter, TokenAtom, index), nil } return nil, fmt.Errorf("unknown character '%v'", string(letter)) @@ -112,18 +60,5 @@ func scanToken(i *iterator.Iterator[rune]) (*Token, error) { // scan a string into tokens. func scan(input string) ([]Token, error) { - i := iterator.Of([]rune(input)) - tokens := []Token{} - errorList := []error{} - - for !i.Done() { - token, err := scanToken(i) - if err != nil { - errorList = append(errorList, err) - } else if token != nil { - tokens = append(tokens, *token) - } - } - - return tokens, errors.Join(errorList...) + return token.Scan(input, scanToken) } diff --git a/pkg/saccharine/token.go b/pkg/saccharine/token.go index c7a1ac6..1fcb580 100644 --- a/pkg/saccharine/token.go +++ b/pkg/saccharine/token.go @@ -1,6 +1,10 @@ package saccharine -import "fmt" +import ( + "fmt" + + "git.maximhutz.com/max/lambda/pkg/token" +) // A TokenType is an identifier for any token in the Saccharine language. type TokenType int @@ -21,7 +25,7 @@ const ( TokenAssign // TokenAtom denotes an alpha-numeric variable. TokenAtom - // TokenSlash denotes the '/' token. + // TokenSlash denotes the '\\' token. TokenSlash // TokenDot denotes the '.' token. TokenDot @@ -29,24 +33,6 @@ const ( TokenSoftBreak ) -// A Token in the Saccharine language. -type Token struct { - Column int // Where the token begins in the source text. - Type TokenType // What type the token is. - Value string // The value of the token. -} - -// NewToken creates a [Token] of the given type at the given column. -// The token's value is derived from its [TokenType]. -func NewToken(typ TokenType, column int) *Token { - return &Token{Type: typ, Column: column, Value: typ.Name()} -} - -// NewTokenAtom creates a [TokenAtom] with the given name at the given column. -func NewTokenAtom(name string, column int) *Token { - return &Token{Type: TokenAtom, Column: column, Value: name} -} - // Name returns the type of the TokenType, as a string. func (t TokenType) Name() string { switch t { @@ -75,7 +61,5 @@ func (t TokenType) Name() string { } } -// Name returns the type of the Token, as a string. -func (t Token) Name() string { - return t.Type.Name() -} +// Token is the concrete token type for the Saccharine language. +type Token = token.Token[TokenType] diff --git a/pkg/token/parse.go b/pkg/token/parse.go new file mode 100644 index 0000000..f5f0386 --- /dev/null +++ b/pkg/token/parse.go @@ -0,0 +1,41 @@ +package token + +import ( + "fmt" + + "git.maximhutz.com/max/lambda/pkg/iterator" +) + +// ParseRawToken consumes the next token from the iterator if its type matches +// the expected type. +// Uses [iterator.Try] for automatic backtracking on failure. +func ParseRawToken[T Type](i *iterator.Iterator[Token[T]], expected T) (*Token[T], error) { + return iterator.Try(i, func(i *iterator.Iterator[Token[T]]) (*Token[T], error) { + if tok, err := i.Next(); err != nil { + return nil, err + } else if tok.Type != expected { + return nil, fmt.Errorf("expected token %v, got %v'", expected.Name(), tok.Value) + } else { + return &tok, nil + } + }) +} + +// ParseList repeatedly applies a parse function, collecting results into a +// slice. +// Stops when the parse function returns an error. +// Returns an error if fewer than minimum results are collected. +func ParseList[T Type, U any](i *iterator.Iterator[Token[T]], fn func(*iterator.Iterator[Token[T]]) (U, error), minimum int) ([]U, error) { + results := []U{} + + for { + if u, err := fn(i); err != nil { + if len(results) < minimum { + return nil, fmt.Errorf("expected at least '%v' items, got only '%v': %w", minimum, len(results), err) + } + return results, nil + } else { + results = append(results, u) + } + } +} diff --git a/pkg/token/scan.go b/pkg/token/scan.go new file mode 100644 index 0000000..2457eb8 --- /dev/null +++ b/pkg/token/scan.go @@ -0,0 +1,74 @@ +package token + +import ( + "errors" + "fmt" + "unicode" + + "git.maximhutz.com/max/lambda/pkg/iterator" +) + +// IsVariable determines whether a rune can be a valid variable character. +func IsVariable(r rune) bool { + return unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_' +} + +// ScanRune consumes the next rune from the iterator if it satisfies the +// predicate. +// Returns an error if the iterator is exhausted or the rune does not match. +func ScanRune(i *iterator.Iterator[rune], expected func(rune) bool) (rune, error) { + return iterator.Try(i, func(i *iterator.Iterator[rune]) (rune, error) { + if r, err := i.Next(); err != nil { + return r, err + } else if !expected(r) { + return r, fmt.Errorf("got unexpected rune %v'", r) + } else { + return r, nil + } + }) +} + +// ScanCharacter consumes the next rune from the iterator if it matches the +// expected rune exactly. +// Returns an error if the iterator is exhausted or the rune does not match. +func ScanCharacter(i *iterator.Iterator[rune], expected rune) (rune, error) { + return ScanRune(i, func(r rune) bool { return r == expected }) +} + +// ScanAtom scans a contiguous sequence of variable characters into a single +// atom token. +// The first rune has already been consumed and is passed in. +func ScanAtom[T Type](i *iterator.Iterator[rune], first rune, typ T, column int) *Token[T] { + atom := []rune{first} + + for { + if r, err := ScanRune(i, IsVariable); err != nil { + break + } else { + atom = append(atom, r) + } + } + + return NewAtom(typ, string(atom), column) +} + +// Scan tokenizes an input string using a language-specific scanToken function. +// The scanToken function is called repeatedly until the input is exhausted. +// It returns nil (no token, no error) for skippable input like whitespace. +// Errors are accumulated and returned joined at the end. +func Scan[T Type](input string, scanToken func(*iterator.Iterator[rune]) (*Token[T], error)) ([]Token[T], error) { + i := iterator.Of([]rune(input)) + tokens := []Token[T]{} + errorList := []error{} + + for !i.Done() { + token, err := scanToken(i) + if err != nil { + errorList = append(errorList, err) + } else if token != nil { + tokens = append(tokens, *token) + } + } + + return tokens, errors.Join(errorList...) +} diff --git a/pkg/token/token.go b/pkg/token/token.go new file mode 100644 index 0000000..bb402b2 --- /dev/null +++ b/pkg/token/token.go @@ -0,0 +1,36 @@ +// Package token provides generic token types and scanning/parsing primitives +// for building language-specific lexers and parsers. +package token + +// A Type is a constraint for language-specific token type enums. +// It must be comparable (for equality checks) and must have a Name method +// that returns a human-readable string for error messages. +type Type interface { + comparable + // Name returns a human-readable name for this token type. + Name() string +} + +// A Token is a lexical unit in a source language. +type Token[T Type] struct { + Column int // Where the token begins in the source text. + Type T // What type the token is. + Value string // The value of the token. +} + +// New creates a Token of the given type at the given column. +// The token's value is derived from its type's Name method. +func New[T Type](typ T, column int) *Token[T] { + return &Token[T]{Type: typ, Column: column, Value: typ.Name()} +} + +// NewAtom creates a Token of the given type with a custom value at the given +// column. +func NewAtom[T Type](typ T, name string, column int) *Token[T] { + return &Token[T]{Type: typ, Column: column, Value: name} +} + +// Name returns the type of the Token, as a string. +func (t Token[T]) Name() string { + return t.Type.Name() +}