Documentation
¶
Overview ¶
Package egg supports Expression Grammar Generators. At the moment it handles LL(1) grammars.
EBNF Grammars ¶
Expression grammars are a slightly modified version of the EBNF used by the Go language specification:
Syntax = { Production } .
Production = production_name '=' [ Expression ] '.' .
Expression = Term { '|' Term } .
Term = Factor { Factor } .
Factor = production_name | token | Group | Option | Repetition .
Group = '(' Expression ')' .
Option = '[' Expression ']' .
Repetition = '{' Expression '}' .
Productions are expressions constructed from terms and the following operators, in increasing precedence:
| alternation
() grouping
[] option (0 or 1 times)
{} repetition (0 to n times)
Lowercase production names are used to identify lexical (terminal) tokens. Non-terminals are in CamelCase.
Interpreted strings literals, like "foo", are tokens and will match literally, in this example the rune sequence "foo".
Raw string literals, like `[0-9]`, are tokens and are interpreted as regexps, in this example matching a character class '0'-'9'. Repetitions, like in `re{min,max}` are not supported.
Rune literals, like 'a', are tokens and will match literally, in this example the rune 0x61.
Comments in the expression grammar start with the '#' character and stop at the end of the line.
Lexical production names starting with '_' are reserved.
LL(1) Parsers ¶
Generated LL(1) parsers return an []int32. The slice encodes the AST as follows
- A non-terminal symbol NT has a header encoded as two int32 values (-NT, n), where 'n' is the number of following slice items that represent the AST of the non-terminal symbol.
- A terminal symbol T is represented by a positive index 'n'. The token for T can be obtained using the parser's Token method using 'n' as its argument.
This is testdata/parser/expr.ebnf
# A simple list of mathematical expressions.
# 'Start' is the entry point.
Start = Expr { ";" Expr } .
Expr = Term { operator Term } .
Term = operand | "(" Expr ")" .
# Lexical tokens (lowercased)
operator = "+" | "-" | "*" | "/" .
operand = `[0-9]+` .
white_space = `[ \t\n\r]` .
Note: The productions for white space should no more accept empty strings and should use no repetitions. Use eg. ` |\n|\r|\t` or `[\n\r\t]` instead of the earlier examples where `( |\n|\r|\t)*` or `[\n\r\t]+` was used. The RecScanner handles sequences of white space by itself.
The parser generated from the above grammar in expr_test.go defines these symbols
// Symbols
const (
expr_TOK_EOF = expr_Symbol(0) // EOF
expr_TOK_0028 = expr_Symbol(1) // '('
expr_TOK_0029 = expr_Symbol(2) // ')'
expr_TOK_003b = expr_Symbol(3) // ';'
expr_operand = expr_Symbol(4) // operand
expr_operator = expr_Symbol(5) // operator
expr_white_space = expr_Symbol(6) // white_space
expr_Start = expr_Symbol(7) // Start
expr_Expr = expr_Symbol(8) // Expr
expr_Term = expr_Symbol(9) // Term
)
Having this example walk function
func walk[S ~int32](p interface{ Token(int32) scanner.Token }, ast []int32, lvl int) {
for len(ast) != 0 {
next := int32(1)
switch n := ast[0]; {
case n < 0:
fmt.Printf("%s%v\n", strings.Repeat("· ", lvl), S(-n))
next = 2 + ast[1]
walk[S](p, ast[2:next], lvl+1)
default:
tok := p.Token(n)
fmt.Printf("%s %s [%v]\n", strings.Repeat("· ", lvl), tok, S(tok.Ch))
}
ast = ast[next:]
}
}
And this AST from parsing testdata/expr/number="42\n" is
[-7 6 -8 3 -9 1 0 1]
Calling
walk[expr_Symbol](p, ast, 0)
Prints (the part starting with # is an explanatory comment, it is not actually printed)
Start # -7, 6 Start=[-8 3 -9 1 0 1] · Expr # · -8, 3 · Expr=[-9 1 0] · · Term # · · -9, 1 · · Term=[0] · · · number:1:1: "" "42" U+0004 [operand] # · · · 0 · · · Token[0] · number:1:4: "\n" "" U+0000 [EOF] # · 1 · Token[1]
The parser's Token method signature is
func(*receiver) Token(n int32) scanner.Token
Example (Data) ¶
const ebnf = `
# A simple list of mathematical expressions.
# 'Start' is the entry point.
Start = Expr { ";" Expr } .
Expr = Term { operator Term } .
Term = operand | "(" Expr ")" .
# Lexical tokens (lowercased)
operator = "+" | "-" | "*" | "/" .
operand = ""[0-9]+"" .
white_space = ""[ \t\n\r]+"" .`
src := strings.ReplaceAll(strings.TrimSpace(ebnf), `""`, "`")
parser, err := NewParser("example.ebnf", "Start", "white_space", []byte(src))
if err != nil {
panic(err)
}
fmt.Printf("==== EBNF\n%s\n==== Data\n", src)
var rules []string
for k := range parser.Rules {
rules = append(rules, k)
}
sort.Strings(rules)
for _, k := range rules {
fmt.Print(parser.Rules[k])
}
fmt.Printf("==== Analysis\n%v", parser.Analyze())
Output: ==== EBNF # A simple list of mathematical expressions. # 'Start' is the entry point. Start = Expr { ";" Expr } . Expr = Term { operator Term } . Term = operand | "(" Expr ")" . # Lexical tokens (lowercased) operator = "+" | "-" | "*" | "/" . operand = `[0-9]+` . white_space = `[ \t\n\r]+` . ==== Data Expr = Term { operator Term } . State 0: on '(' call Term and goto state 1 on operand call Term and goto state 1 State 1: Accept Expr on operator shift and goto state 0 Start = Expr { ";" Expr } . State 0: on '(' call Expr and goto state 1 on operand call Expr and goto state 1 State 1: Accept Start on ';' shift and goto state 0 Term = operand | "(" Expr ")" . State 0: on '(' shift and goto state 1 on operand shift and goto state 3 State 1: on '(' call Expr and goto state 2 on operand call Expr and goto state 2 State 2: on ')' shift and goto state 3 State 3: Accept Term ==== Analysis Class: LL(1)
Example (LL1) ¶
p := &expr_Parser{}
for _, fn := range []string{"number", "numbers", "parens", "exprs"} {
b, err := os.ReadFile(filepath.Join("testdata", "expr", fn))
if err != nil {
panic(err)
}
ast, err := p.Parse(fn, b)
if err != nil {
panic(err)
}
fmt.Printf("input=%q ast=%v\n", b, ast)
walk[expr_Symbol](p, ast, 0)
}
Output: input="42\n" ast=[-7 6 -8 3 -9 1 0 1] Start · Expr · · Term · · · number:1:1: "" "42" U+0004 [operand] · number:1:4: "\n" "" U+0000 [EOF] input="42 * 24\n" ast=[-7 10 -8 7 -9 1 0 1 -9 1 2 3] Start · Expr · · Term · · · numbers:1:1: "" "42" U+0004 [operand] · · numbers:1:4: " " "*" U+0005 [operator] · · Term · · · numbers:1:6: " " "24" U+0004 [operand] · numbers:1:9: "\n" "" U+0000 [EOF] input="2*(3 + 4)\n" ast=[-7 20 -8 17 -9 1 0 1 -9 11 2 -8 7 -9 1 3 4 -9 1 5 6 7] Start · Expr · · Term · · · parens:1:1: "" "2" U+0004 [operand] · · parens:1:2: "" "*" U+0005 [operator] · · Term · · · parens:1:3: "" "(" U+0001 ['('] · · · Expr · · · · Term · · · · · parens:1:4: "" "3" U+0004 [operand] · · · · parens:1:6: " " "+" U+0005 [operator] · · · · Term · · · · · parens:1:8: " " "4" U+0004 [operand] · · · parens:1:9: "" ")" U+0002 [')'] · parens:1:11: "\n" "" U+0000 [EOF] input="42; 24\n" ast=[-7 12 -8 3 -9 1 0 1 -8 3 -9 1 2 3] Start · Expr · · Term · · · exprs:1:1: "" "42" U+0004 [operand] · exprs:1:3: "" ";" U+0003 [';'] · Expr · · Term · · · exprs:1:5: " " "24" U+0004 [operand] · exprs:1:8: "\n" "" U+0000 [EOF]
Index ¶
- func GLL(parser *Parser, token func(index int) scanner.Token, resolve Resolver) (ast []int32, err error)
- func GLL0(parser, test io.Writer, ...) (err error)
- func ImportEBNF(w io.Writer, name, class string, n *html.Node) (err error)
- func LL1(parser, test io.Writer, ...) (err error)
- type Action
- type AmbiguityChoice
- type AnalysisResult
- type Call
- type Conflict
- type ConflictKind
- type Descriptor
- type DescriptorKey
- type GLLOpts
- type GLLParser
- type GSSEdge
- type GSSNode
- type GrammarClass
- type LL1Opts
- type Parser
- type Resolver
- type ResultKey
- type Rule
- type SPPFNode
- type Set
- type Source
- type State
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func GLL ¶ added in v1.0.1
func GLL(parser *Parser, token func(index int) scanner.Token, resolve Resolver) (ast []int32, err error)
GLL executes a General LL parser driven by the rules based on the EBNF grammar described by 'parser'. The 'token' function allows random access using a zero-based index. The scanner.Token has a field 'Ch' of type rune, which contains the token's symbol ID. Zero ID stands for EOF.
For the purpose of error reporting, scanner.Token has also a `Position()` method which returns the "standard" `go/token.Position` instance. The 'resolve' argument, if non-nil, provides an optional Resolver to handle grammar ambiguities during AST construction.
func GLL0 ¶ added in v1.2.2
func GLL0(parser, test io.Writer, packageName, grammarFileName, start, whitespace, receiver, scan string, g []byte, opts *GLLOpts) (err error)
GLL0 writes Go source code for a parser defined by the grammar 'g' to 'parser'.
Temporary state, this API is scheduled for removal.
The Scan() and Parse() methods will be defined on receiver type named by the 'receiver' argument.
func ImportEBNF ¶ added in v1.2.0
ImportEBNF attempts to extract an EBNF grammar from 'n' and write its proper egg form into 'w'. The function looks for <pre> nodes having class 'class', for example "ebnf". The html document is assumed to come from 'name'.
Note: The positions of any input error correspond to he extracted input EBNF, they are not positions in the HTML document.
func LL1 ¶ added in v0.2.0
func LL1(parser, test io.Writer, packageName, grammarFileName, start, whitespace, receiver, scan string, g []byte, opts *LL1Opts) (err error)
LL1 writes Go source code for a lexer/parser defined by the LL(1) EBNF grammar 'g' to 'parser'. The grammar is assumed to come from 'grammarFileName', the start production is 'start'. The lexical production for white space separating token is named in 'whitespace'.
The Scan() and Parse() methods will be defined on receiver type named by the 'receiver' argument, which must be a valid Go identifier. The receiver type must be defined elsewhere, possibly in 'opts.PackageHeader'.
The scan method signature is:
func(*receiver) Scan() (r modernc.org/scanner.Token)
The 'Scan' argument defaults to "Scan". Passing any nonblank identifier in 'Scan' causes the generated parser to call a function named by that identifier instead. You are responsible to provide the alternate function implementation, probably built upon `(*receiver).Scan`.
This is intended for grammars that modify the input token sequence, like for example Go does with its semicolon injection. Note however that Go grammar is not LL(1).
The parse method signature is:
// 'src' becomes "owned" by the parser and must not be mutated afterwards. func(*receiver) Parse(name string, src []byte) (ast []int32, err error)
The Parse input is assumed to come from file 'name'.
It is okay to pass nil 'opts'.
The zero value of the receiver type is ready to use. receiver.Parse() can be called multiple times on the same instance, however the previous ASTs returned from earlier Parser invocations become invalid. Create one receiver type instance per AST you need to keep working with.
If the 'test' argument is not empty then an elementary test and benchmark code is written to 'test'. All non flag argument on the command line are considered test inputs. Example
$ go test -- ~/foo/*.bar
The test command line flags are
-yes <string>
A regular expression for input base names that must parse without errors. Defaults to "" meaning all test inputs are expected to parse without errors.
Example: `-yes '^y_'` means all files with prefix `y_` must parse without errors.
-no <string>
A regular expression for input base names that must parse with errors. Defaults to "" meaning no test inputs are expected to parse with errors.
Example: `-no '^n_'` means all files with prefix `n_` must parse with errors.
-may <string>
A regular expression for input base names that may parse with errors but that is not considered a failure. Defaults to "" meaning no test inputs are expected to parse with errors.
Example: `-may '^i_'` means all files with prefix `i_` may parse with errors without failing the test.
The benchmark considers all non flag argument on the command line as test inputs. Example
$ go test -bench . -- ~/foo/*.bar
Types ¶
type AmbiguityChoice ¶ added in v1.2.2
type AmbiguityChoice int
AmbiguityChoice tells the flattener how to proceed.
const ( // PickError tells Flatten to fail on parsing ambiguities (Strict mode). PickError AmbiguityChoice = iota // PickIndex tells Flatten to use the provided Alternative index. PickIndex )
func LongestMatchResolver ¶ added in v1.2.2
func LongestMatchResolver(p *GLLParser, node *SPPFNode) (AmbiguityChoice, int, error)
LongestMatchResolver prefers alternatives that have fewer, larger children, which typically corresponds to "greedier" parsing in GLL structures.
type AnalysisResult ¶ added in v0.2.0
type AnalysisResult struct {
Class GrammarClass
Conflicts []Conflict
LeftRecursive []string // List of NonTerminals that are directly left-recursive.
}
AnalysisResult holds the computed properties of the grammar.
func (*AnalysisResult) String ¶ added in v0.2.0
func (a *AnalysisResult) String() string
type Call ¶ added in v0.2.0
Call describes which non terminal rule to call and what state to go afterwards.
type Conflict ¶ added in v0.2.0
type Conflict struct {
Rule string // The name of the rule where the conflict occurs.
State int // The ID of the state within the rule's DFA.
Token string // The lookahead token causing the conflict.
Type ConflictKind // The type of conflict.
// Choices lists the possible paths the parser could take.
// For Predict/Predict: names of the NonTerminals.
// For Shift/Predict: "Shift <Token>" vs "Call <NonTerminal>".
Choices []string
// contains filtered or unexported fields
}
Conflict details a specific ambiguity in the grammar.
type ConflictKind ¶ added in v0.2.2
type ConflictKind int
ConflictKind describes the nature of a grammar conflict.
const ( // ConflictPredictPredict occurs when a lookahead token can start // two different non-terminal productions from the current state. // (Equivalent to Reduce/Reduce in LR parsers). ConflictPredictPredict ConflictKind = iota // ConflictShiftPredict occurs when a lookahead token can either be consumed // directly (Shift) or start a non-terminal production. // (Equivalent to Shift/Reduce in LR parsers). ConflictShiftPredict // ConflictFirstPredict occurs when a non-terminal is Nullable (can produce ε), // and the lookahead token exists in both the First set of that non-terminal // and the Predict set of the call (i.e., the First set of the remainder of the rule). // The parser cannot decide whether to enter the non-terminal or skip it. ConflictFirstPredict )
type Descriptor ¶ added in v1.2.2
type Descriptor struct {
State *State
GSSNode *GSSNode
InputIndex int
PendingResults []*SPPFNode // Tracks children for the current rule/path
}
Descriptor represents a "path" waiting to be explored.
type DescriptorKey ¶ added in v1.2.2
type DescriptorKey struct {
StateID int
GSSNode GSSNode
InputIndex int
PathHash string // A representation of the current PendingResults
}
DescriptorKey provides a comparable identity for a descriptor. It includes a simple way to track unique paths.
type GLLOpts ¶ added in v1.0.1
type GLLOpts struct {
ConstPrefix string
Debug bool
MethodPrefix string
TypePrefix string
VarPrefix string
// contains filtered or unexported fields
}
GLLOpts may define additional settings for GLL().
ConstPrefix, MethodPrefix, TypePrefix, VarPrefix ¶
Strings to prepend to names of generated constants, methods, types and variables.
Debug ¶
The parser will include tracing calls to a user supplied function `trc(msg string, args...any)`.
type GLLParser ¶ added in v1.2.2
type GLLParser struct {
Grammar *Parser
Analysis *AnalysisResult
Pending []Descriptor // Worklist
Added map[DescriptorKey]struct{} // To avoid redundant work
// GSS management
GSSNodes map[GSSNode]*[]GSSEdge
// SPPF (The "Forest" of results)
// Key: (Rule/Symbol, Start, End)
Results map[ResultKey]any
// SPPF Storage
// Key: (SymbolID, Start, End) -> The canonical node for this span.
// This ensures we share nodes (the "Shared" in SPPF).
SPPFNodes map[ResultKey]*SPPFNode
MaxInputIndex int // High-water mark for error reporting
// contains filtered or unexported fields
}
GLLParser is the central coordinator to manage the sets of descriptors and ensure we don't process the same descriptor twice (which is how GLL handles cycles and ambiguity).
func (*GLLParser) Trampoline ¶ added in v1.2.2
func (p *GLLParser) Trampoline()
Trampoline processes descriptors until the worklist is empty.
type GSSEdge ¶ added in v1.2.2
type GSSEdge struct {
Target *GSSNode
// The return StateID (int) where the parser should resume.
Result any
// The path history of the caller to be resumed upon return.
CallerPath []*SPPFNode
}
GSSEdge connects GSSNodes.
type GrammarClass ¶ added in v0.2.0
type GrammarClass int
GrammarClass represents the complexity class of the grammar.
const ( // ClassLL1 represents a grammar that can be parsed by a predictive // parser with 1 token of lookahead without backtracking. ClassLL1 GrammarClass = iota // ClassNotLL1 represents a grammar that contains conflicts (ambiguities // or left-recursion) preventing strictly predictive parsing. // It might be LL(k), LR(1), or ambiguous. ClassNotLL1 )
func (GrammarClass) String ¶ added in v0.2.0
func (g GrammarClass) String() string
type LL1Opts ¶ added in v0.2.0
type LL1Opts struct {
ConstPrefix string
Debug bool
MethodPrefix string
TypePrefix string
VarPrefix string
// contains filtered or unexported fields
}
LL1Opts may define additional settings for LL1().
ConstPrefix, MethodPrefix, TypePrefix, VarPrefix ¶
Strings to prepend to names of generated constants, methods, types and variables.
Debug ¶
The parser will include tracing calls to a user supplied function `trc(msg string, args...any)`.
type Parser ¶
type Parser struct {
ID2Symbol map[int]string
Name string // As passed to NewData.
Nullable Set[int]
Rules map[string]*Rule
Source *Source
Start string // As passed to NewData.
Symbol2ID map[string]int
Whitespace string // As passed to NewData.
// contains filtered or unexported fields
}
Parser represents the information intended for generating parser code for the grammar passed to NewData.
func NewParser ¶
NewParser returns a newly created Parser or an error, if any. 'name' names the input file, 'start' and 'whitespace' are production names.
func (*Parser) Analyze ¶ added in v0.2.2
func (d *Parser) Analyze() *AnalysisResult
Analyze inspects the pre-computed Rules and Actions in Data to determine the grammar class and identify any conflicts.
Usage concept ¶
// 1. Create the Data object (it will succeed even if recursive)
parser, err := egg.NewData("MyGrammar", "Start", "ws", src)
if err != nil {
log.Fatal(err)
}
// 2. Analyze the grammar properties
analysis := parser.Analyze()
// 3. Decide how to proceed based on the analysis
if len(analysis.LeftRecursive) > 0 {
// Option A: Fail (if you only support LL(1))
log.Fatalf("Error: Grammar is left-recursive in rules: %v", analysis.LeftRecursive)
// Option B: Warn and switch strategy
// log.Printf("Warning: Left recursion detected. Switching to GLL parser...")
// generateGLLParser(parser)
} else if analysis.Class == egg.ClassLL1 {
// Generate the fast parser
generateRecursiveDescentParser(parser)
} else {
// Handle other conflicts
log.Printf("Grammar has conflicts: %v", analysis.Conflicts)
}
type Resolver ¶ added in v1.2.2
type Resolver func(p *GLLParser, node *SPPFNode) (choice AmbiguityChoice, index int, err error)
Resolver is a callback used by Flatten to resolve SPPF ambiguities.
type Rule ¶ added in v0.2.0
type Rule struct {
DFA *fsm.NFA
First Set[int]
Production *ebnf.Production
States []*State
// contains filtered or unexported fields
}
Rule represents a parser rule.
func (*Rule) StartsAccepting ¶ added in v0.2.2
StartsAccepting reports whether 'p' can accept an empty token sequence.
type SPPFNode ¶ added in v1.2.2
type SPPFNode struct {
SymbolID int
Start int
End int
// Each entry in Alternatives is a "Packed Node" representing
// one way to derive this symbol over this span.
Alternatives [][]*SPPFNode
}
SPPFNode represents a node in the parse forest. It uniquely identifies a parsed symbol over a specific span.
type Set ¶ added in v0.2.0
type Set[T comparable] map[T]struct{}
Set represent a set of comparable values.
type Source ¶
Source describes an input of a generator.
func NewSource ¶
NewSource returns a newly created Source created from the EBNF expression grammar 'g' or an error, if any. 'name' is used to report positions, 'g' is the expression grammar text.
The 'roots' argument name the externally visible production set, for example {"Start"} or {"Source", "white_space"}.
All productions must be used, except for productions named in the 'roots'.
