Documentation
¶
Index ¶
- Constants
- Variables
- type Dfa
- type DfaBuilder
- func (b *DfaBuilder) AddAcceptAction(state int, action int)
- func (b *DfaBuilder) AddState() int
- func (b *DfaBuilder) AddTransition(fromState int, onInput int, toState int)
- func (b *DfaBuilder) Build() *Dfa
- func (b *DfaBuilder) NextState(fromState int, onInput int) int
- func (b *DfaBuilder) NumStates() int
- func (b *DfaBuilder) StartState() int
- type DfaMatchResult
- type Edit
- type InputReader
- type IterDirection
- type Nfa
- type ReadSeekerInput
- type ReaderAtPosFunc
- type Regexp
- type Token
- type TokenIter
- type TokenRole
- type TokenTree
- type Tokenizer
- type TokenizerRule
Constants ¶
const ( TokenRoleNone = TokenRole(iota) TokenRoleOperator TokenRoleKeyword TokenRoleIdentifier TokenRoleNumber TokenRoleString TokenRoleComment )
const ( TokenRoleCustom1 = TokenRole((1 << 16) + iota) TokenRoleCustom2 TokenRoleCustom3 TokenRoleCustom4 TokenRoleCustom5 TokenRoleCustom6 TokenRoleCustom7 TokenRoleCustom8 )
const ( IterDirectionForward = IterDirection(iota) IterDirectionBackward )
const DfaDeadState int = 0
DfaDeadState represents a state in which the DFA will never accept the string, regardless of the remaining input characters.
Variables ¶
var DfaMatchResultNone = DfaMatchResult{}
Functions ¶
This section is empty.
Types ¶
type Dfa ¶
type Dfa struct {
// Number of states in the DFA.
// States are numbered sequentially, starting from zero.
// State zero is the dead state (input rejected).
NumStates int
// The start state of the DFA.
StartState int
// Transition based on current state and next input byte.
// Indices are (currentStateIdx * 256 + inputChar)
Transitions []int
// Actions to perform on an accept state.
// The actions are defined by the user of the DFA.
AcceptActions [][]int
// contains filtered or unexported fields
}
Dfa is a deterministic finite automata.
func (*Dfa) MatchLongest ¶
func (dfa *Dfa) MatchLongest(r InputReader, startPos uint64, textLen uint64) (DfaMatchResult, error)
MatchLongest returns the longest match in an input string. In some cases, the longest match could be empty (e.g. the regular language for "a*" matches the empty string at the beginning of the string "bbb"). The reader position is reset to the end of the match, if there is one, or its original position if not. startPos and textLen determine the maximum number of runes the DFA will process; they also control the behavior of start-of-text (^) and end-of-text ($) patterns.
type DfaBuilder ¶
type DfaBuilder struct {
// contains filtered or unexported fields
}
DfaBuilder constructs a DFA with the minimal number of states.
func (*DfaBuilder) AddAcceptAction ¶
func (b *DfaBuilder) AddAcceptAction(state int, action int)
AddAcceptAction adds an accept action to take when a state is reached. This marks the state as an accept state.
func (*DfaBuilder) AddState ¶
func (b *DfaBuilder) AddState() int
AddState adds a new state to the DFA, returning the state index.
func (*DfaBuilder) AddTransition ¶
func (b *DfaBuilder) AddTransition(fromState int, onInput int, toState int)
AddTransition adds a transition from one state to another based on an input.
func (*DfaBuilder) Build ¶
func (b *DfaBuilder) Build() *Dfa
Build produces a DFA with the minimal number of states.
func (*DfaBuilder) NextState ¶
func (b *DfaBuilder) NextState(fromState int, onInput int) int
NextState returns the next state after a transition based on an input.
func (*DfaBuilder) NumStates ¶
func (b *DfaBuilder) NumStates() int
func (*DfaBuilder) StartState ¶
func (b *DfaBuilder) StartState() int
type DfaMatchResult ¶ added in v0.2.0
type DfaMatchResult struct {
Accepted bool
EndPos uint64
LookaheadPos uint64
NumBytesReadAtLastAccept int
Actions []int
}
DfaMatchResult represents the result of a running a DFA to find the longest match.
type Edit ¶
type Edit struct {
Pos uint64 // Position of the first character inserted/deleted.
NumInserted uint64
NumDeleted uint64
}
Edit represents a change to a document.
type InputReader ¶
type InputReader interface {
io.Reader
// SeekBackward moves the reader position backward by offset bytes.
SeekBackward(offset uint64) error
}
InputReader provides input text for the parser.
type IterDirection ¶
type IterDirection int
IterDirection determines the direction of the token iterator.
type Nfa ¶
type Nfa struct {
// contains filtered or unexported fields
}
Nfa is a non-deterministic finite automaton.
func EmptyLanguageNfa ¶
func EmptyLanguageNfa() *Nfa
EmptyLanguageNfa returns an NFA that matches no strings (the empty language).
func EmptyStringNfa ¶
func EmptyStringNfa() *Nfa
EmptyStringNfa returns an NFA that matches only the empty string.
func NfaForChars ¶
NfaForChars returns an NFA that matches any of the specified chars.
func NfaForEndOfText ¶
func NfaForEndOfText() *Nfa
NfaForEndOfText returns an NFA that matches the end of the text.
func NfaForNegatedChars ¶
NfaForNegatedChars returns an NFA that matches any char EXCEPT the specified chars.
func NfaForStartOfText ¶
func NfaForStartOfText() *Nfa
NfaForStartOfText returns an NFA that matches the start of the text.
func NfaForUnicodeCategory ¶
func NfaForUnicodeCategory(rangeTable *unicode.RangeTable) *Nfa
NfaForUnicodeCategory constructs an NFA that matches UTF-8 encoded runes in a unicode category (letter, digit, etc.).
func (*Nfa) CompileDfa ¶
CompileDfa compiles the NFA into an equivalent deterministic finite automaton. The DFA has the minimum possible number of states. Only accept states with at least one accept action are accepted by the DFA, so make sure to call SetAcceptAction at least once before compiling!
func (*Nfa) Concat ¶
Concat constructs an NFA from the concatenation of two NFAs. The new NFA has accept actions from both the left and right NFAs.
func (*Nfa) SetAcceptAction ¶
SetAcceptAction sets all accept states in the NFA to the specified action. This overwrites any actions set previously.
type ReadSeekerInput ¶
type ReadSeekerInput struct {
R io.ReadSeeker
}
ReadSeekerInput wraps an io.ReadSeeker.
func (*ReadSeekerInput) SeekBackward ¶
func (r *ReadSeekerInput) SeekBackward(offset uint64) error
type ReaderAtPosFunc ¶
type ReaderAtPosFunc func(pos uint64) InputReader
ReaderAtPosFunc returns a reader at the requested position.
type Regexp ¶
type Regexp interface {
CompileNfa() *Nfa
}
Regexp represents a regular expression.
func ParseRegexp ¶
ParseRegexp parses a regular expression string.
type Token ¶
type Token struct {
Role TokenRole
StartPos uint64
EndPos uint64
// Last position the tokenizer read while constructing the token.
// This will always be greater than or equal to EndPos.
LookaheadPos uint64
}
Token represents a distinct element in a document.
type TokenIter ¶
type TokenIter struct {
// contains filtered or unexported fields
}
TokenIter iterates over tokens. Iterator operations are NOT thread-safe because they can mutate the tree (applying lazy edits).
func (*TokenIter) Advance ¶
func (iter *TokenIter) Advance()
Advance moves the iterator to the next token. If there are no more tokens, this is a no-op.
type TokenRole ¶
type TokenRole int
TokenRole represents the role a token plays. This is mainly used for syntax highlighting.
type TokenTree ¶
type TokenTree struct {
// contains filtered or unexported fields
}
TokenTree represents a collection of tokens. It supports efficient lookups by position and "shifting" token positions to account for insertions/deletions.
func NewTokenTree ¶
NewTokenTree constructs a token tree from a set of tokens. The tokens must be sorted ascending by start position and cover the entire text. Each token's length must be greater than zero.
func (*TokenTree) IterFromPosition ¶
func (t *TokenTree) IterFromPosition(pos uint64, direction IterDirection) *TokenIter
IterFromPosition returns a token iterator from the token intersecting a position.
type Tokenizer ¶
type Tokenizer struct {
StateMachine *Dfa
SubTokenizers []*Tokenizer
Rules []TokenizerRule
// contains filtered or unexported fields
}
Tokenizer parses a text into tokens based on a set of rules.
func GenerateTokenizer ¶
func GenerateTokenizer(rules []TokenizerRule) (*Tokenizer, error)
GenerateTokenizer compiles a tokenizer from a set of rules.
func (*Tokenizer) RetokenizeAfterEdit ¶
func (t *Tokenizer) RetokenizeAfterEdit(tree *TokenTree, edit Edit, textLen uint64, readerAtPos ReaderAtPosFunc) (*TokenTree, error)
RetokenizeAfterEdit updates tokens based on an edit to the text. The algorithm is based on Wagner (1998) Practical Algorithms for Incremental Software Development Environments, Chapter 5. This method assumes that the token tree is up-to-date with the text before the most recent edit; if not, it may panic.
func (*Tokenizer) TokenizeAll ¶
func (t *Tokenizer) TokenizeAll(r InputReader, textLen uint64) (*TokenTree, error)
TokenizeAll splits the entire input text into tokens. The input text MUST be valid UTF-8.
type TokenizerRule ¶
type TokenizerRule struct {
Regexp string
TokenRole TokenRole
SubRules []TokenizerRule
}
TokenizerRule represents a rule for parsing a particular token.