parser

package

v0.2.0 Latest Latest Go to latest Published: Aug 21, 2021 License: GPL-3.0 Imports: 11 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/aretext/aretext

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
type Dfa
- func (dfa *Dfa) MatchLongest(r InputReader, startPos uint64, textLen uint64) (DfaMatchResult, error)
- func (dfa *Dfa) NextState(fromState int, onInput int) int
type DfaBuilder
- func NewDfaBuilder() *DfaBuilder
- func (b *DfaBuilder) AddAcceptAction(state int, action int)
- func (b *DfaBuilder) AddState() int
- func (b *DfaBuilder) AddTransition(fromState int, onInput int, toState int)
- func (b *DfaBuilder) Build() *Dfa
- func (b *DfaBuilder) NextState(fromState int, onInput int) int
- func (b *DfaBuilder) NumStates() int
- func (b *DfaBuilder) StartState() int
type DfaMatchResult
type Edit
type InputReader
type IterDirection
type Nfa
- func EmptyLanguageNfa() *Nfa
- func EmptyStringNfa() *Nfa
- func NfaForChars(chars []byte) *Nfa
- func NfaForEndOfText() *Nfa
- func NfaForNegatedChars(negatedChars []byte) *Nfa
- func NfaForStartOfText() *Nfa
- func NfaForUnicodeCategory(rangeTable *unicode.RangeTable) *Nfa
- func (nfa *Nfa) CompileDfa() *Dfa
- func (left *Nfa) Concat(right *Nfa) *Nfa
- func (nfa *Nfa) SetAcceptAction(action int) *Nfa
- func (nfa *Nfa) Star() *Nfa
- func (left *Nfa) Union(right *Nfa) *Nfa
type ReadSeekerInput
- func (r *ReadSeekerInput) Read(b []byte) (int, error)
- func (r *ReadSeekerInput) SeekBackward(offset uint64) error
type ReaderAtPosFunc
type Regexp
- func ParseRegexp(s string) (Regexp, error)
type Token
type TokenIter
- func (iter *TokenIter) Advance()
- func (iter *TokenIter) Collect() []Token
- func (iter *TokenIter) Get(tok *Token) bool
type TokenRole
type TokenTree
- func NewTokenTree(tokens []Token) *TokenTree
- func (t *TokenTree) IterFromPosition(pos uint64, direction IterDirection) *TokenIter
type Tokenizer
- func GenerateTokenizer(rules []TokenizerRule) (*Tokenizer, error)
- func (t *Tokenizer) RetokenizeAfterEdit(tree *TokenTree, edit Edit, textLen uint64, readerAtPos ReaderAtPosFunc) (*TokenTree, error)
- func (t *Tokenizer) TokenizeAll(r InputReader, textLen uint64) (*TokenTree, error)
type TokenizerRule

Constants ¶

View Source

const (
	TokenRoleNone = TokenRole(iota)
	TokenRoleOperator
	TokenRoleKeyword
	TokenRoleIdentifier
	TokenRoleNumber
	TokenRoleString
	TokenRoleComment
)

View Source

const (
	TokenRoleCustom1 = TokenRole((1 << 16) + iota)
	TokenRoleCustom2
	TokenRoleCustom3
	TokenRoleCustom4
	TokenRoleCustom5
	TokenRoleCustom6
	TokenRoleCustom7
	TokenRoleCustom8
)

View Source

const (
	IterDirectionForward = IterDirection(iota)
	IterDirectionBackward
)

View Source

const DfaDeadState int = 0

DfaDeadState represents a state in which the DFA will never accept the string, regardless of the remaining input characters.

Variables ¶

View Source

var DfaMatchResultNone = DfaMatchResult{}

Functions ¶

This section is empty.

Types ¶

type Dfa ¶

type Dfa struct {
	// Number of states in the DFA.
	// States are numbered sequentially, starting from zero.
	// State zero is the dead state (input rejected).
	NumStates int

	// The start state of the DFA.
	StartState int

	// Transition based on current state and next input byte.
	// Indices are (currentStateIdx * 256 + inputChar)
	Transitions []int

	// Actions to perform on an accept state.
	// The actions are defined by the user of the DFA.
	AcceptActions [][]int
	// contains filtered or unexported fields
}

Dfa is a deterministic finite automata.

func (*Dfa) MatchLongest ¶

func (dfa *Dfa) MatchLongest(r InputReader, startPos uint64, textLen uint64) (DfaMatchResult, error)

MatchLongest returns the longest match in an input string. In some cases, the longest match could be empty (e.g. the regular language for "a*" matches the empty string at the beginning of the string "bbb"). The reader position is reset to the end of the match, if there is one, or its original position if not. startPos and textLen determine the maximum number of runes the DFA will process; they also control the behavior of start-of-text (^) and end-of-text ($) patterns.

func (*Dfa) NextState ¶

func (dfa *Dfa) NextState(fromState int, onInput int) int

NextState returns the next state after a transition based on an input.

type DfaBuilder ¶

type DfaBuilder struct {
	// contains filtered or unexported fields
}

DfaBuilder constructs a DFA with the minimal number of states.

func NewDfaBuilder ¶

func NewDfaBuilder() *DfaBuilder

NewDfaBuilder initializes a new builder.

func (*DfaBuilder) AddAcceptAction ¶

func (b *DfaBuilder) AddAcceptAction(state int, action int)

AddAcceptAction adds an accept action to take when a state is reached. This marks the state as an accept state.

func (*DfaBuilder) AddState ¶

func (b *DfaBuilder) AddState() int

AddState adds a new state to the DFA, returning the state index.

func (*DfaBuilder) AddTransition ¶

func (b *DfaBuilder) AddTransition(fromState int, onInput int, toState int)

AddTransition adds a transition from one state to another based on an input.

func (*DfaBuilder) Build ¶

func (b *DfaBuilder) Build() *Dfa

Build produces a DFA with the minimal number of states.

func (*DfaBuilder) NextState ¶

func (b *DfaBuilder) NextState(fromState int, onInput int) int

NextState returns the next state after a transition based on an input.

func (*DfaBuilder) NumStates ¶

func (b *DfaBuilder) NumStates() int

func (*DfaBuilder) StartState ¶

func (b *DfaBuilder) StartState() int

type DfaMatchResult ¶ added in v0.2.0

type DfaMatchResult struct {
	Accepted                 bool
	EndPos                   uint64
	LookaheadPos             uint64
	NumBytesReadAtLastAccept int
	Actions                  []int
}

DfaMatchResult represents the result of a running a DFA to find the longest match.

type Edit ¶

type Edit struct {
	Pos         uint64 // Position of the first character inserted/deleted.
	NumInserted uint64
	NumDeleted  uint64
}

Edit represents a change to a document.

type InputReader ¶

type InputReader interface {
	io.Reader

	// SeekBackward moves the reader position backward by offset bytes.
	SeekBackward(offset uint64) error
}

InputReader provides input text for the parser.

type IterDirection ¶

type IterDirection int

IterDirection determines the direction of the token iterator.

type Nfa ¶

type Nfa struct {
	// contains filtered or unexported fields
}

Nfa is a non-deterministic finite automaton.

func EmptyLanguageNfa ¶

func EmptyLanguageNfa() *Nfa

EmptyLanguageNfa returns an NFA that matches no strings (the empty language).

func EmptyStringNfa ¶

func EmptyStringNfa() *Nfa

EmptyStringNfa returns an NFA that matches only the empty string.

func NfaForChars ¶

func NfaForChars(chars []byte) *Nfa

NfaForChars returns an NFA that matches any of the specified chars.

func NfaForEndOfText ¶

func NfaForEndOfText() *Nfa

NfaForEndOfText returns an NFA that matches the end of the text.

func NfaForNegatedChars ¶

func NfaForNegatedChars(negatedChars []byte) *Nfa

NfaForNegatedChars returns an NFA that matches any char EXCEPT the specified chars.

func NfaForStartOfText ¶

func NfaForStartOfText() *Nfa

NfaForStartOfText returns an NFA that matches the start of the text.

func NfaForUnicodeCategory ¶

func NfaForUnicodeCategory(rangeTable *unicode.RangeTable) *Nfa

NfaForUnicodeCategory constructs an NFA that matches UTF-8 encoded runes in a unicode category (letter, digit, etc.).

func (*Nfa) CompileDfa ¶

func (nfa *Nfa) CompileDfa() *Dfa

CompileDfa compiles the NFA into an equivalent deterministic finite automaton. The DFA has the minimum possible number of states. Only accept states with at least one accept action are accepted by the DFA, so make sure to call SetAcceptAction at least once before compiling!

func (*Nfa) Concat ¶

func (left *Nfa) Concat(right *Nfa) *Nfa

Concat constructs an NFA from the concatenation of two NFAs. The new NFA has accept actions from both the left and right NFAs.

func (*Nfa) SetAcceptAction ¶

func (nfa *Nfa) SetAcceptAction(action int) *Nfa

SetAcceptAction sets all accept states in the NFA to the specified action. This overwrites any actions set previously.

func (*Nfa) Star ¶

func (nfa *Nfa) Star() *Nfa

Star applies the Kleene star operation to the NFA.

func (*Nfa) Union ¶

func (left *Nfa) Union(right *Nfa) *Nfa

Union constructs an NFA from the union of two NFAs.

type ReadSeekerInput ¶

type ReadSeekerInput struct {
	R io.ReadSeeker
}

ReadSeekerInput wraps an io.ReadSeeker.

func (*ReadSeekerInput) Read ¶

func (r *ReadSeekerInput) Read(b []byte) (int, error)

func (*ReadSeekerInput) SeekBackward ¶

func (r *ReadSeekerInput) SeekBackward(offset uint64) error

type ReaderAtPosFunc ¶

type ReaderAtPosFunc func(pos uint64) InputReader

ReaderAtPosFunc returns a reader at the requested position.

type Regexp ¶

type Regexp interface {
	CompileNfa() *Nfa
}

Regexp represents a regular expression.

func ParseRegexp ¶

func ParseRegexp(s string) (Regexp, error)

ParseRegexp parses a regular expression string.

type Token ¶

type Token struct {
	Role     TokenRole
	StartPos uint64
	EndPos   uint64

	// Last position the tokenizer read while constructing the token.
	// This will always be greater than or equal to EndPos.
	LookaheadPos uint64
}

Token represents a distinct element in a document.

type TokenIter ¶

type TokenIter struct {
	// contains filtered or unexported fields
}

TokenIter iterates over tokens. Iterator operations are NOT thread-safe because they can mutate the tree (applying lazy edits).

func (*TokenIter) Advance ¶

func (iter *TokenIter) Advance()

Advance moves the iterator to the next token. If there are no more tokens, this is a no-op.

func (*TokenIter) Collect ¶

func (iter *TokenIter) Collect() []Token

Collect retrieves all tokens from the iterator and returns them as a slice.

func (*TokenIter) Get ¶

func (iter *TokenIter) Get(tok *Token) bool

Get retrieves the current token, if it exists.

type TokenRole ¶

type TokenRole int

TokenRole represents the role a token plays. This is mainly used for syntax highlighting.

type TokenTree ¶

type TokenTree struct {
	// contains filtered or unexported fields
}

TokenTree represents a collection of tokens. It supports efficient lookups by position and "shifting" token positions to account for insertions/deletions.

func NewTokenTree ¶

func NewTokenTree(tokens []Token) *TokenTree

NewTokenTree constructs a token tree from a set of tokens. The tokens must be sorted ascending by start position and cover the entire text. Each token's length must be greater than zero.

func (*TokenTree) IterFromPosition ¶

func (t *TokenTree) IterFromPosition(pos uint64, direction IterDirection) *TokenIter

IterFromPosition returns a token iterator from the token intersecting a position.

type Tokenizer ¶

type Tokenizer struct {
	StateMachine  *Dfa
	SubTokenizers []*Tokenizer
	Rules         []TokenizerRule
	// contains filtered or unexported fields
}

Tokenizer parses a text into tokens based on a set of rules.

func GenerateTokenizer ¶

func GenerateTokenizer(rules []TokenizerRule) (*Tokenizer, error)

GenerateTokenizer compiles a tokenizer from a set of rules.

func (*Tokenizer) RetokenizeAfterEdit ¶

func (t *Tokenizer) RetokenizeAfterEdit(tree *TokenTree, edit Edit, textLen uint64, readerAtPos ReaderAtPosFunc) (*TokenTree, error)

RetokenizeAfterEdit updates tokens based on an edit to the text. The algorithm is based on Wagner (1998) Practical Algorithms for Incremental Software Development Environments, Chapter 5. This method assumes that the token tree is up-to-date with the text before the most recent edit; if not, it may panic.

func (*Tokenizer) TokenizeAll ¶

func (t *Tokenizer) TokenizeAll(r InputReader, textLen uint64) (*TokenTree, error)

TokenizeAll splits the entire input text into tokens. The input text MUST be valid UTF-8.

type TokenizerRule ¶

type TokenizerRule struct {
	Regexp    string
	TokenRole TokenRole
	SubRules  []TokenizerRule
}

TokenizerRule represents a rule for parsing a particular token.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL