lex

package
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 16, 2023 License: BSD-3-Clause Imports: 5 Imported by: 11

Documentation

Overview

Package lex is a Unicode-friendly run time library for golex[0] generated lexical analyzers[1].

Changelog

2015-04-08: Initial release.

Character classes

Golex internally handles only 8 bit "characters". Many Unicode-aware tokenizers do not actually need to recognize every Unicode rune, but only some particular partitions/subsets. Like, for example, a particular Unicode category, say upper case letters: Lu.

The idea is to convert all runes in a particular set as a single 8 bit character allocated outside the ASCII range of codes. The token value, a string of runes and their exact positions is collected as usual (see the Token and TokenBytes method), but the tokenizer DFA is simpler (and thus smaller and perhaps also faster) when this technique is used. In the example program (see below), recognizing (and skipping) white space, integer literals, one keyword and Go identifiers requires only an 8 state DFA[5].

To provide the conversion from runes to character classes, "install" your converting function using the RuneClass option.

References

-

[0]: http://godoc.org/modernc.org/golex
[1]: http://en.wikipedia.org/wiki/Lexical_analysis
[2]: http://golang.org/cmd/yacc/
[3]: https://modernc.org/golex/blob/master/lex/example.l
[4]: http://golang.org/pkg/io/#RuneReader
[5]: https://modernc.org/golex/blob/master/lex/dfa
Example (CompleteGeneratedProgram)
// CAUTION: Generated file - DO NOT EDIT.

// Copyright (c) 2015 The golex Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// This is an example program using golex run time library. It is generated by
//
//	$ golex -o example_test.go example.l
//
// The complete input file, example.l, is at [3], the scan function excerpt is:
//
//	func (l *lexer) scan() lex.Char {
//		c := l.Enter()
//	%}
//
//	digit		[0-9]|{unicodeDigit}
//	identifier	{letter}({letter}|{digit})*
//	int		[0-9]+
//	letter		[_a-zA-Z]|{unicodeLetter}
//	unicodeDigit	\x81
//	unicodeLetter	\x80
//
//	%%
//
//		c = l.Rule0()
//
//	[ \t\r\n]+
//
//	func		return l.char(FUNC)
//	{identifier}	return l.char(IDENT)
//	{int}		return l.char(INT)
//
//
//	%%
//		if c, ok := l.Abort(); ok {
//			return l.char(c)
//		}
//
//		goto yyAction
//	}
package main // import "modernc.org/golex/lex"

import (
	"bytes"
	"fmt"
	"go/token"
	"unicode"

	"modernc.org/golex/lex"
)

// Allocate Character classes anywhere in [0x80, 0xFF].
const (
	classUnicodeLeter = iota + 0x80
	classUnicodeDigit
	classOther
)

// Parser token values.
const (
	FUNC = iota + 0xE002
	INT
	IDENT
)

// For pretty printing.
func str(r rune) string {
	switch r {
	case FUNC:
		return "FUNC"
	case INT:
		return "INT"
	case IDENT:
		return "IDENT"
	case lex.RuneEOF:
		return "EOF"
	}

	return fmt.Sprintf("%q", r)
}

type lexer struct {
	*lex.Lexer
}

func (l *lexer) char(r int) lex.Char {
	return lex.NewChar(l.First.Pos(), rune(r))
}

func rune2Class(r rune) int {
	if r >= 0 && r < 0x80 { // Keep ASCII as it is.
		return int(r)
	}

	if unicode.IsLetter(r) {
		return classUnicodeLeter
	}

	if unicode.IsDigit(r) {
		return classUnicodeDigit
	}

	return classOther
}

const src = `

func Xφ42() int { return 314 }

`

func main() { // main
	fset := token.NewFileSet()
	file := fset.AddFile("example.go", -1, len(src))
	src := bytes.NewBufferString(src)
	lx, err := lex.New(file, src, lex.RuneClass(rune2Class))
	if err != nil {
		panic(err)
	}

	l := &lexer{lx}
	for {
		c := l.scan()
		fmt.Printf("%v: %v %q\n", file.Position(c.Pos()), str(c.Rune), l.TokenBytes(nil))
		if c.Rune == lex.RuneEOF {
			return
		}
	}
}

func (l *lexer) scan() lex.Char {
	c := l.Enter()

yystate0:
	yyrule := -1
	_ = yyrule
	c = l.Rule0()

	goto yystart1

	goto yystate0 // silence unused label error
	goto yyAction // silence unused label error
yyAction:
	switch yyrule {
	case 1:
		goto yyrule1
	case 2:
		goto yyrule2
	case 3:
		goto yyrule3
	case 4:
		goto yyrule4
	}
	goto yystate1 // silence unused label error
yystate1:
	c = l.Next()
yystart1:
	switch {
	default:
		goto yyabort
	case c == '\t' || c == '\n' || c == '\r' || c == ' ':
		goto yystate2
	case c == 'f':
		goto yystate5
	case c >= '0' && c <= '9':
		goto yystate3
	case c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'e' || c >= 'g' && c <= 'z' || c == '\u0080':
		goto yystate4
	}

yystate2:
	c = l.Next()
	yyrule = 1
	l.Mark()
	switch {
	default:
		goto yyrule1
	case c == '\t' || c == '\n' || c == '\r' || c == ' ':
		goto yystate2
	}

yystate3:
	c = l.Next()
	yyrule = 4
	l.Mark()
	switch {
	default:
		goto yyrule4
	case c >= '0' && c <= '9':
		goto yystate3
	}

yystate4:
	c = l.Next()
	yyrule = 3
	l.Mark()
	switch {
	default:
		goto yyrule3
	case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c == '\u0080' || c == '\u0081':
		goto yystate4
	}

yystate5:
	c = l.Next()
	yyrule = 3
	l.Mark()
	switch {
	default:
		goto yyrule3
	case c == 'u':
		goto yystate6
	case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 't' || c >= 'v' && c <= 'z' || c == '\u0080' || c == '\u0081':
		goto yystate4
	}

yystate6:
	c = l.Next()
	yyrule = 3
	l.Mark()
	switch {
	default:
		goto yyrule3
	case c == 'n':
		goto yystate7
	case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'm' || c >= 'o' && c <= 'z' || c == '\u0080' || c == '\u0081':
		goto yystate4
	}

yystate7:
	c = l.Next()
	yyrule = 3
	l.Mark()
	switch {
	default:
		goto yyrule3
	case c == 'c':
		goto yystate8
	case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c == 'a' || c == 'b' || c >= 'd' && c <= 'z' || c == '\u0080' || c == '\u0081':
		goto yystate4
	}

yystate8:
	c = l.Next()
	yyrule = 2
	l.Mark()
	switch {
	default:
		goto yyrule2
	case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c == '\u0080' || c == '\u0081':
		goto yystate4
	}

yyrule1: // [ \t\r\n]+

	goto yystate0
yyrule2: // func
	{
		return l.char(FUNC)
	}
yyrule3: // {identifier}
	{
		return l.char(IDENT)
	}
yyrule4: // {int}
	{
		return l.char(INT)
	}
	panic("unreachable")

	goto yyabort // silence unused label error

yyabort: // no lexem recognized
	if c, ok := l.Abort(); ok {
		return l.char(c)
	}

	goto yyAction
}
Output:

example.go:3:1: FUNC "func"
example.go:3:6: IDENT "Xφ42"
example.go:3:11: '(' "("
example.go:3:12: ')' ")"
example.go:3:14: IDENT "int"
example.go:3:18: '{' "{"
example.go:3:20: IDENT "return"
example.go:3:27: INT "314"
example.go:3:31: '}' "}"
example.go:4:2: EOF "\xff"

Index

Examples

Constants

View Source
const (
	BOMError       = iota // BOM is an error anywhere.
	BOMIgnoreFirst        // Skip BOM if at beginning, report as error if anywhere else.
	BOMPassAll            // No special handling of BOM.
	BOMPassFirst          // No special handling of BOM if at beginning, report as error if anywhere else.
)

BOM handling modes which can be set by the BOMMode Option. Default is BOMIgnoreFirst.

View Source
const (
	NonASCII = 0x80 // DefaultRuneClass returns NonASCII for non ASCII runes.
	RuneEOF  = -1   // Distinct from any valid Unicode rune value.
)

Variables

This section is empty.

Functions

func DefaultRuneClass

func DefaultRuneClass(r rune) int

DefaultRuneClass returns the character class of r. If r is an ASCII code then its class equals the ASCII code. Any other rune is of class NonASCII.

DefaultRuneClass is the default implementation Lexer will use to convert runes (21 bit entities) to scanner classes (8 bit entities).

Non ASCII aware lexical analyzers will typically use their own categorization function. To assign such custom function use the RuneClass option.

Types

type Char

type Char struct {
	Rune rune
	// contains filtered or unexported fields
}

Char represents a rune and its position.

func NewChar

func NewChar(pos token.Pos, r rune) Char

NewChar returns a new Char value.

func (Char) IsValid

func (c Char) IsValid() bool

IsValid reports whether c is not a zero Char.

func (Char) Pos

func (c Char) Pos() token.Pos

Pos returns the token.Pos associated with c.

type CharReader

type CharReader interface {
	ReadChar() (c Char, size int, err error)
}

CharReader is a RuneReader providing additionally explicit position information by returning a Char instead of a rune as its first result.

type Lexer

type Lexer struct {
	File  *token.File // The *token.File passed to New.
	First Char        // First remembers the lookahead char when Rule0 was invoked.
	Last  Char        // Last remembers the last Char returned by Next.
	Prev  Char        // Prev remembers the Char previous to Last.
	// contains filtered or unexported fields
}

Lexer suports golex[0] generated lexical analyzers.

func New

func New(file *token.File, src io.RuneReader, opts ...Option) (*Lexer, error)

New returns a new *Lexer. The result can be amended using opts.

Non Unicode Input

To consume sources in other encodings and still have exact position information, pass an io.RuneReader which returns the next input character reencoded as an Unicode rune but returns the size (number of bytes used to encode it) of the original character, not the size of its UTF-8 representation after converted to an Unicode rune. Size is the second returned value of io.RuneReader.ReadRune method[4].

When src optionally implements CharReader its ReadChar method is used instead of io.ReadRune.

func (*Lexer) Abort

func (l *Lexer) Abort() (int, bool)

Abort handles the situation when the scanner does not successfully recognize any token or when an attempt to find the longest match "overruns" from an accepting state only to never reach an accepting state again. In the first case the scanner was never in an accepting state since last call to Rule0 and then (true, previousLookahead rune) is returned, effectively consuming a single Char token, avoiding scanner stall. Otherwise there was at least one accepting scanner state marked using Mark. In this case Abort rollbacks the lexer state to the marked state and returns (false, 0). The scanner must then execute a prescribed goto statement. For example:

%yyc c
%yyn c = l.Next()
%yym l.Mark()

%{
package foo

import (...)

type lexer struct {
	*lex.Lexer
	...
}

func newLexer(...) *lexer {
	return &lexer{
		lex.NewLexer(...),
		...
	}
}

func (l *lexer) scan() int {
        c := l.Enter()
%}

... more lex defintions

%%

        c = l.Rule0()

... lex rules

%%

	if c, ok := l.Abort(); ok {
		return c
	}

	goto yyAction
}

func (*Lexer) Enter

func (l *Lexer) Enter() int

Enter ensures the lexer has a valid lookahead Char and returns its class. Typical use in an .l file

func (l *lexer) scan() lex.Char {
	c := l.Enter()
	...

func (*Lexer) Error

func (l *Lexer) Error(msg string)

Error Implements yyLexer[2] by printing the msg to stderr.

func (*Lexer) Lookahead

func (l *Lexer) Lookahead() Char

Lookahead returns the current lookahead.

func (*Lexer) Mark

func (l *Lexer) Mark()

Mark records the current state of scanner as accepting. It implements the golex macro %yym. Typical usage in an .l file:

%yym l.Mark()

func (*Lexer) Next

func (l *Lexer) Next() int

Next advances the scanner for one rune and returns the respective character class of the new lookahead. Typical usage in an .l file:

%yyn c = l.Next()

func (*Lexer) Offset

func (l *Lexer) Offset() int

Offset returns the current reading offset of the lexer's source.

func (*Lexer) Rule0

func (l *Lexer) Rule0() int

Rule0 initializes the scanner state before the attempt to recognize a token starts. The token collecting buffer is cleared. Rule0 records the current lookahead in l.First and returns its class. Typical usage in an .l file:

... lex definitions

%%

	c := l.Rule0()

first-pattern-regexp

func (*Lexer) Token

func (l *Lexer) Token() []Char

Token returns the currently collected token chars. The result is R/O.

func (*Lexer) TokenBytes

func (l *Lexer) TokenBytes(builder func(*bytes.Buffer)) []byte

TokenBytes returns the UTF-8 encoding of Token. If builder is not nil then it's called instead to build the encoded token byte value into the buffer passed to it.

The Result is R/O.

func (*Lexer) Unget

func (l *Lexer) Unget(c ...Char)

Unget unreads all chars in c.

type Option

type Option func(*Lexer) error

Option is a function which can be passed as an optional argument to New.

func BOMMode

func BOMMode(mode int) Option

BOMMode option selects how the lexer handles BOMs. See the BOM* constants for details.

func ErrorFunc

func ErrorFunc(f func(token.Pos, string)) Option

ErrorFunc option sets a function called when an, for example I/O error, occurs. The default is to call Error with the position and message already formated as a string.

func RuneClass

func RuneClass(f func(rune) int) Option

RuneClass option sets the function used to convert runes to character classes.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL