Documentation
¶
Overview ¶
Package tokenizer provides text tokenization for ML model inference.
The Tokenizer interface abstracts over different tokenization algorithms (whitespace, BPE, SentencePiece). Implementations include WhitespaceTokenizer for testing and BPETokenizer for production use with HuggingFace models.
Index ¶
- type BPETokenizer
- func (t *BPETokenizer) Decode(ids []int) (string, error)
- func (t *BPETokenizer) Encode(text string) ([]int, error)
- func (t *BPETokenizer) EncodeWithSpecialTokens(text string, addBOS bool, addEOS bool) ([]int, error)
- func (t *BPETokenizer) GetID(token string) (int, bool)
- func (t *BPETokenizer) GetToken(id int) (string, bool)
- func (t *BPETokenizer) SetSentencePiece(enabled bool)
- func (t *BPETokenizer) SetSpecialTokenStrings(tokens map[string]int)
- func (t *BPETokenizer) SpecialTokens() SpecialTokens
- func (t *BPETokenizer) VocabSize() int
- type MergePair
- type NormalizerFunc
- type SpecialTokens
- type Tokenizer
- type WhitespaceTokenizer
- func (t *WhitespaceTokenizer) AddToken(token string) int
- func (t *WhitespaceTokenizer) Decode(ids []int) (string, error)
- func (t *WhitespaceTokenizer) Encode(text string) ([]int, error)
- func (t *WhitespaceTokenizer) GetID(token string) (int, bool)
- func (t *WhitespaceTokenizer) GetToken(id int) (string, bool)
- func (t *WhitespaceTokenizer) SpecialTokens() SpecialTokens
- func (t *WhitespaceTokenizer) VocabSize() int
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BPETokenizer ¶
type BPETokenizer struct {
// contains filtered or unexported fields
}
BPETokenizer implements the Tokenizer interface using byte-pair encoding. It loads vocabulary and merge rules from HuggingFace tokenizer.json format.
func LoadFromJSON ¶
func LoadFromJSON(path string) (*BPETokenizer, error)
LoadFromJSON reads a HuggingFace tokenizer.json file and returns a BPETokenizer.
func NewBPETokenizer ¶
func NewBPETokenizer(vocab map[string]int, merges []MergePair, special SpecialTokens, byteLevelBPE bool) *BPETokenizer
NewBPETokenizer creates a BPETokenizer from vocabulary, merge rules, and special tokens.
func (*BPETokenizer) Decode ¶
func (t *BPETokenizer) Decode(ids []int) (string, error)
Decode converts token IDs back to text.
func (*BPETokenizer) Encode ¶
func (t *BPETokenizer) Encode(text string) ([]int, error)
Encode tokenizes text into a sequence of token IDs using BPE.
func (*BPETokenizer) EncodeWithSpecialTokens ¶
func (t *BPETokenizer) EncodeWithSpecialTokens(text string, addBOS bool, addEOS bool) ([]int, error)
EncodeWithSpecialTokens wraps Encode and optionally prepends BOS / appends EOS.
func (*BPETokenizer) GetID ¶
func (t *BPETokenizer) GetID(token string) (int, bool)
GetID returns the ID for a given token string.
func (*BPETokenizer) GetToken ¶
func (t *BPETokenizer) GetToken(id int) (string, bool)
GetToken returns the string for a given token ID.
func (*BPETokenizer) SetSentencePiece ¶
func (t *BPETokenizer) SetSentencePiece(enabled bool)
SetSentencePiece enables SentencePiece-style pre-tokenization where spaces are replaced with ▁ (U+2581) and the text is split at ▁ boundaries.
func (*BPETokenizer) SetSpecialTokenStrings ¶
func (t *BPETokenizer) SetSpecialTokenStrings(tokens map[string]int)
SetSpecialTokenStrings registers token strings that should be matched as single tokens during encoding instead of being split by BPE.
func (*BPETokenizer) SpecialTokens ¶
func (t *BPETokenizer) SpecialTokens() SpecialTokens
SpecialTokens returns the special token configuration.
func (*BPETokenizer) VocabSize ¶
func (t *BPETokenizer) VocabSize() int
VocabSize returns the number of tokens in the vocabulary.
type NormalizerFunc ¶
NormalizerFunc transforms text before tokenization.
type SpecialTokens ¶
type SpecialTokens struct {
BOS int // Beginning of sequence
EOS int // End of sequence
PAD int // Padding
UNK int // Unknown token
}
SpecialTokens holds IDs for commonly used special tokens.
type Tokenizer ¶
type Tokenizer interface {
// Encode converts text into a sequence of token IDs.
Encode(text string) ([]int, error)
// Decode converts a sequence of token IDs back into text.
Decode(ids []int) (string, error)
// VocabSize returns the total number of tokens in the vocabulary.
VocabSize() int
// GetToken returns the string token for a given ID and whether it exists.
GetToken(id int) (string, bool)
// GetID returns the token ID for a given string and whether it exists.
GetID(token string) (int, bool)
// SpecialTokens returns the special token IDs for this tokenizer.
SpecialTokens() SpecialTokens
}
Tokenizer is the interface for all tokenizer implementations.
type WhitespaceTokenizer ¶
type WhitespaceTokenizer struct {
// contains filtered or unexported fields
}
WhitespaceTokenizer provides simple whitespace-based tokenization. It splits text on whitespace boundaries and maps words to integer IDs. Useful for testing and non-production scenarios.
func NewWhitespaceTokenizer ¶
func NewWhitespaceTokenizer() *WhitespaceTokenizer
NewWhitespaceTokenizer creates a WhitespaceTokenizer pre-loaded with standard special tokens: <unk> (0), <s> (1), </s> (2), <pad> (3).
func (*WhitespaceTokenizer) AddToken ¶
func (t *WhitespaceTokenizer) AddToken(token string) int
AddToken adds a token to the vocabulary if it does not already exist. Returns the token's ID.
func (*WhitespaceTokenizer) Decode ¶
func (t *WhitespaceTokenizer) Decode(ids []int) (string, error)
Decode converts token IDs back to a space-separated string.
func (*WhitespaceTokenizer) Encode ¶
func (t *WhitespaceTokenizer) Encode(text string) ([]int, error)
Encode splits text on whitespace and returns token IDs. Unknown words map to the UNK token ID.
func (*WhitespaceTokenizer) GetID ¶
func (t *WhitespaceTokenizer) GetID(token string) (int, bool)
GetID returns the token ID for a given string.
func (*WhitespaceTokenizer) GetToken ¶
func (t *WhitespaceTokenizer) GetToken(id int) (string, bool)
GetToken returns the string token for a given ID.
func (*WhitespaceTokenizer) SpecialTokens ¶
func (t *WhitespaceTokenizer) SpecialTokens() SpecialTokens
SpecialTokens returns the special token IDs.
func (*WhitespaceTokenizer) VocabSize ¶
func (t *WhitespaceTokenizer) VocabSize() int
VocabSize returns the number of tokens in the vocabulary.