Documentation
¶
Index ¶
- Constants
- func ArrayAsRanges(runes []rune) []rangeTuple
- type BGERank
- type BGERanks
- type GPTEncoder
- func NewCLIPEncoder() GPTEncoder
- func NewEncoder(vocabId string) (*GPTEncoder, error)
- func NewGPT2Encoder() GPTEncoder
- func NewLlama2Encoder() GPTEncoder
- func NewLlama3Encoder() GPTEncoder
- func NewMistralEncoder() GPTEncoder
- func NewNerdstashV1Encoder() GPTEncoder
- func NewNerdstashV2Encoder() GPTEncoder
- func NewPileEncoder() GPTEncoder
- func (encoder *GPTEncoder) AlignAndSizeTokens(tokens *Tokens, desiredLength int) (alignedTokens Tokens, endAt int)
- func (encoder *GPTEncoder) Clone() *GPTEncoder
- func (encoder *GPTEncoder) Decode(encoded *Tokens) (text string)
- func (encoder *GPTEncoder) DecodeBuffer(encoded *[]byte, useUint32 bool) (text string)
- func (encoder *GPTEncoder) Encode(text *string) *Tokens
- func (encoder *GPTEncoder) EncodeBuffer(buffer *[]byte) (*[]byte, uint64)
- func (encoder *GPTEncoder) EncodeReader(reader io.RuneReader) *Tokens
- func (encoder *GPTEncoder) Get(text string) *Token
- func (encoder *GPTEncoder) IsByteToken(token *Token) bool
- func (encoder *GPTEncoder) IsLastTokenByte(tokens *Tokens) bool
- func (encoder *GPTEncoder) SplitWords(text *string) *[]string
- func (encoder *GPTEncoder) StreamingEncode(reader io.RuneReader) func(int) *Tokens
- func (encoder *GPTEncoder) ToBPE(text string) Tokens
- func (encoder *GPTEncoder) TokensReady(tokens *Tokens) bool
- func (encoder *GPTEncoder) TrimIncompleteSentence(tokens *Tokens) (*Tokens, error)
- func (encoder *GPTEncoder) TrimNewlines(tokens *Tokens, direction TrimDirection, limit uint) (*Tokens, error)
- func (encoder *GPTEncoder) TrimSentences(tokens *Tokens, direction TrimDirection, limit uint) (*Tokens, error)
- func (encoder *GPTEncoder) TrimTokens(tokens *Tokens) (trimmed *Tokens)
- func (encoder *GPTEncoder) UpdateSpecialsTree()
- func (encoder *GPTEncoder) WordSplitter(reader io.RuneReader) func() *string
- type GPTPair
- type NextRuneFunc
- type PreallocBGERanks
- type RangeLUT
- type RegexNode
- type RuneNode
- type RuneNodes
- type Token
- type TokenPair
- type Tokens
- type TrimDirection
- type TypedTwoTierCache
- type WordCallback
Constants ¶
const BPE_LRU_SZ = 16384
const PUNC_REGEX = "\\p{L}[.!?;]\\p{L}"
const REGEX_ERROR = "gpt_bpe: Fatal error compiling regular expression: %v"
const RUNEBUF_SZ = 16384
const SPLIT_REGEX = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L" +
"}+| ?\\p{N}+| ?[^\\s\\p{L" +
"}\\p{N}]+|\\s+(\\S){0}|\\s+"
const VOCAB_ID_CLIP = "clip-tokenizer"
const VOCAB_ID_GPT2 = "gpt2-tokenizer"
const VOCAB_ID_LLAMA = "llama-tokenizer"
const VOCAB_ID_LLAMA_3 = "llama3-tokenizer"
const VOCAB_ID_MISTRAL = "mistral-tokenizer"
const VOCAB_ID_NERDSTASH_V1 = "nerdstash_v1-tokenizer"
const VOCAB_ID_NERDSTASH_V2 = "nerdstash_v2-tokenizer"
const VOCAB_ID_PILE = "pile-tokenizer"
const WORDCHAN_SZ = 4096
Variables ¶
This section is empty.
Functions ¶
func ArrayAsRanges ¶
func ArrayAsRanges(runes []rune) []rangeTuple
The AST is given as a []rune where every two runes are the start and end of a range We want to convert this to a list of rangeTuples for easier handling
Types ¶
type GPTEncoder ¶
type GPTEncoder struct { Encoder map[string]Token Decoder map[Token][]byte BpeRanks map[GPTPair]float64 TokenMerges map[TokenPair]Token BytesEncoder *map[byte]Token Specials map[string]Tokens SpecialsTree *RuneNode Cache *lru.ARCCache TwoTierCache *TypedTwoTierCache PuncRunes []rune Normalizer *strings.Replacer DecodeExtra *strings.Replacer BosToken Token EosToken Token PadToken Token LruHits int LruMisses int LruEvictions int LruSize int SplitterThreads int VocabId string // contains filtered or unexported fields }
func NewCLIPEncoder ¶
func NewCLIPEncoder() GPTEncoder
func NewEncoder ¶
func NewEncoder(vocabId string) (*GPTEncoder, error)
NewEncoder Returns a GPTEncoder with the tokenizer data loaded for that vocabulary id.
func NewGPT2Encoder ¶
func NewGPT2Encoder() GPTEncoder
func NewLlama2Encoder ¶
func NewLlama2Encoder() GPTEncoder
func NewLlama3Encoder ¶
func NewLlama3Encoder() GPTEncoder
func NewMistralEncoder ¶
func NewMistralEncoder() GPTEncoder
func NewNerdstashV1Encoder ¶
func NewNerdstashV1Encoder() GPTEncoder
func NewNerdstashV2Encoder ¶
func NewNerdstashV2Encoder() GPTEncoder
func NewPileEncoder ¶
func NewPileEncoder() GPTEncoder
func (*GPTEncoder) AlignAndSizeTokens ¶
func (encoder *GPTEncoder) AlignAndSizeTokens( tokens *Tokens, desiredLength int, ) ( alignedTokens Tokens, endAt int, )
func (*GPTEncoder) Clone ¶
func (encoder *GPTEncoder) Clone() *GPTEncoder
func (*GPTEncoder) Decode ¶
func (encoder *GPTEncoder) Decode(encoded *Tokens) (text string)
Decode Tokens back into a string, handling unicode.
func (*GPTEncoder) DecodeBuffer ¶
func (encoder *GPTEncoder) DecodeBuffer( encoded *[]byte, useUint32 bool, ) (text string)
DecodeBuffer Decode Tokens from a byte array into a string.
func (*GPTEncoder) Encode ¶
func (encoder *GPTEncoder) Encode(text *string) *Tokens
Encode encodes a string into a sequence of tokens.
func (*GPTEncoder) EncodeBuffer ¶
func (encoder *GPTEncoder) EncodeBuffer(buffer *[]byte) ( *[]byte, uint64, )
EncodeBuffer takes a byte array and encodes it into Tokens in another byte array.
func (*GPTEncoder) EncodeReader ¶
func (encoder *GPTEncoder) EncodeReader(reader io.RuneReader) *Tokens
func (*GPTEncoder) Get ¶
func (encoder *GPTEncoder) Get(text string) *Token
Get Looks up text in the Encoder, and returns the Token representation of it. If the text is not found, then nil is returned.
func (*GPTEncoder) IsByteToken ¶
func (encoder *GPTEncoder) IsByteToken(token *Token) bool
IsByteToken Determine if the token is a byte token.
func (*GPTEncoder) IsLastTokenByte ¶
func (encoder *GPTEncoder) IsLastTokenByte(tokens *Tokens) bool
IsLastTokenByte Determine if the last token in the sequence is a byte token.
func (*GPTEncoder) SplitWords ¶
func (encoder *GPTEncoder) SplitWords(text *string) *[]string
func (*GPTEncoder) StreamingEncode ¶
func (encoder *GPTEncoder) StreamingEncode(reader io.RuneReader) func(int) *Tokens
func (*GPTEncoder) ToBPE ¶
func (encoder *GPTEncoder) ToBPE(text string) Tokens
ToBPE Given pre-split text, perform bigram ranking and merges, and returns Tokens Add at package level - reusable buffers for common operations
func (*GPTEncoder) TokensReady ¶
func (encoder *GPTEncoder) TokensReady(tokens *Tokens) bool
TokensReady Determine if the sequence of Tokens given is ready to be serialized to string, based on if the sequence will produce valid Unicode runes.
func (*GPTEncoder) TrimIncompleteSentence ¶
func (encoder *GPTEncoder) TrimIncompleteSentence(tokens *Tokens) ( *Tokens, error, )
func (*GPTEncoder) TrimNewlines ¶
func (encoder *GPTEncoder) TrimNewlines( tokens *Tokens, direction TrimDirection, limit uint, ) (*Tokens, error)
func (*GPTEncoder) TrimSentences ¶
func (encoder *GPTEncoder) TrimSentences( tokens *Tokens, direction TrimDirection, limit uint, ) (*Tokens, error)
func (*GPTEncoder) TrimTokens ¶
func (encoder *GPTEncoder) TrimTokens(tokens *Tokens) (trimmed *Tokens)
TrimTokens Trims the given Tokens to tokens that produce valid unicode.
func (*GPTEncoder) UpdateSpecialsTree ¶
func (encoder *GPTEncoder) UpdateSpecialsTree()
func (*GPTEncoder) WordSplitter ¶
func (encoder *GPTEncoder) WordSplitter(reader io.RuneReader) func() *string
type NextRuneFunc ¶
type PreallocBGERanks ¶
type PreallocBGERanks struct {
// contains filtered or unexported fields
}
func NewPreallocBGERanks ¶
func NewPreallocBGERanks(capacity int) *PreallocBGERanks
func (*PreallocBGERanks) InsertSorted ¶
func (p *PreallocBGERanks) InsertSorted(v BGERank)
type RegexNode ¶
type RegexNode struct {
// contains filtered or unexported fields
}
Nodes of the regex tree
func CreateRegexTree ¶
func (*RegexNode) EvaluateRegexTree ¶
We want to take a string and use pre-order traversal to match the string to the tree, in a regex-like fashion This is much faster than using the regex package. The input is a pathmap generate from the regex tree, and the runes to match The output is a list of strings that have been matched
func (*RegexNode) GeneratePathMap ¶
We need a path map to know where we are in the tree
type RuneNode ¶
type RuneNode struct {
// contains filtered or unexported fields
}
func CreateContractionsTree ¶
func CreateContractionsTree() *RuneNode
ContractionsTree creates a specialized RuneTree for handling contractions
func CreateRuneTree ¶
Create a new rune tree from an array of strings to match against.
func NewRuneTree ¶
func NewRuneTree() *RuneNode
func (*RuneNode) InsertIntoRuneTree ¶
func (*RuneNode) InsertReplacementsIntoRuneTree ¶
type TrimDirection ¶
type TrimDirection uint
const ( TrimTop TrimDirection = iota TrimBottom TrimDirection = iota TrimNone TrimDirection = iota )
type TypedTwoTierCache ¶
type TypedTwoTierCache struct {
// contains filtered or unexported fields
}
type WordCallback ¶
type WordCallback func([]string)
Directories
¶
Path | Synopsis |
---|---|
cmd
|
|
detokenizer
command
|
|
model_downloader
command
|
|
sentencepiece_converter
command
|
|
tokens_transformer
command
|
|
dataset_tokenizer
module
|
|
tokenizer_repl
module
|
|