Documentation
¶
Index ¶
- Constants
- func CBToFreq(cB int) float64
- func DigitFreq(text string) float64
- func FreqToZipf(freq float64) float64
- func HasDigitSequence(text string) bool
- func SmashNumbers(text string) string
- func ZipfToFreq(zipf float64) float64
- type ChineseProcessor
- type DataLoader
- func (dl *DataLoader) GetFrequencyDict(lang Language, wordlist WordlistType) (map[string]float64, error)
- func (dl *DataLoader) LoadChineseMapping() (map[rune]string, error)
- func (dl *DataLoader) ReadCBPack(filename string) ([][]string, error)
- func (dl *DataLoader) ReadTextFile(filename string) (string, error)
- type JapaneseProcessor
- type Language
- type LanguageInfo
- type Tokenizer
- type TokenizerType
- type WordFreq
- type WordlistType
Constants ¶
const ( // CacheSize for frequency lookups CacheSize = 100000 // InferredSpaceFactor is applied for each inferred word boundary in Chinese InferredSpaceFactor = 10.0 )
Constants from the original implementation
Variables ¶
This section is empty.
Functions ¶
func FreqToZipf ¶
FreqToZipf converts frequency proportion to Zipf scale
func HasDigitSequence ¶
HasDigitSequence returns true if the text has a digit sequence that will be normalized out and handled with digit_freq
func SmashNumbers ¶
SmashNumbers replaces sequences of multiple digits with zeroes, so we don't need to distinguish the frequencies of thousands of numbers
func ZipfToFreq ¶
ZipfToFreq converts Zipf scale to frequency proportion
Types ¶
type ChineseProcessor ¶
type ChineseProcessor struct {
// contains filtered or unexported fields
}
ChineseProcessor handles Chinese text processing and tokenization
func NewChineseProcessor ¶
func NewChineseProcessor(dataLoader *DataLoader) *ChineseProcessor
NewChineseProcessor creates a new Chinese processor
func (*ChineseProcessor) SimplifyChinese ¶
func (cp *ChineseProcessor) SimplifyChinese(text string) (string, error)
SimplifyChinese converts Chinese text character-by-character to Simplified Chinese
type DataLoader ¶
type DataLoader struct {
// contains filtered or unexported fields
}
DataLoader handles loading and caching of embedded wordfreq data files
func (*DataLoader) GetFrequencyDict ¶
func (dl *DataLoader) GetFrequencyDict(lang Language, wordlist WordlistType) (map[string]float64, error)
GetFrequencyDict converts frequency list to a map for faster lookups
func (*DataLoader) LoadChineseMapping ¶
func (dl *DataLoader) LoadChineseMapping() (map[rune]string, error)
LoadChineseMapping loads the Traditional->Simplified Chinese character mapping from embedded data
func (*DataLoader) ReadCBPack ¶
func (dl *DataLoader) ReadCBPack(filename string) ([][]string, error)
ReadCBPack reads a cBpack file from embedded data and returns the frequency data
func (*DataLoader) ReadTextFile ¶
func (dl *DataLoader) ReadTextFile(filename string) (string, error)
ReadTextFile reads a text file from embedded data
type JapaneseProcessor ¶
type JapaneseProcessor struct {
// contains filtered or unexported fields
}
JapaneseProcessor handles Japanese text processing and tokenization
func NewJapaneseProcessor ¶
func NewJapaneseProcessor() (*JapaneseProcessor, error)
NewJapaneseProcessor creates a new Japanese processor
type LanguageInfo ¶
type LanguageInfo struct {
Tokenizer TokenizerType
LookupTransliteration string
}
LanguageInfo contains metadata about how to handle a language
func GetLanguageInfo ¶
func GetLanguageInfo(lang Language) LanguageInfo
GetLanguageInfo returns metadata about how to handle text in a given language
type Tokenizer ¶
type Tokenizer struct {
// contains filtered or unexported fields
}
Tokenizer handles text tokenization for Chinese and Japanese
func NewTokenizer ¶
func NewTokenizer(dataLoader *DataLoader) (*Tokenizer, error)
NewTokenizer creates a new tokenizer
type TokenizerType ¶
type TokenizerType string
TokenizerType represents different tokenization methods
const ( TokenizerGse TokenizerType = "gse" TokenizerKagome TokenizerType = "kagome" )
type WordFreq ¶
type WordFreq struct {
// contains filtered or unexported fields
}
WordFreq is the main interface for word frequency lookups
func (*WordFreq) WordFrequency ¶
func (wf *WordFreq) WordFrequency(word string, lang Language, wordlist WordlistType, minimum float64) (float64, error)
WordFrequency gets the frequency of a word in the specified language Returns a value between 0 and 1, where 1 means the word appears in every token
func (*WordFreq) ZipfFrequency ¶
func (wf *WordFreq) ZipfFrequency(word string, lang Language, wordlist WordlistType, minimum float64) (float64, error)
ZipfFrequency gets the frequency of a word on the Zipf scale
type WordlistType ¶
type WordlistType string
WordlistType represents different wordlist sizes
const ( WordlistSmall WordlistType = "small" WordlistLarge WordlistType = "large" WordlistBest WordlistType = "best" )