Documentation
¶
Overview ¶
Package gse Go efficient multilingual NLP and text segmentation
Index ¶
- Constants
- Variables
- func DictPaths(dictDir, filePath string) (files []string)
- func FilterEmoji(text string) (new string)
- func FilterHtml(text string) string
- func FilterLang(text, lang string) (new string)
- func FilterSymbol(text string) (new string)
- func FindAllOccs(data []byte, searches []string) map[string][]int
- func GetVersion() string
- func IsJp(segText string) bool
- func Join(text []Text) string
- func Range(text string) (new []string)
- func RangeText(text string) (new string)
- func SplitNum(text string) []string
- func SplitNums(text string) string
- func ToSlice(segs []Segment, searchMode ...bool) (output []string)
- func ToString(segs []Segment, searchMode ...bool) (output string)
- type AnalyzeToken
- type Dictionary
- func (dict *Dictionary) AddToken(token Token) error
- func (dict *Dictionary) Find(word []byte) (float64, string, bool)
- func (dict *Dictionary) LookupTokens(words []Text, tokens []*Token) (numOfTokens int)
- func (dict *Dictionary) MaxTokenLen() int
- func (dict *Dictionary) NumTokens() int
- func (dict *Dictionary) RemoveToken(token Token) error
- func (dict *Dictionary) TotalFreq() float64
- func (dict *Dictionary) Value(word []byte) (val, id int, err error)
- type Prob
- type SegPos
- type Segment
- type Segmenter
- func (seg *Segmenter) AddStop(text string)
- func (seg *Segmenter) AddStopArr(text ...string)
- func (seg *Segmenter) AddToken(text string, freq float64, pos ...string) error
- func (seg *Segmenter) AddTokenForce(text string, freq float64, pos ...string) (err error)
- func (seg *Segmenter) Analyze(text []string, t1 string, by ...bool) (az []AnalyzeToken)
- func (seg *Segmenter) CalcToken()
- func (seg *Segmenter) Cut(str string, hmm ...bool) []string
- func (seg *Segmenter) CutAll(str string) []string
- func (seg *Segmenter) CutDAG(str string, reg ...*regexp.Regexp) []string
- func (seg *Segmenter) CutDAGNoHMM(str string) []string
- func (seg *Segmenter) CutSearch(str string, hmm ...bool) []string
- func (seg *Segmenter) CutStop(str string, hmm ...bool) []string
- func (seg *Segmenter) CutStr(str []string, separator ...string) (r string)
- func (seg *Segmenter) CutTrim(str string, hmm ...bool) []string
- func (seg *Segmenter) CutTrimHtml(str string, hmm ...bool) []string
- func (seg *Segmenter) CutTrimHtmls(str string, hmm ...bool) string
- func (seg *Segmenter) CutUrl(str string, num ...bool) []string
- func (seg *Segmenter) CutUrls(str string, num ...bool) string
- func (seg *Segmenter) Dictionary() *Dictionary
- func (seg *Segmenter) Empty() error
- func (seg *Segmenter) EmptyStop() error
- func (seg *Segmenter) Find(str string) (float64, string, bool)
- func (seg *Segmenter) GetCurrentFilePath() string
- func (seg *Segmenter) GetIdfPath(files ...string) []string
- func (seg *Segmenter) HMMCut(str string, reg ...*regexp.Regexp) []string
- func (seg *Segmenter) HMMCutMod(str string, prob ...map[rune]float64) []string
- func (seg *Segmenter) Init()
- func (seg *Segmenter) IsStop(s string) bool
- func (seg *Segmenter) LoadDict(files ...string) error
- func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error)
- func (seg *Segmenter) LoadDictMap(dict []map[string]string) error
- func (seg *Segmenter) LoadDictStr(dict string) error
- func (seg *Segmenter) LoadModel(prob ...map[rune]float64)
- func (seg *Segmenter) LoadStop(files ...string) error
- func (seg *Segmenter) LoadStopArr(dict []string)
- func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error)
- func (seg *Segmenter) LoadStopStr(dict string) error
- func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment
- func (seg *Segmenter) Pos(s string, searchMode ...bool) []SegPos
- func (seg *Segmenter) PosStr(str []SegPos, separator ...string) (r string)
- func (seg *Segmenter) PosTrim(str string, search bool, pos ...string) []SegPos
- func (seg *Segmenter) PosTrimArr(str string, search bool, pos ...string) (re []string)
- func (seg *Segmenter) PosTrimStr(str string, search bool, pos ...string) string
- func (seg *Segmenter) ReAddToken(text string, freq float64, pos ...string) error
- func (seg *Segmenter) Read(file string) error
- func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int, text, freqText, pos string, fsErr error)
- func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error
- func (seg *Segmenter) RemoveStop(text string)
- func (seg *Segmenter) RemoveToken(text string) error
- func (seg *Segmenter) Segment(bytes []byte) []Segment
- func (seg *Segmenter) Size(size int, text, freqText string) (freq float64)
- func (seg *Segmenter) Slice(s string, searchMode ...bool) []string
- func (seg *Segmenter) SplitTextToWords(text Text) []Text
- func (seg *Segmenter) Stop(s []string) (r []string)
- func (seg *Segmenter) String(s string, searchMode ...bool) string
- func (seg *Segmenter) SuggestFreq(words ...string) float64
- func (seg *Segmenter) ToToken(text string, freq float64, pos ...string) Token
- func (seg *Segmenter) Trim(s []string) (r []string)
- func (seg *Segmenter) TrimPos(s []SegPos) (r []SegPos)
- func (seg *Segmenter) TrimPosPunct(se []SegPos) (re []SegPos)
- func (seg *Segmenter) TrimPunct(s []string) (r []string)
- func (seg *Segmenter) TrimSymbol(s []string) (r []string)
- func (seg *Segmenter) TrimWithPos(se []SegPos, pos ...string) (re []SegPos)
- func (seg *Segmenter) Value(str string) (int, int, error)
- type Text
- type Token
Constants ¶
const ( // RatioWord ratio words and letters RatioWord float32 = 1.5 // RatioWordFull full ratio words and letters RatioWordFull float32 = 1 )
const (
// Version get the gse version
Version = "v0.80.2.705, Green Lake!"
)
Variables ¶
var StopWordMap = map[string]bool{ " ": true, }
StopWordMap the default stop words.
var ( // ToLower set alpha to lowercase ToLower = true )
var ( //go:embed data/dict/zh/idf.txt ZhIdf string )
Functions ¶
func FilterEmoji ¶ added in v0.60.0
FilterEmoji filter the emoji
func FilterHtml ¶ added in v0.61.0
FilterHtml filter the html tag
func FilterLang ¶ added in v0.60.0
FilterLang filter the language
func FilterSymbol ¶ added in v0.60.0
FilterSymbol filter the symbol
func FindAllOccs ¶ added in v0.69.14
FindAllOccs find the all search byte start in data
Types ¶
type AnalyzeToken ¶ added in v0.68.0
type AnalyzeToken struct { // the start of the segment in the text Start int End int Position int Len int Type string Text string Freq float64 Pos string }
AnalyzeToken analyze the segment info structure
type Dictionary ¶
type Dictionary struct { Tokens []Token // the all tokens in the dictionary, to traverse // contains filtered or unexported fields }
Dictionary struct implements a string double array trie. one segment maybe in leaf node or not
func (*Dictionary) AddToken ¶ added in v0.69.6
func (dict *Dictionary) AddToken(token Token) error
AddToken add a token to the dictionary
func (*Dictionary) Find ¶
func (dict *Dictionary) Find(word []byte) (float64, string, bool)
Find find the word in the dictionary is non-existent and the word's frequency and pos
func (*Dictionary) LookupTokens ¶
func (dict *Dictionary) LookupTokens( words []Text, tokens []*Token) (numOfTokens int)
LookupTokens finds tokens and words in the dictionary, matching the given pattern and returns the number of tokens
func (*Dictionary) MaxTokenLen ¶
func (dict *Dictionary) MaxTokenLen() int
MaxTokenLen the maximum length of the dictionary
func (*Dictionary) NumTokens ¶
func (dict *Dictionary) NumTokens() int
NumTokens the number of tokens in the dictionary
func (*Dictionary) RemoveToken ¶
func (dict *Dictionary) RemoveToken(token Token) error
RemoveToken remove token in dictionary
func (*Dictionary) TotalFreq ¶
func (dict *Dictionary) TotalFreq() float64
TotalFreq the total frequency of the dictionary
type Segment ¶
type Segment struct { Position int // contains filtered or unexported fields }
Segment a segment in the text
type Segmenter ¶
type Segmenter struct { Dict *Dictionary Load bool DictSep string DictPath string // NotLoadHMM option load the default hmm model config (Chinese char) NotLoadHMM bool // AlphaNum set splitTextToWords can add token // when words in alphanum // set up alphanum dictionary word segmentation AlphaNum bool Alpha bool Num bool // LoadNoFreq load not have freq dict word LoadNoFreq bool // MinTokenFreq load min freq token MinTokenFreq float64 // TextFreq add token frequency when not specified freq TextFreq string // SkipLog set skip log print SkipLog bool MoreLog bool // SkipPos skip PosStr pos SkipPos bool NotStop bool // StopWordMap the stop word map StopWordMap map[string]bool }
Segmenter define the segmenter structure
func (*Segmenter) AddStopArr ¶ added in v0.70.0
AddStopArr add array stop token to stop dictionaries
func (*Segmenter) AddTokenForce ¶
AddTokenForce add new text to token and force time-consuming
func (*Segmenter) Analyze ¶ added in v0.68.0
func (seg *Segmenter) Analyze(text []string, t1 string, by ...bool) (az []AnalyzeToken)
Analyze analyze the token segment info
func (*Segmenter) Cut ¶
Cut cuts a str into words using accurate mode. Parameter hmm controls whether to use the HMM(Hidden Markov Model) or use the user's model.
seg.Cut(text):
use the shortest path
seg.Cut(text, false):
use cut dag not hmm
seg.Cut(text, true):
use cut dag and hmm mode
func (*Segmenter) CutDAGNoHMM ¶ added in v0.69.8
CutDAGNoHMM cut string with DAG not use hmm
func (*Segmenter) CutTrimHtml ¶ added in v0.62.1
CutTrimHtml cut string trim html and symbol return []string
func (*Segmenter) CutTrimHtmls ¶ added in v0.62.1
CutTrimHtmls cut string trim html and symbol return string
func (*Segmenter) Dictionary ¶
func (seg *Segmenter) Dictionary() *Dictionary
Dictionary returns the dictionary used by the tokenizer
func (*Segmenter) GetCurrentFilePath ¶ added in v0.80.2
GetCurrentFilePath get the current file path
func (*Segmenter) GetIdfPath ¶ added in v0.80.2
GetIdfPath get the idf path
func (*Segmenter) Init ¶ added in v0.60.0
func (seg *Segmenter) Init()
Init initializes the segmenter config
func (*Segmenter) LoadDict ¶
LoadDict load the dictionary from the file
The format of the dictionary is (one for each participle):
participle text, frequency, part of speech
And you can option the dictionary separator by seg.DictSep = "," ¶
Can load multiple dictionary files, the file name separated by "," or ", " the front of the dictionary preferentially load the participle,
such as: "user_dictionary.txt,common_dictionary.txt"
When a participle appears both in the user dictionary and in the `common dictionary`, the `user dictionary` is given priority.
func (*Segmenter) LoadDictEmbed ¶ added in v0.66.6
LoadDictEmbed load the dictionary by embed file
func (*Segmenter) LoadDictMap ¶ added in v0.66.0
LoadDictMap load dictionary from []map[string]string
func (*Segmenter) LoadDictStr ¶ added in v0.66.6
LoadDictStr load the dictionary from dict path
func (*Segmenter) LoadModel ¶
LoadModel load the hmm model (default is Chinese char)
Use the user's model:
seg.LoadModel(B, E, M, S map[rune]float64)
func (*Segmenter) LoadStopArr ¶ added in v0.66.1
LoadStopArr load stop word by []string
func (*Segmenter) LoadStopEmbed ¶ added in v0.66.6
LoadStopEmbed load the stop dictionary from embed file
func (*Segmenter) LoadStopStr ¶ added in v0.69.0
LoadStopStr load the stop dictionary from dict path
func (*Segmenter) ModeSegment ¶
ModeSegment segment using search mode if searchMode is true
func (*Segmenter) PosTrimArr ¶ added in v0.60.0
PosTrimArr cut string return pos.Text []string
func (*Segmenter) PosTrimStr ¶ added in v0.60.0
PosTrimStr cut string return pos.Text string
func (*Segmenter) ReAddToken ¶ added in v0.69.7
ReAddToken remove and add token again
func (*Segmenter) ReadN ¶ added in v0.80.0
func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int, text, freqText, pos string, fsErr error)
ReadN read the tokens by '\n'
func (*Segmenter) RemoveStop ¶ added in v0.63.0
RemoveStop remove a token from the StopWord dictionary.
func (*Segmenter) RemoveToken ¶
RemoveToken remove token in dictionary
func (*Segmenter) Segment ¶
Segment use the shortest path to segment the text
input parameter:
bytes UTF8 text []byte
output:
[]Segment return segments result
func (*Segmenter) Size ¶ added in v0.66.0
Size frequency is calculated based on the size of the text
func (*Segmenter) Slice ¶
Slice use modeSegment segment return []string using search mode if searchMode is true
func (*Segmenter) SplitTextToWords ¶ added in v0.64.0
SplitTextToWords splits a string to token words
func (*Segmenter) String ¶
Slice use modeSegment segment return string using search mode if searchMode is true
func (*Segmenter) SuggestFreq ¶ added in v0.60.0
SuggestFreq suggest the words frequency return a suggested frequency of a word cutted to short words.
func (*Segmenter) TrimPosPunct ¶ added in v0.60.0
TrimPosPunct trim SegPos not space and punct
func (*Segmenter) TrimSymbol ¶ added in v0.62.1
TrimSymbol trim []string exclude symbol, space and punct
func (*Segmenter) TrimWithPos ¶ added in v0.60.0
TrimWithPos trim some seg with pos
type Text ¶
type Text []byte
Text a string type,used to parse text 1. a word, such as "world" or "boundary", in English a word is a word 2. a participle, such as "world" a.k.a. "population" 3. a text, such as "the world has seven billion people"
func SplitWords ¶ added in v0.64.1
SplitWords splits a string to token words
type Token ¶
type Token struct {
// contains filtered or unexported fields
}
Token define a segment token structure
func (*Token) Segments ¶
Segments will segment further subdivisions of the text of this participle, the participle has two subclauses.
Subclauses can also have further subclauses forming a tree structure, which can be traversed to get all the detailed subdivisions of the participle, which is mainly Used by search engines to perform full-text searches on a piece of text.