Documentation ¶
Overview ¶
Package gse Go efficient text segmentation, Go 语言高性能分词
Index ¶
- Constants
- Variables
- func DictPaths(dictDir, filePath string) (files []string)
- func FilterEmoji(text string) (new string)
- func FilterHtml(text string) string
- func FilterLang(text, lang string) (new string)
- func FilterSymbol(text string) (new string)
- func GetCurrentFilePath() string
- func GetIdfPath(files ...string) []string
- func GetVersion() string
- func IsJp(segText string) bool
- func Join(text []Text) string
- func Range(text string) (new []string)
- func RangeText(text string) (new string)
- func SplitNum(text string) []string
- func SplitNums(text string) string
- func ToSlice(segs []Segment, searchMode ...bool) (output []string)
- func ToString(segs []Segment, searchMode ...bool) (output string)
- type AnalyzeToken
- type Dictionary
- func (dict *Dictionary) AddToken(token Token) error
- func (dict *Dictionary) Find(word []byte) (float64, string, bool)
- func (dict *Dictionary) LookupTokens(words []Text, tokens []*Token) (numOfTokens int)
- func (dict *Dictionary) MaxTokenLen() int
- func (dict *Dictionary) NumTokens() int
- func (dict *Dictionary) RemoveToken(token Token) error
- func (dict *Dictionary) TotalFreq() float64
- func (dict *Dictionary) Value(word []byte) (val, id int, err error)
- type Prob
- type SegPos
- type Segment
- type Segmenter
- func (seg *Segmenter) AddStop(text string)
- func (seg *Segmenter) AddToken(text string, freq float64, pos ...string) error
- func (seg *Segmenter) AddTokenForce(text string, freq float64, pos ...string) (err error)
- func (seg *Segmenter) Analyze(text []string) (az []AnalyzeToken)
- func (seg *Segmenter) CalcToken()
- func (seg *Segmenter) Cut(str string, hmm ...bool) []string
- func (seg *Segmenter) CutAll(str string) []string
- func (seg *Segmenter) CutDAG(str string, reg ...*regexp.Regexp) []string
- func (seg *Segmenter) CutDAGNoHMM(str string) []string
- func (seg *Segmenter) CutSearch(str string, hmm ...bool) []string
- func (seg *Segmenter) CutStr(str []string, separator ...string) (r string)
- func (seg *Segmenter) CutTrim(str string, hmm ...bool) []string
- func (seg *Segmenter) CutTrimHtml(str string, hmm ...bool) []string
- func (seg *Segmenter) CutTrimHtmls(str string, hmm ...bool) string
- func (seg *Segmenter) CutUrl(str string, num ...bool) []string
- func (seg *Segmenter) CutUrls(str string, num ...bool) string
- func (seg *Segmenter) Dictionary() *Dictionary
- func (seg *Segmenter) Empty() error
- func (seg *Segmenter) EmptyStop() error
- func (seg *Segmenter) Find(str string) (float64, string, bool)
- func (seg *Segmenter) HMMCut(str string, reg ...*regexp.Regexp) []string
- func (seg *Segmenter) HMMCutMod(str string, prob ...map[rune]float64) []string
- func (seg *Segmenter) Init()
- func (seg *Segmenter) IsStop(s string) bool
- func (seg *Segmenter) LoadDict(files ...string) error
- func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error)
- func (seg *Segmenter) LoadDictMap(dict []map[string]string) error
- func (seg *Segmenter) LoadDictStr(dict string) error
- func (seg *Segmenter) LoadModel(prob ...map[rune]float64)
- func (seg *Segmenter) LoadStop(files ...string) error
- func (seg *Segmenter) LoadStopArr(dict []string)
- func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error)
- func (seg *Segmenter) LoadStopStr(dict string) error
- func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment
- func (seg *Segmenter) Pos(s string, searchMode ...bool) []SegPos
- func (seg *Segmenter) PosStr(str []SegPos, separator ...string) (r string)
- func (seg *Segmenter) PosTrim(str string, search bool, pos ...string) []SegPos
- func (seg *Segmenter) PosTrimArr(str string, search bool, pos ...string) (re []string)
- func (seg *Segmenter) PosTrimStr(str string, search bool, pos ...string) string
- func (seg *Segmenter) ReAddToken(text string, freq float64, pos ...string) error
- func (seg *Segmenter) Read(file string) error
- func (seg *Segmenter) Reader(reader io.Reader, files ...string) error
- func (seg *Segmenter) RemoveStop(text string)
- func (seg *Segmenter) RemoveToken(text string) error
- func (seg *Segmenter) Segment(bytes []byte) []Segment
- func (seg *Segmenter) Size(size int, text, freqText string) (freq float64)
- func (seg *Segmenter) Slice(s string, searchMode ...bool) []string
- func (seg *Segmenter) SplitTextToWords(text Text) []Text
- func (seg *Segmenter) String(s string, searchMode ...bool) string
- func (seg *Segmenter) SuggestFreq(words ...string) float64
- func (seg *Segmenter) ToToken(text string, freq float64, pos ...string) Token
- func (seg *Segmenter) Trim(s []string) (r []string)
- func (seg *Segmenter) TrimPos(s []SegPos) (r []SegPos)
- func (seg *Segmenter) TrimPosPunct(se []SegPos) (re []SegPos)
- func (seg *Segmenter) TrimPunct(s []string) (r []string)
- func (seg *Segmenter) TrimSymbol(s []string) (r []string)
- func (seg *Segmenter) TrimWithPos(se []SegPos, pos ...string) (re []SegPos)
- func (seg *Segmenter) Value(str string) (int, int, error)
- type Text
- type Token
Constants ¶
const ( // RatioWord ratio words and letters RatioWord float32 = 1.5 // RatioWordFull full ratio words and letters RatioWordFull float32 = 1 )
const (
// Version get the gse version
Version = "v0.69.9.593, Green Lake!"
)
Variables ¶
var StopWordMap = map[string]bool{ " ": true, }
StopWordMap the default stop words.
var ( // ToLower set alpha tolower ToLower = true )
Functions ¶
func FilterEmoji ¶ added in v0.60.0
FilterEmoji filter the emoji
func FilterHtml ¶ added in v0.61.0
FilterHtml filter the html tag
func FilterLang ¶ added in v0.60.0
FilterLang filter the language
func FilterSymbol ¶ added in v0.60.0
FilterSymbol filter the symbol
func GetCurrentFilePath ¶ added in v0.60.0
func GetCurrentFilePath() string
GetCurrentFilePath get current file path
func GetIdfPath ¶ added in v0.60.0
GetIdfPath get the idf path
func ToSlice ¶
ToSlice segments to slice 输出分词结果到一个字符串 slice
有两种输出模式,以 "山达尔星联邦共和国" 为例
普通模式(searchMode=false)输出一个分词"[山达尔星联邦共和国]" 搜索模式(searchMode=true) 输出普通模式的再细致切分: "[山达尔星 联邦 共和 国 共和国 联邦共和国 山达尔星联邦共和国]"
默认 searchMode=false 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见Token结构体的注释。
Types ¶
type AnalyzeToken ¶ added in v0.68.0
type AnalyzeToken struct { // 分词在文本中的起始位置 Start int End int Position int Len int Type string Text string Freq float64 Pos string }
AnalyzeToken analyze the segment info structure
type Dictionary ¶
type Dictionary struct { Tokens []Token // 词典中所有的分词,方便遍历 // contains filtered or unexported fields }
Dictionary 结构体实现了一个字串双数组树, 一个分词可能出现在叶子节点也有可能出现在非叶节点
func (*Dictionary) AddToken ¶ added in v0.69.6
func (dict *Dictionary) AddToken(token Token) error
AddToken 向词典中加入一个分词
func (*Dictionary) Find ¶
func (dict *Dictionary) Find(word []byte) (float64, string, bool)
Find find the word in the dictionary is non-existent and the word's frequency, pos
func (*Dictionary) LookupTokens ¶
func (dict *Dictionary) LookupTokens( words []Text, tokens []*Token) (numOfTokens int)
LookupTokens 在词典中查找和字元组 words 可以前缀匹配的所有分词 返回值为找到的分词数
func (*Dictionary) RemoveToken ¶
func (dict *Dictionary) RemoveToken(token Token) error
RemoveToken remove token in dictionary
type Segment ¶
type Segment struct { Position int // contains filtered or unexported fields }
Segment 文本中的一个分词
type Segmenter ¶
type Segmenter struct { Dict *Dictionary Load bool // AlphaNum set splitTextToWords can add token // when words in alphanum // set up alphanum dictionary word segmentation AlphaNum bool Alpha bool Num bool // LoadNoFreq load not have freq dict word LoadNoFreq bool // MinTokenFreq load min freq token MinTokenFreq float64 // TextFreq add token frenquency when not specified freq TextFreq string // SkipLog set skip log print SkipLog bool MoreLog bool // SkipPos skip PosStr pos SkipPos bool NotStop bool // StopWordMap the stop word map StopWordMap map[string]bool }
Segmenter 分词器结构体
func (*Segmenter) AddTokenForce ¶
AddTokenForce add new text to token and force time-consuming
func (*Segmenter) Analyze ¶ added in v0.68.0
func (seg *Segmenter) Analyze(text []string) (az []AnalyzeToken)
Analyze analyze the token segment info
func (*Segmenter) Cut ¶
Cut cuts a str into words using accurate mode. Parameter hmm controls whether to use the HMM(Hidden Markov Model) or use the user's model.
seg.Cut(text), use the shortest path seg.Cut(text, false), use cut dag not hmm seg.Cut(text, true), use cut dag and hmm mode
func (*Segmenter) CutDAGNoHMM ¶ added in v0.69.8
CutDAGNoHMM cut string with DAG not use hmm
func (*Segmenter) CutTrimHtml ¶ added in v0.62.1
CutTrimHtml cut string trim html and symbol return []string
func (*Segmenter) CutTrimHtmls ¶ added in v0.62.1
CutTrimHtmls cut string trim html and symbol return string
func (*Segmenter) LoadDict ¶
LoadDict load the dictionary from the file
The format of the dictionary is (one for each participle):
participle text, frequency, part of speech
Can load multiple dictionary files, the file name separated by "," or ", " the front of the dictionary preferentially load the participle,
such as: "user_dictionary.txt,common_dictionary.txt"
When a participle appears both in the user dictionary and in the `common dictionary`, the `user dictionary` is given priority.
从文件中载入词典
可以载入多个词典文件,文件名用 "," 或 ", " 分隔,排在前面的词典优先载入分词,比如:
"用户词典.txt,通用词典.txt"
当一个分词既出现在用户词典也出现在 `通用词典` 中,则优先使用 `用户词典`。
词典的格式为(每个分词一行):
分词文本 频率 词性
func (*Segmenter) LoadDictEmbed ¶ added in v0.66.6
LoadDictEmbed load dictionary by embed file
func (*Segmenter) LoadDictMap ¶ added in v0.66.0
LoadDictMap load dictionary from []map[string]string
func (*Segmenter) LoadDictStr ¶ added in v0.66.6
LoadDictStr load dictionary from string
func (*Segmenter) LoadModel ¶
LoadModel load the hmm model
Use the user's model:
seg.LoadModel(B, E, M, S map[rune]float64)
func (*Segmenter) LoadStopArr ¶ added in v0.66.1
LoadStopArr load stop word by []string
func (*Segmenter) LoadStopEmbed ¶ added in v0.66.6
LoadStopEmbed load stop dictionary from embed file
func (*Segmenter) LoadStopStr ¶ added in v0.69.0
LoadDictStr load the stop dictionary from string
func (*Segmenter) ModeSegment ¶
ModeSegment segment using search mode if searchMode is true
func (*Segmenter) PosTrimArr ¶ added in v0.60.0
PosTrimArr cut string return pos.Text []string
func (*Segmenter) PosTrimStr ¶ added in v0.60.0
PosTrimStr cut string return pos.Text string
func (*Segmenter) ReAddToken ¶ added in v0.69.7
ReAddToken remove and add token again
func (*Segmenter) RemoveStop ¶ added in v0.63.0
RemoveStop remove a token into StopWord dictionary.
func (*Segmenter) RemoveToken ¶
RemoveToken remove token in dictionary
func (*Segmenter) Size ¶ added in v0.66.0
Size frequency is calculated based on the size of the text
func (*Segmenter) Slice ¶
Slice use modeSegment segment retrun []string using search mode if searchMode is true
func (*Segmenter) SplitTextToWords ¶ added in v0.64.0
SplitTextToWords 将文本划分成字元
func (*Segmenter) String ¶
Slice use modeSegment segment retrun string using search mode if searchMode is true
func (*Segmenter) SuggestFreq ¶ added in v0.60.0
SuggestFreq suggest the words frequency return a suggested frequency of a word cutted to short words.
func (*Segmenter) TrimPosPunct ¶ added in v0.60.0
TrimPosPunct trim SegPos not space and punct
func (*Segmenter) TrimSymbol ¶ added in v0.62.1
TrimSymbol trim []string exclude symbol, space and punct
func (*Segmenter) TrimWithPos ¶ added in v0.60.0
TrimWithPos trim some seg with pos
type Text ¶
type Text []byte
Text 字串类型,可以用来表达
- 一个字元,比如 "世" 又如 "界", 英文的一个字元是一个词
- 一个分词,比如 "世界" 又如 "人口"
- 一段文字,比如 "世界有七十亿人口"
Source Files ¶
Directories ¶
Path | Synopsis |
---|---|
gonn
|
|
Package hmm is the Golang HMM cut module Package hmm model data The data from https://github.com/fxsjy/jieba
|
Package hmm is the Golang HMM cut module Package hmm model data The data from https://github.com/fxsjy/jieba |
pos
Package pos model data The data from https://github.com/fxsjy/jieba
|
Package pos model data The data from https://github.com/fxsjy/jieba |