Documentation ¶
Overview ¶
Package gse Go efficient multilingual NLP and text segmentation, Go 高性能多语言 NLP 和分词
Index ¶
- Constants
- Variables
- func DictPaths(dictDir, filePath string) (files []string)
- func FilterEmoji(text string) (new string)
- func FilterHtml(text string) string
- func FilterLang(text, lang string) (new string)
- func FilterSymbol(text string) (new string)
- func FindAllOccs(data []byte, searches []string) map[string][]int
- func GetVersion() string
- func IsJp(segText string) bool
- func Join(text []Text) string
- func Range(text string) (new []string)
- func RangeText(text string) (new string)
- func SplitNum(text string) []string
- func SplitNums(text string) string
- func ToSlice(segs []Segment, searchMode ...bool) (output []string)
- func ToString(segs []Segment, searchMode ...bool) (output string)
- type AnalyzeToken
- type Dictionary
- func (dict *Dictionary) AddToken(token Token) error
- func (dict *Dictionary) Find(word []byte) (float64, string, bool)
- func (dict *Dictionary) LookupTokens(words []Text, tokens []*Token) (numOfTokens int)
- func (dict *Dictionary) MaxTokenLen() int
- func (dict *Dictionary) NumTokens() int
- func (dict *Dictionary) RemoveToken(token Token) error
- func (dict *Dictionary) TotalFreq() float64
- func (dict *Dictionary) Value(word []byte) (val, id int, err error)
- type Prob
- type SegPos
- type Segment
- type Segmenter
- func (seg *Segmenter) AddStop(text string)
- func (seg *Segmenter) AddToken(text string, freq float64, pos ...string) error
- func (seg *Segmenter) AddTokenForce(text string, freq float64, pos ...string) (err error)
- func (seg *Segmenter) Analyze(text []string, t1 string, by ...bool) (az []AnalyzeToken)
- func (seg *Segmenter) CalcToken()
- func (seg *Segmenter) Cut(str string, hmm ...bool) []string
- func (seg *Segmenter) CutAll(str string) []string
- func (seg *Segmenter) CutDAG(str string, reg ...*regexp.Regexp) []string
- func (seg *Segmenter) CutDAGNoHMM(str string) []string
- func (seg *Segmenter) CutSearch(str string, hmm ...bool) []string
- func (seg *Segmenter) CutStr(str []string, separator ...string) (r string)
- func (seg *Segmenter) CutTrim(str string, hmm ...bool) []string
- func (seg *Segmenter) CutTrimHtml(str string, hmm ...bool) []string
- func (seg *Segmenter) CutTrimHtmls(str string, hmm ...bool) string
- func (seg *Segmenter) CutUrl(str string, num ...bool) []string
- func (seg *Segmenter) CutUrls(str string, num ...bool) string
- func (seg *Segmenter) Dictionary() *Dictionary
- func (seg *Segmenter) Empty() error
- func (seg *Segmenter) EmptyStop() error
- func (seg *Segmenter) Find(str string) (float64, string, bool)
- func (seg *Segmenter) GetCurrentFilePath() string
- func (seg *Segmenter) GetIdfPath(files ...string) []string
- func (seg *Segmenter) HMMCut(str string, reg ...*regexp.Regexp) []string
- func (seg *Segmenter) HMMCutMod(str string, prob ...map[rune]float64) []string
- func (seg *Segmenter) Init()
- func (seg *Segmenter) IsStop(s string) bool
- func (seg *Segmenter) LoadDict(files ...string) error
- func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error)
- func (seg *Segmenter) LoadDictMap(dict []map[string]string) error
- func (seg *Segmenter) LoadDictStr(dict string) error
- func (seg *Segmenter) LoadModel(prob ...map[rune]float64)
- func (seg *Segmenter) LoadStop(files ...string) error
- func (seg *Segmenter) LoadStopArr(dict []string)
- func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error)
- func (seg *Segmenter) LoadStopStr(dict string) error
- func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment
- func (seg *Segmenter) Pos(s string, searchMode ...bool) []SegPos
- func (seg *Segmenter) PosStr(str []SegPos, separator ...string) (r string)
- func (seg *Segmenter) PosTrim(str string, search bool, pos ...string) []SegPos
- func (seg *Segmenter) PosTrimArr(str string, search bool, pos ...string) (re []string)
- func (seg *Segmenter) PosTrimStr(str string, search bool, pos ...string) string
- func (seg *Segmenter) ReAddToken(text string, freq float64, pos ...string) error
- func (seg *Segmenter) Read(file string) error
- func (seg *Segmenter) Reader(reader io.Reader, files ...string) error
- func (seg *Segmenter) RemoveStop(text string)
- func (seg *Segmenter) RemoveToken(text string) error
- func (seg *Segmenter) Segment(bytes []byte) []Segment
- func (seg *Segmenter) SetDataPath(dataPath string)
- func (seg *Segmenter) Size(size int, text, freqText string) (freq float64)
- func (seg *Segmenter) Slice(s string, searchMode ...bool) []string
- func (seg *Segmenter) SplitTextToWords(text Text) []Text
- func (seg *Segmenter) String(s string, searchMode ...bool) string
- func (seg *Segmenter) SuggestFreq(words ...string) float64
- func (seg *Segmenter) ToToken(text string, freq float64, pos ...string) Token
- func (seg *Segmenter) Trim(s []string) (r []string)
- func (seg *Segmenter) TrimPos(s []SegPos) (r []SegPos)
- func (seg *Segmenter) TrimPosPunct(se []SegPos) (re []SegPos)
- func (seg *Segmenter) TrimPunct(s []string) (r []string)
- func (seg *Segmenter) TrimSymbol(s []string) (r []string)
- func (seg *Segmenter) TrimWithPos(se []SegPos, pos ...string) (re []SegPos)
- func (seg *Segmenter) Value(str string) (int, int, error)
- type Text
- type Token
Constants ¶
const ( // RatioWord ratio words and letters RatioWord float32 = 1.5 // RatioWordFull full ratio words and letters RatioWordFull float32 = 1 )
const (
// Version get the gse version
Version = "v0.69.9.593, Green Lake!"
)
Variables ¶
var StopWordMap = map[string]bool{ " ": true, }
StopWordMap the default stop words.
var ( // ToLower set alpha tolower ToLower = true )
Functions ¶
func FindAllOccs ¶
FindAllOccs find the all search byte start in data
func ToSlice ¶
ToSlice segments to slice 输出分词结果到一个字符串 slice
有两种输出模式,以 "山达尔星联邦共和国" 为例
普通模式(searchMode=false)输出一个分词"[山达尔星联邦共和国]" 搜索模式(searchMode=true) 输出普通模式的再细致切分: "[山达尔星 联邦 共和 国 共和国 联邦共和国 山达尔星联邦共和国]"
默认 searchMode=false 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见Token结构体的注释。
Types ¶
type AnalyzeToken ¶
type AnalyzeToken struct { // 分词在文本中的起始位置 Start int End int Position int Len int Type string Text string Freq float64 Pos string }
AnalyzeToken analyze the segment info structure
type Dictionary ¶
type Dictionary struct { Tokens []Token // 词典中所有的分词,方便遍历 // contains filtered or unexported fields }
Dictionary 结构体实现了一个字串双数组树, 一个分词可能出现在叶子节点也有可能出现在非叶节点
func (*Dictionary) AddToken ¶
func (dict *Dictionary) AddToken(token Token) error
AddToken 向词典中加入一个分词
func (*Dictionary) Find ¶
func (dict *Dictionary) Find(word []byte) (float64, string, bool)
Find find the word in the dictionary is non-existent and the word's frequency, pos
func (*Dictionary) LookupTokens ¶
func (dict *Dictionary) LookupTokens( words []Text, tokens []*Token) (numOfTokens int)
LookupTokens 在词典中查找和字元组 words 可以前缀匹配的所有分词 返回值为找到的分词数
func (*Dictionary) RemoveToken ¶
func (dict *Dictionary) RemoveToken(token Token) error
RemoveToken remove token in dictionary
type Segment ¶
type Segment struct { Position int // contains filtered or unexported fields }
Segment 文本中的一个分词
type Segmenter ¶
type Segmenter struct { Dict *Dictionary Load bool // AlphaNum set splitTextToWords can add token // when words in alphanum // set up alphanum dictionary word segmentation AlphaNum bool Alpha bool Num bool // LoadNoFreq load not have freq dict word LoadNoFreq bool // MinTokenFreq load min freq token MinTokenFreq float64 // TextFreq add token frenquency when not specified freq TextFreq string // SkipLog set skip log print SkipLog bool MoreLog bool // SkipPos skip PosStr pos SkipPos bool NotStop bool // StopWordMap the stop word map StopWordMap map[string]bool DataPath string }
Segmenter 分词器结构体
func (*Segmenter) AddTokenForce ¶
AddTokenForce add new text to token and force time-consuming
func (*Segmenter) Analyze ¶
func (seg *Segmenter) Analyze(text []string, t1 string, by ...bool) (az []AnalyzeToken)
Analyze analyze the token segment info
func (*Segmenter) Cut ¶
Cut cuts a str into words using accurate mode. Parameter hmm controls whether to use the HMM(Hidden Markov Model) or use the user's model.
seg.Cut(text):
use the shortest path
seg.Cut(text, false):
use cut dag not hmm
seg.Cut(text, true):
use cut dag and hmm mode
func (*Segmenter) CutDAGNoHMM ¶
CutDAGNoHMM cut string with DAG not use hmm
func (*Segmenter) CutTrimHtml ¶
CutTrimHtml cut string trim html and symbol return []string
func (*Segmenter) CutTrimHtmls ¶
CutTrimHtmls cut string trim html and symbol return string
func (*Segmenter) GetCurrentFilePath ¶
GetCurrentFilePath get current file path
func (*Segmenter) GetIdfPath ¶
GetIdfPath get the idf path
func (*Segmenter) LoadDict ¶
LoadDict load the dictionary from the file
The format of the dictionary is (one for each participle):
participle text, frequency, part of speech
Can load multiple dictionary files, the file name separated by "," or ", " the front of the dictionary preferentially load the participle,
such as: "user_dictionary.txt,common_dictionary.txt"
When a participle appears both in the user dictionary and in the `common dictionary`, the `user dictionary` is given priority.
从文件中载入词典
可以载入多个词典文件,文件名用 "," 或 ", " 分隔,排在前面的词典优先载入分词,比如:
"用户词典.txt,通用词典.txt"
当一个分词既出现在用户词典也出现在 `通用词典` 中,则优先使用 `用户词典`。
词典的格式为(每个分词一行):
分词文本 频率 词性
func (*Segmenter) LoadDictEmbed ¶
LoadDictEmbed load dictionary by embed file
func (*Segmenter) LoadDictMap ¶
LoadDictMap load dictionary from []map[string]string
func (*Segmenter) LoadDictStr ¶
LoadDictStr load dictionary from string
func (*Segmenter) LoadModel ¶
LoadModel load the hmm model
Use the user's model:
seg.LoadModel(B, E, M, S map[rune]float64)
func (*Segmenter) LoadStopArr ¶
LoadStopArr load stop word by []string
func (*Segmenter) LoadStopEmbed ¶
LoadStopEmbed load stop dictionary from embed file
func (*Segmenter) LoadStopStr ¶
LoadDictStr load the stop dictionary from string
func (*Segmenter) ModeSegment ¶
ModeSegment segment using search mode if searchMode is true
func (*Segmenter) PosTrimArr ¶
PosTrimArr cut string return pos.Text []string
func (*Segmenter) PosTrimStr ¶
PosTrimStr cut string return pos.Text string
func (*Segmenter) ReAddToken ¶
ReAddToken remove and add token again
func (*Segmenter) RemoveStop ¶
RemoveStop remove a token from the StopWord dictionary.
func (*Segmenter) RemoveToken ¶
RemoveToken remove token in dictionary
func (*Segmenter) SetDataPath ¶ added in v0.69.17
func (*Segmenter) Slice ¶
Slice use modeSegment segment retrun []string using search mode if searchMode is true
func (*Segmenter) SplitTextToWords ¶
SplitTextToWords 将文本划分成字元
func (*Segmenter) String ¶
Slice use modeSegment segment retrun string using search mode if searchMode is true
func (*Segmenter) SuggestFreq ¶
SuggestFreq suggest the words frequency return a suggested frequency of a word cutted to short words.
func (*Segmenter) TrimPosPunct ¶
TrimPosPunct trim SegPos not space and punct
func (*Segmenter) TrimSymbol ¶
TrimSymbol trim []string exclude symbol, space and punct
func (*Segmenter) TrimWithPos ¶
TrimWithPos trim some seg with pos
type Text ¶
type Text []byte
Text 字串类型,可以用来表达
- 一个字元,比如 "世" 又如 "界", 英文的一个字元是一个词
- 一个分词,比如 "世界" 又如 "人口"
- 一段文字,比如 "世界有七十亿人口"
Source Files ¶
Directories ¶
Path | Synopsis |
---|---|
gonn
|
|
Package hmm is the Golang HMM cut module Package hmm model data The data from https://github.com/fxsjy/jieba
|
Package hmm is the Golang HMM cut module Package hmm model data The data from https://github.com/fxsjy/jieba |
pos
Package pos model data The data from https://github.com/fxsjy/jieba
|
Package pos model data The data from https://github.com/fxsjy/jieba |
tools
|
|