gse

package module

v0.80.2 Latest Latest Go to latest Published: Jan 16, 2023 License: Apache-2.0 Imports: 17 Imported by: 89

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/go-ego/gse

Links

Open Source Insights

README ¶

gse

Go efficient multilingual NLP and text segmentation; support English, Chinese, Japanese and others. And supports with elasticsearch and bleve.

简体中文

Gse is implements jieba by golang, and try add NLP support and more feature

Feature:

Support common, search engine, full mode, precise mode and HMM mode multiple word segmentation modes;
Support user and embed dictionary, Part-of-speech/POS tagging, analyze segment info, stop and trim words
Support multilingual: English, Chinese, Japanese and others
Support Traditional Chinese
Support HMM cut text use Viterbi algorithm
Support NLP by TensorFlow (in work)
Named Entity Recognition (in work)
Supports with elasticsearch and bleve
run JSON RPC service.

Algorithm:

Dictionary with double array trie (Double-Array Trie) to achieve
Segmenter algorithm is the shortest path (based on word frequency and dynamic programming), and DAG and HMM algorithm word segmentation.

Text Segmentation speed:

single thread 9.2MB/s
goroutines concurrent 26.8MB/s.
HMM text segmentation single thread 3.2MB/s. (2core 4threads Macbook Pro).

Binding:

gse-bind, binding JavaScript and other, support more language.

Install / update

With Go module support (Go 1.11+), just import:

import "github.com/go-ego/gse"

Otherwise, to install the gse package, run the command:

go get -u github.com/go-ego/gse

Use

package main

import (
	_ "embed"
	"fmt"

	"github.com/go-ego/gse"
)

//go:embed testdata/test_en2.txt
var testDict string

//go:embed testdata/test_en.txt
var testEn string

var (
	text  = "To be or not to be, that's the question!"
	test1 = "Hiworld, Helloworld!"
)

func main() {
	var seg1 gse.Segmenter
	seg1.DictSep = ","
	err := seg1.LoadDict("./testdata/test_en.txt")
	if err != nil {
		fmt.Println("Load dictionary error: ", err)
	}

	s1 := seg1.Cut(text)
	fmt.Println("seg1 Cut: ", s1)
	// seg1 Cut:  [to be   or   not to be ,   that's the question!]

	var seg2 gse.Segmenter
	seg2.AlphaNum = true
	seg2.LoadDict("./testdata/test_en_dict3.txt")

	s2 := seg2.Cut(test1)
	fmt.Println("seg2 Cut: ", s2)
	// seg2 Cut:  [hi world ,   hello world !]

	var seg3 gse.Segmenter
	seg3.AlphaNum = true
	seg3.DictSep = ","
	err = seg3.LoadDictEmbed(testDict + "\n" + testEn)
	if err != nil {
		fmt.Println("loadDictEmbed error: ", err)
	}
	s3 := seg3.Cut(text + test1)
	fmt.Println("seg3 Cut: ", s3)
	// seg3 Cut:  [to be   or   not to be ,   that's the question! hi world ,   hello world !]

	// example2()
}

Example2:

package main

import (
	"fmt"
	"regexp"

	"github.com/go-ego/gse"
	"github.com/go-ego/gse/hmm/pos"
)

var (
	text = "Hello world, Helloworld. Winter is coming! こんにちは世界, 你好世界."

	new, _ = gse.New("zh,testdata/test_en_dict3.txt", "alpha")

	seg gse.Segmenter
	posSeg pos.Segmenter
)

func main() {
	// Loading the default dictionary
	seg.LoadDict()
	// Loading the default dictionary with embed
	// seg.LoadDictEmbed()
	//
	// Loading the Simplified Chinese dictionary
	// seg.LoadDict("zh_s")
	// seg.LoadDictEmbed("zh_s")
	//
	// Loading the Traditional Chinese dictionary
	// seg.LoadDict("zh_t")
	//
	// Loading the Japanese dictionary
	// seg.LoadDict("jp")
	//
	// Load the dictionary
	// seg.LoadDict("your gopath"+"/src/github.com/go-ego/gse/data/dict/dictionary.txt")

	cut()

	segCut()
}

func cut() {
	hmm := new.Cut(text, true)
	fmt.Println("cut use hmm: ", hmm)

	hmm = new.CutSearch(text, true)
	fmt.Println("cut search use hmm: ", hmm)
	fmt.Println("analyze: ", new.Analyze(hmm, text))

	hmm = new.CutAll(text)
	fmt.Println("cut all: ", hmm)

	reg := regexp.MustCompile(`(\d+年|\d+月|\d+日|[\p{Latin}]+|[\p{Hangul}]+|\d+\.\d+|[a-zA-Z0-9]+)`)
	text1 := `헬로월드 헬로 서울, 2021年09月10日, 3.14`
	hmm = seg.CutDAG(text1, reg)
	fmt.Println("Cut with hmm and regexp: ", hmm, hmm[0], hmm[6])
}

func analyzeAndTrim(cut []string) {
	a := seg.Analyze(cut, "")
	fmt.Println("analyze the segment: ", a)

	cut = seg.Trim(cut)
	fmt.Println("cut all: ", cut)

	fmt.Println(seg.String(text, true))
	fmt.Println(seg.Slice(text, true))
}

func cutPos() {
	po := seg.Pos(text, true)
	fmt.Println("pos: ", po)
	po = seg.TrimPos(po)
	fmt.Println("trim pos: ", po)

	pos.WithGse(seg)
	po = posSeg.Cut(text, true)
	fmt.Println("pos: ", po)

	po = posSeg.TrimWithPos(po, "zg")
	fmt.Println("trim pos: ", po)
}

func segCut() {
	// Text Segmentation
	tb := []byte(text)
	fmt.Println(seg.String(text, true))

	segments := seg.Segment(tb)
	// Handle word segmentation results, search mode
	fmt.Println(gse.ToString(segments, true))
}

Look at an custom dictionary example

package main

import (
	"fmt"
	_ "embed"

	"github.com/go-ego/gse"
)

//go:embed test_en_dict3.txt
var testDict string

func main() {
	// var seg gse.Segmenter
	// seg.LoadDict("zh, testdata/zh/test_dict.txt, testdata/zh/test_dict1.txt")
	// seg.LoadStop()
	seg, err := gse.NewEmbed("zh, word 20 n"+testDict, "en")
	// seg.LoadDictEmbed()
	seg.LoadStopEmbed()

	text1 := "Hello world, こんにちは世界, 你好世界!"
	s1 := seg.Cut(text1, true)
	fmt.Println(s1)
	fmt.Println("trim: ", seg.Trim(s1))
	fmt.Println("stop: ", seg.Stop(s1))
	fmt.Println(seg.String(text1, true))

	segments := seg.Segment([]byte(text1))
	fmt.Println(gse.ToString(segments))
}

Look at an Chinese example

Look at an Japanese example

Elasticsearch

How to use it with elasticsearch?

go-gse-elastic

Authors

License

Gse is primarily distributed under the terms of "both the MIT license and the Apache License (Version 2.0)". See LICENSE-APACHE, LICENSE-MIT.

Thanks for sego and jieba(jiebago).

Documentation ¶

Overview ¶

Package gse Go efficient multilingual NLP and text segmentation,

Index ¶

Constants
Variables
func DictPaths(dictDir, filePath string) (files []string)
func FilterEmoji(text string) (new string)
func FilterHtml(text string) string
func FilterLang(text, lang string) (new string)
func FilterSymbol(text string) (new string)
func FindAllOccs(data []byte, searches []string) map[string][]int
func GetVersion() string
func IsJp(segText string) bool
func Join(text []Text) string
func Range(text string) (new []string)
func RangeText(text string) (new string)
func SplitNum(text string) []string
func SplitNums(text string) string
func ToSlice(segs []Segment, searchMode ...bool) (output []string)
func ToString(segs []Segment, searchMode ...bool) (output string)
type AnalyzeToken
type Dictionary
- func NewDict() *Dictionary
- func (dict *Dictionary) AddToken(token Token) error
- func (dict *Dictionary) Find(word []byte) (float64, string, bool)
- func (dict *Dictionary) LookupTokens(words []Text, tokens []*Token) (numOfTokens int)
- func (dict *Dictionary) MaxTokenLen() int
- func (dict *Dictionary) NumTokens() int
- func (dict *Dictionary) RemoveToken(token Token) error
- func (dict *Dictionary) TotalFreq() float64
- func (dict *Dictionary) Value(word []byte) (val, id int, err error)
type Prob
type SegPos
- func ToPos(segs []Segment, searchMode ...bool) (output []SegPos)
type Segment
- func (s *Segment) End() int
- func (s *Segment) Start() int
- func (s *Segment) Token() *Token
type Segmenter
- func New(files ...string) (seg Segmenter, err error)
- func NewEmbed(dict ...string) (seg Segmenter, err error)
- func (seg *Segmenter) AddStop(text string)
- func (seg *Segmenter) AddStopArr(text ...string)
- func (seg *Segmenter) AddToken(text string, freq float64, pos ...string) error
- func (seg *Segmenter) AddTokenForce(text string, freq float64, pos ...string) (err error)
- func (seg *Segmenter) Analyze(text []string, t1 string, by ...bool) (az []AnalyzeToken)
- func (seg *Segmenter) CalcToken()
- func (seg *Segmenter) Cut(str string, hmm ...bool) []string
- func (seg *Segmenter) CutAll(str string) []string
- func (seg *Segmenter) CutDAG(str string, reg ...*regexp.Regexp) []string
- func (seg *Segmenter) CutDAGNoHMM(str string) []string
- func (seg *Segmenter) CutSearch(str string, hmm ...bool) []string
- func (seg *Segmenter) CutStop(str string, hmm ...bool) []string
- func (seg *Segmenter) CutStr(str []string, separator ...string) (r string)
- func (seg *Segmenter) CutTrim(str string, hmm ...bool) []string
- func (seg *Segmenter) CutTrimHtml(str string, hmm ...bool) []string
- func (seg *Segmenter) CutTrimHtmls(str string, hmm ...bool) string
- func (seg *Segmenter) CutUrl(str string, num ...bool) []string
- func (seg *Segmenter) CutUrls(str string, num ...bool) string
- func (seg *Segmenter) Dictionary() *Dictionary
- func (seg *Segmenter) Empty() error
- func (seg *Segmenter) EmptyStop() error
- func (seg *Segmenter) Find(str string) (float64, string, bool)
- func (seg *Segmenter) GetCurrentFilePath() string
- func (seg *Segmenter) GetIdfPath(files ...string) []string
- func (seg *Segmenter) HMMCut(str string, reg ...*regexp.Regexp) []string
- func (seg *Segmenter) HMMCutMod(str string, prob ...map[rune]float64) []string
- func (seg *Segmenter) Init()
- func (seg *Segmenter) IsStop(s string) bool
- func (seg *Segmenter) LoadDict(files ...string) error
- func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error)
- func (seg *Segmenter) LoadDictMap(dict []map[string]string) error
- func (seg *Segmenter) LoadDictStr(dict string) error
- func (seg *Segmenter) LoadModel(prob ...map[rune]float64)
- func (seg *Segmenter) LoadStop(files ...string) error
- func (seg *Segmenter) LoadStopArr(dict []string)
- func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error)
- func (seg *Segmenter) LoadStopStr(dict string) error
- func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment
- func (seg *Segmenter) Pos(s string, searchMode ...bool) []SegPos
- func (seg *Segmenter) PosStr(str []SegPos, separator ...string) (r string)
- func (seg *Segmenter) PosTrim(str string, search bool, pos ...string) []SegPos
- func (seg *Segmenter) PosTrimArr(str string, search bool, pos ...string) (re []string)
- func (seg *Segmenter) PosTrimStr(str string, search bool, pos ...string) string
- func (seg *Segmenter) ReAddToken(text string, freq float64, pos ...string) error
- func (seg *Segmenter) Read(file string) error
- func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int, text, freqText, pos string, fsErr error)
- func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error
- func (seg *Segmenter) RemoveStop(text string)
- func (seg *Segmenter) RemoveToken(text string) error
- func (seg *Segmenter) Segment(bytes []byte) []Segment
- func (seg *Segmenter) Size(size int, text, freqText string) (freq float64)
- func (seg *Segmenter) Slice(s string, searchMode ...bool) []string
- func (seg *Segmenter) SplitTextToWords(text Text) []Text
- func (seg *Segmenter) Stop(s []string) (r []string)
- func (seg *Segmenter) String(s string, searchMode ...bool) string
- func (seg *Segmenter) SuggestFreq(words ...string) float64
- func (seg *Segmenter) ToToken(text string, freq float64, pos ...string) Token
- func (seg *Segmenter) Trim(s []string) (r []string)
- func (seg *Segmenter) TrimPos(s []SegPos) (r []SegPos)
- func (seg *Segmenter) TrimPosPunct(se []SegPos) (re []SegPos)
- func (seg *Segmenter) TrimPunct(s []string) (r []string)
- func (seg *Segmenter) TrimSymbol(s []string) (r []string)
- func (seg *Segmenter) TrimWithPos(se []SegPos, pos ...string) (re []SegPos)
- func (seg *Segmenter) Value(str string) (int, int, error)
type Text
- func SplitWords(text Text) []Text
type Token
- func (token *Token) Equals(str string) bool
- func (token *Token) Freq() float64
- func (token *Token) Pos() string
- func (token *Token) Segments() []*Segment
- func (token *Token) Text() string

Constants ¶

View Source

const (
	// RatioWord ratio words and letters
	RatioWord float32 = 1.5
	// RatioWordFull full ratio words and letters
	RatioWordFull float32 = 1
)

View Source

const (
	// Version get the gse version
	Version = "v0.71.0.695, Green Lake!"
)

Variables ¶

View Source

var StopWordMap = map[string]bool{
	" ": true,
}

StopWordMap the default stop words.

View Source

var (
	// ToLower set alpha tolower
	ToLower = true
)

Functions ¶

func FilterEmoji ¶ added in v0.60.0

func FilterEmoji(text string) (new string)

FilterEmoji filter the emoji

func FilterHtml ¶ added in v0.61.0

func FilterHtml(text string) string

FilterHtml filter the html tag

func FilterLang ¶ added in v0.60.0

func FilterLang(text, lang string) (new string)

FilterLang filter the language

func FilterSymbol ¶ added in v0.60.0

func FilterSymbol(text string) (new string)

FilterSymbol filter the symbol

func FindAllOccs ¶ added in v0.69.14

func FindAllOccs(data []byte, searches []string) map[string][]int

FindAllOccs find the all search byte start in data

func Range ¶ added in v0.63.3

func Range(text string) (new []string)

Range range text to []string

func RangeText ¶ added in v0.63.3

func RangeText(text string) (new string)

RangeText range text to string

func SplitNum ¶ added in v0.65.0

func SplitNum(text string) []string

SplitNum cut string by num to []string

func SplitNums ¶ added in v0.65.0

func SplitNums(text string) string

SplitNums cut string by num to string

func ToSlice ¶

func ToSlice(segs []Segment, searchMode ...bool) (output []string)

ToSlice converts a segments to slice retrun string slice

func ToString ¶

func ToString(segs []Segment, searchMode ...bool) (output string)

ToString converts a segments slice to string retrun the string

 two output modes:

	normal mode (searchMode=false）
	search mode（searchMode=true）

default searchMode=false search mode is used search engine, and will output more results

Types ¶

type AnalyzeToken ¶ added in v0.68.0

type AnalyzeToken struct {
	// the start of the segment in the text
	Start int
	End   int

	Position int
	Len      int

	Type string

	Text string
	Freq float64
	Pos  string
}

AnalyzeToken analyze the segment info structure

type Dictionary ¶

type Dictionary struct {
	Tokens []Token // the all tokens in the dictionary, to traverse
	// contains filtered or unexported fields
}

Dictionary struct implements a string double array trie. one segment maybe in leaf node or not

func (*Dictionary) AddToken ¶ added in v0.69.6

func (dict *Dictionary) AddToken(token Token) error

AddToken add a token to the dictionary

func (*Dictionary) Find ¶

func (dict *Dictionary) Find(word []byte) (float64, string, bool)

Find find the word in the dictionary is non-existent and the word's frequency and pos

func (*Dictionary) LookupTokens ¶

func (dict *Dictionary) LookupTokens(
	words []Text, tokens []*Token) (numOfTokens int)

LookupTokens finds tokens and words in the dictionary, matching the given pattern and returns the number of tokens

func (*Dictionary) MaxTokenLen ¶

func (dict *Dictionary) MaxTokenLen() int

MaxTokenLen the maximum length of the dictionary

func (*Dictionary) NumTokens ¶

func (dict *Dictionary) NumTokens() int

NumTokens the number of tokens in the dictionary

func (*Dictionary) RemoveToken ¶

func (dict *Dictionary) RemoveToken(token Token) error

RemoveToken remove token in dictionary

func (*Dictionary) TotalFreq ¶

func (dict *Dictionary) TotalFreq() float64

TotalFreq the total frequency of the dictionary

func (*Dictionary) Value ¶ added in v0.60.0

func (dict *Dictionary) Value(word []byte) (val, id int, err error)

Value find word in the dictionary retrun the word's value and id

type SegPos ¶ added in v0.60.0

type SegPos struct {
	Text, Pos string
}

SegPos type a POS struct

func ToPos ¶ added in v0.60.0

func ToPos(segs []Segment, searchMode ...bool) (output []SegPos)

ToPos converts a segments slice to []SegPos

func (*Segment) End ¶

func (s *Segment) End() int

End retrun the end byte position of the segment (not including this)

func (*Segment) Start ¶

func (s *Segment) Start() int

Start returns the start byte position of the segment

func (*Segment) Token ¶

func (s *Segment) Token() *Token

Token retrun the segment token information

func New ¶

func New(files ...string) (seg Segmenter, err error)

New return a new gse segmenter

func NewEmbed ¶ added in v0.66.7

func NewEmbed(dict ...string) (seg Segmenter, err error)

NewEmbed return new gse segmenter by embed dictionary

func (*Segmenter) AddStop ¶ added in v0.60.0

func (seg *Segmenter) AddStop(text string)

AddStop add a token to the StopWord dictionary.

func (*Segmenter) AddStopArr ¶ added in v0.70.0

func (seg *Segmenter) AddStopArr(text ...string)

AddStopArr add array stop token to stop dictionaries

func (*Segmenter) AddToken ¶

func (seg *Segmenter) AddToken(text string, freq float64, pos ...string) error

AddToken add a new text to the token

func (*Segmenter) AddTokenForce ¶

func (seg *Segmenter) AddTokenForce(text string, freq float64, pos ...string) (err error)

AddTokenForce add new text to token and force time-consuming

func (*Segmenter) Analyze ¶ added in v0.68.0

func (seg *Segmenter) Analyze(text []string, t1 string, by ...bool) (az []AnalyzeToken)

Analyze analyze the token segment info

func (*Segmenter) CalcToken ¶

func (seg *Segmenter) CalcToken()

CalcToken calc the segmenter token

func (*Segmenter) Cut ¶

func (seg *Segmenter) Cut(str string, hmm ...bool) []string

Cut cuts a str into words using accurate mode. Parameter hmm controls whether to use the HMM(Hidden Markov Model) or use the user's model.

seg.Cut(text):

use the shortest path

seg.Cut(text, false):

use cut dag not hmm

seg.Cut(text, true):

use cut dag and hmm mode

func (*Segmenter) CutAll ¶

func (seg *Segmenter) CutAll(str string) []string

CutAll cuts a str into words using full mode.

func (*Segmenter) CutDAG ¶ added in v0.69.8

func (seg *Segmenter) CutDAG(str string, reg ...*regexp.Regexp) []string

CutDAG cut string with DAG use hmm and regexp

func (*Segmenter) CutDAGNoHMM ¶ added in v0.69.8

func (seg *Segmenter) CutDAGNoHMM(str string) []string

CutDAGNoHMM cut string with DAG not use hmm

func (*Segmenter) CutSearch ¶

func (seg *Segmenter) CutSearch(str string, hmm ...bool) []string

CutSearch cuts str into words using search engine mode.

func (*Segmenter) CutStop ¶ added in v0.70.0

func (seg *Segmenter) CutStop(str string, hmm ...bool) []string

CutStop cut string and tirm stop

func (*Segmenter) CutStr ¶ added in v0.60.0

func (seg *Segmenter) CutStr(str []string, separator ...string) (r string)

CutStr cut []string with Cut return string

func (*Segmenter) CutTrim ¶ added in v0.60.0

func (seg *Segmenter) CutTrim(str string, hmm ...bool) []string

CutTrim cut string and tirm

func (*Segmenter) CutTrimHtml ¶ added in v0.62.1

func (seg *Segmenter) CutTrimHtml(str string, hmm ...bool) []string

CutTrimHtml cut string trim html and symbol return []string

func (*Segmenter) CutTrimHtmls ¶ added in v0.62.1

func (seg *Segmenter) CutTrimHtmls(str string, hmm ...bool) string

CutTrimHtmls cut string trim html and symbol return string

func (*Segmenter) CutUrl ¶ added in v0.65.0

func (seg *Segmenter) CutUrl(str string, num ...bool) []string

CutUrl cut url string trim symbol return []string

func (*Segmenter) CutUrls ¶ added in v0.65.0

func (seg *Segmenter) CutUrls(str string, num ...bool) string

CutUrls cut url string trim symbol return string

func (*Segmenter) Dictionary ¶

func (seg *Segmenter) Dictionary() *Dictionary

Dictionary returns the dictionary used by the tokenizer

func (*Segmenter) Empty ¶ added in v0.69.4

func (seg *Segmenter) Empty() error

Empty empty the seg dictionary

func (*Segmenter) EmptyStop ¶ added in v0.69.5

func (seg *Segmenter) EmptyStop() error

EmptyStop empty the stop dictionary

func (*Segmenter) Find ¶

func (seg *Segmenter) Find(str string) (float64, string, bool)

Find find word in dictionary return word's freq, pos and existence

func (*Segmenter) GetCurrentFilePath ¶ added in v0.80.2

func (seg *Segmenter) GetCurrentFilePath() string

GetCurrentFilePath get the current file path

func (*Segmenter) GetIdfPath ¶ added in v0.80.2

func (seg *Segmenter) GetIdfPath(files ...string) []string

GetIdfPath get the idf path

func (*Segmenter) HMMCut ¶

func (seg *Segmenter) HMMCut(str string, reg ...*regexp.Regexp) []string

HMMCut cut sentence string use HMM with Viterbi

func (*Segmenter) HMMCutMod ¶

func (seg *Segmenter) HMMCutMod(str string, prob ...map[rune]float64) []string

HMMCutMod cut sentence string use HMM with Viterbi

func (*Segmenter) Init ¶ added in v0.60.0

func (seg *Segmenter) Init()

Init initializes the segmenter config

func (*Segmenter) IsStop ¶ added in v0.60.0

func (seg *Segmenter) IsStop(s string) bool

IsStop check the word is a stop word.

func (*Segmenter) LoadDict ¶

func (seg *Segmenter) LoadDict(files ...string) error

LoadDict load the dictionary from the file

The format of the dictionary is (one for each participle):

participle text, frequency, part of speech

And you can option the dictionary separator by seg.DictSep = "," ¶

Can load multiple dictionary files, the file name separated by "," or ", " the front of the dictionary preferentially load the participle,

such as: "user_dictionary.txt,common_dictionary.txt"

When a participle appears both in the user dictionary and in the `common dictionary`, the `user dictionary` is given priority.

func (*Segmenter) LoadDictEmbed ¶ added in v0.66.6

func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error)

LoadDictEmbed load the dictionary by embed file

func (*Segmenter) LoadDictMap ¶ added in v0.66.0

func (seg *Segmenter) LoadDictMap(dict []map[string]string) error

LoadDictMap load dictionary from []map[string]string

func (*Segmenter) LoadDictStr ¶ added in v0.66.6

func (seg *Segmenter) LoadDictStr(dict string) error

LoadDictStr load the dictionary from string

func (*Segmenter) LoadModel ¶

func (seg *Segmenter) LoadModel(prob ...map[rune]float64)

LoadModel load the hmm model (default is Chinese char)

Use the user's model:

seg.LoadModel(B, E, M, S map[rune]float64)

func (*Segmenter) LoadStop ¶ added in v0.60.0

func (seg *Segmenter) LoadStop(files ...string) error

LoadStop load stop word files add token to map

func (*Segmenter) LoadStopArr ¶ added in v0.66.1

func (seg *Segmenter) LoadStopArr(dict []string)

LoadStopArr load stop word by []string

func (*Segmenter) LoadStopEmbed ¶ added in v0.66.6

func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error)

LoadStopEmbed load the stop dictionary from embed file

func (*Segmenter) LoadStopStr ¶ added in v0.69.0

func (seg *Segmenter) LoadStopStr(dict string) error

LoadDictStr load the stop dictionary from string

func (*Segmenter) ModeSegment ¶

func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment

ModeSegment segment using search mode if searchMode is true

func (*Segmenter) Pos ¶ added in v0.60.0

func (seg *Segmenter) Pos(s string, searchMode ...bool) []SegPos

Pos return text and pos array

func (*Segmenter) PosStr ¶ added in v0.60.0

func (seg *Segmenter) PosStr(str []SegPos, separator ...string) (r string)

PosStr cut []SegPos with Pos return string

func (*Segmenter) PosTrim ¶ added in v0.60.0

func (seg *Segmenter) PosTrim(str string, search bool, pos ...string) []SegPos

PosTrim cut string pos and trim

func (*Segmenter) PosTrimArr ¶ added in v0.60.0

func (seg *Segmenter) PosTrimArr(str string, search bool, pos ...string) (re []string)

PosTrimArr cut string return pos.Text []string

func (*Segmenter) PosTrimStr ¶ added in v0.60.0

func (seg *Segmenter) PosTrimStr(str string, search bool, pos ...string) string

PosTrimStr cut string return pos.Text string

func (*Segmenter) ReAddToken ¶ added in v0.69.7

func (seg *Segmenter) ReAddToken(text string, freq float64, pos ...string) error

ReAddToken remove and add token again

func (*Segmenter) Read ¶

func (seg *Segmenter) Read(file string) error

Read read the dict flie

func (*Segmenter) ReadN ¶ added in v0.80.0

func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int,
	text, freqText, pos string, fsErr error)

ReadN read the tokens by '\n'

func (*Segmenter) Reader ¶ added in v0.66.0

func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error

Reader load dictionary from io.Reader

func (*Segmenter) RemoveStop ¶ added in v0.63.0

func (seg *Segmenter) RemoveStop(text string)

RemoveStop remove a token from the StopWord dictionary.

func (*Segmenter) RemoveToken ¶

func (seg *Segmenter) RemoveToken(text string) error

RemoveToken remove token in dictionary

func (*Segmenter) Segment ¶

func (seg *Segmenter) Segment(bytes []byte) []Segment

Segment use shortest path to segment the text

input parameter：

bytes	UTF8 text []byte

output：

[]Segment	retrun segments result

func (*Segmenter) Size ¶ added in v0.66.0

func (seg *Segmenter) Size(size int, text, freqText string) (freq float64)

Size frequency is calculated based on the size of the text

func (*Segmenter) Slice ¶

func (seg *Segmenter) Slice(s string, searchMode ...bool) []string

Slice use modeSegment segment retrun []string using search mode if searchMode is true

func (*Segmenter) SplitTextToWords ¶ added in v0.64.0

func (seg *Segmenter) SplitTextToWords(text Text) []Text

SplitTextToWords splits a string to token words

func (*Segmenter) Stop ¶ added in v0.70.0

func (seg *Segmenter) Stop(s []string) (r []string)

Stop trim []string stop word

func (*Segmenter) String ¶

func (seg *Segmenter) String(s string, searchMode ...bool) string

Slice use modeSegment segment retrun string using search mode if searchMode is true

func (*Segmenter) SuggestFreq ¶ added in v0.60.0

func (seg *Segmenter) SuggestFreq(words ...string) float64

SuggestFreq suggest the words frequency return a suggested frequency of a word cutted to short words.

func (*Segmenter) ToToken ¶ added in v0.69.7

func (seg *Segmenter) ToToken(text string, freq float64, pos ...string) Token

ToToken make the text, freq and pos to token structure

func (*Segmenter) Trim ¶

func (seg *Segmenter) Trim(s []string) (r []string)

Trim trim []string exclude symbol, space and punct

func (*Segmenter) TrimPos ¶ added in v0.60.0

func (seg *Segmenter) TrimPos(s []SegPos) (r []SegPos)

TrimPos trim SegPos not symbol, space and punct

func (*Segmenter) TrimPosPunct ¶ added in v0.60.0

func (seg *Segmenter) TrimPosPunct(se []SegPos) (re []SegPos)

TrimPosPunct trim SegPos not space and punct

func (*Segmenter) TrimPunct ¶ added in v0.60.0

func (seg *Segmenter) TrimPunct(s []string) (r []string)

TrimPunct trim []string exclude space and punct

func (*Segmenter) TrimSymbol ¶ added in v0.62.1

func (seg *Segmenter) TrimSymbol(s []string) (r []string)

TrimSymbol trim []string exclude symbol, space and punct

func (*Segmenter) TrimWithPos ¶ added in v0.60.0

func (seg *Segmenter) TrimWithPos(se []SegPos, pos ...string) (re []SegPos)

TrimWithPos trim some seg with pos

func (*Segmenter) Value ¶ added in v0.60.0

func (seg *Segmenter) Value(str string) (int, int, error)

Value find word in dictionary return word's value

type Text ¶

type Text []byte

Text a string type，used to parse text 1. a word, such as "world" or "boundary", in English a word is a word 2. a participle, such as "world" a.k.a. "population" 3. a text, such as "the world has seven billion people"

func SplitWords ¶ added in v0.64.1

func SplitWords(text Text) []Text

SplitWords splits a string to token words

type Token ¶

type Token struct {
	// contains filtered or unexported fields
}

Token define a segment token structure

func (*Token) Equals ¶

func (token *Token) Equals(str string) bool

Equals compare str split tokens

func (*Token) Freq ¶ added in v0.69.7

func (token *Token) Freq() float64

Freq returns the frequency in the dictionary token

func (*Token) Pos ¶

func (token *Token) Pos() string

Pos returns the part of speech in the dictionary token

func (*Token) Segments ¶

func (token *Token) Segments() []*Segment

Segments will segment further subdivisions of the text of this participle, the participle has two subclauses.

Subclauses can also have further subclauses forming a tree structure, which can be traversed to get all the detailed subdivisions of the participle, which is mainly Used by search engines to perform full-text searches on a piece of text.

func (*Token) Text ¶

func (token *Token) Text() string

Text return the text of the segment

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
crf
data
examples
dict
dict/embed
en
hmm
jp
gonn
cnn
rnn
hmm Package hmm is the Golang HMM cut module	Package hmm is the Golang HMM cut module
bm25
idf
pos Package pos model data The data from https://github.com/fxsjy/jieba	Package pos model data The data from https://github.com/fxsjy/jieba
util
tf
nlp
tools
benchmark
benchmark/goroutines
server

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

README ¶

gse

Feature:

Algorithm:

Text Segmentation speed:

Binding:

Install / update

Use

Elasticsearch

Authors

License

Documentation ¶

Overview ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func DictPaths ¶

func FilterEmoji ¶ added in v0.60.0

func FilterHtml ¶ added in v0.61.0

func FilterLang ¶ added in v0.60.0

func FilterSymbol ¶ added in v0.60.0

func FindAllOccs ¶ added in v0.69.14

func GetVersion ¶

func IsJp ¶

func Join ¶

func Range ¶ added in v0.63.3

func RangeText ¶ added in v0.63.3

func SplitNum ¶ added in v0.65.0

func SplitNums ¶ added in v0.65.0

func ToSlice ¶

func ToString ¶

Types ¶

type AnalyzeToken ¶ added in v0.68.0

type Dictionary ¶

func NewDict ¶

func (*Dictionary) AddToken ¶ added in v0.69.6

func (*Dictionary) Find ¶

func (*Dictionary) LookupTokens ¶

func (*Dictionary) MaxTokenLen ¶

func (*Dictionary) NumTokens ¶

func (*Dictionary) RemoveToken ¶

func (*Dictionary) TotalFreq ¶

func (*Dictionary) Value ¶ added in v0.60.0

type Prob ¶

type SegPos ¶ added in v0.60.0

func ToPos ¶ added in v0.60.0

type Segment ¶

func (*Segment) End ¶

func (*Segment) Start ¶

func (*Segment) Token ¶

type Segmenter ¶

func New ¶

func NewEmbed ¶ added in v0.66.7

func (*Segmenter) AddStop ¶ added in v0.60.0

func (*Segmenter) AddStopArr ¶ added in v0.70.0

func (*Segmenter) AddToken ¶

func (*Segmenter) AddTokenForce ¶

func (*Segmenter) Analyze ¶ added in v0.68.0

func (*Segmenter) CalcToken ¶

func (*Segmenter) Cut ¶

func (*Segmenter) CutAll ¶

func (*Segmenter) CutDAG ¶ added in v0.69.8

func (*Segmenter) CutDAGNoHMM ¶ added in v0.69.8

func (*Segmenter) CutSearch ¶

func (*Segmenter) CutStop ¶ added in v0.70.0

func (*Segmenter) CutStr ¶ added in v0.60.0

func (*Segmenter) CutTrim ¶ added in v0.60.0

func (*Segmenter) CutTrimHtml ¶ added in v0.62.1

func (*Segmenter) CutTrimHtmls ¶ added in v0.62.1

func (*Segmenter) CutUrl ¶ added in v0.65.0

func (*Segmenter) CutUrls ¶ added in v0.65.0

func (*Segmenter) Dictionary ¶

func (*Segmenter) Empty ¶ added in v0.69.4

func (*Segmenter) EmptyStop ¶ added in v0.69.5

func (*Segmenter) Find ¶

func (*Segmenter) GetCurrentFilePath ¶ added in v0.80.2

func (*Segmenter) GetIdfPath ¶ added in v0.80.2

func (*Segmenter) HMMCut ¶

func (*Segmenter) HMMCutMod ¶