package module
v0.80.2 Latest Latest

This package is not in the latest version of its module.

Go to latest
Published: Jan 16, 2023 License: Apache-2.0 Imports: 17 Imported by: 93



Go efficient multilingual NLP and text segmentation; support English, Chinese, Japanese and others. And supports with elasticsearch and bleve.

Build Status CircleCI Status codecov Build Status Go Report Card GoDoc GitHub release


Gse is implements jieba by golang, and try add NLP support and more feature


  • Support common, search engine, full mode, precise mode and HMM mode multiple word segmentation modes;
  • Support user and embed dictionary, Part-of-speech/POS tagging, analyze segment info, stop and trim words
  • Support multilingual: English, Chinese, Japanese and others
  • Support Traditional Chinese
  • Support HMM cut text use Viterbi algorithm
  • Support NLP by TensorFlow (in work)
  • Named Entity Recognition (in work)
  • Supports with elasticsearch and bleve
  • run JSON RPC service.


  • Dictionary with double array trie (Double-Array Trie) to achieve
  • Segmenter algorithm is the shortest path (based on word frequency and dynamic programming), and DAG and HMM algorithm word segmentation.

Text Segmentation speed:


gse-bind, binding JavaScript and other, support more language.

Install / update

With Go module support (Go 1.11+), just import:

import ""

Otherwise, to install the gse package, run the command:

go get -u


package main

import (
	_ "embed"


//go:embed testdata/test_en2.txt
var testDict string

//go:embed testdata/test_en.txt
var testEn string

var (
	text  = "To be or not to be, that's the question!"
	test1 = "Hiworld, Helloworld!"

func main() {
	var seg1 gse.Segmenter
	seg1.DictSep = ","
	err := seg1.LoadDict("./testdata/test_en.txt")
	if err != nil {
		fmt.Println("Load dictionary error: ", err)

	s1 := seg1.Cut(text)
	fmt.Println("seg1 Cut: ", s1)
	// seg1 Cut:  [to be   or   not to be ,   that's the question!]

	var seg2 gse.Segmenter
	seg2.AlphaNum = true

	s2 := seg2.Cut(test1)
	fmt.Println("seg2 Cut: ", s2)
	// seg2 Cut:  [hi world ,   hello world !]

	var seg3 gse.Segmenter
	seg3.AlphaNum = true
	seg3.DictSep = ","
	err = seg3.LoadDictEmbed(testDict + "\n" + testEn)
	if err != nil {
		fmt.Println("loadDictEmbed error: ", err)
	s3 := seg3.Cut(text + test1)
	fmt.Println("seg3 Cut: ", s3)
	// seg3 Cut:  [to be   or   not to be ,   that's the question! hi world ,   hello world !]

	// example2()


package main

import (


var (
	text = "Hello world, Helloworld. Winter is coming! こんにちは世界, 你好世界."

	new, _ = gse.New("zh,testdata/test_en_dict3.txt", "alpha")

	seg gse.Segmenter
	posSeg pos.Segmenter

func main() {
	// Loading the default dictionary
	// Loading the default dictionary with embed
	// seg.LoadDictEmbed()
	// Loading the Simplified Chinese dictionary
	// seg.LoadDict("zh_s")
	// seg.LoadDictEmbed("zh_s")
	// Loading the Traditional Chinese dictionary
	// seg.LoadDict("zh_t")
	// Loading the Japanese dictionary
	// seg.LoadDict("jp")
	// Load the dictionary
	// seg.LoadDict("your gopath"+"/src/")



func cut() {
	hmm := new.Cut(text, true)
	fmt.Println("cut use hmm: ", hmm)

	hmm = new.CutSearch(text, true)
	fmt.Println("cut search use hmm: ", hmm)
	fmt.Println("analyze: ", new.Analyze(hmm, text))

	hmm = new.CutAll(text)
	fmt.Println("cut all: ", hmm)

	reg := regexp.MustCompile(`(\d+年|\d+月|\d+日|[\p{Latin}]+|[\p{Hangul}]+|\d+\.\d+|[a-zA-Z0-9]+)`)
	text1 := `헬로월드 헬로 서울, 2021年09月10日, 3.14`
	hmm = seg.CutDAG(text1, reg)
	fmt.Println("Cut with hmm and regexp: ", hmm, hmm[0], hmm[6])

func analyzeAndTrim(cut []string) {
	a := seg.Analyze(cut, "")
	fmt.Println("analyze the segment: ", a)

	cut = seg.Trim(cut)
	fmt.Println("cut all: ", cut)

	fmt.Println(seg.String(text, true))
	fmt.Println(seg.Slice(text, true))

func cutPos() {
	po := seg.Pos(text, true)
	fmt.Println("pos: ", po)
	po = seg.TrimPos(po)
	fmt.Println("trim pos: ", po)

	po = posSeg.Cut(text, true)
	fmt.Println("pos: ", po)

	po = posSeg.TrimWithPos(po, "zg")
	fmt.Println("trim pos: ", po)

func segCut() {
	// Text Segmentation
	tb := []byte(text)
	fmt.Println(seg.String(text, true))

	segments := seg.Segment(tb)
	// Handle word segmentation results, search mode
	fmt.Println(gse.ToString(segments, true))

Look at an custom dictionary example

package main

import (
	_ "embed"


//go:embed test_en_dict3.txt
var testDict string

func main() {
	// var seg gse.Segmenter
	// seg.LoadDict("zh, testdata/zh/test_dict.txt, testdata/zh/test_dict1.txt")
	// seg.LoadStop()
	seg, err := gse.NewEmbed("zh, word 20 n"+testDict, "en")
	// seg.LoadDictEmbed()

	text1 := "Hello world, こんにちは世界, 你好世界!"
	s1 := seg.Cut(text1, true)
	fmt.Println("trim: ", seg.Trim(s1))
	fmt.Println("stop: ", seg.Stop(s1))
	fmt.Println(seg.String(text1, true))

	segments := seg.Segment([]byte(text1))

Look at an Chinese example

Look at an Japanese example


How to use it with elasticsearch?




Gse is primarily distributed under the terms of "both the MIT license and the Apache License (Version 2.0)". See LICENSE-APACHE, LICENSE-MIT.

Thanks for sego and jieba(jiebago).



Package gse Go efficient multilingual NLP and text segmentation,



View Source
const (
	// RatioWord ratio words and letters
	RatioWord float32 = 1.5
	// RatioWordFull full ratio words and letters
	RatioWordFull float32 = 1
View Source
const (
	// Version get the gse version
	Version = "v0.71.0.695, Green Lake!"


View Source
var StopWordMap = map[string]bool{
	" ": true,

StopWordMap the default stop words.

View Source
var (
	// ToLower set alpha tolower
	ToLower = true


func DictPaths

func DictPaths(dictDir, filePath string) (files []string)

DictPaths get the dict's paths

func FilterEmoji added in v0.60.0

func FilterEmoji(text string) (new string)

FilterEmoji filter the emoji

func FilterHtml added in v0.61.0

func FilterHtml(text string) string

FilterHtml filter the html tag

func FilterLang added in v0.60.0

func FilterLang(text, lang string) (new string)

FilterLang filter the language

func FilterSymbol added in v0.60.0

func FilterSymbol(text string) (new string)

FilterSymbol filter the symbol

func FindAllOccs added in v0.69.14

func FindAllOccs(data []byte, searches []string) map[string][]int

FindAllOccs find the all search byte start in data

func GetVersion

func GetVersion() string

GetVersion get the version of gse

func IsJp

func IsJp(segText string) bool

IsJp is Japan char return true

func Join

func Join(text []Text) string

Join is better string splicing

func Range added in v0.63.3

func Range(text string) (new []string)

Range range text to []string

func RangeText added in v0.63.3

func RangeText(text string) (new string)

RangeText range text to string

func SplitNum added in v0.65.0

func SplitNum(text string) []string

SplitNum cut string by num to []string

func SplitNums added in v0.65.0

func SplitNums(text string) string

SplitNums cut string by num to string

func ToSlice

func ToSlice(segs []Segment, searchMode ...bool) (output []string)

ToSlice converts a segments to slice retrun string slice

func ToString

func ToString(segs []Segment, searchMode ...bool) (output string)

ToString converts a segments slice to string retrun the string

 two output modes:

	normal mode (searchMode=false)
	search mode(searchMode=true)

default searchMode=false search mode is used search engine, and will output more results


type AnalyzeToken added in v0.68.0

type AnalyzeToken struct {
	// the start of the segment in the text
	Start int
	End   int

	Position int
	Len      int

	Type string

	Text string
	Freq float64
	Pos  string

AnalyzeToken analyze the segment info structure

type Dictionary

type Dictionary struct {
	Tokens []Token // the all tokens in the dictionary, to traverse
	// contains filtered or unexported fields

Dictionary struct implements a string double array trie. one segment maybe in leaf node or not

func NewDict

func NewDict() *Dictionary

NewDict a new dictionary trie

func (*Dictionary) AddToken added in v0.69.6

func (dict *Dictionary) AddToken(token Token) error

AddToken add a token to the dictionary

func (*Dictionary) Find

func (dict *Dictionary) Find(word []byte) (float64, string, bool)

Find find the word in the dictionary is non-existent and the word's frequency and pos

func (*Dictionary) LookupTokens

func (dict *Dictionary) LookupTokens(
	words []Text, tokens []*Token) (numOfTokens int)

LookupTokens finds tokens and words in the dictionary, matching the given pattern and returns the number of tokens

func (*Dictionary) MaxTokenLen

func (dict *Dictionary) MaxTokenLen() int

MaxTokenLen the maximum length of the dictionary

func (*Dictionary) NumTokens

func (dict *Dictionary) NumTokens() int

NumTokens the number of tokens in the dictionary

func (*Dictionary) RemoveToken

func (dict *Dictionary) RemoveToken(token Token) error

RemoveToken remove token in dictionary

func (*Dictionary) TotalFreq

func (dict *Dictionary) TotalFreq() float64

TotalFreq the total frequency of the dictionary

func (*Dictionary) Value added in v0.60.0

func (dict *Dictionary) Value(word []byte) (val, id int, err error)

Value find word in the dictionary retrun the word's value and id

type Prob

type Prob struct {
	B, E, M, S map[rune]float64

Prob define the hmm model struct

type SegPos added in v0.60.0

type SegPos struct {
	Text, Pos string

SegPos type a POS struct

func ToPos added in v0.60.0

func ToPos(segs []Segment, searchMode ...bool) (output []SegPos)

ToPos converts a segments slice to []SegPos

type Segment

type Segment struct {
	Position int
	// contains filtered or unexported fields

Segment a segment in the text

func (*Segment) End

func (s *Segment) End() int

End retrun the end byte position of the segment (not including this)

func (*Segment) Start

func (s *Segment) Start() int

Start returns the start byte position of the segment

func (*Segment) Token

func (s *Segment) Token() *Token

Token retrun the segment token information

type Segmenter

type Segmenter struct {
	Dict     *Dictionary
	Load     bool
	DictSep  string
	DictPath string

	// NotLoadHMM option load the default hmm model config (Chinese char)
	NotLoadHMM bool

	// AlphaNum set splitTextToWords can add token
	// when words in alphanum
	// set up alphanum dictionary word segmentation
	AlphaNum bool
	Alpha    bool
	Num      bool

	// LoadNoFreq load not have freq dict word
	LoadNoFreq bool
	// MinTokenFreq load min freq token
	MinTokenFreq float64
	// TextFreq add token frenquency when not specified freq
	TextFreq string

	// SkipLog set skip log print
	SkipLog bool
	MoreLog bool

	// SkipPos skip PosStr pos
	SkipPos bool

	NotStop bool
	// StopWordMap the stop word map
	StopWordMap map[string]bool

Segmenter define the segmenter structure

func New

func New(files ...string) (seg Segmenter, err error)

New return a new gse segmenter

func NewEmbed added in v0.66.7

func NewEmbed(dict ...string) (seg Segmenter, err error)

NewEmbed return new gse segmenter by embed dictionary

func (*Segmenter) AddStop added in v0.60.0

func (seg *Segmenter) AddStop(text string)

AddStop add a token to the StopWord dictionary.

func (*Segmenter) AddStopArr added in v0.70.0

func (seg *Segmenter) AddStopArr(text ...string)

AddStopArr add array stop token to stop dictionaries

func (*Segmenter) AddToken

func (seg *Segmenter) AddToken(text string, freq float64, pos ...string) error

AddToken add a new text to the token

func (*Segmenter) AddTokenForce

func (seg *Segmenter) AddTokenForce(text string, freq float64, pos ...string) (err error)

AddTokenForce add new text to token and force time-consuming

func (*Segmenter) Analyze added in v0.68.0

func (seg *Segmenter) Analyze(text []string, t1 string, by ...bool) (az []AnalyzeToken)

Analyze analyze the token segment info

func (*Segmenter) CalcToken

func (seg *Segmenter) CalcToken()

CalcToken calc the segmenter token

func (*Segmenter) Cut

func (seg *Segmenter) Cut(str string, hmm ...bool) []string

Cut cuts a str into words using accurate mode. Parameter hmm controls whether to use the HMM(Hidden Markov Model) or use the user's model.


use the shortest path

seg.Cut(text, false):

use cut dag not hmm

seg.Cut(text, true):

use cut dag and hmm mode

func (*Segmenter) CutAll

func (seg *Segmenter) CutAll(str string) []string

CutAll cuts a str into words using full mode.

func (*Segmenter) CutDAG added in v0.69.8

func (seg *Segmenter) CutDAG(str string, reg ...*regexp.Regexp) []string

CutDAG cut string with DAG use hmm and regexp

func (*Segmenter) CutDAGNoHMM added in v0.69.8

func (seg *Segmenter) CutDAGNoHMM(str string) []string

CutDAGNoHMM cut string with DAG not use hmm

func (*Segmenter) CutSearch

func (seg *Segmenter) CutSearch(str string, hmm ...bool) []string

CutSearch cuts str into words using search engine mode.

func (*Segmenter) CutStop added in v0.70.0

func (seg *Segmenter) CutStop(str string, hmm ...bool) []string

CutStop cut string and tirm stop

func (*Segmenter) CutStr added in v0.60.0

func (seg *Segmenter) CutStr(str []string, separator ...string) (r string)

CutStr cut []string with Cut return string

func (*Segmenter) CutTrim added in v0.60.0

func (seg *Segmenter) CutTrim(str string, hmm ...bool) []string

CutTrim cut string and tirm

func (*Segmenter) CutTrimHtml added in v0.62.1

func (seg *Segmenter) CutTrimHtml(str string, hmm ...bool) []string

CutTrimHtml cut string trim html and symbol return []string

func (*Segmenter) CutTrimHtmls added in v0.62.1

func (seg *Segmenter) CutTrimHtmls(str string, hmm ...bool) string

CutTrimHtmls cut string trim html and symbol return string

func (*Segmenter) CutUrl added in v0.65.0

func (seg *Segmenter) CutUrl(str string, num ...bool) []string

CutUrl cut url string trim symbol return []string

func (*Segmenter) CutUrls added in v0.65.0

func (seg *Segmenter) CutUrls(str string, num ...bool) string

CutUrls cut url string trim symbol return string

func (*Segmenter) Dictionary

func (seg *Segmenter) Dictionary() *Dictionary

Dictionary returns the dictionary used by the tokenizer

func (*Segmenter) Empty added in v0.69.4

func (seg *Segmenter) Empty() error

Empty empty the seg dictionary

func (*Segmenter) EmptyStop added in v0.69.5

func (seg *Segmenter) EmptyStop() error

EmptyStop empty the stop dictionary

func (*Segmenter) Find

func (seg *Segmenter) Find(str string) (float64, string, bool)

Find find word in dictionary return word's freq, pos and existence

func (*Segmenter) GetCurrentFilePath added in v0.80.2

func (seg *Segmenter) GetCurrentFilePath() string

GetCurrentFilePath get the current file path

func (*Segmenter) GetIdfPath added in v0.80.2

func (seg *Segmenter) GetIdfPath(files ...string) []string

GetIdfPath get the idf path

func (*Segmenter) HMMCut

func (seg *Segmenter) HMMCut(str string, reg ...*regexp.Regexp) []string

HMMCut cut sentence string use HMM with Viterbi

func (*Segmenter) HMMCutMod

func (seg *Segmenter) HMMCutMod(str string, prob[rune]float64) []string

HMMCutMod cut sentence string use HMM with Viterbi

func (*Segmenter) Init added in v0.60.0

func (seg *Segmenter) Init()

Init initializes the segmenter config

func (*Segmenter) IsStop added in v0.60.0

func (seg *Segmenter) IsStop(s string) bool

IsStop check the word is a stop word.

func (*Segmenter) LoadDict

func (seg *Segmenter) LoadDict(files ...string) error

LoadDict load the dictionary from the file

The format of the dictionary is (one for each participle):

participle text, frequency, part of speech

And you can option the dictionary separator by seg.DictSep = ","

Can load multiple dictionary files, the file name separated by "," or ", " the front of the dictionary preferentially load the participle,

such as: "user_dictionary.txt,common_dictionary.txt"

When a participle appears both in the user dictionary and in the `common dictionary`, the `user dictionary` is given priority.

func (*Segmenter) LoadDictEmbed added in v0.66.6

func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error)

LoadDictEmbed load the dictionary by embed file

func (*Segmenter) LoadDictMap added in v0.66.0

func (seg *Segmenter) LoadDictMap(dict []map[string]string) error

LoadDictMap load dictionary from []map[string]string

func (*Segmenter) LoadDictStr added in v0.66.6

func (seg *Segmenter) LoadDictStr(dict string) error

LoadDictStr load the dictionary from string

func (*Segmenter) LoadModel

func (seg *Segmenter) LoadModel(prob[rune]float64)

LoadModel load the hmm model (default is Chinese char)

Use the user's model:

seg.LoadModel(B, E, M, S map[rune]float64)

func (*Segmenter) LoadStop added in v0.60.0

func (seg *Segmenter) LoadStop(files ...string) error

LoadStop load stop word files add token to map

func (*Segmenter) LoadStopArr added in v0.66.1

func (seg *Segmenter) LoadStopArr(dict []string)

LoadStopArr load stop word by []string

func (*Segmenter) LoadStopEmbed added in v0.66.6

func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error)

LoadStopEmbed load the stop dictionary from embed file

func (*Segmenter) LoadStopStr added in v0.69.0

func (seg *Segmenter) LoadStopStr(dict string) error

LoadDictStr load the stop dictionary from string

func (*Segmenter) ModeSegment

func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment

ModeSegment segment using search mode if searchMode is true

func (*Segmenter) Pos added in v0.60.0

func (seg *Segmenter) Pos(s string, searchMode ...bool) []SegPos

Pos return text and pos array

func (*Segmenter) PosStr added in v0.60.0

func (seg *Segmenter) PosStr(str []SegPos, separator ...string) (r string)

PosStr cut []SegPos with Pos return string

func (*Segmenter) PosTrim added in v0.60.0

func (seg *Segmenter) PosTrim(str string, search bool, pos ...string) []SegPos

PosTrim cut string pos and trim

func (*Segmenter) PosTrimArr added in v0.60.0

func (seg *Segmenter) PosTrimArr(str string, search bool, pos ...string) (re []string)

PosTrimArr cut string return pos.Text []string

func (*Segmenter) PosTrimStr added in v0.60.0

func (seg *Segmenter) PosTrimStr(str string, search bool, pos ...string) string

PosTrimStr cut string return pos.Text string

func (*Segmenter) ReAddToken added in v0.69.7

func (seg *Segmenter) ReAddToken(text string, freq float64, pos ...string) error

ReAddToken remove and add token again

func (*Segmenter) Read

func (seg *Segmenter) Read(file string) error

Read read the dict flie

func (*Segmenter) ReadN added in v0.80.0

func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int,
	text, freqText, pos string, fsErr error)

ReadN read the tokens by '\n'

func (*Segmenter) Reader added in v0.66.0

func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error

Reader load dictionary from io.Reader

func (*Segmenter) RemoveStop added in v0.63.0

func (seg *Segmenter) RemoveStop(text string)

RemoveStop remove a token from the StopWord dictionary.

func (*Segmenter) RemoveToken

func (seg *Segmenter) RemoveToken(text string) error

RemoveToken remove token in dictionary

func (*Segmenter) Segment

func (seg *Segmenter) Segment(bytes []byte) []Segment

Segment use shortest path to segment the text

input parameter:

bytes	UTF8 text []byte


[]Segment	retrun segments result

func (*Segmenter) Size added in v0.66.0

func (seg *Segmenter) Size(size int, text, freqText string) (freq float64)

Size frequency is calculated based on the size of the text

func (*Segmenter) Slice

func (seg *Segmenter) Slice(s string, searchMode ...bool) []string

Slice use modeSegment segment retrun []string using search mode if searchMode is true

func (*Segmenter) SplitTextToWords added in v0.64.0

func (seg *Segmenter) SplitTextToWords(text Text) []Text

SplitTextToWords splits a string to token words

func (*Segmenter) Stop added in v0.70.0

func (seg *Segmenter) Stop(s []string) (r []string)

Stop trim []string stop word

func (*Segmenter) String

func (seg *Segmenter) String(s string, searchMode ...bool) string

Slice use modeSegment segment retrun string using search mode if searchMode is true

func (*Segmenter) SuggestFreq added in v0.60.0

func (seg *Segmenter) SuggestFreq(words ...string) float64

SuggestFreq suggest the words frequency return a suggested frequency of a word cutted to short words.

func (*Segmenter) ToToken added in v0.69.7

func (seg *Segmenter) ToToken(text string, freq float64, pos ...string) Token

ToToken make the text, freq and pos to token structure

func (*Segmenter) Trim

func (seg *Segmenter) Trim(s []string) (r []string)

Trim trim []string exclude symbol, space and punct

func (*Segmenter) TrimPos added in v0.60.0

func (seg *Segmenter) TrimPos(s []SegPos) (r []SegPos)

TrimPos trim SegPos not symbol, space and punct

func (*Segmenter) TrimPosPunct added in v0.60.0

func (seg *Segmenter) TrimPosPunct(se []SegPos) (re []SegPos)

TrimPosPunct trim SegPos not space and punct

func (*Segmenter) TrimPunct added in v0.60.0

func (seg *Segmenter) TrimPunct(s []string) (r []string)

TrimPunct trim []string exclude space and punct

func (*Segmenter) TrimSymbol added in v0.62.1

func (seg *Segmenter) TrimSymbol(s []string) (r []string)

TrimSymbol trim []string exclude symbol, space and punct

func (*Segmenter) TrimWithPos added in v0.60.0

func (seg *Segmenter) TrimWithPos(se []SegPos, pos ...string) (re []SegPos)

TrimWithPos trim some seg with pos

func (*Segmenter) Value added in v0.60.0

func (seg *Segmenter) Value(str string) (int, int, error)

Value find word in dictionary return word's value

type Text

type Text []byte

Text a string type,used to parse text 1. a word, such as "world" or "boundary", in English a word is a word 2. a participle, such as "world" a.k.a. "population" 3. a text, such as "the world has seven billion people"

func SplitWords added in v0.64.1

func SplitWords(text Text) []Text

SplitWords splits a string to token words

type Token

type Token struct {
	// contains filtered or unexported fields

Token define a segment token structure

func (*Token) Equals

func (token *Token) Equals(str string) bool

Equals compare str split tokens

func (*Token) Freq added in v0.69.7

func (token *Token) Freq() float64

Freq returns the frequency in the dictionary token

func (*Token) Pos

func (token *Token) Pos() string

Pos returns the part of speech in the dictionary token

func (*Token) Segments

func (token *Token) Segments() []*Segment

Segments will segment further subdivisions of the text of this participle, the participle has two subclauses.

Subclauses can also have further subclauses forming a tree structure, which can be traversed to get all the detailed subdivisions of the participle, which is mainly Used by search engines to perform full-text searches on a piece of text.

func (*Token) Text

func (token *Token) Text() string

Text return the text of the segment


Path Synopsis
Package hmm is the Golang HMM cut module
Package hmm is the Golang HMM cut module
Package pos model data The data from
Package pos model data The data from

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL