Go efficient text segmentation; support english, chinese, japanese and other.

Build Status CircleCI Status codecov Build Status Go Report Card GoDoc GitHub release


Dictionary with double array trie (Double-Array Trie) to achieve, Sender algorithm is the shortest path based on word frequency plus dynamic programming, and DAG and HMM algorithm word segmentation.

Support common, search engine, full mode, precise mode and HMM mode multiple word segmentation modes, support user dictionary, POS tagging, run JSON RPC service.

Support HMM cut text use Viterbi algorithm.

Text Segmentation speed single thread 9.2MB/s,goroutines concurrent 26.8MB/s. HMM text segmentation single thread 3.2MB/s. (2core 4threads Macbook Pro).


gse-bind, binding JavaScript and other, support more language.

Install / update

go get -u


go get -u
re gse

To create a new gse application

$ re gse my-gse
re run

To run the application we just created, you can navigate to the application folder and execute:

$ cd my-gse && re run


package main

import (


var (
	text = "Hello world, Helloworld. Winter is coming! 你好世界."

	new = gse.New("zh,testdata/test_dict3.txt", "alpha")

	seg gse.Segmenter
	posSeg pos.Segmenter

func cut() {
	hmm := new.Cut(text, true)
	fmt.Println("cut use hmm: ", hmm)

	hmm = new.CutSearch(text, true)
	fmt.Println("cut search use hmm: ", hmm)

	hmm = new.CutAll(text)
	fmt.Println("cut all: ", hmm)

func main() {


func posAndTrim(cut []string) {
	cut = seg.Trim(cut)
	fmt.Println("cut all: ", cut)

	po := posSeg.Cut(text, true)
	fmt.Println("pos: ", po)

	po = posSeg.TrimWithPos(po, "zg")
	fmt.Println("trim pos: ", po)

func cutPos() {
	fmt.Println(seg.String(text, true))
	fmt.Println(seg.Slice(text, true))

	po := seg.Pos(text, true)
	fmt.Println("pos: ", po)
	po = seg.TrimPos(po)
	fmt.Println("trim pos: ", po)

func segCut() {
	// Loading the default dictionary
	// Load the dictionary
	// seg.LoadDict("your gopath"+"/src/")

	// Text Segmentation
	tb := []byte(text)
	fmt.Println(seg.String(text, true))

	segments := seg.Segment(tb)

	// Handle word segmentation results
	// Support for normal mode and search mode two participle,
	// see the comments in the code ToString function.
	// The search mode is mainly used to provide search engines
	// with as many keywords as possible
	fmt.Println(gse.ToString(segments, true))

Look at an custom dictionary example

package main

import (


func main() {
	var seg gse.Segmenter
	// seg.LoadDictEmbed()
	// seg.LoadStopEmbed()

	text1 := "你好世界, Hello world"
	fmt.Println(seg.String(text1, true))

	segments := seg.Segment([]byte(text1))

Look at an Chinese example

Look at an Japanese example



Gse is primarily distributed under the terms of both the MIT license and the Apache License (Version 2.0), thanks for sego and jieba(jiebago).



    Package gse Go efficient text segmentation, Go 语言高性能分词



    View Source
    const (
    	// RatioWord ratio words and letters
    	RatioWord float32 = 1.5
    	// RatioWordFull full ratio words and letters
    	RatioWordFull float32 = 1
    View Source
    const (
    	// Version get the gse version
    	Version = "v0.62.1.510, Green Lake!"


    View Source
    var StopWordMap = map[string]bool{
    	" ": true,

      StopWordMap default contains some stop words.

      View Source
      var (
      	// ToLower set alpha tolower
      	ToLower = true


      func DictPaths

      func DictPaths(dictDir, filePath string) (files []string)

        DictPaths get the dict's paths

        func FilterEmoji

        func FilterEmoji(text string) (new string)

          FilterEmoji filter the emoji

          func FilterHtml

          func FilterHtml(text string) string

            FilterHtml filter the html tag

            func FilterLang

            func FilterLang(text, lang string) (new string)

              FilterLang filter the language

              func FilterSymbol

              func FilterSymbol(text string) (new string)

                FilterSymbol filter the symbol

                func GetCurrentFilePath

                func GetCurrentFilePath() string

                  GetCurrentFilePath get current file path

                  func GetIdfPath

                  func GetIdfPath(files ...string) []string

                    GetIdfPath get the idf path

                    func GetVersion

                    func GetVersion() string

                      GetVersion get the gse version

                      func IsJp

                      func IsJp(segText string) bool

                        IsJp is jp char return true

                        func Join

                        func Join(text []Text) string

                          Join is better string splicing

                          func Range

                          func Range(text string) (new []string)

                            Range range text to []string

                            func RangeText

                            func RangeText(text string) (new string)

                              RangeText range text to string

                              func SplitNum

                              func SplitNum(text string) []string

                                SplitNum cut string by num to []string

                                func SplitNums

                                func SplitNums(text string) string

                                  SplitNums cut string by num to string

                                  func ToSlice

                                  func ToSlice(segs []Segment, searchMode ...bool) (output []string)

                                    ToSlice segments to slice 输出分词结果到一个字符串 slice

                                    有两种输出模式,以 "山达尔星联邦共和国" 为例

                                    搜索模式(searchMode=true) 输出普通模式的再细致切分:
                                        "[山达尔星 联邦 共和 国 共和国 联邦共和国 山达尔星联邦共和国]"

                                    默认 searchMode=false 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见Token结构体的注释。

                                    func ToString

                                    func ToString(segs []Segment, searchMode ...bool) (output string)

                                      ToString segments to string 输出分词结果为字符串

                                      有两种输出模式,以 "山达尔星联邦共和国" 为例

                                      普通模式(searchMode=false)输出一个分词 "山达尔星联邦共和国/ns "
                                      搜索模式(searchMode=true) 输出普通模式的再细致切分:
                                          "山达尔星/nz 联邦/n 共和/nz 国/n 共和国/ns 联邦共和国/nt 山达尔星联邦共和国/ns "

                                      默认 searchMode=false 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见 Token 结构体的注释。


                                      type Dictionary

                                      type Dictionary struct {
                                      	Tokens []Token // 词典中所有的分词,方便遍历
                                      	// contains filtered or unexported fields

                                        Dictionary 结构体实现了一个字串双数组树, 一个分词可能出现在叶子节点也有可能出现在非叶节点

                                        func NewDict

                                        func NewDict() *Dictionary

                                          NewDict new dictionary

                                          func (*Dictionary) Find

                                          func (dict *Dictionary) Find(word []byte) (float64, bool)

                                            Find find word in the dictionary is non-existent and the word's frequency

                                            func (*Dictionary) LookupTokens

                                            func (dict *Dictionary) LookupTokens(
                                            	words []Text, tokens []*Token) (numOfTokens int)

                                              LookupTokens 在词典中查找和字元组 words 可以前缀匹配的所有分词 返回值为找到的分词数

                                              func (*Dictionary) MaxTokenLen

                                              func (dict *Dictionary) MaxTokenLen() int

                                                MaxTokenLen 词典中最长的分词

                                                func (*Dictionary) NumTokens

                                                func (dict *Dictionary) NumTokens() int

                                                  NumTokens 词典中分词数目

                                                  func (*Dictionary) RemoveToken

                                                  func (dict *Dictionary) RemoveToken(token Token) error

                                                    RemoveToken remove token in dictionary

                                                    func (*Dictionary) TotalFreq

                                                    func (dict *Dictionary) TotalFreq() float64

                                                      TotalFreq 词典中所有分词的频率之和

                                                      func (*Dictionary) Value

                                                      func (dict *Dictionary) Value(word []byte) (val, id int, err error)

                                                        Value find word in the dictionary retrun the word's value, id

                                                        type Prob

                                                        type Prob struct {
                                                        	B, E, M, S map[rune]float64

                                                          Prob type hmm model struct

                                                          type SegPos

                                                          type SegPos struct {
                                                          	Text, Pos string

                                                            SegPos represents a word with it's POS

                                                            func ToPos

                                                            func ToPos(segs []Segment, searchMode ...bool) (output []SegPos)

                                                              ToPos segments to SegPos

                                                              type Segment

                                                              type Segment struct {
                                                              	Position int
                                                              	// contains filtered or unexported fields

                                                                Segment 文本中的一个分词

                                                                func (*Segment) End

                                                                func (s *Segment) End() int

                                                                  End 返回分词在文本中的结束字节位置(不包括该位置)

                                                                  func (*Segment) Start

                                                                  func (s *Segment) Start() int

                                                                    Start 返回分词在文本中的起始字节位置

                                                                    func (*Segment) Token

                                                                    func (s *Segment) Token() *Token

                                                                      Token 返回分词信息

                                                                      type Segmenter

                                                                      type Segmenter struct {
                                                                      	Dict *Dictionary
                                                                      	Load bool
                                                                      	// AlphaNum set splitTextToWords can add token
                                                                      	// when words in alphanum
                                                                      	// set up alphanum dictionary word segmentation
                                                                      	AlphaNum bool
                                                                      	Alpha    bool
                                                                      	Num      bool
                                                                      	// LoadNoFreq load not have freq dict word
                                                                      	LoadNoFreq bool
                                                                      	// MinTokenFreq load min freq token
                                                                      	MinTokenFreq float64
                                                                      	// TextFreq add token frenquency when not specified freq
                                                                      	TextFreq string
                                                                      	// SkipLog set skip log print
                                                                      	SkipLog bool
                                                                      	MoreLog bool
                                                                      	// SkipPos skip PosStr pos
                                                                      	SkipPos bool
                                                                      	NotStop bool
                                                                      	// StopWordMap stop word map
                                                                      	StopWordMap map[string]bool

                                                                        Segmenter 分词器结构体

                                                                        func New

                                                                        func New(files ...string) Segmenter

                                                                          New return new gse segmenter

                                                                          func NewEmbed

                                                                          func NewEmbed(alpha ...string) (seg Segmenter)

                                                                            NewEmbed return new gse segmenter by embed dictionary

                                                                            func (*Segmenter) AddStop

                                                                            func (seg *Segmenter) AddStop(text string)

                                                                              AddStop add a token into StopWord dictionary.

                                                                              func (*Segmenter) AddToken

                                                                              func (seg *Segmenter) AddToken(text string, frequency float64, pos ...string) error

                                                                                AddToken add new text to token

                                                                                func (*Segmenter) AddTokenForce

                                                                                func (seg *Segmenter) AddTokenForce(text string, frequency float64, pos ...string)

                                                                                  AddTokenForce add new text to token and force time-consuming

                                                                                  func (*Segmenter) CalcToken

                                                                                  func (seg *Segmenter) CalcToken()

                                                                                    CalcToken calc the segmenter token

                                                                                    func (*Segmenter) Cut

                                                                                    func (seg *Segmenter) Cut(str string, hmm ...bool) []string

                                                                                      Cut cuts a str into words using accurate mode. Parameter hmm controls whether to use the HMM(Hidden Markov Model) or use the user's model.

                                                                                      func (*Segmenter) CutAll

                                                                                      func (seg *Segmenter) CutAll(str string) []string

                                                                                        CutAll cuts a str into words using full mode.

                                                                                        func (*Segmenter) CutSearch

                                                                                        func (seg *Segmenter) CutSearch(str string, hmm ...bool) []string

                                                                                          CutSearch cuts str into words using search engine mode.

                                                                                          func (*Segmenter) CutStr

                                                                                          func (seg *Segmenter) CutStr(str []string, separator ...string) (r string)

                                                                                            CutStr cut []string with Cut return string

                                                                                            func (*Segmenter) CutTrim

                                                                                            func (seg *Segmenter) CutTrim(str string, hmm ...bool) []string

                                                                                              CutTrim cut string and tirm

                                                                                              func (*Segmenter) CutTrimHtml

                                                                                              func (seg *Segmenter) CutTrimHtml(str string, hmm ...bool) []string

                                                                                                CutTrimHtml cut string trim html and symbol return []string

                                                                                                func (*Segmenter) CutTrimHtmls

                                                                                                func (seg *Segmenter) CutTrimHtmls(str string, hmm ...bool) string

                                                                                                  CutTrimHtmls cut string trim html and symbol return string

                                                                                                  func (*Segmenter) CutUrl

                                                                                                  func (seg *Segmenter) CutUrl(str string, num ...bool) []string

                                                                                                    CutUrl cut url string trim symbol return []string

                                                                                                    func (*Segmenter) CutUrls

                                                                                                    func (seg *Segmenter) CutUrls(str string, num ...bool) string

                                                                                                      CutUrls cut url string trim symbol return string

                                                                                                      func (*Segmenter) Dictionary

                                                                                                      func (seg *Segmenter) Dictionary() *Dictionary

                                                                                                        Dictionary 返回分词器使用的词典

                                                                                                        func (*Segmenter) Find

                                                                                                        func (seg *Segmenter) Find(str string) (float64, bool)

                                                                                                          Find find word in dictionary return word's frequency and existence

                                                                                                          func (*Segmenter) HMMCut

                                                                                                          func (seg *Segmenter) HMMCut(str string) []string

                                                                                                            HMMCut cut sentence string use HMM with Viterbi

                                                                                                            func (*Segmenter) HMMCutMod

                                                                                                            func (seg *Segmenter) HMMCutMod(str string, prob[rune]float64) []string

                                                                                                              HMMCutMod cut sentence string use HMM with Viterbi

                                                                                                              func (*Segmenter) Init

                                                                                                              func (seg *Segmenter) Init()

                                                                                                                Init init seg config

                                                                                                                func (*Segmenter) IsStop

                                                                                                                func (seg *Segmenter) IsStop(s string) bool

                                                                                                                  IsStop checks if a given word is stop word.

                                                                                                                  func (*Segmenter) LoadDict

                                                                                                                  func (seg *Segmenter) LoadDict(files ...string) error

                                                                                                                    LoadDict load the dictionary from the file

                                                                                                                    The format of the dictionary is (one for each participle):

                                                                                                                    participle text, frequency, part of speech

                                                                                                                    Can load multiple dictionary files, the file name separated by "," or ", " the front of the dictionary preferentially load the participle,

                                                                                                                    such as: "user_dictionary.txt,common_dictionary.txt"

                                                                                                                    When a participle appears both in the user dictionary and in the `common dictionary`, the `user dictionary` is given priority.


                                                                                                                    可以载入多个词典文件,文件名用 "," 或 ", " 分隔,排在前面的词典优先载入分词,比如:


                                                                                                                    当一个分词既出现在用户词典也出现在 `通用词典` 中,则优先使用 `用户词典`。


                                                                                                                    分词文本 频率 词性

                                                                                                                    func (*Segmenter) LoadDictEmbed

                                                                                                                    func (seg *Segmenter) LoadDictEmbed() error

                                                                                                                      LoadDictEmbed load dictionary by embed file

                                                                                                                      func (*Segmenter) LoadDictMap

                                                                                                                      func (seg *Segmenter) LoadDictMap(dict []map[string]string) error

                                                                                                                        LoadDictMap load dictionary from []map[string]string

                                                                                                                        func (*Segmenter) LoadDictStr

                                                                                                                        func (seg *Segmenter) LoadDictStr(dict string) error

                                                                                                                          LoadDictStr load dictionary from string

                                                                                                                          func (*Segmenter) LoadModel

                                                                                                                          func (seg *Segmenter) LoadModel(prob[rune]float64)

                                                                                                                            LoadModel load the hmm model

                                                                                                                            Use the user's model:

                                                                                                                            seg.LoadModel(B, E, M, S map[rune]float64)

                                                                                                                            func (*Segmenter) LoadStop

                                                                                                                            func (seg *Segmenter) LoadStop(files ...string) error

                                                                                                                              LoadStop load stop word files add token to map

                                                                                                                              func (*Segmenter) LoadStopArr

                                                                                                                              func (seg *Segmenter) LoadStopArr(dict []string)

                                                                                                                                LoadStopArr load stop word by []string

                                                                                                                                func (*Segmenter) LoadStopEmbed

                                                                                                                                func (seg *Segmenter) LoadStopEmbed() error

                                                                                                                                  LoadStopEmbed load stop dictionary from embed file

                                                                                                                                  func (*Segmenter) ModeSegment

                                                                                                                                  func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment

                                                                                                                                    ModeSegment segment using search mode if searchMode is true

                                                                                                                                    func (*Segmenter) Pos

                                                                                                                                    func (seg *Segmenter) Pos(s string, searchMode ...bool) []SegPos

                                                                                                                                      Pos return text and pos array

                                                                                                                                      func (*Segmenter) PosStr

                                                                                                                                      func (seg *Segmenter) PosStr(str []SegPos, separator ...string) (r string)

                                                                                                                                        PosStr cut []SegPos with Pos return string

                                                                                                                                        func (*Segmenter) PosTrim

                                                                                                                                        func (seg *Segmenter) PosTrim(str string, search bool, pos ...string) []SegPos

                                                                                                                                          PosTrim cut string pos and trim

                                                                                                                                          func (*Segmenter) PosTrimArr

                                                                                                                                          func (seg *Segmenter) PosTrimArr(str string, search bool, pos ...string) (re []string)

                                                                                                                                            PosTrimArr cut string return pos.Text []string

                                                                                                                                            func (*Segmenter) PosTrimStr

                                                                                                                                            func (seg *Segmenter) PosTrimStr(str string, search bool, pos ...string) string

                                                                                                                                              PosTrimStr cut string return pos.Text string

                                                                                                                                              func (*Segmenter) Read

                                                                                                                                              func (seg *Segmenter) Read(file string) error

                                                                                                                                                Read read the dict flie

                                                                                                                                                func (*Segmenter) Reader

                                                                                                                                                func (seg *Segmenter) Reader(reader io.Reader, files ...string) error

                                                                                                                                                  Reader load dictionary from io.Reader

                                                                                                                                                  func (*Segmenter) RemoveStop

                                                                                                                                                  func (seg *Segmenter) RemoveStop(text string)

                                                                                                                                                    RemoveStop remove a token into StopWord dictionary.

                                                                                                                                                    func (*Segmenter) RemoveToken

                                                                                                                                                    func (seg *Segmenter) RemoveToken(text string) error

                                                                                                                                                      RemoveToken remove token in dictionary

                                                                                                                                                      func (*Segmenter) Segment

                                                                                                                                                      func (seg *Segmenter) Segment(bytes []byte) []Segment

                                                                                                                                                        Segment 对文本分词


                                                                                                                                                        bytes	UTF8 文本的字节数组


                                                                                                                                                        []Segment	划分的分词

                                                                                                                                                        func (*Segmenter) Size

                                                                                                                                                        func (seg *Segmenter) Size(size int, text, freqText string) (frequency float64)

                                                                                                                                                          Size frequency is calculated based on the size of the text

                                                                                                                                                          func (*Segmenter) Slice

                                                                                                                                                          func (seg *Segmenter) Slice(s string, searchMode ...bool) []string

                                                                                                                                                            Slice use modeSegment segment retrun []string using search mode if searchMode is true

                                                                                                                                                            func (*Segmenter) SplitTextToWords

                                                                                                                                                            func (seg *Segmenter) SplitTextToWords(text Text) []Text

                                                                                                                                                              SplitTextToWords 将文本划分成字元

                                                                                                                                                              func (*Segmenter) String

                                                                                                                                                              func (seg *Segmenter) String(s string, searchMode ...bool) string

                                                                                                                                                                Slice use modeSegment segment retrun string using search mode if searchMode is true

                                                                                                                                                                func (*Segmenter) SuggestFreq

                                                                                                                                                                func (seg *Segmenter) SuggestFreq(words ...string) float64

                                                                                                                                                                  SuggestFreq suggest the words frequency returns a suggested frequncy of a word cutted into several short words.

                                                                                                                                                                  func (*Segmenter) Trim

                                                                                                                                                                  func (seg *Segmenter) Trim(s []string) (r []string)

                                                                                                                                                                    Trim trim []string exclude symbol, space and punct

                                                                                                                                                                    func (*Segmenter) TrimPos

                                                                                                                                                                    func (seg *Segmenter) TrimPos(s []SegPos) (r []SegPos)

                                                                                                                                                                      TrimPos trim SegPos not symbol, space and punct

                                                                                                                                                                      func (*Segmenter) TrimPosPunct

                                                                                                                                                                      func (seg *Segmenter) TrimPosPunct(se []SegPos) (re []SegPos)

                                                                                                                                                                        TrimPosPunct trim SegPos not space and punct

                                                                                                                                                                        func (*Segmenter) TrimPunct

                                                                                                                                                                        func (seg *Segmenter) TrimPunct(s []string) (r []string)

                                                                                                                                                                          TrimPunct trim []string exclude space and punct

                                                                                                                                                                          func (*Segmenter) TrimSymbol

                                                                                                                                                                          func (seg *Segmenter) TrimSymbol(s []string) (r []string)

                                                                                                                                                                            TrimSymbol trim []string exclude symbol, space and punct

                                                                                                                                                                            func (*Segmenter) TrimWithPos

                                                                                                                                                                            func (seg *Segmenter) TrimWithPos(se []SegPos, pos ...string) (re []SegPos)

                                                                                                                                                                              TrimWithPos trim some seg with pos

                                                                                                                                                                              func (*Segmenter) Value

                                                                                                                                                                              func (seg *Segmenter) Value(str string) (int, int, error)

                                                                                                                                                                                Value find word in dictionary return word's value

                                                                                                                                                                                type Text

                                                                                                                                                                                type Text []byte

                                                                                                                                                                                  Text 字串类型,可以用来表达

                                                                                                                                                                                  1. 一个字元,比如 "世" 又如 "界", 英文的一个字元是一个词
                                                                                                                                                                                  2. 一个分词,比如 "世界" 又如 "人口"
                                                                                                                                                                                  3. 一段文字,比如 "世界有七十亿人口"

                                                                                                                                                                                  func SplitWords

                                                                                                                                                                                  func SplitWords(text Text) []Text

                                                                                                                                                                                    SplitWords 将文本划分成字元

                                                                                                                                                                                    type Token

                                                                                                                                                                                    type Token struct {
                                                                                                                                                                                    	// contains filtered or unexported fields

                                                                                                                                                                                      Token 一个分词

                                                                                                                                                                                      func (*Token) Equals

                                                                                                                                                                                      func (token *Token) Equals(str string) bool

                                                                                                                                                                                        Equals compare str split tokens

                                                                                                                                                                                        func (*Token) Frequency

                                                                                                                                                                                        func (token *Token) Frequency() float64

                                                                                                                                                                                          Frequency 返回分词在语料库中的词频

                                                                                                                                                                                          func (*Token) Pos

                                                                                                                                                                                          func (token *Token) Pos() string

                                                                                                                                                                                            Pos 返回分词词性标注

                                                                                                                                                                                            func (*Token) Segments

                                                                                                                                                                                            func (token *Token) Segments() []*Segment

                                                                                                                                                                                              Segments 该分词文本的进一步分词划分,比如 "山达尔星联邦共和国联邦政府" 这个分词 有两个子分词 "山达尔星联邦共和国 " 和 "联邦政府"。子分词也可以进一步有子分词 形成一个树结构,遍历这个树就可以得到该分词的所有细致分词划分,这主要 用于搜索引擎对一段文本进行全文搜索。

                                                                                                                                                                                              func (*Token) Text

                                                                                                                                                                                              func (token *Token) Text() string

                                                                                                                                                                                                Text 返回分词文本


                                                                                                                                                                                                Path Synopsis
                                                                                                                                                                                                Package hmm is the Golang HMM cut module Package hmm model data The data from
                                                                                                                                                                                                Package hmm is the Golang HMM cut module Package hmm model data The data from
                                                                                                                                                                                                Package pos model data The data from
                                                                                                                                                                                                Package pos model data The data from