gojieba

package module

v1.3.0 Latest Latest Go to latest Published: Feb 5, 2023 License: MIT Imports: 8 Imported by: 119

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/yanyiwu/gojieba

Links

Open Source Insights

README ¶

GoJieba English

GoJieba是"结巴"中文分词的Golang语言版本。

简介

支持多种分词方式，包括: 最大概率模式, HMM新词发现模式, 搜索引擎模式, 全模式
核心算法底层由C++实现，性能高效。
字典路径可配置，NewJieba(...string), NewExtractor(...string) 可变形参，当参数为空时使用默认词典(推荐方式)

用法

go get github.com/yanyiwu/gojieba

分词示例

package main

import (
	"fmt"
	"strings"

	"github.com/yanyiwu/gojieba"
)

func main() {
	var s string
	var words []string
	use_hmm := true
	x := gojieba.NewJieba()
	defer x.Free()

	s = "我来到北京清华大学"
	words = x.CutAll(s)
	fmt.Println(s)
	fmt.Println("全模式:", strings.Join(words, "/"))

	words = x.Cut(s, use_hmm)
	fmt.Println(s)
	fmt.Println("精确模式:", strings.Join(words, "/"))
	s = "比特币"
	words = x.Cut(s, use_hmm)
	fmt.Println(s)
	fmt.Println("精确模式:", strings.Join(words, "/"))

	x.AddWord("比特币")
	// `AddWordEx` 支持指定词语的权重，作为 `AddWord` 权重太低加词失败的补充。
	// `tag` 参数可以为空字符串，也可以指定词性。
	// x.AddWordEx("比特币", 100000, "")
	s = "比特币"
	words = x.Cut(s, use_hmm)
	fmt.Println(s)
	fmt.Println("添加词典后,精确模式:", strings.Join(words, "/"))

	s = "他来到了网易杭研大厦"
	words = x.Cut(s, use_hmm)
	fmt.Println(s)
	fmt.Println("新词识别:", strings.Join(words, "/"))

	s = "小明硕士毕业于中国科学院计算所，后在日本京都大学深造"
	words = x.CutForSearch(s, use_hmm)
	fmt.Println(s)
	fmt.Println("搜索引擎模式:", strings.Join(words, "/"))

	s = "长春市长春药店"
	words = x.Tag(s)
	fmt.Println(s)
	fmt.Println("词性标注:", strings.Join(words, ","))

	s = "区块链"
	words = x.Tag(s)
	fmt.Println(s)
	fmt.Println("词性标注:", strings.Join(words, ","))

	s = "长江大桥"
	words = x.CutForSearch(s, !use_hmm)
	fmt.Println(s)
	fmt.Println("搜索引擎模式:", strings.Join(words, "/"))

	wordinfos := x.Tokenize(s, gojieba.SearchMode, !use_hmm)
	fmt.Println(s)
	fmt.Println("Tokenize:(搜索引擎模式)", wordinfos)

	wordinfos = x.Tokenize(s, gojieba.DefaultMode, !use_hmm)
	fmt.Println(s)
	fmt.Println("Tokenize:(默认模式)", wordinfos)

	keywords := x.ExtractWithWeight(s, 5)
	fmt.Println("Extract:", keywords)
}

我来到北京清华大学
全模式: 我/来到/北京/清华/清华大学/华大/大学
我来到北京清华大学
精确模式: 我/来到/北京/清华大学
比特币
精确模式: 比特/币
比特币
添加词典后,精确模式: 比特币
他来到了网易杭研大厦
新词识别: 他/来到/了/网易/杭研/大厦
小明硕士毕业于中国科学院计算所，后在日本京都大学深造
搜索引擎模式: 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/，/后/在/日本/京都/大学/日本京都大学/深造
长春市长春药店
词性标注: 长春市/ns,长春/ns,药店/n
区块链
词性标注: 区块链/nz
长江大桥
搜索引擎模式: 长江/大桥/长江大桥
长江大桥
Tokenize: [{长江 0 6} {大桥 6 12} {长江大桥 0 12}]

See example in jieba_test, extractor_test

Benchmark

Jieba中文分词系列性能评测

Unittest

go test ./...

Benchmark

go test -bench "Jieba" -test.benchtime 10s
go test -bench "Extractor" -test.benchtime 10s

Contributors

Code Contributors

This project exists thanks to all the people who contribute.

Contact

Email: i@yanyiwu.com

Documentation ¶

Examples ¶

Jieba

Constants ¶

View Source

const TOTAL_DICT_PATH_NUMBER = 5

Variables ¶

View Source

var (
	DICT_DIR        string
	DICT_PATH       string
	HMM_PATH        string
	USER_DICT_PATH  string
	IDF_PATH        string
	STOP_WORDS_PATH string
)

Functions ¶

This section is empty.

Types ¶

type Jieba ¶

type Jieba struct {
	// contains filtered or unexported fields
}

Example ¶

var s string
var words []string
use_hmm := true
//equals with x := NewJieba(DICT_PATH, HMM_PATH, USER_DICT_PATH)
x := NewJieba()
defer x.Free()

s = "我来到北京清华大学"
words = x.CutAll(s)
fmt.Println(s)
fmt.Println("全模式:", strings.Join(words, "/"))

words = x.Cut(s, use_hmm)
fmt.Println(s)
fmt.Println("精确模式:", strings.Join(words, "/"))

s = "比特币"
words = x.Cut(s, use_hmm)
fmt.Println(s)
fmt.Println("精确模式:", strings.Join(words, "/"))

x.AddWord("比特币")
s = "比特币"
words = x.Cut(s, use_hmm)
fmt.Println(s)
fmt.Println("添加词典后,精确模式:", strings.Join(words, "/"))

x.AddWord("这是一个很长的关键字")
s = "这是一个很长的关键字"
words = x.Extract(s, 3)
fmt.Println(s)
fmt.Println("添加词典后,Extract:", strings.Join(words, "/"))

x.RemoveWord("这是一个很长的关键字")
s = "这是一个很长的关键字"
words = x.Extract(s, 3)
fmt.Println(s)
fmt.Println("从词典删除后,Extract:", strings.Join(words, "/"))

s = "他来到了网易杭研大厦"
words = x.Cut(s, use_hmm)
fmt.Println(s)
fmt.Println("新词识别:", strings.Join(words, "/"))

s = "小明硕士毕业于中国科学院计算所，后在日本京都大学深造"
words = x.CutForSearch(s, use_hmm)
fmt.Println(s)
fmt.Println("搜索引擎模式:", strings.Join(words, "/"))

s = "长春市长春药店"
words = x.Tag(s)
fmt.Println(s)
fmt.Println("词性标注:", strings.Join(words, ","))

s = "区块链"
words = x.Tag(s)
fmt.Println(s)
fmt.Println("词性标注:", strings.Join(words, ","))

s = "长江大桥"
words = x.CutForSearch(s, !use_hmm)
fmt.Println(s)
fmt.Println("搜索引擎模式:", strings.Join(words, "/"))

wordinfos := x.Tokenize(s, SearchMode, !use_hmm)
fmt.Println(s)
fmt.Println("Tokenize:", wordinfos)

Output:

我来到北京清华大学
全模式: 我/来到/北京/清华/清华大学/华大/大学
我来到北京清华大学
精确模式: 我/来到/北京/清华大学
比特币
精确模式: 比特/币
比特币
添加词典后,精确模式: 比特币
这是一个很长的关键字
添加词典后,Extract: 这是一个很长的关键字
这是一个很长的关键字
从词典删除后,Extract: 关键字/很长/这是
他来到了网易杭研大厦
新词识别: 他/来到/了/网易/杭研/大厦
小明硕士毕业于中国科学院计算所，后在日本京都大学深造
搜索引擎模式: 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/，/后/在/日本/京都/大学/日本京都大学/深造
长春市长春药店
词性标注: 长春市/ns,长春/ns,药店/n
区块链
词性标注: 区块链/nz
长江大桥
搜索引擎模式: 长江/大桥/长江大桥
长江大桥
Tokenize: [{长江 0 6} {大桥 6 12} {长江大桥 0 12}]

func NewJieba ¶

func NewJieba(paths ...string) *Jieba

func (*Jieba) AddWord ¶

func (x *Jieba) AddWord(s string)

func (*Jieba) AddWordEx ¶ added in v1.3.0

func (x *Jieba) AddWordEx(s string, freq int, tag string)

func (*Jieba) Cut ¶

func (x *Jieba) Cut(s string, hmm bool) []string

func (*Jieba) CutAll ¶

func (x *Jieba) CutAll(s string) []string

func (*Jieba) CutForSearch ¶

func (x *Jieba) CutForSearch(s string, hmm bool) []string

func (*Jieba) Extract ¶

func (x *Jieba) Extract(s string, topk int) []string

func (*Jieba) ExtractWithWeight ¶

func (x *Jieba) ExtractWithWeight(s string, topk int) []WordWeight

func (*Jieba) Free ¶

func (x *Jieba) Free()

func (*Jieba) RemoveWord ¶ added in v1.2.0

func (x *Jieba) RemoveWord(s string)

func (*Jieba) Tag ¶

func (x *Jieba) Tag(s string) []string

func (*Jieba) Tokenize ¶

func (x *Jieba) Tokenize(s string, mode TokenizeMode, hmm bool) []Word

type TokenizeMode ¶

type TokenizeMode int

const (
	DefaultMode TokenizeMode = iota
	SearchMode
)

type Word ¶

type Word struct {
	Str   string
	Start int
	End   int
}

type WordWeight ¶

type WordWeight struct {
	Word   string
	Weight float64
}

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
deps
cppjieba
limonp
dict

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL