types

package

v0.0.0-...-6df956e Latest Latest Go to latest Published: Aug 9, 2013 License: Apache-2.0 Imports: 3 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/neeke/wukong

Links

Open Source Insights

Documentation ¶

Constants ¶

View Source

const (
	// 仅存储文档的docId
	DocIdsIndex = 0

	// 存储关键词的词频，用于计算BM25
	FrequenciesIndex = 1

	// 存储关键词在文档中出现的具体字节位置（可能有多个）
	// 如果你希望得到关键词紧邻度数据，必须使用LocationsIndex类型的索引
	LocationsIndex = 2
)

这些常数定义了反向索引表存储的数据类型

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type BM25Parameters ¶

type BM25Parameters struct {
	K1 float32
	B  float32
}

见http://en.wikipedia.org/wiki/Okapi_BM25 默认值见engine_init_options.go

type DocumentIndex ¶

type DocumentIndex struct {
	// 文本的DocId
	DocId uint64

	// 文本的关键词长
	TokenLength float32

	// 加入的索引键
	Keywords []KeywordIndex
}

type DocumentIndexData ¶

type DocumentIndexData struct {
	// 文档全文，用于生成待索引的关键词
	Content string

	// 文档标签，比如文档的类别属性等，这些标签并不出现在文档文本中
	Labels []string

	// 文档的评分字段，可以接纳任何类型的结构体
	Fields interface{}
}

type EngineInitOptions ¶

type EngineInitOptions struct {
	// 半角逗号分隔的字典文件，具体用法见
	// sego.Segmenter.LoadDictionary函数的注释
	SegmenterDictionaries string

	// 停用词文件
	StopTokenFile string

	// 分词器线程数
	NumSegmenterThreads int

	// 索引器和排序器的shard数目
	// 被检索/排序的文档会被均匀分配到各个shard中
	NumShards int

	// 索引器的信道缓冲长度
	IndexerBufferLength int

	// 索引器每个shard分配的线程数
	NumIndexerThreadsPerShard int

	// 排序器的信道缓冲长度
	RankerBufferLength int

	// 排序器每个shard分配的线程数
	NumRankerThreadsPerShard int

	// 索引器初始化选项
	IndexerInitOptions *IndexerInitOptions

	// 默认的搜索选项
	DefaultRankOptions *RankOptions
}

func (*EngineInitOptions) Init ¶

func (options *EngineInitOptions) Init()

初始化EngineInitOptions，当用户未设定某个选项的值时用默认值取代

type IndexedDocument ¶

type IndexedDocument struct {
	DocId uint64

	// BM25，仅当索引类型为FrequenciesIndex或者LocationsIndex时返回有效值
	BM25 float32

	// 关键词在文档中的紧邻距离，紧邻距离的含义见computeTokenProximity的注释。
	// 仅当索引类型为LocationsIndex时返回有效值。
	TokenProximity int32

	// 紧邻距离计算得到的关键词位置，和Lookup函数输入tokens的长度一样且一一对应。
	// 仅当索引类型为LocationsIndex时返回有效值。
	TokenSnippetLocations []int

	// 关键词在文本中的具体位置。
	// 仅当索引类型为LocationsIndex时返回有效值。
	TokenLocations [][]int
}

索引器返回结果

type IndexerInitOptions ¶

type IndexerInitOptions struct {
	// 索引表的类型，见上面的常数
	IndexType int

	// BM25参数
	BM25Parameters *BM25Parameters
}

初始化索引器选项

type KeywordIndex ¶

type KeywordIndex struct {
	// 搜索键的文本
	Text string

	// 搜索键词频
	Frequency float32

	// 搜索键在文档中的起始字节位置，按照升序排列
	Starts []int
}

反向索引项，这实际上标注了一个（搜索键，文档）对。

type RankByBM25 ¶

type RankByBM25 struct {
}

一个简单的评分规则，文档分数为BM25

func (RankByBM25) Score ¶

func (rule RankByBM25) Score(doc IndexedDocument, fields interface{}) []float32

type RankOptions ¶

type RankOptions struct {
	// 文档的评分规则，值为nil时使用Engine初始化时设定的规则
	ScoringCriteria ScoringCriteria

	// 默认情况下（ReverseOrder=false）按照分数从大到小排序，否则从小到大排序
	ReverseOrder bool

	// 从第几条结果开始输出
	OutputOffset int

	// 最大输出的搜索结果数，为0时无限制
	MaxOutputs int
}

type ScoredDocument ¶

type ScoredDocument struct {
	DocId uint64

	// 文档的打分值
	// 搜索结果按照Scores的值排序，先按照第一个数排，如果相同则按照第二个数排序，依次类推。
	Scores []float32

	// 用于生成摘要的关键词在文本中的字节位置，该切片长度和SearchResponse.Tokens的长度一样
	// 只有当IndexType == LocationsIndex时不为空
	TokenSnippetPositions []int

	// 关键词出现的位置
	// 只有当IndexType == LocationsIndex时不为空
	TokenPositions [][]int
}

type ScoredDocuments ¶

type ScoredDocuments []ScoredDocument

func (ScoredDocuments) Len ¶

func (docs ScoredDocuments) Len() int

func (ScoredDocuments) Less ¶

func (docs ScoredDocuments) Less(i, j int) bool

func (ScoredDocuments) Swap ¶

func (docs ScoredDocuments) Swap(i, j int)

type ScoringCriteria ¶

type ScoringCriteria interface {
	// 给一个文档评分，文档排序时先用第一个分值比较，如果
	// 分值相同则转移到第二个分值，以此类推。
	// 返回空切片表明该文档应该从最终排序结果中剔除。
	Score(doc IndexedDocument, fields interface{}) []float32
}

评分规则通用接口

type SearchRequest ¶

type SearchRequest struct {
	// 搜索的短语，会被分词
	// 当值为空字符串时关键词会从下面的Tokens读入
	Text string

	// 关键词，当Text不为空时优先使用Text
	// 通常你不需要自己指定关键词，除非你运行自己的分词程序
	Tokens []string

	// 文档标签，标签不存在文档文本中，但也属于搜索键的一种
	Labels []string

	// 当不为空时，仅从这些文档中搜索
	DocIds []uint64

	// 排序选项
	RankOptions *RankOptions

	// 超时，单位毫秒（千分之一秒）。此值小于等于零时不设超时。
	// 搜索超时的情况下仍有可能返回部分排序结果。
	Timeout int
}

type SearchResponse ¶

type SearchResponse struct {
	// 搜索用到的关键词
	Tokens []string

	// 搜索到的文档，已排序
	Docs []ScoredDocument

	// 搜索是否超时。超时的情况下也可能会返回部分结果
	Timeout bool
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL