types

package

v0.0.0-...-f4c30ac Latest Latest Go to latest Published: Oct 13, 2020 License: Apache-2.0 Imports: 3 Imported by: 178

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/go-ego/riot

Links

Open Source Insights

Documentation ¶

Overview ¶

Package types is riot types

Index ¶

Constants
type Attri
type BM25Parameters
type BaseResp
type Content
type DocData
type DocIndex
type DocIndexData
type DocInfo
type DocInfosShard
type DocsId
- func (docs DocsId) Len() int
- func (docs DocsId) Less(i, j int) bool
- func (docs DocsId) Swap(i, j int)
type DocsIndex
- func (docs DocsIndex) Len() int
- func (docs DocsIndex) Less(i, j int) bool
- func (docs DocsIndex) Swap(i, j int)
type EngineOpts
- func (options *EngineOpts) Init()
type Expr
type IndexedDoc
type IndexerOpts
- func (options *IndexerOpts) Init()
type InvertedIndexShard
type KeywordIndex
type KeywordIndices
type Logic
type RankByBM25
- func (rule RankByBM25) Score(doc IndexedDoc, fields interface{}) []float32
type RankOpts
type ScoredDoc
type ScoredDocs
- func (docs ScoredDocs) Len() int
- func (docs ScoredDocs) Less(i, j int) bool
- func (docs ScoredDocs) Swap(i, j int)
type ScoredID
type ScoredIDs
- func (docs ScoredIDs) Len() int
- func (docs ScoredIDs) Less(i, j int) bool
- func (docs ScoredIDs) Swap(i, j int)
type ScoringCriteria
type SearchDoc
type SearchID
type SearchReq
type SearchResp
type TokenData

Constants ¶

View Source

const (
	// DocIdsIndex 仅存储文档的 docId
	DocIdsIndex = 0

	// FrequenciesIndex 存储关键词的词频，用于计算BM25
	FrequenciesIndex = 1

	// LocsIndex 存储关键词在文档中出现的具体字节位置（可能有多个）
	// 如果你希望得到关键词紧邻度数据，必须使用 LocsIndex 类型的索引
	LocsIndex = 2
)

这些常数定义了反向索引表存储的数据类型

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Attri ¶

type Attri struct {
	Title  string `json:"title"`
	Author string `json:"author"`
	Time   string `json:"time"`
	Ts     int64  `json:"ts"`
}

Attri doc attribute

type BM25Parameters ¶

type BM25Parameters struct {
	K1 float32
	B  float32
}

BM25Parameters 见http://en.wikipedia.org/wiki/Okapi_BM25 默认值见 engine_init_options.go

type BaseResp ¶

type BaseResp struct {
	// 搜索用到的关键词
	Tokens []string

	// 搜索是否超时。超时的情况下也可能会返回部分结果
	Timeout bool

	// 搜索到的文档个数。注意这是全部文档中满足条件的个数，可能比返回的文档数要大
	NumDocs int
}

BaseResp search response options

type Content ¶

type Content struct {
	// new Content
	Content string

	// new 属性 Attri
	Attri interface{}

	// new 返回评分字段
	Fields interface{}
}

Content search content

type DocData ¶

type DocData struct {
	// 文档全文（必须是 UTF-8 格式），用于生成待索引的关键词
	Content string

	// new 类别
	// Class string
	// new 属性
	Attri interface{}

	// 文档的关键词
	// 当 Content 不为空的时候，优先从 Content 中分词得到关键词
	// 并叠加 Tokens。
	// Tokens 存在的意义在于可以绕过 riot 内置的分词器，在引擎外部
	// 进行分词和预处理。
	// Tokens []*TokenData
	Tokens []TokenData

	// 文档标签（必须是 UTF-8 格式），比如文档的类别属性等，
	// 这些标签并不出现在文档文本中
	Labels []string

	// 文档的评分字段，可以接纳任何类型的结构体
	Fields interface{}
}

DocData type document Index Data struct

type DocIndex ¶

type DocIndex struct {
	// DocId 文本的 DocId
	DocId string

	// TokenLen 文本的关键词长
	TokenLen float32

	// Keywords 加入的索引键
	Keywords []KeywordIndex
}

DocIndex document's index

type DocIndexData ¶

type DocIndexData = DocData

DocIndexData type document Index Data struct type DocIndexData DocData

type DocInfo ¶

type DocInfo struct {
	Fields    interface{}
	TokenLens float32
}

DocInfo document info

type DocInfosShard ¶

type DocInfosShard struct {
	DocInfos map[string]*DocInfo
	NumDocs  uint64 // 这实际上是总文档数的一个近似
	sync.RWMutex
}

DocInfosShard 文档信息[id]info

type DocsId ¶

type DocsId []string

DocsId 方便批量删除文档索引 type DocsId []uint64

func (DocsId) Len ¶

func (docs DocsId) Len() int

func (DocsId) Less ¶

func (docs DocsId) Less(i, j int) bool

func (DocsId) Swap ¶

func (docs DocsId) Swap(i, j int)

type DocsIndex ¶

type DocsIndex []*DocIndex

DocsIndex 方便批量加入文档索引

func (DocsIndex) Len ¶

func (docs DocsIndex) Len() int

func (DocsIndex) Less ¶

func (docs DocsIndex) Less(i, j int) bool

func (DocsIndex) Swap ¶

func (docs DocsIndex) Swap(i, j int)

type EngineOpts ¶

type EngineOpts struct {
	// 是否使用分词器
	// 默认使用，否则在启动阶段跳过 GseDict 和 StopTokenFile 设置
	// 如果你不需要在引擎内分词，可以将这个选项设为 true
	// 注意，如果你不用分词器，那么在调用 IndexDoc 时,
	// DocIndexData 中的 Content 会被忽略
	// Not use the gse segment
	NotUseGse bool `toml:"not_use_gse"`

	// new, 分词规则
	Using int `toml:"using"`

	// 半角逗号 "," 分隔的字典文件，具体用法见
	// gse.Segmenter.LoadDict 函数的注释
	GseDict   string `toml:"gse_dict"`
	PinYin    bool   `toml:"pin_yin"`
	UsePhrase bool   `toml:"use_phrase"`

	// 停用词文件
	StopTokenFile string `toml:"stop_file"`
	// Gse search mode
	GseMode bool   `toml:"gse_mode"`
	Hmm     bool   `toml:"hmm"`
	Model   string `toml:"model"`

	// 分词器线程数
	// NumSegmenterThreads int
	NumGseThreads int

	// 索引器和排序器的 shard 数目
	// 被检索/排序的文档会被均匀分配到各个 shard 中
	NumShards int

	// 索引器的信道缓冲长度
	IndexerBufLen int

	// 索引器每个shard分配的线程数
	NumIndexerThreads int

	// 排序器的信道缓冲长度
	RankerBufLen int

	// 排序器每个 shard 分配的线程数
	NumRankerThreads int

	// 索引器初始化选项
	IndexerOpts *IndexerOpts

	// 默认的搜索选项
	DefRankOpts *RankOpts

	// 是否使用持久数据库，以及数据库文件保存的目录和裂分数目
	StoreOnly bool `toml:"store_only"`
	UseStore  bool `toml:"use_store"`

	StoreFolder string `toml:"store_folder"`
	StoreShards int    `toml:"store_shards"`
	StoreEngine string `toml:"store_engine"`

	IDOnly bool `toml:"id_only"`
}

EngineOpts init engine options

func (*EngineOpts) Init ¶

func (options *EngineOpts) Init()

Init init engine options 初始化 EngineOpts，当用户未设定某个选项的值时用默认值取代

type Expr ¶

type Expr struct {

	// 与查询, 必须都存在
	Must []string

	// 或查询, 有一个存在即可
	Should []string

	// 非查询, 不包含
	NotIn []string
}

Expr logic expression options

type IndexedDoc ¶

type IndexedDoc struct {
	// DocId document id
	DocId string

	// BM25，仅当索引类型为 FrequenciesIndex 或者 LocsIndex 时返回有效值
	BM25 float32

	// TokenProximity 关键词在文档中的紧邻距离，
	// 紧邻距离的含义见 computeTokenProximity 的注释。
	// 仅当索引类型为 LocsIndex 时返回有效值。
	TokenProximity int32

	// TokenSnippetLocs 紧邻距离计算得到的关键词位置，
	// 和 Lookup 函数输入 tokens 的长度一样且一一对应。
	// 仅当索引类型为 LocsIndex 时返回有效值。
	TokenSnippetLocs []int

	// TokenLocs 关键词在文本中的具体位置。
	// 仅当索引类型为 LocsIndex 时返回有效值。
	TokenLocs [][]int
}

IndexedDoc 索引器返回结果

type IndexerOpts ¶

type IndexerOpts struct {
	// 索引表的类型，见上面的常数
	IndexType int

	// 待插入索引表文档 CACHE SIZE
	DocCacheSize int

	// BM25 参数
	BM25Parameters *BM25Parameters
}

IndexerOpts 初始化索引器选项

func (*IndexerOpts) Init ¶

func (options *IndexerOpts) Init()

Init init IndexerOpts

type InvertedIndexShard ¶

type InvertedIndexShard struct {
	InvertedIndex map[string]*KeywordIndices
	TotalTokenLen float32 //总关键词数
	sync.RWMutex
}

InvertedIndexShard 反向索引表([关键词]反向索引表)

type KeywordIndex ¶

type KeywordIndex struct {
	// Text 搜索键的 UTF-8 文本
	Text string

	// Frequency 搜索键词频
	Frequency float32

	// Starts 搜索键在文档中的起始字节位置，按照升序排列
	Starts []int
}

KeywordIndex 反向索引项，这实际上标注了一个（搜索键，文档）对。

type KeywordIndices ¶

type KeywordIndices struct {
	// 下面的切片是否为空，取决于初始化时 IndexType 的值
	DocIds      []uint64  // 全部类型都有
	Frequencies []float32 // IndexType == FrequenciesIndex
	Locations   [][]int   // IndexType == LocsIndex
}

KeywordIndices 反向索引表的一行，收集了一个搜索键出现的所有文档，按照 DocId 从小到大排序。

type Logic ¶

type Logic struct {

	// 与查询, 必须都存在
	Must bool

	// 或查询, 有一个存在即可
	Should bool

	// 非查询, 不包含
	NotIn bool

	Expr
}

Logic logic options

type RankByBM25 ¶

type RankByBM25 struct {
}

RankByBM25 一个简单的评分规则，文档分数为BM25

func (RankByBM25) Score ¶

func (rule RankByBM25) Score(doc IndexedDoc, fields interface{}) []float32

Score score

type RankOpts ¶

type RankOpts struct {
	// 文档的评分规则，值为 nil 时使用 Engine 初始化时设定的规则
	ScoringCriteria ScoringCriteria

	// 默认情况下（ReverseOrder = false）按照分数从大到小排序，否则从小到大排序
	ReverseOrder bool

	// 从第几条结果开始输出
	OutputOffset int

	// 最大输出的搜索结果数，为 0 时无限制
	MaxOutputs int
}

RankOpts rank options

type ScoredDoc ¶

type ScoredDoc struct {
	ScoredID

	// new 返回文档 Content
	Content string
	// new 返回文档属性 Attri
	Attri interface{}
	// new 返回评分字段
	Fields interface{}
}

ScoredDoc scored the document

type ScoredDocs ¶

type ScoredDocs []ScoredDoc

ScoredDocs 为了方便排序

func (ScoredDocs) Len ¶

func (docs ScoredDocs) Len() int

func (ScoredDocs) Less ¶

func (docs ScoredDocs) Less(i, j int) bool

func (ScoredDocs) Swap ¶

func (docs ScoredDocs) Swap(i, j int)

type ScoredID ¶

type ScoredID struct {
	DocId string

	// 文档的打分值
	// 搜索结果按照 Scores 的值排序，先按照第一个数排，
	// 如果相同则按照第二个数排序，依次类推。
	Scores []float32

	// 用于生成摘要的关键词在文本中的字节位置，
	// 该切片长度和 SearchResp.Tokens 的长度一样
	// 只有当 IndexType == LocsIndex 时不为空
	TokenSnippetLocs []int

	// 关键词出现的位置
	// 只有当 IndexType == LocsIndex 时不为空
	TokenLocs [][]int
}

ScoredID scored doc only id

type ScoredIDs ¶

type ScoredIDs []ScoredID

ScoredIDs 为了方便排序

func (ScoredIDs) Len ¶

func (docs ScoredIDs) Len() int

func (ScoredIDs) Less ¶

func (docs ScoredIDs) Less(i, j int) bool

func (ScoredIDs) Swap ¶

func (docs ScoredIDs) Swap(i, j int)

type ScoringCriteria ¶

type ScoringCriteria interface {
	// 给一个文档评分，文档排序时先用第一个分值比较，如果
	// 分值相同则转移到第二个分值，以此类推。
	// 返回空切片表明该文档应该从最终排序结果中剔除。
	Score(doc IndexedDoc, fields interface{}) []float32
}

ScoringCriteria 评分规则通用接口

type SearchDoc ¶

type SearchDoc struct {
	BaseResp
	// 搜索到的文档，已排序
	Docs []ScoredDoc
}

SearchDoc search response options

type SearchID ¶

type SearchID struct {
	BaseResp
	// 搜索到的文档，已排序
	Docs []ScoredID
}

SearchID search response options

type SearchReq ¶

type SearchReq struct {
	// 搜索的短语（必须是 UTF-8 格式），会被分词
	// 当值为空字符串时关键词会从下面的 Tokens 读入
	Text string

	// 关键词（必须是 UTF-8 格式），当 Text 不为空时优先使用 Text
	// 通常你不需要自己指定关键词，除非你运行自己的分词程序
	Tokens []string

	// 文档标签（必须是 UTF-8 格式），标签不存在文档文本中，
	// 但也属于搜索键的一种
	Labels []string

	// Logic 逻辑检索表达式
	Logic Logic

	// 当不为 nil 时，仅从这些 DocIds 包含的键中搜索（忽略值）
	DocIds map[string]bool

	// 排序选项
	RankOpts *RankOpts

	// 超时，单位毫秒（千分之一秒）。此值小于等于零时不设超时。
	// 搜索超时的情况下仍有可能返回部分排序结果。
	Timeout int

	// 设为 true 时仅统计搜索到的文档个数，不返回具体的文档
	CountDocsOnly bool

	// 不排序，对于可在引擎外部（比如客户端）排序情况适用
	// 对返回文档很多的情况打开此选项可以有效节省时间
	Orderless bool
}

SearchReq search request options

type SearchResp ¶

type SearchResp struct {
	BaseResp
	// 搜索到的文档，已排序
	Docs interface{}
}

SearchResp search response options

type TokenData ¶

type TokenData struct {
	// 关键词的字符串
	Text string

	// 关键词的首字节在文档中出现的位置
	Locations []int
}

TokenData 文档的一个关键词

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL