index

package
v0.0.0-...-6f23c6b Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 25, 2022 License: Apache-2.0 Imports: 17 Imported by: 0

Documentation

Index

Constants

View Source
const VirtualQueryDocId int32 = -10000

Variables

View Source
var DefaultConfig = btree.Config{
	IndexConfig: btree.IndexConfig{
		Sectorsize: 512,
		Flistsize:  1000 * btree.OFFSET_SIZE,
		Blocksize:  512,
	},
	Maxlevel:      4,
	RebalanceThrs: 30,
	AppendRatio:   0.7,
	DrainRate:     100,
	MaxLeafCache:  0,
	Sync:          false,
	Nocache:       false,
}

Functions

func CalDocScore

func CalDocScore(frequency int32, pagerank int) float64

CalDocScore todo: calculate doc static score by PageRank + frequency

func CalIDF

func CalIDF(docNum int, df int) float64

func Drain

func Drain(idx Index, file string)

Drain data to file. sort by key

func IfElseInt

func IfElseInt(condition bool, o1 int, o2 int) int

func Load

func Load(file string) (chan *KVPair, error)

Load file.

func LoadDocumentStream

func LoadDocumentStream(path string) (chan *Document, error)

Types

type BTreeIndex

type BTreeIndex struct {
	//skip-list vs btree:
	//https://stackoverflow.com/questions/256511/skip-list-vs-binary-search-tree/28270537#28270537
	BT        *btree.BTree
	IndexFile string
	// contains filtered or unexported fields
}

func NewBTreeIndex

func NewBTreeIndex(file string) *BTreeIndex

func (*BTreeIndex) Add

func (bt *BTreeIndex) Add(docs []Document)

Add 该方法比较低效,批量插入文档会在posting list后不段追加新文档,但postinglist并未预留空间, 因此需要移动到新的空间,导致文件数据拷贝

func (*BTreeIndex) Clear

func (bt *BTreeIndex) Clear()

func (*BTreeIndex) Close

func (bt *BTreeIndex) Close()

func (*BTreeIndex) Get

func (bt *BTreeIndex) Get(term string) []Doc

func (*BTreeIndex) Insert

func (bt *BTreeIndex) Insert(key string, pl PostingList)

func (*BTreeIndex) Keys

func (bt *BTreeIndex) Keys() []string

func (*BTreeIndex) Load

func (bt *BTreeIndex) Load()

func (*BTreeIndex) Lookup

func (bt *BTreeIndex) Lookup(token string, dirty bool) PostingList

func (*BTreeIndex) Property

func (bt *BTreeIndex) Property() *Property

func (*BTreeIndex) Retrieval

func (bt *BTreeIndex) Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc

func (*BTreeIndex) Save

func (bt *BTreeIndex) Save()

func (*BTreeIndex) SetProperty

func (bt *BTreeIndex) SetProperty(p Property)

type DataRange

type DataRange struct {
	Start int
	End   int
}

type Doc

type Doc struct {
	ID     int32 //doc id
	DocLen int32 //doc length

	TF           int32   //词频, eg. 在倒排表term->[doc1,doc2,doc3]中,仅表示term在docX中的词频
	QualityScore float64 //静态分、质量分

	Score float64 //bm25/Cosine score used by sort
}

func CalBM25

func CalBM25(hits []Doc, tfidf *TFIDF, docLen int, docNum int) []Doc

CalBM25 计算bm25得分并排序 docsLen 索引文档总长度(词的数量), DocsNum 索取文档总数

func CalCosine

func CalCosine(hits []Doc, tfidf *TFIDF) []Doc

CalCosine 余弦距离相似度 https://blog.csdn.net/weixin_42398658/article/details/85063004

func DoRetrieval

func DoRetrieval(idx Index, must []string, should []string, not []string, k int, r int, model SearchModel) []Doc

DoRetrieval returns top k docs sorted by boolean model todo: compress posting list and opt intersection/union rt https://blog.csdn.net/weixin_39890629/article/details/111268898

func (Doc) Bytes

func (doc Doc) Bytes() []byte

func (*Doc) FromBytes

func (doc *Doc) FromBytes(b []byte)

type Document

type Document struct {
	Title     string `xml:"title"`
	URL       string `xml:"url"`
	Text      string `xml:"abstract"`
	Timestamp int
	ID        int
}

Document represents a Wikipedia abstract dump document.

func LoadDocuments

func LoadDocuments(path string) ([]Document, error)

LoadDocuments loads a Wikipedia abstract dump and returns a slice of documents. Dump example from https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz

type HashMapIndex

type HashMapIndex struct {
	// contains filtered or unexported fields
}

HashMapIndex is an inverted index. It maps tokens to document IDs.

func NewHashMapIndex

func NewHashMapIndex() *HashMapIndex

func (*HashMapIndex) Add

func (idx *HashMapIndex) Add(docs []Document)

Add adds documents to the index. todo: Support indexing multiple document fields.

func (*HashMapIndex) Clear

func (idx *HashMapIndex) Clear()

Clear unsafe function

func (*HashMapIndex) Get

func (idx *HashMapIndex) Get(term string) []Doc

func (*HashMapIndex) Keys

func (idx *HashMapIndex) Keys() []string

func (*HashMapIndex) Map

func (idx *HashMapIndex) Map() map[string]PostingList

func (*HashMapIndex) Property

func (idx *HashMapIndex) Property() *Property

func (*HashMapIndex) Retrieval

func (idx *HashMapIndex) Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc

type Index

type Index interface {
	Property() *Property
	Keys() []string
	Clear()

	Add(docs []Document)
	Get(term string) []Doc

	Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc
}

type KVPair

type KVPair struct {
	Key   string
	Value PostingList
}

type PostingList

type PostingList []Doc

func (*PostingList) Append

func (pl *PostingList) Append(docs ...Doc)

func (PostingList) Bytes

func (pl PostingList) Bytes() []byte

func (*PostingList) Filter

func (pl *PostingList) Filter(docs []Doc)

func (PostingList) Find

func (pl PostingList) Find(id int) *Doc

func (*PostingList) FromBytes

func (pl *PostingList) FromBytes(buf []byte)

func (PostingList) IDs

func (pl PostingList) IDs() []int

func (*PostingList) Inter

func (pl *PostingList) Inter(docs []Doc)

func (PostingList) Len

func (pl PostingList) Len() int

func (PostingList) Less

func (pl PostingList) Less(i, j int) bool

func (PostingList) Swap

func (pl PostingList) Swap(i, j int)

func (*PostingList) Union

func (pl *PostingList) Union(docs []Doc)

type Property

type Property struct {
	// contains filtered or unexported fields
}

func (*Property) DataRange

func (idx *Property) DataRange() DataRange

func (*Property) DocNum

func (idx *Property) DocNum() int

func (*Property) SetDataRange

func (idx *Property) SetDataRange(d DataRange)

func (*Property) SetDocNum

func (idx *Property) SetDocNum(num int)

func (*Property) SetTokenCount

func (idx *Property) SetTokenCount(cnt int)

func (*Property) TokenCount

func (idx *Property) TokenCount() int

type SearchModel

type SearchModel int
const (
	Boolean SearchModel = iota
	VectorSpace
	BM25
)

type TF

type TF map[string]int32

type TFIDF

type TFIDF struct {
	IDF    map[string]float64
	DOC2TF map[int32]TF
}

func NewTFIDF

func NewTFIDF() *TFIDF

type Term

type Term struct {
	K  string //key
	Id int32  //key id
	DF int32  //Document Frequency
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL