Documentation
¶
Index ¶
- Constants
- func CloseAndRemove(file *os.File)
- func CompareTokens(a, b Token) (cmp int)
- type Document
- type DocumentId
- type DocumentLengthEntry
- type Field
- type FieldDefinition
- type FieldHeader
- type Header
- type Merger
- type PostingList
- type PostingListHeader
- type RawValue
- type Storage
- func (s *Storage) BuildFrom(docs ...*Document)
- func (s *Storage) Close() (err error)
- func (s *Storage) ColdInitialize()
- func (s *Storage) Load(name string) (err error)
- func (s *Storage) Reset() (err error)
- func (s *Storage) SaveTo(name string) (err error)
- func (s *Storage) SortAndBuildFrom(docs ...*Document)
- type Token
- type TokenDefinition
- type TokenFrequencyEntry
- type Tokens
Constants ¶
View Source
const DocumentIdSize = unsafe.Sizeof(DocumentId{})
View Source
const DocumentLengthEntrySize = unsafe.Sizeof(DocumentLengthEntry{})
View Source
const FieldHeaderSize = unsafe.Sizeof(FieldHeader{})
View Source
const HeaderSize = unsafe.Sizeof(Header{})
View Source
const MagicNumber uint64 = 0x7E7127E9
View Source
const MaxRawValueSize = 128
View Source
const PostingListHeaderSize = unsafe.Sizeof(PostingListHeader{})
View Source
const TokenFrequencyEntrySize = unsafe.Sizeof(TokenFrequencyEntry{})
View Source
const TokenSize = unsafe.Sizeof(Token{})
View Source
const (
VersionV1 uint16 = iota
)
Variables ¶
This section is empty.
Functions ¶
func CloseAndRemove ¶
func CompareTokens ¶
Types ¶
type Document ¶
type Document struct {
// External document identifier e.g. "CO1.PCCNTR.123456"
// Must be unique across the index
// Will be inserted sorted into DocumentsIds
Id DocumentId
// Fields present in this document
// Fields absent from this slice are treated as empty for this document
Fields []*FieldDefinition
}
type DocumentId ¶
type DocumentId struct {
Value RawValue
}
type DocumentLengthEntry ¶
type DocumentLengthEntry struct {
// Index of the document referenced
Index uint64
// Actual length of the document in number of tokens
Length uint64
}
This is per field Meaning the length only references what the field is actually storing for that particular document Writer must ensure they are sorted based on index
type Field ¶
type Field struct {
// Used for BM25 calculation
AvgDocumentLength float64
// Tokens present on the file
// This field is stored in memory but most of its references
// are direct mmap zero-copied arrays
Tokens Tokens
// DocumentLength entries
// Keys are indexes of the documents
DocumentLengths []DocumentLengthEntry
}
type FieldDefinition ¶
type FieldDefinition struct {
// xxh3 hash of the field name
Hash uint64
// Total number of tokens in this field for this document
// Used to update avgdl and store as DocumentLengthEntry
Length uint64
// Tokens found in this field for this document
// Caller must deduplicate — one entry per unique token
// Frequency carries the count of occurrences
Tokens []*TokenDefinition
}
type FieldHeader ¶
type FieldHeader struct {
// xxh3 hashed representation of the field string
Hash uint64
// Avgdl used in the BM25 formula
// Precomputed so the reader can go directly to queries
AvgDocumentLength float64
// Number of total tokens the field has
TokenCount uint64
// Number of document lengths included
DocumentLengthCount uint64
}
type PostingList ¶
type PostingList struct {
Data []byte
}
func (*PostingList) Bitmap ¶
func (l *PostingList) Bitmap(dst *roaring64.Bitmap)
type PostingListHeader ¶
type PostingListHeader struct {
Size uint64
}
type RawValue ¶
type RawValue struct {
Size uint64
Data [MaxRawValueSize]byte
}
func RawValueFrom ¶
func (*RawValue) UnsafeString ¶
type Storage ¶
type Storage struct {
// Read-only intended field
Version uint16
// Read-only intended field
Size uint64
// Reference of the internal buffer of the file
// exposed only if the caller needs to hack his way around
Buffer []byte
// File reference
File *os.File
// Fast reference to mapped fields for O(1) lookups
Fields map[uint64]*Field
// Documents mapped only unce to the sub-slices of buffer for quick convertion between
// index form and human-readble form
DocumentsIds []DocumentId
// Posting lists used once the caller knows which fields-tokens to query
PostingLists []PostingList
// Token frequencies
TokenFrequencies []TokenFrequencyEntry
// Used to determine if the storage was already initialized or not
Initialized bool
}
func (*Storage) ColdInitialize ¶
func (s *Storage) ColdInitialize()
func (*Storage) SortAndBuildFrom ¶
This function will allocate a new batch and sort documents in the batch by their ID if the batch in ensured to be in order already call directly BuildFrom
type TokenDefinition ¶
type TokenFrequencyEntry ¶
Click to show internal directories.
Click to hide internal directories.