inverted

package

v1.17.1 Latest Latest Go to latest Published: Jan 17, 2023 License: BSD-3-Clause Imports: 38 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/semi-technologies/weaviate

Links

Open Source Insights

Documentation ¶

Index ¶

func ConfigFromModel(iicm *models.InvertedIndexConfig) schema.InvertedIndexConfig
func CopyIntoMap(a, b map[string]interface{}) map[string]interface{}
func HasFrequency(dt schema.DataType) bool
func LexicographicallySortableFloat64(in float64) ([]byte, error)
func LexicographicallySortableInt64(in int64) ([]byte, error)
func LexicographicallySortableUint64(in uint64) ([]byte, error)
func ParseLexicographicallySortableFloat64(in []byte) (float64, error)
func ParseLexicographicallySortableInt64(in []byte) (int64, error)
func ParseLexicographicallySortableUint64(in []byte) (uint64, error)
func ValidateConfig(conf *models.InvertedIndexConfig) error
func ValidateUserConfigUpdate(initial, updated *models.InvertedIndexConfig) error
type Analyzer
- func NewAnalyzer(stopwords stopwords.StopwordDetector) *Analyzer
- func (a *Analyzer) Bool(in bool) ([]Countable, error)
- func (a *Analyzer) BoolArray(in []bool) ([]Countable, error)
- func (a *Analyzer) Float(in float64) ([]Countable, error)
- func (a *Analyzer) FloatArray(in []float64) ([]Countable, error)
- func (a *Analyzer) Int(in int64) ([]Countable, error)
- func (a *Analyzer) IntArray(in []int64) ([]Countable, error)
- func (a *Analyzer) Object(input map[string]interface{}, props []*models.Property, uuid strfmt.UUID) ([]Property, error)
- func (a *Analyzer) Ref(in models.MultipleRef) ([]Countable, error)
- func (a *Analyzer) RefCount(in models.MultipleRef) ([]Countable, error)
- func (a *Analyzer) String(tokenization, in string) []Countable
- func (a *Analyzer) StringArray(tokenization string, in []string) []Countable
- func (a *Analyzer) Text(tokenization, in string) []Countable
- func (a *Analyzer) TextArray(tokenization string, in []string) []Countable
type BM25Searcher
- func NewBM25Searcher(config schema.BM25Config, store *lsmkv.Store, schema schema.Schema, ...) *BM25Searcher
- func (b *BM25Searcher) BM25F(ctx context.Context, className schema.ClassName, limit int, ...) ([]*storobj.Object, []float32, error)
- func (b *BM25Searcher) Objects(ctx context.Context, limit int, keywordRanking *searchparams.KeywordRanking, ...) ([]*storobj.Object, []float32, error)
type CacheEntry
- func (ce *CacheEntry) Size() uint64
type CacheEntryType
- func (t CacheEntryType) String() string
type ClassSearcher
type Cleaner
- func NewCleaner(db *bolt.DB, class *models.Class, deletedDocIDs []uint64, deleteFn deleteFn) *Cleaner
- func (c *Cleaner) Cleanup() ([]uint64, error)
type Countable
type DeletedDocIDChecker
type DeltaMergeResult
type DeltaMerger
- func NewDeltaMerger() *DeltaMerger
- func (dm *DeltaMerger) AddAdditions(props []Property, docID uint64)
- func (dm *DeltaMerger) AddDeletions(props []Property, docID uint64)
- func (dm *DeltaMerger) Merge() DeltaMergeResult
type DeltaResults
- func Delta(previous, next []Property) DeltaResults
type MergeDocIDWithFrequency
type MergeItem
- func (mi MergeItem) Countable() Countable
- func (mi MergeItem) IDs() []uint64
type MergeProperty
type Property
type PropertyLengthTracker
- func NewPropertyLengthTracker(path string) (*PropertyLengthTracker, error)
- func (t *PropertyLengthTracker) Close() error
- func (t *PropertyLengthTracker) Drop() error
- func (t *PropertyLengthTracker) FileName() string
- func (t *PropertyLengthTracker) Flush() error
- func (t *PropertyLengthTracker) PropertyMean(propName string) (float32, error)
- func (t *PropertyLengthTracker) TrackProperty(propName string, value float32)
type ReadFn
type ReadFnFrequency
type RowCacher
- func NewRowCacher(maxSize uint64) *RowCacher
- func (rc *RowCacher) Load(id []byte) (*CacheEntry, bool)
- func (rc *RowCacher) Size() uint64
- func (rc *RowCacher) Store(id []byte, row *CacheEntry)
type RowReader
- func NewRowReader(bucket *lsmkv.Bucket, value []byte, operator filters.Operator, keyOnly bool) *RowReader
- func (rr *RowReader) Read(ctx context.Context, readFn ReadFn) error
type RowReaderFrequency
- func NewRowReaderFrequency(bucket *lsmkv.Bucket, value []byte, operator filters.Operator, keyOnly bool, ...) *RowReaderFrequency
- func (rr *RowReaderFrequency) Read(ctx context.Context, readFn ReadFnFrequency) error
type Searcher
- func NewSearcher(store *lsmkv.Store, schema schema.Schema, rowCache cacher, ...) *Searcher
- func (s *Searcher) DocIDs(ctx context.Context, filter *filters.LocalFilter, ...) (helpers.AllowList, error)
- func (s *Searcher) DocIDsPreventCaching(ctx context.Context, filter *filters.LocalFilter, ...) (helpers.AllowList, error)
- func (s *Searcher) Objects(ctx context.Context, limit int, filter *filters.LocalFilter, ...) ([]*storobj.Object, error)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func ConfigFromModel ¶ added in v1.12.0

func ConfigFromModel(iicm *models.InvertedIndexConfig) schema.InvertedIndexConfig

func CopyIntoMap ¶ added in v1.17.0

func CopyIntoMap(a, b map[string]interface{}) map[string]interface{}

func HasFrequency ¶ added in v1.5.0

func HasFrequency(dt schema.DataType) bool

func LexicographicallySortableFloat64 ¶

func LexicographicallySortableFloat64(in float64) ([]byte, error)

LexicographicallySortableFloat64 transforms a conversion to a lexicographically sortable byte slice. In general, for lexicographical sorting big endian notatino is required. Additionally the sign needs to be flipped in any case, but additionally each remaining byte also needs to be flipped if the number is negative

func LexicographicallySortableInt64 ¶

func LexicographicallySortableInt64(in int64) ([]byte, error)

LexicographicallySortableInt64 performs a conversion to a lexicographically sortable byte slice. For this, big endian notation is required and the sign must be flipped

func LexicographicallySortableUint64 ¶ added in v1.0.0

func LexicographicallySortableUint64(in uint64) ([]byte, error)

LexicographicallySortableUint64 performs a conversion to a lexicographically sortable byte slice. For this, big endian notation is required.

func ParseLexicographicallySortableFloat64 ¶ added in v0.22.19

func ParseLexicographicallySortableFloat64(in []byte) (float64, error)

ParseLexicographicallySortableFloat64 reverses the changes in LexicographicallySortableFloat64

func ParseLexicographicallySortableInt64 ¶ added in v0.22.19

func ParseLexicographicallySortableInt64(in []byte) (int64, error)

ParseLexicographicallySortableInt64 reverses the changes in LexicographicallySortableInt64

func ParseLexicographicallySortableUint64 ¶ added in v1.0.0

func ParseLexicographicallySortableUint64(in []byte) (uint64, error)

ParseLexicographicallySortableUint64 reverses the changes in LexicographicallySortableUint64

func ValidateConfig ¶ added in v1.12.0

func ValidateConfig(conf *models.InvertedIndexConfig) error

func ValidateUserConfigUpdate ¶ added in v1.12.0

func ValidateUserConfigUpdate(initial, updated *models.InvertedIndexConfig) error

Types ¶

type Analyzer ¶

type Analyzer struct {
	// contains filtered or unexported fields
}

func NewAnalyzer ¶

func NewAnalyzer(stopwords stopwords.StopwordDetector) *Analyzer

func (*Analyzer) Bool ¶

func (a *Analyzer) Bool(in bool) ([]Countable, error)

Bool requires no analysis, so it's actually just a simple conversion to a little-endian ordered byte slice

func (*Analyzer) BoolArray ¶ added in v1.7.2

func (a *Analyzer) BoolArray(in []bool) ([]Countable, error)

BoolArray requires no analysis, so it's actually just a simple conversion to a little-endian ordered byte slice

func (*Analyzer) Float ¶

func (a *Analyzer) Float(in float64) ([]Countable, error)

Float requires no analysis, so it's actually just a simple conversion to a lexicographically sortable byte slice.

func (*Analyzer) FloatArray ¶ added in v1.7.0

func (a *Analyzer) FloatArray(in []float64) ([]Countable, error)

Float array requires no analysis, so it's actually just a simple conversion to a lexicographically sortable byte slice.

func (*Analyzer) Int ¶

func (a *Analyzer) Int(in int64) ([]Countable, error)

Int requires no analysis, so it's actually just a simple conversion to a string-formatted byte slice of the int

func (*Analyzer) IntArray ¶ added in v1.7.0

func (a *Analyzer) IntArray(in []int64) ([]Countable, error)

Int array requires no analysis, so it's actually just a simple conversion to a string-formatted byte slice of the int

func (*Analyzer) Object ¶

func (a *Analyzer) Object(input map[string]interface{}, props []*models.Property,
	uuid strfmt.UUID,
) ([]Property, error)

func (*Analyzer) Ref ¶ added in v0.22.20

func (a *Analyzer) Ref(in models.MultipleRef) ([]Countable, error)

Ref indexes references as beacon-strings

func (*Analyzer) RefCount ¶

func (a *Analyzer) RefCount(in models.MultipleRef) ([]Countable, error)

RefCount does not index the content of the refs, but only the count with 0 being an explicitly allowed value as well.

func (*Analyzer) String ¶

func (a *Analyzer) String(tokenization, in string) []Countable

String splits only on spaces and does not lowercase, then aggregates duplicates

func (*Analyzer) StringArray ¶ added in v1.12.0

func (a *Analyzer) StringArray(tokenization string, in []string) []Countable

StringArray splits only on spaces and does not lowercase, then aggregates duplicates

func (*Analyzer) Text ¶

func (a *Analyzer) Text(tokenization, in string) []Countable

Text removes non alpha-numeric and splits into lowercased words, then aggregates duplicates

func (*Analyzer) TextArray ¶ added in v1.12.0

func (a *Analyzer) TextArray(tokenization string, in []string) []Countable

TextArray removes non alpha-numeric and splits into lowercased words, then aggregates duplicates

type BM25Searcher ¶ added in v1.11.0

type BM25Searcher struct {
	// contains filtered or unexported fields
}

func NewBM25Searcher ¶ added in v1.11.0

func NewBM25Searcher(config schema.BM25Config, store *lsmkv.Store, schema schema.Schema,
	rowCache cacher, propIndices propertyspecific.Indices,
	classSearcher ClassSearcher, deletedDocIDs DeletedDocIDChecker,
	propLengths propLengthRetriever, logger logrus.FieldLogger,
	shardVersion uint16,
) *BM25Searcher

func (*BM25Searcher) BM25F ¶ added in v1.17.0

func (b *BM25Searcher) BM25F(ctx context.Context, className schema.ClassName, limit int,
	keywordRanking *searchparams.KeywordRanking,
	filter *filters.LocalFilter, sort []filters.Sort, additional additional.Properties,
	objectByIndexID func(index uint64) *storobj.Object,
) ([]*storobj.Object, []float32, error)

func (*BM25Searcher) Objects ¶ added in v1.17.1

func (b *BM25Searcher) Objects(ctx context.Context, limit int,
	keywordRanking *searchparams.KeywordRanking,
	filter *filters.LocalFilter, sort []filters.Sort, additional additional.Properties,
	className schema.ClassName,
) ([]*storobj.Object, []float32, error)

Objects returns a list of full objects

type CacheEntry ¶ added in v1.8.0

type CacheEntry struct {
	Type      CacheEntryType
	Hash      []byte
	Partial   *docPointers
	AllowList helpers.AllowList
}

func (*CacheEntry) Size ¶ added in v1.8.0

func (ce *CacheEntry) Size() uint64

Size cannot be determined accurately since a golang map does not have fixed size per elements. However, through experimentation we have found that a map[uint64]struct{} rarely exceeds 25 bytes per entry, so we are using this as an estimate. In addition, we know that the partial content uses an array where we can assume full efficiency, i.e. 8 bytes per entry.

type CacheEntryType ¶ added in v1.8.0

type CacheEntryType uint8

const (
	CacheTypePartial CacheEntryType = iota
	CacheTypeAllowList
)

func (CacheEntryType) String ¶ added in v1.8.0

func (t CacheEntryType) String() string

type ClassSearcher ¶ added in v0.22.20

type ClassSearcher interface {
	ClassSearch(ctx context.Context,
		params traverser.GetParams) ([]search.Result, error)
	GetQueryMaximumResults() int
}

ClassSearcher is anything that allows a root-level ClassSearch

type Cleaner ¶ added in v1.0.0

type Cleaner struct {
	// contains filtered or unexported fields
}

func NewCleaner ¶ added in v1.0.0

func NewCleaner(db *bolt.DB, class *models.Class, deletedDocIDs []uint64, deleteFn deleteFn) *Cleaner

func (*Cleaner) Cleanup ¶ added in v1.0.0

func (c *Cleaner) Cleanup() ([]uint64, error)

Cleanup cleans up properties for given documents

type Countable ¶

type Countable struct {
	Data          []byte
	TermFrequency float32
}

type DeletedDocIDChecker ¶ added in v0.22.20

type DeletedDocIDChecker interface {
	Contains(id uint64) bool
}

type DeltaMergeResult ¶ added in v1.1.0

type DeltaMergeResult struct {
	Additions []MergeProperty
	Deletions []MergeProperty
}

type DeltaMerger ¶ added in v1.1.0

type DeltaMerger struct {
	// contains filtered or unexported fields
}

DeltaMerger can be used to condense the number of single writes into one big one. Additionally it removes overlaps between additions and deletions. It is meant to be used in batch situation, where 5 ref objects in a row might each increase the doc count by one. Instead of writing 5 additions and 4 deletions, this can be condensed to write just one addition

func NewDeltaMerger ¶ added in v1.1.0

func NewDeltaMerger() *DeltaMerger

func (*DeltaMerger) AddAdditions ¶ added in v1.1.0

func (dm *DeltaMerger) AddAdditions(props []Property, docID uint64)

func (*DeltaMerger) AddDeletions ¶ added in v1.1.0

func (dm *DeltaMerger) AddDeletions(props []Property, docID uint64)

func (*DeltaMerger) Merge ¶ added in v1.1.0

func (dm *DeltaMerger) Merge() DeltaMergeResult

type DeltaResults ¶

type DeltaResults struct {
	ToDelete []Property
	ToAdd    []Property
}

func Delta ¶

func Delta(previous, next []Property) DeltaResults

type MergeDocIDWithFrequency ¶ added in v1.1.0

type MergeDocIDWithFrequency struct {
	DocID     uint64
	Frequency float32
}

type MergeItem ¶ added in v1.1.0

type MergeItem struct {
	Data   []byte
	DocIDs []MergeDocIDWithFrequency
}

func (MergeItem) Countable ¶ added in v1.1.0

func (mi MergeItem) Countable() Countable

Countable converts the merge item to a regular (non-merge) Countable. Note that this loses the IDs and Frequency information, so IDs have to be passed separately using .IDs()

func (MergeItem) IDs ¶ added in v1.1.0

func (mi MergeItem) IDs() []uint64

IDs is meant for cases such as deletion, where the frequency is irrelevant, but the expected format is a []docID

type MergeProperty ¶ added in v1.1.0

type MergeProperty struct {
	Name         string
	HasFrequency bool
	MergeItems   []MergeItem
}

type Property ¶

type Property struct {
	Name         string
	Items        []Countable
	HasFrequency bool
	Length       int
}

type PropertyLengthTracker ¶ added in v1.11.0

type PropertyLengthTracker struct {
	sync.Mutex
	// contains filtered or unexported fields
}

Page Design | Bytes | Description | | --------- | ------------------------------------------------ | | start | page is now 0 | 0-1 | uint16 pointer to last index byte | 2-3 | uint16 pointer for property name length | 4-n | property name | ... | repeat length+pointer pattern | 3584-3840 | second property buckets (64 buckets of float32) | 3840-4096 | first property buckets | repeat | page is now 1, repeat all of above

Fixed Assumptions:

First two bytes always used to indicate end of index, minimal value is 02, as the first possible value with index length=0 is after the two bytes themselves.
64 buckets of float32 per property (=256B per prop), excluding the index
One index row is always 4+len(propName), consisting of a uint16 prop name length pointer, the name itself and an offset pointer pointing to the start (first byte) of the buckets

func NewPropertyLengthTracker ¶ added in v1.11.0

func NewPropertyLengthTracker(path string) (*PropertyLengthTracker, error)

func (*PropertyLengthTracker) Close ¶ added in v1.11.0

func (t *PropertyLengthTracker) Close() error

func (*PropertyLengthTracker) Drop ¶ added in v1.11.0

func (t *PropertyLengthTracker) Drop() error

func (*PropertyLengthTracker) FileName ¶ added in v1.15.0

func (t *PropertyLengthTracker) FileName() string

func (*PropertyLengthTracker) Flush ¶ added in v1.11.0

func (t *PropertyLengthTracker) Flush() error

func (*PropertyLengthTracker) PropertyMean ¶ added in v1.11.0

func (t *PropertyLengthTracker) PropertyMean(propName string) (float32, error)

func (*PropertyLengthTracker) TrackProperty ¶ added in v1.11.0

func (t *PropertyLengthTracker) TrackProperty(propName string,
	value float32,
)

type ReadFn ¶ added in v0.22.19

type ReadFn func(k []byte, values [][]byte) (bool, error)

ReadFn will be called 1..n times per match. This means it will also be called on a non-match, in this case v == nil. It is up to the caller to decide if that is an error case or not.

Note that because what we are parsing is an inverted index row, it can sometimes become confusing what a key and value actually resembles. The variables k and v are the literal row key and value. So this means, the data-value as in "less than 17" where 17 would be the "value" is in the key variable "k". The value will contain the docCount, hash and list of pointers (with optional frequency) to the docIDs

The boolean return argument is a way to stop iteration (e.g. when a limit is reached) without producing an error. In normal operation always return true, if false is returned once, the loop is broken.

type ReadFnFrequency ¶ added in v1.5.0

type ReadFnFrequency func(k []byte, values []lsmkv.MapPair) (bool, error)

ReadFnFrequency will be called 1..n times per match. This means it will also be called on a non-match, in this case v == nil. It is up to the caller to decide if that is an error case or not.

Note that because what we are parsing is an inverted index row, it can sometimes become confusing what a key and value actually resembles. The variables k and v are the literal row key and value. So this means, the data-value as in "less than 17" where 17 would be the "value" is in the key variable "k". The value will contain the docCount, hash and list of pointers (with optional frequency) to the docIDs

The boolean return argument is a way to stop iteration (e.g. when a limit is reached) without producing an error. In normal operation always return true, if false is returned once, the loop is broken.

type RowCacher ¶

type RowCacher struct {
	// contains filtered or unexported fields
}

func NewRowCacher ¶

func NewRowCacher(maxSize uint64) *RowCacher

func (*RowCacher) Load ¶

func (rc *RowCacher) Load(id []byte) (*CacheEntry, bool)

func (*RowCacher) Size ¶ added in v1.14.0

func (rc *RowCacher) Size() uint64

func (*RowCacher) Store ¶

func (rc *RowCacher) Store(id []byte, row *CacheEntry)

type RowReader ¶ added in v0.22.19

type RowReader struct {
	// contains filtered or unexported fields
}

RowReader reads one or many row(s) depending on the specified operator

func NewRowReader ¶ added in v0.22.19

func NewRowReader(bucket *lsmkv.Bucket, value []byte,
	operator filters.Operator, keyOnly bool,
) *RowReader

If keyOnly is set, the RowReader will request key-only cursors wherever cursors are used, the specified value arguments in the ReadFn will always be nil

func (*RowReader) Read ¶ added in v0.22.19

func (rr *RowReader) Read(ctx context.Context, readFn ReadFn) error

Read a row using the specified ReadFn. If RowReader was created with keysOnly==true, the values argument in the readFn will always be nil on all requests involving cursors

type RowReaderFrequency ¶ added in v1.5.0

type RowReaderFrequency struct {
	// contains filtered or unexported fields
}

RowReaderFrequency reads one or many row(s) depending on the specified operator

func NewRowReaderFrequency ¶ added in v1.5.0

func NewRowReaderFrequency(bucket *lsmkv.Bucket, value []byte,
	operator filters.Operator, keyOnly bool, shardVersion uint16,
) *RowReaderFrequency

func (*RowReaderFrequency) Read ¶ added in v1.5.0

func (rr *RowReaderFrequency) Read(ctx context.Context, readFn ReadFnFrequency) error

type Searcher ¶

type Searcher struct {
	// contains filtered or unexported fields
}

func NewSearcher ¶

func NewSearcher(store *lsmkv.Store, schema schema.Schema,
	rowCache cacher, propIndices propertyspecific.Indices,
	classSearcher ClassSearcher, deletedDocIDs DeletedDocIDChecker,
	stopwords stopwords.StopwordDetector, shardVersion uint16,
) *Searcher

func (*Searcher) DocIDs ¶

func (s *Searcher) DocIDs(ctx context.Context, filter *filters.LocalFilter,
	additional additional.Properties, className schema.ClassName,
) (helpers.AllowList, error)

DocIDs is similar to Objects, but does not actually resolve the docIDs to full objects. Instead it returns the pure object id pointers. They can then be used in a secondary index (e.g. vector index)

DocID queries does not contain a limit by design, as we won't know if the limit wouldn't remove the item that is most important for the follow up query. Imagine the user sets the limit to 1 and the follow-up is a vector search. If we already limited the allowList to 1, the vector search would be pointless, as only the first element would be allowed, regardless of which had the shortest distance

func (*Searcher) DocIDsPreventCaching ¶ added in v1.14.0

func (s *Searcher) DocIDsPreventCaching(ctx context.Context, filter *filters.LocalFilter,
	additional additional.Properties, className schema.ClassName,
) (helpers.AllowList, error)

DocIDsPreventCaching is the same as DocIDs, but makes sure that no filter cache entries are written. This can be used when we can guarantee that the filter is part of an operation that will lead to a state change, such as batch delete. The state change would make the cached filter unusable anyway, so we don't need to unnecessarily populate the cache with an entry.

func (*Searcher) Objects ¶ added in v1.17.1

func (s *Searcher) Objects(ctx context.Context, limit int,
	filter *filters.LocalFilter, sort []filters.Sort, additional additional.Properties,
	className schema.ClassName,
) ([]*storobj.Object, error)

Objects returns a list of full objects

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
stopwords

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL