stats

package
v0.0.0-...-b7c488f Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 2, 2021 License: MIT Imports: 35 Imported by: 3

Documentation

Overview

Package stats provides implementations of statistic sources.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ElasticsearchAnalysedField

func ElasticsearchAnalysedField(field string) func(*ElasticsearchStatisticsSource)

ElasticsearchAnalysedField sets the analyser for the statistic source.

func ElasticsearchAnalyser

func ElasticsearchAnalyser(analyser string) func(*ElasticsearchStatisticsSource)

ElasticsearchAnalyser sets the analyser for the statistic source.

func ElasticsearchDocumentType

func ElasticsearchDocumentType(documentType string) func(*ElasticsearchStatisticsSource)

ElasticsearchDocumentType sets the document type for the Elasticsearch client.

func ElasticsearchHosts

func ElasticsearchHosts(hosts ...string) func(*ElasticsearchStatisticsSource)

ElasticsearchHosts sets the hosts for the Elasticsearch client.

func ElasticsearchIndex

func ElasticsearchIndex(index string) func(*ElasticsearchStatisticsSource)

ElasticsearchIndex sets the index for the Elasticsearch client.

func ElasticsearchParameters

func ElasticsearchParameters(params map[string]float64) func(*ElasticsearchStatisticsSource)

ElasticsearchParameters sets the parameters for the statistic source.

func ElasticsearchScroll

func ElasticsearchScroll(scroll bool) func(*ElasticsearchStatisticsSource)

ElasticsearchScroll sets the scroll for the statistic source.

func ElasticsearchSearchOptions

func ElasticsearchSearchOptions(options SearchOptions) func(*ElasticsearchStatisticsSource)

ElasticsearchSearchOptions sets the execute options for the statistic source.

func EntrezAPIKey

func EntrezAPIKey(key string) func(source *EntrezStatisticsSource)

EntrezTool sets the API key for entrez.

func EntrezDb

func EntrezDb(db string) func(source *EntrezStatisticsSource)

EntrezDb sets the database to search.

func EntrezEmail

func EntrezEmail(email string) func(source *EntrezStatisticsSource)

EntrezTool sets the email for entrez.

func EntrezLimiter

func EntrezLimiter(limit time.Duration) func(source *EntrezStatisticsSource)

EntrezOptions sets any additional options for the entrez statistics source.

func EntrezOptions

func EntrezOptions(options SearchOptions) func(source *EntrezStatisticsSource)

EntrezOptions sets any additional options for the entrez statistics source.

func EntrezRank

func EntrezRank(rank bool) func(source *EntrezStatisticsSource)

func EntrezTool

func EntrezTool(tool string) func(source *EntrezStatisticsSource)

EntrezTool sets the tool name for entrez.

func GetDocumentIDs

func GetDocumentIDs(query pipeline.Query, ss StatisticsSource) ([]uint32, error)

GetDocumentIDs retrieves the document IDs for a query as fast as possible. Using Elasticsearch this will create a very fast concurrent scroll service. This method does not guarantee order.

func LanguageModelWeights

func LanguageModelWeights(weights []float64) func(*LanguageModel)

LanguageModelWeights configures a language model to use the specified weights.

Types

type ElasticsearchStatisticsSource

type ElasticsearchStatisticsSource struct {
	Scroll       bool
	Analyser     string
	AnalyseField string
	// contains filtered or unexported fields
}

ElasticsearchStatisticsSource is a way of gathering statistics for a collection using Elasticsearch.

func NewElasticsearchStatisticsSource

func NewElasticsearchStatisticsSource(options ...func(*ElasticsearchStatisticsSource)) (*ElasticsearchStatisticsSource, error)

NewElasticsearchStatisticsSource creates a new ElasticsearchStatisticsSource using functional options.

func (*ElasticsearchStatisticsSource) Analyse

func (es *ElasticsearchStatisticsSource) Analyse(text, analyser string) (tokens []string, err error)

Analyse is a specific Elasticsearch method used in the analyse transformation.

func (*ElasticsearchStatisticsSource) CollectionSize

func (es *ElasticsearchStatisticsSource) CollectionSize() (float64, error)

func (*ElasticsearchStatisticsSource) DocumentFrequency

func (es *ElasticsearchStatisticsSource) DocumentFrequency(term string, field string) (float64, error)

DocumentFrequency is the document frequency (the number of documents containing the current term).

func (*ElasticsearchStatisticsSource) Execute

Execute runs the query on Elasticsearch and returns results in trec format.

func (*ElasticsearchStatisticsSource) ExecuteFast

func (es *ElasticsearchStatisticsSource) ExecuteFast(query gpipeline.Query, options SearchOptions) ([]uint32, error)

ExecuteFast executes an Elasticsearch query and retrieves only the document ids in the fastest possible way. Do not use this for ranked results as the concurrency of this method does not guarantee order.

func (*ElasticsearchStatisticsSource) InverseDocumentFrequency

func (es *ElasticsearchStatisticsSource) InverseDocumentFrequency(term, field string) (float64, error)

InverseDocumentFrequency is the ratio of of documents in the collection to the number of documents the term appears in, logarithmically smoothed.

func (*ElasticsearchStatisticsSource) Parameters

func (es *ElasticsearchStatisticsSource) Parameters() map[string]float64

Parameters gets the immutable parameters for the statistics source.

func (*ElasticsearchStatisticsSource) RetrievalSize

RetrievalSize is the minimum number of documents that contains at least one of the query terms.

func (*ElasticsearchStatisticsSource) SearchOptions

func (es *ElasticsearchStatisticsSource) SearchOptions() SearchOptions

SearchOptions gets the immutable execute options for the statistics source.

func (*ElasticsearchStatisticsSource) TermFrequency

func (es *ElasticsearchStatisticsSource) TermFrequency(term, field, document string) (float64, error)

TermFrequency is the term frequency in the field.

func (*ElasticsearchStatisticsSource) TermVector

func (es *ElasticsearchStatisticsSource) TermVector(document string) (TermVector, error)

TermVector retrieves the term vector for a document.

func (*ElasticsearchStatisticsSource) TotalTermFrequency

func (es *ElasticsearchStatisticsSource) TotalTermFrequency(term, field string) (float64, error)

TotalTermFrequency is a sum of total term frequencies (the sum of total term frequencies of each term in this field).

func (*ElasticsearchStatisticsSource) VocabularySize

func (es *ElasticsearchStatisticsSource) VocabularySize(field string) (float64, error)

VocabularySize is the total number of terms in the vocabulary.

type EntrezStatisticsSource

type EntrezStatisticsSource struct {
	Limit int

	// The size of PubMed.
	N float64
	// contains filtered or unexported fields
}

func NewEntrezStatisticsSource

func NewEntrezStatisticsSource(options ...func(source *EntrezStatisticsSource)) (EntrezStatisticsSource, error)

NewEntrezStatisticsSource creates a new entrez statistics source for searching pubmed. When an API key is specified, the entrez request Limit is raised to 10 per second instead of the default 3.

func (EntrezStatisticsSource) CollectionSize

func (e EntrezStatisticsSource) CollectionSize() (float64, error)

func (EntrezStatisticsSource) Count

func (e EntrezStatisticsSource) Count(term, field string) float64

func (EntrezStatisticsSource) DocumentFrequency

func (e EntrezStatisticsSource) DocumentFrequency(term, field string) (float64, error)

func (EntrezStatisticsSource) Execute

func (EntrezStatisticsSource) Fetch

func (e EntrezStatisticsSource) Fetch(pmids []int, options ...func(p *entrez.Parameters)) ([]guru.MedlineDocument, error)

Fetch uses the entrez eutils to fetch the pubmed Article given a set of pubmed identifiers.

func (EntrezStatisticsSource) InverseDocumentFrequency

func (e EntrezStatisticsSource) InverseDocumentFrequency(term, field string) (float64, error)
func (e EntrezStatisticsSource) Link(pmids []int, linkname string) ([]int, error)

func (EntrezStatisticsSource) MarshalEasyJSON

func (e EntrezStatisticsSource) MarshalEasyJSON(w *jwriter.Writer)

MarshalEasyJSON supports easyjson.Marshaler interface

func (EntrezStatisticsSource) MarshalJSON

func (e EntrezStatisticsSource) MarshalJSON() ([]byte, error)

MarshalJSON supports json.Marshaler interface

func (EntrezStatisticsSource) Parameters

func (e EntrezStatisticsSource) Parameters() map[string]float64

func (EntrezStatisticsSource) RetrievalSize

func (EntrezStatisticsSource) Search

func (e EntrezStatisticsSource) Search(query string, options ...func(p *entrez.Parameters)) ([]int, error)

Search uses the entrez eutils to get the pmids for a given query.

func (EntrezStatisticsSource) SearchOptions

func (e EntrezStatisticsSource) SearchOptions() SearchOptions

func (EntrezStatisticsSource) SearchSize

func (e EntrezStatisticsSource) SearchSize(n int) func(p *entrez.Parameters)

func (EntrezStatisticsSource) SearchStart

func (e EntrezStatisticsSource) SearchStart(n int) func(p *entrez.Parameters)

func (EntrezStatisticsSource) SetDB

func (EntrezStatisticsSource) Summary

func (e EntrezStatisticsSource) Summary(ids []string, value interface{}, options ...func(p *entrez.Parameters)) error

Summary uses the entrez eutils to obtain summary documents for the ids.

func (EntrezStatisticsSource) TermFrequency

func (e EntrezStatisticsSource) TermFrequency(term, field, document string) (float64, error)

func (EntrezStatisticsSource) TermVector

func (e EntrezStatisticsSource) TermVector(document string) (TermVector, error)

func (EntrezStatisticsSource) TotalTermFrequency

func (e EntrezStatisticsSource) TotalTermFrequency(term, _ string) (float64, error)

func (EntrezStatisticsSource) Translation

func (e EntrezStatisticsSource) Translation(term string) ([]string, error)

func (*EntrezStatisticsSource) UnmarshalEasyJSON

func (e *EntrezStatisticsSource) UnmarshalEasyJSON(l *jlexer.Lexer)

UnmarshalEasyJSON supports easyjson.Unmarshaler interface

func (*EntrezStatisticsSource) UnmarshalJSON

func (e *EntrezStatisticsSource) UnmarshalJSON(data []byte) error

UnmarshalJSON supports json.Unmarshaler interface

func (EntrezStatisticsSource) VocabularySize

func (e EntrezStatisticsSource) VocabularySize(field string) (float64, error)

type LanguageModel

type LanguageModel struct {
	DocIds             []string
	Scores             []float64
	Weights            []float64
	TermCount          map[string]float64
	DocLen             float64
	StatisticsSource   StatisticsSource
	VocabularySize     float64
	TotalTermFrequency map[string]float64
}

LanguageModel is used for query likelihood statistics.

func NewLanguageModel

func NewLanguageModel(source StatisticsSource, docIds []string, scores []float64, field string, options ...func(model *LanguageModel)) (*LanguageModel, error)

NewLanguageModel creates a new language model from a statistics source using the specified documents and scores for those documents. Optionally, the language model can use weights that can be configured through the functional arguments.

func (*LanguageModel) CollectionTermProbability

func (lm *LanguageModel) CollectionTermProbability(term string) float64

CollectionTermProbability is the term probability for the background language model.

func (*LanguageModel) DocumentTermProbability

func (lm *LanguageModel) DocumentTermProbability(term string) float64

DocumentTermProbability is the term probability for the document language model.

func (*LanguageModel) KLDivergence

func (lm *LanguageModel) KLDivergence(lambda float64, probability TermProbability) (float64, error)

KLDivergence computes the KLDivergence between the background collection and the document language model.

type Search struct {
	Count int `xml:"Count"`
}

func (Search) MarshalEasyJSON

func (v Search) MarshalEasyJSON(w *jwriter.Writer)

MarshalEasyJSON supports easyjson.Marshaler interface

func (Search) MarshalJSON

func (v Search) MarshalJSON() ([]byte, error)

MarshalJSON supports json.Marshaler interface

func (*Search) UnmarshalEasyJSON

func (v *Search) UnmarshalEasyJSON(l *jlexer.Lexer)

UnmarshalEasyJSON supports easyjson.Unmarshaler interface

func (*Search) UnmarshalJSON

func (v *Search) UnmarshalJSON(data []byte) error

UnmarshalJSON supports json.Unmarshaler interface

type SearchOptions

type SearchOptions struct {
	Size    int
	RunName string
}

SearchOptions are options that the statistics source will use for retrieval.

type StatisticsSource

type StatisticsSource interface {
	SearchOptions() SearchOptions
	Parameters() map[string]float64

	TermFrequency(term, field, document string) (float64, error)
	TermVector(document string) (TermVector, error)

	DocumentFrequency(term, field string) (float64, error)
	TotalTermFrequency(term, field string) (float64, error)
	InverseDocumentFrequency(term, field string) (float64, error)
	RetrievalSize(query cqr.CommonQueryRepresentation) (float64, error)
	VocabularySize(field string) (float64, error)
	Execute(query pipeline.Query, options SearchOptions) (trecresults.ResultList, error)
	CollectionSize() (float64, error)
}

StatisticsSource represents the way statistics are calculated for a collection.

type TermProbability

type TermProbability func(model LanguageModel, term string) float64

TermProbability returns a term probability for a term in a language model.

func DirichletTermProbability

func DirichletTermProbability(mu float64) TermProbability

DirichletTermProbability computes Dirichlet distribution for term in a language model.

func JelinekMercerTermProbability

func JelinekMercerTermProbability(lambda float64) TermProbability

JelinekMercerTermProbability computes Jelinek-Mercer probability for term in a language model.

type TermVector

type TermVector []TermVectorTerm

TermVector is a standard format for returning term vectors from statistic sources.

func (TermVector) ToPipelineQuery

func (tv TermVector) ToPipelineQuery(topic, name string) pipeline.Query

ToPipelineQuery creates a pipeline query from a term vector. This can be used to perform analysis on documents (since the term vector is a representation of a document).

type TermVectorTerm

type TermVectorTerm struct {
	DocumentFrequency  float64
	TotalTermFrequency float64
	TermFrequency      float64
	Field              string
	Term               string
}

TermVectorTerm is a term inside a term vector.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL