Documentation
¶
Overview ¶
Package vectorizer provides text vectorization utilities matching sklearn behavior.
Index ¶
- func EnglishStopWords() map[string]bool
- type CountVectorizer
- func (cv *CountVectorizer) Fit(corpus []string)
- func (cv *CountVectorizer) FitTransform(corpus []string) []SparseVector
- func (cv *CountVectorizer) MarshalJSON() ([]byte, error)
- func (cv *CountVectorizer) Transform(text string) SparseVector
- func (cv *CountVectorizer) UnmarshalJSON(data []byte) error
- func (cv *CountVectorizer) VocabSize() int
- type DictVectorizer
- type SparseVector
- type TfidfVectorizer
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func EnglishStopWords ¶
EnglishStopWords returns sklearn's default English stop words set.
Types ¶
type CountVectorizer ¶
type CountVectorizer struct {
Vocabulary map[string]int `json:"vocabulary"`
NgramRange [2]int `json:"ngram_range"`
Binary bool `json:"binary"`
Analyzer string `json:"analyzer"` // "word" or "char_wb"
MinDF int `json:"min_df"`
}
CountVectorizer converts text to token count vectors.
func NewCountVectorizer ¶
func NewCountVectorizer(ngramRange [2]int, binary bool, analyzer string, minDF int) *CountVectorizer
NewCountVectorizer creates a CountVectorizer with default settings.
func (*CountVectorizer) Fit ¶
func (cv *CountVectorizer) Fit(corpus []string)
Fit builds the vocabulary from a corpus.
func (*CountVectorizer) FitTransform ¶
func (cv *CountVectorizer) FitTransform(corpus []string) []SparseVector
FitTransform fits the vocabulary and transforms the corpus.
func (*CountVectorizer) MarshalJSON ¶
func (cv *CountVectorizer) MarshalJSON() ([]byte, error)
MarshalJSON implements json.Marshaler.
func (*CountVectorizer) Transform ¶
func (cv *CountVectorizer) Transform(text string) SparseVector
Transform converts a single document to a sparse vector.
func (*CountVectorizer) UnmarshalJSON ¶
func (cv *CountVectorizer) UnmarshalJSON(data []byte) error
UnmarshalJSON implements json.Unmarshaler.
func (*CountVectorizer) VocabSize ¶
func (cv *CountVectorizer) VocabSize() int
VocabSize returns the vocabulary size.
type DictVectorizer ¶
type DictVectorizer struct {
FeatureNames []string `json:"feature_names"`
FeatureIndex map[string]int `json:"feature_index"`
}
DictVectorizer converts feature dicts to sparse vectors.
func NewDictVectorizer ¶
func NewDictVectorizer() *DictVectorizer
NewDictVectorizer creates an empty DictVectorizer.
func (*DictVectorizer) Fit ¶
func (dv *DictVectorizer) Fit(data []map[string]any)
Fit builds the feature mapping from a list of feature dicts.
func (*DictVectorizer) FitTransform ¶
func (dv *DictVectorizer) FitTransform(data []map[string]any) []SparseVector
FitTransform fits and transforms the data.
func (*DictVectorizer) Transform ¶
func (dv *DictVectorizer) Transform(d map[string]any) SparseVector
Transform converts a feature dict to a sparse vector.
func (*DictVectorizer) VocabSize ¶
func (dv *DictVectorizer) VocabSize() int
VocabSize returns the number of features.
type SparseVector ¶
SparseVector represents a sparse float64 vector.
func ConcatSparse ¶
func ConcatSparse(vectors []SparseVector) SparseVector
ConcatSparse concatenates multiple sparse vectors with offsets into a single vector.
func NewSparseVector ¶
func NewSparseVector(dim int) SparseVector
NewSparseVector creates a sparse vector with given dimension.
func (SparseVector) Dot ¶
func (sv SparseVector) Dot(dense []float64) float64
Dot computes the dot product with a dense vector.
func (SparseVector) L2Norm ¶
func (sv SparseVector) L2Norm() float64
L2Norm returns the L2 norm of the sparse vector.
func (SparseVector) Nnz ¶
func (sv SparseVector) Nnz() int
Nnz returns the number of non-zero entries.
func (*SparseVector) Set ¶
func (sv *SparseVector) Set(idx int, val float64)
Set adds or updates a value at the given index.
func (SparseVector) ToDense ¶
func (sv SparseVector) ToDense() []float64
ToDense converts to a dense float64 slice.
type TfidfVectorizer ¶
type TfidfVectorizer struct {
CountVec *CountVectorizer `json:"count_vec"`
IDF []float64 `json:"idf"`
StopWords map[string]bool `json:"stop_words,omitempty"`
}
TfidfVectorizer converts text to TF-IDF weighted vectors. Uses binary=true mode (matching Formasaurus): value = IDF[term] if present, 0 otherwise.
func NewTfidfVectorizer ¶
func NewTfidfVectorizer(ngramRange [2]int, minDF int, binary bool, analyzer string, stopWords map[string]bool) *TfidfVectorizer
NewTfidfVectorizer creates a TfidfVectorizer.
func (*TfidfVectorizer) Fit ¶
func (tv *TfidfVectorizer) Fit(corpus []string)
Fit computes IDF values from a corpus.
func (*TfidfVectorizer) FitTransform ¶
func (tv *TfidfVectorizer) FitTransform(corpus []string) []SparseVector
FitTransform fits and transforms the corpus.
func (*TfidfVectorizer) Transform ¶
func (tv *TfidfVectorizer) Transform(text string) SparseVector
Transform converts a single document to a TF-IDF sparse vector.
func (*TfidfVectorizer) VocabSize ¶
func (tv *TfidfVectorizer) VocabSize() int
VocabSize returns the vocabulary size.