Documentation
¶
Index ¶
- func NewEmbeddingVectorizer(embedType types.EmbeddingType) (defaultConfig Config, vectorizer Vectorizer)
- type BM25Config
- type BM25FConfig
- type BM25FVectorizer
- func (v *BM25FVectorizer) BatchTransform(docs []map[string]string) [][]float32
- func (v *BM25FVectorizer) BatchTransformStrings(documents []string) [][]float32
- func (v *BM25FVectorizer) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
- func (v *BM25FVectorizer) CreateEmbeddingFunc() EmbeddingFunc
- func (v *BM25FVectorizer) DisableCache()
- func (v *BM25FVectorizer) EnableCache()
- func (v *BM25FVectorizer) Fit(documents []map[string]string)
- func (v *BM25FVectorizer) GetAvgDocLength() float64
- func (v *BM25FVectorizer) GetCacheStats() (hits, misses, size int64)
- func (v *BM25FVectorizer) GetDimension() int
- func (v *BM25FVectorizer) GetVocabularySize() int
- func (v *BM25FVectorizer) Transform(doc map[string]string) []float32
- type BM25FVectorizerAdapter
- func (a *BM25FVectorizerAdapter) BatchTransform(documents []string) [][]float32
- func (a *BM25FVectorizerAdapter) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
- func (a *BM25FVectorizerAdapter) CreateEmbeddingFunc() EmbeddingFunc
- func (a *BM25FVectorizerAdapter) Fit(documents []string)
- func (a *BM25FVectorizerAdapter) GetDimension() int
- func (a *BM25FVectorizerAdapter) Transform(document string) []float32
- type BM25LVectorizer
- func (v *BM25LVectorizer) BatchTransform(documents []string) [][]float32
- func (v *BM25LVectorizer) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
- func (v *BM25LVectorizer) CreateEmbeddingFunc() EmbeddingFunc
- func (v *BM25LVectorizer) DisableCache()
- func (v *BM25LVectorizer) EnableCache()
- func (v *BM25LVectorizer) Fit(documents []string)
- func (v *BM25LVectorizer) GetAvgDocLength() float64
- func (v *BM25LVectorizer) GetCacheStats() (hits, misses, size int64)
- func (v *BM25LVectorizer) GetDimension() int
- func (v *BM25LVectorizer) GetVocabularySize() int
- func (v *BM25LVectorizer) Transform(document string) []float32
- type BM25PlusVectorizer
- func (v *BM25PlusVectorizer) BatchTransform(documents []string) [][]float32
- func (v *BM25PlusVectorizer) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
- func (v *BM25PlusVectorizer) CreateEmbeddingFunc() EmbeddingFunc
- func (v *BM25PlusVectorizer) DisableCache()
- func (v *BM25PlusVectorizer) EnableCache()
- func (v *BM25PlusVectorizer) Fit(documents []string)
- func (v *BM25PlusVectorizer) GetAvgDocLength() float64
- func (v *BM25PlusVectorizer) GetCacheStats() (hits, misses, size int64)
- func (v *BM25PlusVectorizer) GetDimension() int
- func (v *BM25PlusVectorizer) GetVocabularySize() int
- func (v *BM25PlusVectorizer) SetDelta(delta float64)
- func (v *BM25PlusVectorizer) Transform(document string) []float32
- type BM25Vectorizer
- func (v *BM25Vectorizer) BatchTransform(documents []string) [][]float32
- func (v *BM25Vectorizer) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
- func (v *BM25Vectorizer) CreateEmbeddingFunc() EmbeddingFunc
- func (v *BM25Vectorizer) DisableCache()
- func (v *BM25Vectorizer) EnableCache()
- func (v *BM25Vectorizer) Fit(documents []string)
- func (v *BM25Vectorizer) GetAvgDocLength() float64
- func (v *BM25Vectorizer) GetCacheStats() (hits, misses, size int64)
- func (v *BM25Vectorizer) GetDimension() int
- func (v *BM25Vectorizer) GetVocabularySize() int
- func (v *BM25Vectorizer) Transform(document string) []float32
- type BatchEmbeddingFunc
- type Config
- type EmbeddingFunc
- type EmbeddingResponse
- type IncrementalBM25Vectorizer
- func (v *IncrementalBM25Vectorizer) AddDocuments(newDocuments []string) error
- func (v *IncrementalBM25Vectorizer) ClearCache()
- func (v *IncrementalBM25Vectorizer) DisableCache()
- func (v *IncrementalBM25Vectorizer) EnableCache()
- func (v *IncrementalBM25Vectorizer) Fit(documents []string)
- func (v *IncrementalBM25Vectorizer) FitTransform(documents []string) [][]float32
- func (v *IncrementalBM25Vectorizer) GetCacheSize() int
- func (v *IncrementalBM25Vectorizer) GetDimension() int
- func (v *IncrementalBM25Vectorizer) GetDocCount() int
- func (v *IncrementalBM25Vectorizer) GetVersion() int64
- func (v *IncrementalBM25Vectorizer) GetVocabularySize() int
- func (v *IncrementalBM25Vectorizer) IsCacheEnabled() bool
- func (v *IncrementalBM25Vectorizer) RemoveDocuments(docIndices []int) error
- func (v *IncrementalBM25Vectorizer) Transform(document string) []float32
- type MinHeap
- type OpenAIClient
- type PrecomputedVectors
- type QueryOptimizer
- func (o *QueryOptimizer) BatchTopKSearch(queries []string, documents []string, k int) [][]SearchResult
- func (o *QueryOptimizer) ClearCache()
- func (o *QueryOptimizer) ClearPrecomputedVectors()
- func (o *QueryOptimizer) DisableCache()
- func (o *QueryOptimizer) EnableCache()
- func (o *QueryOptimizer) GetStats() QueryOptimizerStats
- func (o *QueryOptimizer) PrecomputeDocVectors(documents []string)
- func (o *QueryOptimizer) PrecomputeDocVectorsWithPool(documents []string, pool *sync.Pool)
- func (o *QueryOptimizer) PrunedSearch(query string, documents []string, k int, pruneThreshold float32) []SearchResult
- func (o *QueryOptimizer) SetMaxResults(max int)
- func (o *QueryOptimizer) TopKSearch(query string, documents []string, k int) []SearchResult
- type QueryOptimizerStats
- type SearchResult
- type SparseBM25Vectorizer
- func (v *SparseBM25Vectorizer) ClearCache()
- func (v *SparseBM25Vectorizer) DisableCache()
- func (v *SparseBM25Vectorizer) EnableCache()
- func (v *SparseBM25Vectorizer) Fit(documents []string)
- func (v *SparseBM25Vectorizer) FitTransform(documents []string) []*SparseVector
- func (v *SparseBM25Vectorizer) GetCacheSize() int
- func (v *SparseBM25Vectorizer) GetDimension() int
- func (v *SparseBM25Vectorizer) GetVocabularySize() int
- func (v *SparseBM25Vectorizer) IsCacheEnabled() bool
- func (v *SparseBM25Vectorizer) SetThreshold(threshold float32)
- func (v *SparseBM25Vectorizer) SparseToDense(sparse *SparseVector) []float32
- func (v *SparseBM25Vectorizer) TransformToSparse(document string) *SparseVector
- type SparseVector
- type TFIDFConfig
- type TFIDFVectorizer
- func (v *TFIDFVectorizer) BatchTransform(documents []string) [][]float32
- func (v *TFIDFVectorizer) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
- func (v *TFIDFVectorizer) CreateEmbeddingFunc() EmbeddingFunc
- func (v *TFIDFVectorizer) Fit(documents []string)
- func (v *TFIDFVectorizer) FitTransform(documents []string) [][]float32
- func (v *TFIDFVectorizer) GetDimension() int
- func (v *TFIDFVectorizer) GetVocabularySize() int
- func (v *TFIDFVectorizer) SetTokenizer(tokenizer func(string) []string)
- func (v *TFIDFVectorizer) Transform(document string) []float32
- type Vectorizer
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func NewEmbeddingVectorizer ¶
func NewEmbeddingVectorizer(embedType types.EmbeddingType) (defaultConfig Config, vectorizer Vectorizer)
NewEmbeddingVectorizer 创建向量化器
Types ¶
type BM25Config ¶
type BM25Config struct {
MaxVocabSize int // 最大词汇表大小(默认10000)
MinDocFreq int // 最小文档频率(默认1)
MaxDocFreq float64 // 最大文档频率比例(默认1.0,即不限制)
K1 float64 // 词频饱和参数(默认1.5,通常1.2-2.0)
B float64 // 长度归一化参数(默认0.75,通常0.5-1.0)
Variant types.EmbeddingType // BM25变体: "bm25", "bm25l", "bm25+", "bm25f"(默认"bm25")
Delta float64 // BM25+的delta参数(默认1.0)
}
BM25Config BM25配置
type BM25FConfig ¶
type BM25FConfig struct {
BM25Config
FieldWeights map[string]float64 // 字段权重,如 {"title": 2.0, "content": 1.0}
}
BM25FConfig BM25F 多字段配置
type BM25FVectorizer ¶
type BM25FVectorizer struct {
// contains filtered or unexported fields
}
BM25FVectorizer BM25F 多字段向量化器(优化版) BM25F 支持多字段文档,每个字段可以有不同的权重 优化特性: - 使用切片代替 map 存储 IDF,提升访问性能 - 支持查询结果缓存
func NewBM25FVectorizer ¶
func NewBM25FVectorizer(config BM25FConfig) *BM25FVectorizer
NewBM25FVectorizer 创建 BM25F 向量化器
func (*BM25FVectorizer) BatchTransform ¶
func (v *BM25FVectorizer) BatchTransform(docs []map[string]string) [][]float32
BatchTransform 批量向量化(动态worker优化版)
func (*BM25FVectorizer) BatchTransformStrings ¶
func (v *BM25FVectorizer) BatchTransformStrings(documents []string) [][]float32
BatchTransformStrings 批量向量化(单字符串版本,实现 Vectorizer 接口)
func (*BM25FVectorizer) CreateBatchEmbeddingFunc ¶
func (v *BM25FVectorizer) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
CreateBatchEmbeddingFunc 创建批量Embedding函数
func (*BM25FVectorizer) CreateEmbeddingFunc ¶
func (v *BM25FVectorizer) CreateEmbeddingFunc() EmbeddingFunc
CreateEmbeddingFunc 创建Embedding函数
func (*BM25FVectorizer) Fit ¶
func (v *BM25FVectorizer) Fit(documents []map[string]string)
Fit 优化的训练方法(并行化版)
func (*BM25FVectorizer) GetAvgDocLength ¶
func (v *BM25FVectorizer) GetAvgDocLength() float64
GetAvgDocLength 获取平均文档长度
func (*BM25FVectorizer) GetCacheStats ¶
func (v *BM25FVectorizer) GetCacheStats() (hits, misses, size int64)
GetCacheStats 获取缓存统计
func (*BM25FVectorizer) GetDimension ¶
func (v *BM25FVectorizer) GetDimension() int
GetDimension 获取向量维度
func (*BM25FVectorizer) GetVocabularySize ¶
func (v *BM25FVectorizer) GetVocabularySize() int
GetVocabularySize 获取词汇表大小
type BM25FVectorizerAdapter ¶
type BM25FVectorizerAdapter struct {
// contains filtered or unexported fields
}
BM25FVectorizerAdapter BM25F 向量化器适配器,将多字段向量器适配为单字段接口
func NewBM25FVectorizerAdapter ¶
func NewBM25FVectorizerAdapter(vectorizer *BM25FVectorizer) *BM25FVectorizerAdapter
func (*BM25FVectorizerAdapter) BatchTransform ¶
func (a *BM25FVectorizerAdapter) BatchTransform(documents []string) [][]float32
func (*BM25FVectorizerAdapter) CreateBatchEmbeddingFunc ¶
func (a *BM25FVectorizerAdapter) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
func (*BM25FVectorizerAdapter) CreateEmbeddingFunc ¶
func (a *BM25FVectorizerAdapter) CreateEmbeddingFunc() EmbeddingFunc
func (*BM25FVectorizerAdapter) Fit ¶
func (a *BM25FVectorizerAdapter) Fit(documents []string)
func (*BM25FVectorizerAdapter) GetDimension ¶
func (a *BM25FVectorizerAdapter) GetDimension() int
func (*BM25FVectorizerAdapter) Transform ¶
func (a *BM25FVectorizerAdapter) Transform(document string) []float32
type BM25LVectorizer ¶
type BM25LVectorizer struct {
// contains filtered or unexported fields
}
BM25LVectorizer BM25L 向量化器(优化版) BM25L 是改进长度归一化的 BM25 变体 优化特性: - 使用切片代替 map 存储 IDF,提升访问性能 - 支持查询结果缓存
func NewBM25LVectorizer ¶
func NewBM25LVectorizer(config BM25Config) *BM25LVectorizer
NewBM25LVectorizer 创建 BM25L 向量化器
func (*BM25LVectorizer) BatchTransform ¶
func (v *BM25LVectorizer) BatchTransform(documents []string) [][]float32
BatchTransform 批量处理(动态worker优化版)
func (*BM25LVectorizer) CreateBatchEmbeddingFunc ¶
func (v *BM25LVectorizer) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
CreateBatchEmbeddingFunc 创建批量Embedding函数
func (*BM25LVectorizer) CreateEmbeddingFunc ¶
func (v *BM25LVectorizer) CreateEmbeddingFunc() EmbeddingFunc
CreateEmbeddingFunc 创建Embedding函数
func (*BM25LVectorizer) GetAvgDocLength ¶
func (v *BM25LVectorizer) GetAvgDocLength() float64
GetAvgDocLength 获取平均文档长度
func (*BM25LVectorizer) GetCacheStats ¶
func (v *BM25LVectorizer) GetCacheStats() (hits, misses, size int64)
GetCacheStats 获取统计
func (*BM25LVectorizer) GetDimension ¶
func (v *BM25LVectorizer) GetDimension() int
GetDimension 获取向量维度
func (*BM25LVectorizer) GetVocabularySize ¶
func (v *BM25LVectorizer) GetVocabularySize() int
GetVocabularySize 获取词汇表大小
func (*BM25LVectorizer) Transform ¶
func (v *BM25LVectorizer) Transform(document string) []float32
Transform 向量化
type BM25PlusVectorizer ¶
type BM25PlusVectorizer struct {
// contains filtered or unexported fields
}
BM25PlusVectorizer BM25+ 向量化器(优化版) BM25+ 是处理负 IDF 值的 BM25 变体 优化特性: - 使用切片代替 map 存储 IDF,提升访问性能 - 支持查询结果缓存
func NewBM25PlusVectorizer ¶
func NewBM25PlusVectorizer(config BM25Config) *BM25PlusVectorizer
NewBM25PlusVectorizer 创建 BM25+ 向量化器
func (*BM25PlusVectorizer) BatchTransform ¶
func (v *BM25PlusVectorizer) BatchTransform(documents []string) [][]float32
BatchTransform 批量处理(动态worker优化版)
func (*BM25PlusVectorizer) CreateBatchEmbeddingFunc ¶
func (v *BM25PlusVectorizer) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
CreateBatchEmbeddingFunc 创建批量Embedding函数
func (*BM25PlusVectorizer) CreateEmbeddingFunc ¶
func (v *BM25PlusVectorizer) CreateEmbeddingFunc() EmbeddingFunc
CreateEmbeddingFunc 创建Embedding函数
func (*BM25PlusVectorizer) DisableCache ¶
func (v *BM25PlusVectorizer) DisableCache()
DisableCache 禁用缓存
func (*BM25PlusVectorizer) EnableCache ¶
func (v *BM25PlusVectorizer) EnableCache()
EnableCache 启用缓存
func (*BM25PlusVectorizer) GetAvgDocLength ¶
func (v *BM25PlusVectorizer) GetAvgDocLength() float64
GetAvgDocLength 获取平均文档长度
func (*BM25PlusVectorizer) GetCacheStats ¶
func (v *BM25PlusVectorizer) GetCacheStats() (hits, misses, size int64)
GetCacheStats 获取统计
func (*BM25PlusVectorizer) GetDimension ¶
func (v *BM25PlusVectorizer) GetDimension() int
GetDimension 获取向量维度
func (*BM25PlusVectorizer) GetVocabularySize ¶
func (v *BM25PlusVectorizer) GetVocabularySize() int
GetVocabularySize 获取词汇表大小
func (*BM25PlusVectorizer) SetDelta ¶
func (v *BM25PlusVectorizer) SetDelta(delta float64)
SetDelta 设置 delta 参数
func (*BM25PlusVectorizer) Transform ¶
func (v *BM25PlusVectorizer) Transform(document string) []float32
Transform 向量化
type BM25Vectorizer ¶
type BM25Vectorizer struct {
// contains filtered or unexported fields
}
BM25Vectorizer BM25向量化器(优化版) BM25 是一种改进的 TF-IDF 算法,广泛用于信息检索 优化特性: - 使用切片代替 map 存储 IDF,提升访问性能 - 支持查询结果缓存 - 批量查询并发优化
func NewBM25Vectorizer ¶
func NewBM25Vectorizer(config BM25Config) *BM25Vectorizer
NewBM25Vectorizer 创建 BM25 向量化器
func (*BM25Vectorizer) BatchTransform ¶
func (v *BM25Vectorizer) BatchTransform(documents []string) [][]float32
BatchTransform 批量向量化(动态worker优化版)
func (*BM25Vectorizer) CreateBatchEmbeddingFunc ¶
func (v *BM25Vectorizer) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
CreateBatchEmbeddingFunc 创建批量Embedding函数
func (*BM25Vectorizer) CreateEmbeddingFunc ¶
func (v *BM25Vectorizer) CreateEmbeddingFunc() EmbeddingFunc
CreateEmbeddingFunc 创建Embedding函数
func (*BM25Vectorizer) GetAvgDocLength ¶
func (v *BM25Vectorizer) GetAvgDocLength() float64
GetAvgDocLength 获取平均文档长度
func (*BM25Vectorizer) GetCacheStats ¶
func (v *BM25Vectorizer) GetCacheStats() (hits, misses, size int64)
GetCacheStats 获取缓存统计
func (*BM25Vectorizer) GetDimension ¶
func (v *BM25Vectorizer) GetDimension() int
GetDimension 获取向量维度(词汇表大小)
func (*BM25Vectorizer) GetVocabularySize ¶
func (v *BM25Vectorizer) GetVocabularySize() int
GetVocabularySize 获取词汇表大小
func (*BM25Vectorizer) Transform ¶
func (v *BM25Vectorizer) Transform(document string) []float32
Transform 优化的向量化方法
type BatchEmbeddingFunc ¶
BatchEmbeddingFunc 定义批量文本到向量的转换函数
func MockBatchEmbeddingFunc ¶
func MockBatchEmbeddingFunc(dim int) BatchEmbeddingFunc
MockBatchEmbeddingFunc 创建一个批量Mock Embedding函数
func RandomBatchEmbeddingFunc ¶
func RandomBatchEmbeddingFunc(dim int) BatchEmbeddingFunc
RandomBatchEmbeddingFunc 创建一个批量随机Embedding函数
type EmbeddingFunc ¶
EmbeddingFunc 定义文本到向量的转换函数
func MockEmbeddingFunc ¶
func MockEmbeddingFunc(dim int) EmbeddingFunc
MockEmbeddingFunc 创建一个基于文本哈希的Mock Embedding函数 这个函数为相同的文本生成相同的向量,便于测试
func RandomEmbeddingFunc ¶
func RandomEmbeddingFunc(dim int) EmbeddingFunc
RandomEmbeddingFunc 创建一个生成随机向量的Embedding函数 每次调用都会生成不同的向量
type EmbeddingResponse ¶
type EmbeddingResponse struct {
Object string `json:"object"`
Data []struct {
Object string `json:"object"`
Embedding []float32 `json:"embedding"`
Index int `json:"index"`
} `json:"data"`
Model string `json:"model"`
Usage struct {
PromptTokens int `json:"prompt_tokens"`
TotalTokens int `json:"total_tokens"`
} `json:"usage"`
}
EmbeddingResponse OpenAI API响应结构
type IncrementalBM25Vectorizer ¶
type IncrementalBM25Vectorizer struct {
// contains filtered or unexported fields
}
IncrementalBM25Vectorizer 支持增量更新的 BM25 向量化器
func NewIncrementalBM25Vectorizer ¶
func NewIncrementalBM25Vectorizer(config BM25Config) *IncrementalBM25Vectorizer
NewIncrementalBM25Vectorizer 创建支持增量更新的 BM25 向量化器
func (*IncrementalBM25Vectorizer) AddDocuments ¶
func (v *IncrementalBM25Vectorizer) AddDocuments(newDocuments []string) error
AddDocuments 增量添加新文档
func (*IncrementalBM25Vectorizer) ClearCache ¶
func (v *IncrementalBM25Vectorizer) ClearCache()
ClearCache 清空缓存
func (*IncrementalBM25Vectorizer) DisableCache ¶
func (v *IncrementalBM25Vectorizer) DisableCache()
DisableCache 禁用缓存
func (*IncrementalBM25Vectorizer) EnableCache ¶
func (v *IncrementalBM25Vectorizer) EnableCache()
EnableCache 启用缓存
func (*IncrementalBM25Vectorizer) Fit ¶
func (v *IncrementalBM25Vectorizer) Fit(documents []string)
Fit 初始训练 BM25 模型
func (*IncrementalBM25Vectorizer) FitTransform ¶
func (v *IncrementalBM25Vectorizer) FitTransform(documents []string) [][]float32
FitTransform 训练并转换文档
func (*IncrementalBM25Vectorizer) GetCacheSize ¶
func (v *IncrementalBM25Vectorizer) GetCacheSize() int
GetCacheSize 获取缓存大小
func (*IncrementalBM25Vectorizer) GetDimension ¶
func (v *IncrementalBM25Vectorizer) GetDimension() int
GetDimension 获取向量维度
func (*IncrementalBM25Vectorizer) GetDocCount ¶
func (v *IncrementalBM25Vectorizer) GetDocCount() int
GetDocCount 获取文档数量
func (*IncrementalBM25Vectorizer) GetVersion ¶
func (v *IncrementalBM25Vectorizer) GetVersion() int64
GetVersion 获取当前版本号
func (*IncrementalBM25Vectorizer) GetVocabularySize ¶
func (v *IncrementalBM25Vectorizer) GetVocabularySize() int
GetVocabularySize 获取词汇表大小
func (*IncrementalBM25Vectorizer) IsCacheEnabled ¶
func (v *IncrementalBM25Vectorizer) IsCacheEnabled() bool
IsCacheEnabled 检查缓存是否启用
func (*IncrementalBM25Vectorizer) RemoveDocuments ¶
func (v *IncrementalBM25Vectorizer) RemoveDocuments(docIndices []int) error
RemoveDocuments 移除指定索引的文档 (通过标记实现,不实际删除以保持一致性)
func (*IncrementalBM25Vectorizer) Transform ¶
func (v *IncrementalBM25Vectorizer) Transform(document string) []float32
Transform 转换文档
type OpenAIClient ¶
type OpenAIClient struct {
// contains filtered or unexported fields
}
OpenAIClient OpenAI Embedding客户端
func NewOpenAIClient ¶
func NewOpenAIClient(baseURL, apiKey, model string) *OpenAIClient
NewOpenAIClient 创建OpenAI客户端 baseURL: API基础URL,默认为 https://api.openai.com/v1 apiKey: OpenAI API密钥 model: 使用的模型,默认为 text-embedding-3-small
func NewOpenAIClientFromConfig ¶
func NewOpenAIClientFromConfig(config Config) *OpenAIClient
NewOpenAIClientFromConfig 从配置创建OpenAI客户端
func (*OpenAIClient) CreateBatchEmbeddingFunc ¶
func (c *OpenAIClient) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
CreateBatchEmbeddingFunc 返回一个BatchEmbeddingFunc,使用OpenAI客户端
func (*OpenAIClient) CreateBatchEmbeddings ¶
func (c *OpenAIClient) CreateBatchEmbeddings(texts []string) ([][]float32, error)
CreateBatchEmbeddings 为多个文本批量创建向量嵌入
func (*OpenAIClient) CreateEmbedding ¶
func (c *OpenAIClient) CreateEmbedding(text string) ([]float32, error)
CreateEmbedding 为单个文本创建向量嵌入
func (*OpenAIClient) CreateEmbeddingFunc ¶
func (c *OpenAIClient) CreateEmbeddingFunc() EmbeddingFunc
CreateEmbeddingFunc 返回一个EmbeddingFunc,使用OpenAI客户端
type PrecomputedVectors ¶ added in v1.0.3
type PrecomputedVectors struct {
// contains filtered or unexported fields
}
PrecomputedVectors 预计算的向量集合
func NewPrecomputedVectors ¶ added in v1.0.3
func NewPrecomputedVectors(capacity int) *PrecomputedVectors
NewPrecomputedVectors 创建预计算向量容器
func (*PrecomputedVectors) Get ¶ added in v1.0.3
func (p *PrecomputedVectors) Get(idx int) []float32
Get 获取指定索引的向量
func (*PrecomputedVectors) SetVectors ¶ added in v1.0.3
func (p *PrecomputedVectors) SetVectors(vectors [][]float32)
SetVectors 设置预计算的向量
type QueryOptimizer ¶
type QueryOptimizer struct {
// contains filtered or unexported fields
}
QueryOptimizer 浼樺寲鍚庣殑鏌ヨ浼樺寲鍣?
func NewQueryOptimizer ¶
func NewQueryOptimizer(vectorizer *BM25Vectorizer) *QueryOptimizer
NewQueryOptimizer 鍒涘缓浼樺寲鐨勬煡璇紭鍖栧櫒
func (*QueryOptimizer) BatchTopKSearch ¶
func (o *QueryOptimizer) BatchTopKSearch(queries []string, documents []string, k int) [][]SearchResult
BatchTopKSearch 鎵归噺 Top-K 鎼滅储 - 涓€娆℃€у鐞嗗涓煡璇?
func (*QueryOptimizer) ClearPrecomputedVectors ¶ added in v1.0.3
func (o *QueryOptimizer) ClearPrecomputedVectors()
ClearPrecomputedVectors 清除预计算的向量
func (*QueryOptimizer) GetStats ¶
func (o *QueryOptimizer) GetStats() QueryOptimizerStats
GetStats 鑾峰彇缁熻淇℃伅
func (*QueryOptimizer) PrecomputeDocVectors ¶ added in v1.0.3
func (o *QueryOptimizer) PrecomputeDocVectors(documents []string)
PrecomputeDocVectors 预计算所有文档的向量(核心优化) 这避免了搜索时重复对每个文档进行向量化
func (*QueryOptimizer) PrecomputeDocVectorsWithPool ¶ added in v1.0.3
func (o *QueryOptimizer) PrecomputeDocVectorsWithPool(documents []string, pool *sync.Pool)
PrecomputeDocVectorsWithPool 使用对象池预计算向量
func (*QueryOptimizer) PrunedSearch ¶
func (o *QueryOptimizer) PrunedSearch(query string, documents []string, k int, pruneThreshold float32) []SearchResult
PrunedSearch 鍓灊鎼滅储 - 鎻愬墠缁堟
func (*QueryOptimizer) SetMaxResults ¶
func (o *QueryOptimizer) SetMaxResults(max int)
SetMaxResults 璁剧疆鏈€澶х粨鏋滄暟
func (*QueryOptimizer) TopKSearch ¶
func (o *QueryOptimizer) TopKSearch(query string, documents []string, k int) []SearchResult
TopKSearch 浼樺寲鐨?Top-K 鎼滅储 优化:优先使用预计算的向量,避免重复向量化
type QueryOptimizerStats ¶
type QueryOptimizerStats struct {
HitCount int64
MissCount int64
TotalTime time.Duration
AvgTime time.Duration
}
QueryOptimizerStats 浼樺寲鍣ㄧ粺璁′俊鎭?
type SearchResult ¶
SearchResult 搜索结果
type SparseBM25Vectorizer ¶
type SparseBM25Vectorizer struct {
// contains filtered or unexported fields
}
SparseBM25Vectorizer 鏀寔绋€鐤忓悜閲忎紭鍖栫殑 BM25 鍚戦噺鍖栧櫒
func NewSparseBM25Vectorizer ¶
func NewSparseBM25Vectorizer(config BM25Config) *SparseBM25Vectorizer
NewSparseBM25Vectorizer 鍒涘缓绋€鐤忓悜閲?BM25 鍚戦噺鍖栧櫒
func (*SparseBM25Vectorizer) ClearCache ¶
func (v *SparseBM25Vectorizer) ClearCache()
ClearCache 娓呯┖缂撳瓨
func (*SparseBM25Vectorizer) DisableCache ¶
func (v *SparseBM25Vectorizer) DisableCache()
DisableCache 绂佺敤缂撳瓨
func (*SparseBM25Vectorizer) EnableCache ¶
func (v *SparseBM25Vectorizer) EnableCache()
EnableCache 鍚敤缂撳瓨
func (*SparseBM25Vectorizer) Fit ¶
func (v *SparseBM25Vectorizer) Fit(documents []string)
Fit 璁粌 BM25 妯″瀷
func (*SparseBM25Vectorizer) FitTransform ¶
func (v *SparseBM25Vectorizer) FitTransform(documents []string) []*SparseVector
FitTransform 璁粌骞惰浆鎹㈡枃妗?
func (*SparseBM25Vectorizer) GetCacheSize ¶
func (v *SparseBM25Vectorizer) GetCacheSize() int
GetCacheSize 鑾峰彇缂撳瓨澶у皬
func (*SparseBM25Vectorizer) GetDimension ¶
func (v *SparseBM25Vectorizer) GetDimension() int
GetDimension 鑾峰彇鍚戦噺缁村害
func (*SparseBM25Vectorizer) GetVocabularySize ¶
func (v *SparseBM25Vectorizer) GetVocabularySize() int
GetVocabularySize 鑾峰彇璇嶆眹琛ㄥぇ灏?
func (*SparseBM25Vectorizer) IsCacheEnabled ¶
func (v *SparseBM25Vectorizer) IsCacheEnabled() bool
IsCacheEnabled 妫€鏌ョ紦瀛樻槸鍚﹀惎鐢?
func (*SparseBM25Vectorizer) SetThreshold ¶
func (v *SparseBM25Vectorizer) SetThreshold(threshold float32)
SetThreshold 璁剧疆绋€鐤忓寲闃堝€?
func (*SparseBM25Vectorizer) SparseToDense ¶
func (v *SparseBM25Vectorizer) SparseToDense(sparse *SparseVector) []float32
SparseToDense 灏嗙█鐤忓悜閲忚浆鎹负绋犲瘑鍚戦噺
func (*SparseBM25Vectorizer) TransformToSparse ¶
func (v *SparseBM25Vectorizer) TransformToSparse(document string) *SparseVector
TransformToSparse 灏嗘枃妗h浆鎹负绋€鐤忓悜閲?
type SparseVector ¶
SparseVector 绋€鐤忓悜閲忚〃绀?
func (*SparseVector) DotProduct ¶
func (v1 *SparseVector) DotProduct(v2 *SparseVector) float32
DotProduct 璁$畻涓や釜绋€鐤忓悜閲忕殑鐐圭Н
type TFIDFConfig ¶
type TFIDFConfig struct {
MaxVocabSize int // 最大词汇表大小(默认10000)
MinDocFreq int // 最小文档频率(默认1)
MaxDocFreq float64 // 最大文档频率比例(默认1.0,即不限制)
Normalize bool // 是否归一化向量(默认true)
}
TFIDFConfig TF-IDF配置
type TFIDFVectorizer ¶
type TFIDFVectorizer struct {
// contains filtered or unexported fields
}
TFIDFVectorizer TF-IDF向量化器
func NewTFIDFVectorizer ¶
func NewTFIDFVectorizer(config TFIDFConfig) *TFIDFVectorizer
NewTFIDFVectorizer 创建TF-IDF向量化器
func (*TFIDFVectorizer) BatchTransform ¶
func (v *TFIDFVectorizer) BatchTransform(documents []string) [][]float32
BatchTransform 批量转换文档
func (*TFIDFVectorizer) CreateBatchEmbeddingFunc ¶
func (v *TFIDFVectorizer) CreateBatchEmbeddingFunc() BatchEmbeddingFunc
CreateBatchEmbeddingFunc 创建批量Embedding函数
func (*TFIDFVectorizer) CreateEmbeddingFunc ¶
func (v *TFIDFVectorizer) CreateEmbeddingFunc() EmbeddingFunc
CreateEmbeddingFunc 创建Embedding函数
func (*TFIDFVectorizer) Fit ¶
func (v *TFIDFVectorizer) Fit(documents []string)
Fit 训练TF-IDF模型(构建词汇表和IDF)
func (*TFIDFVectorizer) FitTransform ¶
func (v *TFIDFVectorizer) FitTransform(documents []string) [][]float32
FitTransform 训练并转换文档
func (*TFIDFVectorizer) GetDimension ¶
func (v *TFIDFVectorizer) GetDimension() int
GetDimension 获取向量维度
func (*TFIDFVectorizer) GetVocabularySize ¶
func (v *TFIDFVectorizer) GetVocabularySize() int
GetVocabularySize 获取词汇表大小
func (*TFIDFVectorizer) SetTokenizer ¶
func (v *TFIDFVectorizer) SetTokenizer(tokenizer func(string) []string)
SetTokenizer 设置自定义分词函数
func (*TFIDFVectorizer) Transform ¶
func (v *TFIDFVectorizer) Transform(document string) []float32
Transform 将文档转换为TF-IDF向量
type Vectorizer ¶
type Vectorizer interface {
// Fit 训练向量化器
Fit(documents []string)
// Transform 将单个文档转换为向量
Transform(document string) []float32
// BatchTransform 批量将文档转换为向量
BatchTransform(documents []string) [][]float32
// GetDimension 获取向量维度
GetDimension() int
// CreateEmbeddingFunc 创建单个文本的 Embedding 函数
CreateEmbeddingFunc() EmbeddingFunc
// CreateBatchEmbeddingFunc 创建批量文本的 Embedding 函数
CreateBatchEmbeddingFunc() BatchEmbeddingFunc
}
Vectorizer 向量化器接口 统一的向量化器接口,支持 TF-IDF、BM25 等不同实现