Documentation
¶
Overview ¶
Package rag provides distributed RAG index functionality
Index ¶
- func AutoTuneChunkingOptions(content string) (ChunkingOptions, DocumentAnalysis)
- func GenerateChunkID(documentID string, index int) string
- func GenerateDocumentID(content []byte) string
- func IsSupportedExtension(ext string) bool
- func SupportedExtensions() []string
- type Chunk
- type ChunkResult
- type Chunker
- type ChunkingOptions
- type DistributedNode
- type DistributedRAG
- func (dr *DistributedRAG) AddNode(node DistributedNode)
- func (dr *DistributedRAG) GetStats() map[string]interface{}
- func (dr *DistributedRAG) ListNodes() []DistributedNode
- func (dr *DistributedRAG) RebalanceShards(ctx context.Context) error
- func (dr *DistributedRAG) RemoveNode(nodeID string) error
- func (dr *DistributedRAG) Search(ctx context.Context, query string, topK int, ...) (*DistributedSearchResult, error)
- func (dr *DistributedRAG) Stop()
- func (dr *DistributedRAG) SyncIndex(ctx context.Context) error
- type DistributedRAGConfig
- type DistributedSearchResult
- type Document
- type DocumentAnalysis
- type DocumentParser
- func (p *DocumentParser) Parse(content []byte, filename string, ext string) (*ParseResult, error)
- func (p *DocumentParser) ParseCSV(content []byte) (*ParseResult, error)
- func (p *DocumentParser) ParseCode(content []byte, ext string) (*ParseResult, error)
- func (p *DocumentParser) ParseDOCX(content []byte) (*ParseResult, error)
- func (p *DocumentParser) ParseHTML(content []byte) (*ParseResult, error)
- func (p *DocumentParser) ParseJSON(content []byte) (*ParseResult, error)
- func (p *DocumentParser) ParsePDF(content []byte) (*ParseResult, error)
- func (p *DocumentParser) ParsePPTX(content []byte) (*ParseResult, error)
- func (p *DocumentParser) ParsePlainText(content []byte) (*ParseResult, error)
- func (p *DocumentParser) ParseRTF(content []byte) (*ParseResult, error)
- func (p *DocumentParser) ParseXLSX(content []byte) (*ParseResult, error)
- func (p *DocumentParser) ParseXML(content []byte) (*ParseResult, error)
- type Engine
- func (e *Engine) AnalyzeDocument(content string) DocumentAnalysis
- func (e *Engine) AutoEnableWithModel(ctx context.Context, availableModels []string) error
- func (e *Engine) AutoRestore(ctx context.Context) error
- func (e *Engine) DeleteDocument(id string) bool
- func (e *Engine) Disable()
- func (e *Engine) Enable(ctx context.Context, embeddingModel string) error
- func (e *Engine) EnhancePrompt(ctx context.Context, userMessage string) (string, *RAGContext, error)
- func (e *Engine) GetDocument(id string) *Document
- func (e *Engine) GetPersistedModel() string
- func (e *Engine) IngestFile(ctx context.Context, filePath string, metadata map[string]string) (*Document, error)
- func (e *Engine) IngestReader(ctx context.Context, name string, reader io.Reader, metadata map[string]string) (*Document, error)
- func (e *Engine) IngestText(ctx context.Context, name, content string, metadata map[string]string) (*Document, error)
- func (e *Engine) IsAutoTuningEnabled() bool
- func (e *Engine) IsEnabled() bool
- func (e *Engine) ListDocuments() []*Document
- func (e *Engine) Search(ctx context.Context, query string, opts SearchOptions) (*RAGContext, error)
- func (e *Engine) SetAutoTuning(enabled bool)
- func (e *Engine) Stats() map[string]interface{}
- type NodeSearchResult
- type ParseResult
- type RAGContext
- type SQLiteStore
- func (s *SQLiteStore) AddChunk(chunk *Chunk, embedding []float32) error
- func (s *SQLiteStore) AddDocument(doc *Document) error
- func (s *SQLiteStore) Close() error
- func (s *SQLiteStore) DeleteDocument(id string) error
- func (s *SQLiteStore) GetDocument(id string) (*Document, error)
- func (s *SQLiteStore) HybridSearch(queryEmbedding []float32, query string, limit int, minScore float32, ...) ([]SearchResult, error)
- func (s *SQLiteStore) ListDocuments() ([]*Document, error)
- func (s *SQLiteStore) Search(queryEmbedding []float32, limit int, minScore float32) ([]SearchResult, error)
- func (s *SQLiteStore) Stats() map[string]interface{}
- type SearchOptions
- type SearchResult
- type Store
- type VectorStore
- func (vs *VectorStore) AddChunk(chunk *Chunk, embedding []float32) error
- func (vs *VectorStore) AddDocument(doc *Document) error
- func (vs *VectorStore) Close() error
- func (vs *VectorStore) DeleteDocument(docID string) error
- func (vs *VectorStore) GetAllEmbeddings() map[string][]float32
- func (vs *VectorStore) GetChunk(id string) (*Chunk, error)
- func (vs *VectorStore) GetDocument(id string) (*Document, error)
- func (vs *VectorStore) HybridSearch(queryEmbedding []float32, query string, limit int, minScore float32, ...) ([]SearchResult, error)
- func (vs *VectorStore) ListChunks() []*Chunk
- func (vs *VectorStore) ListDocuments() ([]*Document, error)
- func (vs *VectorStore) Search(queryEmbedding []float32, limit int, minScore float32) ([]SearchResult, error)
- func (vs *VectorStore) Stats() map[string]interface{}
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AutoTuneChunkingOptions ¶ added in v0.2.11
func AutoTuneChunkingOptions(content string) (ChunkingOptions, DocumentAnalysis)
AutoTuneChunkingOptions analyzes document and returns optimized chunking options
func GenerateChunkID ¶
GenerateChunkID creates a unique ID for a chunk
func GenerateDocumentID ¶
GenerateDocumentID creates a unique ID for a document based on content hash
func IsSupportedExtension ¶ added in v0.2.3
IsSupportedExtension checks if a file extension is supported
func SupportedExtensions ¶ added in v0.2.3
func SupportedExtensions() []string
SupportedExtensions returns all supported file extensions Includes formats that can be parsed with built-in or optional external tools
Types ¶
type Chunk ¶
type Chunk struct {
ID string `json:"id"`
DocumentID string `json:"document_id"`
Content string `json:"content"`
Index int `json:"index"` // Position in document
StartChar int `json:"start_char"` // Character offset in original document
EndChar int `json:"end_char"`
Embedding []float32 `json:"-"` // Stored separately for efficiency
CreatedAt time.Time `json:"created_at"`
}
Chunk represents a chunk of text from a document
type ChunkResult ¶ added in v0.2.11
type ChunkResult struct {
DocumentID string `json:"document_id"`
ChunkID string `json:"chunk_id"`
Content string `json:"content"`
Score float64 `json:"score"`
Metadata map[string]string `json:"metadata,omitempty"`
SourceNode string `json:"source_node,omitempty"`
}
ChunkResult represents a single chunk search result
type Chunker ¶
type Chunker struct {
// contains filtered or unexported fields
}
Chunker handles splitting documents into chunks
func NewChunker ¶
func NewChunker(opts ChunkingOptions) *Chunker
NewChunker creates a new chunker with the given options
type ChunkingOptions ¶
type ChunkingOptions struct {
ChunkSize int `json:"chunk_size"` // Target size in characters
ChunkOverlap int `json:"chunk_overlap"` // Overlap between chunks
Separator string `json:"separator"` // Primary separator (default: paragraph)
}
ChunkingOptions configures how documents are chunked
func DefaultChunkingOptions ¶
func DefaultChunkingOptions() ChunkingOptions
DefaultChunkingOptions returns sensible defaults for general documents
func LargeDocumentChunkingOptions ¶
func LargeDocumentChunkingOptions() ChunkingOptions
LargeDocumentChunkingOptions returns options for longer documents
type DistributedNode ¶ added in v0.2.11
type DistributedNode struct {
ID string `json:"id"`
URL string `json:"url"`
Name string `json:"name"`
Healthy bool `json:"healthy"`
LastCheck time.Time `json:"last_check"`
DocCount int `json:"doc_count"`
ChunkCount int `json:"chunk_count"`
IndexSizeMB float64 `json:"index_size_mb"`
}
DistributedNode represents a node in the distributed RAG cluster
type DistributedRAG ¶ added in v0.2.11
type DistributedRAG struct {
// contains filtered or unexported fields
}
DistributedRAG manages distributed RAG search across multiple nodes
func NewDistributedRAG ¶ added in v0.2.11
func NewDistributedRAG(config DistributedRAGConfig) *DistributedRAG
NewDistributedRAG creates a new distributed RAG manager
func (*DistributedRAG) AddNode ¶ added in v0.2.11
func (dr *DistributedRAG) AddNode(node DistributedNode)
AddNode adds a node to the distributed cluster
func (*DistributedRAG) GetStats ¶ added in v0.2.11
func (dr *DistributedRAG) GetStats() map[string]interface{}
GetStats returns distributed RAG statistics
func (*DistributedRAG) ListNodes ¶ added in v0.2.11
func (dr *DistributedRAG) ListNodes() []DistributedNode
ListNodes returns all configured nodes
func (*DistributedRAG) RebalanceShards ¶ added in v0.2.11
func (dr *DistributedRAG) RebalanceShards(ctx context.Context) error
RebalanceShards redistributes documents across nodes This is a placeholder for future implementation of sharding
func (*DistributedRAG) RemoveNode ¶ added in v0.2.11
func (dr *DistributedRAG) RemoveNode(nodeID string) error
RemoveNode removes a node from the cluster
func (*DistributedRAG) Search ¶ added in v0.2.11
func (dr *DistributedRAG) Search(ctx context.Context, query string, topK int, localSearch func(string, int) ([]ChunkResult, error)) (*DistributedSearchResult, error)
Search performs a distributed search across all healthy nodes
func (*DistributedRAG) Stop ¶ added in v0.2.11
func (dr *DistributedRAG) Stop()
Stop stops the distributed RAG manager
type DistributedRAGConfig ¶ added in v0.2.11
type DistributedRAGConfig struct {
LocalNodeID string `json:"local_node_id"`
SearchPath string `json:"search_path"`
HealthPath string `json:"health_path"`
TimeoutSeconds int `json:"timeout_seconds"`
HealthCheckSeconds int `json:"health_check_seconds"`
}
DistributedRAGConfig contains configuration for distributed RAG
type DistributedSearchResult ¶ added in v0.2.11
type DistributedSearchResult struct {
Chunks []ChunkResult `json:"chunks"`
TotalChunks int `json:"total_chunks"`
SearchTimeMS int64 `json:"search_time_ms"`
NodesQueried int `json:"nodes_queried"`
NodeResults []NodeSearchResult `json:"node_results"`
}
DistributedSearchResult represents a search result from a distributed search
type Document ¶
type Document struct {
ID string `json:"id"`
Name string `json:"name"`
ContentType string `json:"content_type"` // "text/plain", "application/pdf", etc.
Size int64 `json:"size"`
ChunkCount int `json:"chunk_count"`
Metadata map[string]string `json:"metadata,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
Document represents an uploaded document
type DocumentAnalysis ¶ added in v0.2.11
type DocumentAnalysis struct {
TotalChars int `json:"total_chars"`
TotalWords int `json:"total_words"`
TotalParagraphs int `json:"total_paragraphs"`
TotalSentences int `json:"total_sentences"`
AvgWordsPerPara float64 `json:"avg_words_per_para"`
AvgWordsPerSent float64 `json:"avg_words_per_sent"`
DocumentType string `json:"document_type"` // prose, technical, code, list, mixed
RecommendedOpts ChunkingOptions `json:"recommended_options"`
Reasoning string `json:"reasoning"`
}
DocumentAnalysis contains analysis results for automatic chunking tuning
type DocumentParser ¶ added in v0.2.3
type DocumentParser struct {
// contains filtered or unexported fields
}
DocumentParser handles parsing of various document formats
func NewDocumentParser ¶ added in v0.2.3
func NewDocumentParser() *DocumentParser
NewDocumentParser creates a new document parser
func (*DocumentParser) Parse ¶ added in v0.2.3
func (p *DocumentParser) Parse(content []byte, filename string, ext string) (*ParseResult, error)
Parse attempts to parse a document based on its extension
func (*DocumentParser) ParseCSV ¶ added in v0.2.3
func (p *DocumentParser) ParseCSV(content []byte) (*ParseResult, error)
ParseCSV handles CSV files
func (*DocumentParser) ParseCode ¶ added in v0.2.3
func (p *DocumentParser) ParseCode(content []byte, ext string) (*ParseResult, error)
ParseCode handles source code files with syntax awareness
func (*DocumentParser) ParseDOCX ¶ added in v0.2.3
func (p *DocumentParser) ParseDOCX(content []byte) (*ParseResult, error)
ParseDOCX extracts text from a DOCX file
func (*DocumentParser) ParseHTML ¶ added in v0.2.3
func (p *DocumentParser) ParseHTML(content []byte) (*ParseResult, error)
ParseHTML extracts text from HTML
func (*DocumentParser) ParseJSON ¶ added in v0.2.3
func (p *DocumentParser) ParseJSON(content []byte) (*ParseResult, error)
ParseJSON formats JSON for readability
func (*DocumentParser) ParsePDF ¶ added in v0.2.3
func (p *DocumentParser) ParsePDF(content []byte) (*ParseResult, error)
ParsePDF extracts text from a PDF file with layout awareness First tries pdftotext (poppler-utils) for best results, falls back to basic extraction Also detects images and provides metadata about document structure
func (*DocumentParser) ParsePPTX ¶ added in v0.2.3
func (p *DocumentParser) ParsePPTX(content []byte) (*ParseResult, error)
ParsePPTX extracts text from a PPTX file
func (*DocumentParser) ParsePlainText ¶ added in v0.2.3
func (p *DocumentParser) ParsePlainText(content []byte) (*ParseResult, error)
ParsePlainText handles plain text files
func (*DocumentParser) ParseRTF ¶ added in v0.2.3
func (p *DocumentParser) ParseRTF(content []byte) (*ParseResult, error)
ParseRTF extracts text from RTF
func (*DocumentParser) ParseXLSX ¶ added in v0.2.3
func (p *DocumentParser) ParseXLSX(content []byte) (*ParseResult, error)
ParseXLSX extracts text from an XLSX file
func (*DocumentParser) ParseXML ¶ added in v0.2.3
func (p *DocumentParser) ParseXML(content []byte) (*ParseResult, error)
ParseXML extracts text from XML
type Engine ¶
type Engine struct {
// contains filtered or unexported fields
}
Engine is the main RAG engine that coordinates document ingestion and search
func NewEngine ¶
func NewEngine(embeddingEngine *inference.EmbeddingEngine, dataDir string) *Engine
NewEngine creates a new RAG engine
func (*Engine) AnalyzeDocument ¶ added in v0.2.11
func (e *Engine) AnalyzeDocument(content string) DocumentAnalysis
AnalyzeDocument analyzes a document and returns chunking recommendations This is useful for previewing what settings would be used before ingestion
func (*Engine) AutoEnableWithModel ¶ added in v0.2.9
AutoEnableWithModel auto-enables RAG if an embedding model is available It looks for models with names containing "embed", "bge", "minilm", or "nomic"
func (*Engine) AutoRestore ¶
AutoRestore attempts to restore RAG state from disk if data exists
func (*Engine) DeleteDocument ¶
DeleteDocument removes a document and its chunks
func (*Engine) EnhancePrompt ¶
func (e *Engine) EnhancePrompt(ctx context.Context, userMessage string) (string, *RAGContext, error)
EnhancePrompt enhances a user prompt with relevant context from documents
func (*Engine) GetDocument ¶
GetDocument returns a document by ID
func (*Engine) GetPersistedModel ¶
GetPersistedModel returns the embedding model from persisted data (if any) This is used to auto-restore RAG on server startup
func (*Engine) IngestFile ¶
func (e *Engine) IngestFile(ctx context.Context, filePath string, metadata map[string]string) (*Document, error)
IngestFile ingests a file from the filesystem Now supports PDF, DOCX, XLSX, PPTX, and many more formats
func (*Engine) IngestReader ¶
func (e *Engine) IngestReader(ctx context.Context, name string, reader io.Reader, metadata map[string]string) (*Document, error)
IngestReader ingests content from an io.Reader
func (*Engine) IngestText ¶
func (e *Engine) IngestText(ctx context.Context, name, content string, metadata map[string]string) (*Document, error)
IngestText ingests plain text content
func (*Engine) IsAutoTuningEnabled ¶ added in v0.2.11
IsAutoTuningEnabled returns whether auto-tuning is enabled
func (*Engine) ListDocuments ¶
ListDocuments returns all documents
func (*Engine) Search ¶
func (e *Engine) Search(ctx context.Context, query string, opts SearchOptions) (*RAGContext, error)
Search searches for relevant chunks using hybrid search (semantic + keyword)
func (*Engine) SetAutoTuning ¶ added in v0.2.11
SetAutoTuning enables or disables automatic chunking tuning
type NodeSearchResult ¶ added in v0.2.11
type NodeSearchResult struct {
NodeID string `json:"node_id"`
Chunks []ChunkResult `json:"chunks"`
SearchTimeMS int64 `json:"search_time_ms"`
Error string `json:"error,omitempty"`
}
NodeSearchResult represents search results from a single node
type ParseResult ¶ added in v0.2.3
type ParseResult struct {
Content string
ContentType string
Metadata map[string]string
PageCount int
WordCount int
}
ParseResult contains the extracted content and metadata
type RAGContext ¶
type RAGContext struct {
Query string `json:"query"`
Results []SearchResult `json:"results"`
Context string `json:"context"` // Formatted context string for injection
}
RAGContext represents context to inject into LLM prompts
func (*RAGContext) FormatContext ¶
func (rc *RAGContext) FormatContext() string
FormatContext formats search results into a context string for LLM injection Groups chunks by their source document to avoid confusion
func (*RAGContext) TruncateContext ¶
func (rc *RAGContext) TruncateContext(maxLen int)
TruncateContext truncates the context to fit within maxLen characters while keeping complete chunks
func (*RAGContext) UniqueDocumentCount ¶ added in v0.2.9
func (rc *RAGContext) UniqueDocumentCount() int
UniqueDocumentCount returns the number of unique documents in the results
type SQLiteStore ¶ added in v0.2.6
type SQLiteStore struct {
// contains filtered or unexported fields
}
SQLiteStore implements a persistent vector store using SQLite
func NewSQLiteStore ¶ added in v0.2.6
func NewSQLiteStore(dataDir string) (*SQLiteStore, error)
NewSQLiteStore creates a new SQLite-based vector store
func (*SQLiteStore) AddChunk ¶ added in v0.2.6
func (s *SQLiteStore) AddChunk(chunk *Chunk, embedding []float32) error
AddChunk adds a chunk with its embedding to the store
func (*SQLiteStore) AddDocument ¶ added in v0.2.6
func (s *SQLiteStore) AddDocument(doc *Document) error
AddDocument adds a document to the store
func (*SQLiteStore) Close ¶ added in v0.2.6
func (s *SQLiteStore) Close() error
Close closes the database connection
func (*SQLiteStore) DeleteDocument ¶ added in v0.2.6
func (s *SQLiteStore) DeleteDocument(id string) error
DeleteDocument deletes a document and its chunks
func (*SQLiteStore) GetDocument ¶ added in v0.2.6
func (s *SQLiteStore) GetDocument(id string) (*Document, error)
GetDocument retrieves a document by ID
func (*SQLiteStore) HybridSearch ¶ added in v0.2.11
func (s *SQLiteStore) HybridSearch(queryEmbedding []float32, query string, limit int, minScore float32, alpha float32) ([]SearchResult, error)
HybridSearch performs a hybrid search combining semantic similarity with FTS5 keyword matching
func (*SQLiteStore) ListDocuments ¶ added in v0.2.6
func (s *SQLiteStore) ListDocuments() ([]*Document, error)
ListDocuments returns all documents
func (*SQLiteStore) Search ¶ added in v0.2.6
func (s *SQLiteStore) Search(queryEmbedding []float32, limit int, minScore float32) ([]SearchResult, error)
Search performs a semantic search using cosine similarity Uses a min-heap for efficient top-k selection (O(n log k) vs O(n log n) for full sort)
func (*SQLiteStore) Stats ¶ added in v0.2.6
func (s *SQLiteStore) Stats() map[string]interface{}
Stats returns statistics about the store
type SearchOptions ¶
type SearchOptions struct {
TopK int `json:"top_k"` // Number of results to return
MinScore float32 `json:"min_score"` // Minimum similarity score (0-1)
DocumentFilter []string `json:"document_filter"` // Only search these document IDs
IncludeContent bool `json:"include_content"` // Include chunk content in results
}
SearchOptions configures search behavior
func DefaultSearchOptions ¶
func DefaultSearchOptions() SearchOptions
DefaultSearchOptions returns sensible defaults
type SearchResult ¶
type SearchResult struct {
Chunk *Chunk `json:"chunk"`
Score float32 `json:"score"` // Cosine similarity score (0-1)
DocumentID string `json:"document_id"`
DocName string `json:"document_name"`
Metadata map[string]string `json:"metadata,omitempty"` // Source URL, author, etc.
}
SearchResult represents a search result with relevance score
type Store ¶ added in v0.2.6
type Store interface {
AddDocument(doc *Document) error
AddChunk(chunk *Chunk, embedding []float32) error
GetDocument(id string) (*Document, error)
ListDocuments() ([]*Document, error)
DeleteDocument(id string) error
Search(queryEmbedding []float32, limit int, minScore float32) ([]SearchResult, error)
HybridSearch(queryEmbedding []float32, query string, limit int, minScore float32, alpha float32) ([]SearchResult, error)
Stats() map[string]interface{}
Close() error
}
Store defines the interface for vector storage
type VectorStore ¶
type VectorStore struct {
// contains filtered or unexported fields
}
VectorStore is an in-memory vector database for semantic search
func NewVectorStore ¶
func NewVectorStore() *VectorStore
NewVectorStore creates a new in-memory vector store
func (*VectorStore) AddChunk ¶
func (vs *VectorStore) AddChunk(chunk *Chunk, embedding []float32) error
AddChunk adds a chunk with its embedding to the store
func (*VectorStore) AddDocument ¶
func (vs *VectorStore) AddDocument(doc *Document) error
AddDocument adds a document to the store
func (*VectorStore) Close ¶ added in v0.2.6
func (vs *VectorStore) Close() error
Close closes the store (no-op for in-memory)
func (*VectorStore) DeleteDocument ¶
func (vs *VectorStore) DeleteDocument(docID string) error
DeleteDocument removes a document and all its chunks
func (*VectorStore) GetAllEmbeddings ¶
func (vs *VectorStore) GetAllEmbeddings() map[string][]float32
GetAllEmbeddings returns all embeddings
func (*VectorStore) GetChunk ¶
func (vs *VectorStore) GetChunk(id string) (*Chunk, error)
GetChunk retrieves a chunk by ID
func (*VectorStore) GetDocument ¶
func (vs *VectorStore) GetDocument(id string) (*Document, error)
GetDocument retrieves a document by ID
func (*VectorStore) HybridSearch ¶
func (vs *VectorStore) HybridSearch(queryEmbedding []float32, query string, limit int, minScore float32, alpha float32) ([]SearchResult, error)
HybridSearch implements the Store interface for hybrid search
func (*VectorStore) ListChunks ¶
func (vs *VectorStore) ListChunks() []*Chunk
ListChunks returns all chunks
func (*VectorStore) ListDocuments ¶
func (vs *VectorStore) ListDocuments() ([]*Document, error)
ListDocuments returns all documents
func (*VectorStore) Search ¶
func (vs *VectorStore) Search(queryEmbedding []float32, limit int, minScore float32) ([]SearchResult, error)
Search finds the top-k most similar chunks to the query embedding
func (*VectorStore) Stats ¶
func (vs *VectorStore) Stats() map[string]interface{}
Stats returns statistics about the store