rag

package

v0.0.1 Latest Latest Go to latest Published: Mar 13, 2026 License: MIT Imports: 16 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/hungpdn/llmgo

Links

Documentation ¶

Index ¶

Constants
type Chunk
type DirectoryLoader
- func NewDirectoryLoader(dirPath string, opts ...DirectoryLoaderOption) *DirectoryLoader
- func (d *DirectoryLoader) Load(ctx context.Context, docChan chan<- Document) error
type DirectoryLoaderOption
- func WithAllowedExtensions(exts ...string) DirectoryLoaderOption
- func WithMaxConcurrency(n int) DirectoryLoaderOption
type Document
type DocumentLoader
type Embedder
type IngestionPipeline
- func NewIngestionPipeline(loader DocumentLoader, splitter TextSplitter, embedder Embedder, ...) (*IngestionPipeline, error)
- func (p *IngestionPipeline) Run(ctx context.Context) error
type NanovecStore
- func NewNanovecStore(path string, dimension int) (*NanovecStore, error)
- func (n *NanovecStore) Close() error
- func (n *NanovecStore) SimilaritySearch(ctx context.Context, queryVector []float32, topK int) ([]Chunk, error)
- func (n *NanovecStore) Upsert(ctx context.Context, chunks []Chunk) error
type PipelineOption
- func WithBatchSize(size int) PipelineOption
- func WithNumWorkers(workers int) PipelineOption
type RecursiveCharacterTextSplitter
- func NewRecursiveCharacterTextSplitter(chunkSize, chunkOverlap int) *RecursiveCharacterTextSplitter
- func (s *RecursiveCharacterTextSplitter) Split(ctx context.Context, inChan <-chan Document, outChan chan<- Chunk) error
type Retriever
type StandardRetriever
- func NewStandardRetriever(embedder Embedder, store VectorStore) *StandardRetriever
- func (r *StandardRetriever) Search(ctx context.Context, query string, topK int) ([]Chunk, error)
type TextLoader
- func NewTextLoader(filePath string, opts ...TextLoaderOption) *TextLoader
- func (l *TextLoader) Load(ctx context.Context, docChan chan<- Document) error
type TextLoaderOption
- func WithFileInfo(info fs.FileInfo) TextLoaderOption
- func WithMaxDocSize(bytes core.ByteSize) TextLoaderOption
type TextSplitter
type VectorStore

Constants ¶

View Source

const (
	// DefaultMaxDocSize limits the RAM usage per file to 5MB.
	// If a file is larger, it will be split into multiple Documents.
	DefaultMaxDocSize = 5 * core.MB
)

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Chunk ¶

type Chunk struct {
	ID         string `json:"id"`
	ParentID   string `json:"parent_id"`
	ChunkIndex int    `json:"chunk_index"`
	Content    string `json:"content"`

	// Vector representation of the chunk's content, generated by the Embedder.
	Embedding []float32 `json:"embedding,omitempty"`

	// Similar scores
	Score float32 `json:"score,omitempty"`

	// Metadata is inherited from the original document plus Chunk's own metadata (such as chunk_index).
	Metadata map[string]any `json:"metadata"`
}

Chunk represents a semantic shard. This is the actual object stored in the Vector DB and returned to the LLM.

type DirectoryLoader ¶

type DirectoryLoader struct {
	// contains filtered or unexported fields
}

DirectoryLoader recursively scans a directory and uses TextLoader to load all matching files.

func NewDirectoryLoader ¶

func NewDirectoryLoader(dirPath string, opts ...DirectoryLoaderOption) *DirectoryLoader

NewDirectoryLoader creates a loader that scans an entire folder.

func (*DirectoryLoader) Load ¶

func (d *DirectoryLoader) Load(ctx context.Context, docChan chan<- Document) error

Load recursively traverses the directory and loads file contents.

type DirectoryLoaderOption ¶

type DirectoryLoaderOption func(*DirectoryLoader)

DirectoryLoaderOption follows the functional option pattern for flexible configuration.

func WithAllowedExtensions ¶

func WithAllowedExtensions(exts ...string) DirectoryLoaderOption

WithAllowedExtensions specifies which file extensions to load.

func WithMaxConcurrency ¶

func WithMaxConcurrency(n int) DirectoryLoaderOption

WithMaxConcurrency allows tuning the maximum number of files opened simultaneously. Default is 50. Increase for NVMe SSDs, decrease for HDDs or strict OS ulimits.

type Document ¶

type Document struct {
	ID       string         `json:"id"`
	Content  string         `json:"content"`
	Metadata map[string]any `json:"metadata"` // FilePath, Author, Title...
}

Document represents a raw, source file (e.g., a PDF, a Web page).

type DocumentLoader ¶

type DocumentLoader interface {
	// Load streams Documents into the provided channel.
	// The implementation MUST NOT close the channel (the pipeline manages it).
	Load(ctx context.Context, docChan chan<- Document) error
}

DocumentLoader defines the contract for streaming data from various sources.

type Embedder ¶

type Embedder interface {
	Embed(ctx context.Context, text string) ([]float32, error)
	EmbedBatch(ctx context.Context, texts []string) ([][]float32, error)
}

Embedder converts text into high-dimensional vector representations.

type IngestionPipeline ¶

type IngestionPipeline struct {
	Loader     DocumentLoader
	Splitter   TextSplitter
	Embedder   Embedder
	Store      VectorStore
	BatchSize  int // Number of chunks to embed in one API call
	NumWorkers int // Concurrent workers for embedding/upserting
}

IngestionPipeline orchestrates the flow of data from raw files to VectorDB.

func NewIngestionPipeline ¶

func NewIngestionPipeline(
	loader DocumentLoader,
	splitter TextSplitter,
	embedder Embedder,
	store VectorStore,
	opts ...PipelineOption,
) (*IngestionPipeline, error)

NewIngestionPipeline creates a safely initialized pipeline with default fallbacks.

func (*IngestionPipeline) Run ¶

func (p *IngestionPipeline) Run(ctx context.Context) error

Run executes the high-performance streaming pipeline.

type NanovecStore ¶

type NanovecStore struct {
	// contains filtered or unexported fields
}

NanovecStore implements the VectorStore interface using the embedded nanovec library.

func NewNanovecStore ¶

func NewNanovecStore(path string, dimension int) (*NanovecStore, error)

NewNanovecStore creates a highly optimized, in-memory vector database using nanovec.

func (*NanovecStore) Close ¶

func (n *NanovecStore) Close() error

Close gracefully shuts down the nanovec database.

func (*NanovecStore) SimilaritySearch ¶

func (n *NanovecStore) SimilaritySearch(ctx context.Context, queryVector []float32, topK int) ([]Chunk, error)

SimilaritySearch performs a blazing fast nearest-neighbor search using nanovec's HNSW index.

func (*NanovecStore) Upsert ¶

func (n *NanovecStore) Upsert(ctx context.Context, chunks []Chunk) error

Upsert saves documents and their corresponding embeddings into the nanovec index.

type PipelineOption ¶

type PipelineOption func(*IngestionPipeline)

PipelineOption follows the Functional Option pattern for flexible configuration.

func WithBatchSize ¶

func WithBatchSize(size int) PipelineOption

WithBatchSize configures how many chunks are sent to the Embedder at once.

func WithNumWorkers ¶

func WithNumWorkers(workers int) PipelineOption

WithNumWorkers configures the number of concurrent embedding workers.

type RecursiveCharacterTextSplitter ¶

type RecursiveCharacterTextSplitter struct {
	ChunkSize    int
	ChunkOverlap int
	Separators   []string
}

RecursiveCharacterTextSplitter implements the TextSplitter interface. It splits text recursively based on a hierarchy of separators to keep semantically related pieces together.

func NewRecursiveCharacterTextSplitter ¶

func NewRecursiveCharacterTextSplitter(chunkSize, chunkOverlap int) *RecursiveCharacterTextSplitter

NewRecursiveCharacterTextSplitter creates a new splitter with default LangChain-like separators.

func (*RecursiveCharacterTextSplitter) Split ¶

func (s *RecursiveCharacterTextSplitter) Split(ctx context.Context, inChan <-chan Document, outChan chan<- Chunk) error

Split processes a slice of Documents and returns smaller chunked Documents.

type Retriever ¶

type Retriever interface {
	// Search queries the Vector Database and returns topK relevant documents.
	Search(ctx context.Context, query string, topK int) ([]Chunk, error)
}

Retriever defines the contract for any knowledge base search engine.

type StandardRetriever ¶

type StandardRetriever struct {
	// contains filtered or unexported fields
}

StandardRetriever implements the Retriever interface for the Agent's Tool.

func NewStandardRetriever ¶

func NewStandardRetriever(embedder Embedder, store VectorStore) *StandardRetriever

NewStandardRetriever creates a new Retriever that uses the provided Embedder and VectorStore.

func (*StandardRetriever) Search ¶

func (r *StandardRetriever) Search(ctx context.Context, query string, topK int) ([]Chunk, error)

Search converts the query into a vector and performs a similarity search in the VectorStore.

type TextLoader ¶

type TextLoader struct {
	// contains filtered or unexported fields
}

TextLoader reads a single local text file (.txt, .md, .csv, .go, .py, ...) into a Document.

func NewTextLoader ¶

func NewTextLoader(filePath string, opts ...TextLoaderOption) *TextLoader

NewTextLoader creates a new loader for a specific file path.

func (*TextLoader) Load ¶

func (l *TextLoader) Load(ctx context.Context, docChan chan<- Document) error

Load reads the file content and attaches file-specific metadata. Principle: 1. Open the file using `os.Open` (only creates a descriptor file, no RAM usage). 2. Use `bufio.Reader` to read each line or small chunk. 3. Accumulate data into a `strings.Builder`. When the buffer reaches a safe threshold (e.g., 5MB), we will split it into parts and push the document into a `docchan`. 4. Repeat until the entire file (`io.EOF`) is read.

type TextLoaderOption ¶

type TextLoaderOption func(*TextLoader)

TextLoaderOption follows the Functional Option pattern.

func WithFileInfo ¶

func WithFileInfo(info fs.FileInfo) TextLoaderOption

WithFileInfo injects pre-fetched file stats to avoid redundant os.Stat syscalls.

func WithMaxDocSize ¶

func WithMaxDocSize(bytes core.ByteSize) TextLoaderOption

WithMaxDocSize allows custom configuration for memory limits per file.

type TextSplitter ¶

type TextSplitter interface {
	// Split reads from inChan, splits the documents, and streams chunks to outChan.
	Split(ctx context.Context, inChan <-chan Document, outChan chan<- Chunk) error
}

TextSplitter breaks down large documents into semantic chunks on the fly.

type VectorStore ¶

type VectorStore interface {
	// Upsert saves chunks and their embeddings to the database.
	Upsert(ctx context.Context, chunks []Chunk) error

	// SimilaritySearch finds the top-k most similar nodes.
	SimilaritySearch(ctx context.Context, queryVector []float32, limit int) ([]Chunk, error)

	// Close gracefully shuts down the database connection and releases resources.
	Close() error
}

VectorStore handles the storage and similarity search of vectors.

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
ragtest

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL