Documentation
¶
Index ¶
- Constants
- func Clamp(value, min, max int) int
- func CosineSimilarity(a, b []float32) float32
- func CountLines(text string) int
- func CreateChunker(strategy core.ChunkStrategy, opts ...Option) (core.Chunker, error)
- func DefaultSeparators() []string
- func ExtractImageChunks(structured *core.StructuredDocument) []*core.Chunk
- func GenerateChunkID(docID string, index int, content string) string
- func GetSupportedStrategies() []core.ChunkStrategy
- func NormalizeWhitespace(text string) string
- func RegisterChunker(strategy core.ChunkStrategy, creator ChunkerCreator)
- type ChunkValidator
- type ChunkerCreator
- type ChunkingFactory
- func (f *ChunkingFactory) CreateChunker(strategy core.ChunkStrategy, opts ...Option) (core.Chunker, error)
- func (f *ChunkingFactory) GetSupportedStrategies() []core.ChunkStrategy
- func (f *ChunkingFactory) IsStrategySupported(strategy core.ChunkStrategy) bool
- func (f *ChunkingFactory) MustCreateChunker(strategy core.ChunkStrategy, opts ...Option) core.Chunker
- func (f *ChunkingFactory) RegisterChunker(strategy core.ChunkStrategy, creator ChunkerCreator)
- func (f *ChunkingFactory) UnregisterChunker(strategy core.ChunkStrategy)
- type CodeChunker
- type FixedSizeChunker
- type Option
- func WithChildSize(size int) Option
- func WithChunkSize(size int) Option
- func WithMaxChunkSize(maxSize int) Option
- func WithMaxParagraphs(maxParagraphs int) Option
- func WithMaxSentences(maxSentences int) Option
- func WithMinChunkSize(minSize int) Option
- func WithOverlap(overlap int) Option
- func WithParentSize(size int) Option
- func WithSeparators(separators []string) Option
- func WithSimilarityThreshold(threshold float32) Option
- type Options
- type ParagraphChunker
- type ParentDocChunker
- type RecursiveChunker
- type SemanticChunker
- type SentenceChunker
- type SizeStats
- type ValidationError
- type ValidationReport
- type ValidationWarning
Constants ¶
const ( // StrategyFixedSize uses fixed-size chunking StrategyFixedSize core.ChunkStrategy = "fixed_size" // StrategySentence splits by sentences StrategySentence core.ChunkStrategy = "sentence" // StrategyParagraph splits by paragraphs StrategyParagraph core.ChunkStrategy = "paragraph" // StrategyRecursive uses recursive intelligent splitting StrategyRecursive core.ChunkStrategy = "recursive" // StrategySemantic uses semantic similarity for splitting StrategySemantic core.ChunkStrategy = "semantic" // StrategyCode splits code by structure StrategyCode core.ChunkStrategy = "code" // StrategyParentDoc uses two-level chunking with parent-child relationships StrategyParentDoc core.ChunkStrategy = "parent_doc" )
Chunk strategy constants
const ( DefaultChunkSize = 1500 // default chunk size in characters (~500-600 tokens for Chinese text) DefaultOverlap = 225 // default overlap in characters (~15% of chunk size) MinChunkSize = 50 // minimum chunk size MaxChunkSize = 2000 // maximum chunk size DefaultMaxSentences = 5 // default max sentences per chunk DefaultMaxParagraphs = 15 // default max paragraphs per chunk (hard upper limit) DefaultSimilarityThreshold = 0.7 // default similarity threshold for semantic chunking DefaultParentSize = 1500 // default parent chunk size DefaultChildSize = 400 // default child chunk size )
Default configuration constants (based on NVIDIA 2025 RAG research: 512-1024 tokens sweet spot, ~15% overlap optimal)
Variables ¶
This section is empty.
Functions ¶
func CosineSimilarity ¶
CosineSimilarity computes the cosine similarity between two vectors
func CreateChunker ¶
CreateChunker creates a chunker using the global factory
func DefaultSeparators ¶
func DefaultSeparators() []string
DefaultSeparators returns the default separator list (priority from high to low)
func ExtractImageChunks ¶
func ExtractImageChunks(structured *core.StructuredDocument) []*core.Chunk
ExtractImageChunks 从 structured 文档中提取图片子分块 图片作为子分块关联到主文档,通过 ParentID 实现文档级别召回
func GenerateChunkID ¶
GenerateChunkID generates a unique chunk ID Format: chunk_{docID}_{index}_{hash8}
func GetSupportedStrategies ¶
func GetSupportedStrategies() []core.ChunkStrategy
GetSupportedStrategies returns supported strategies from global factory
func NormalizeWhitespace ¶
NormalizeWhitespace normalizes whitespace characters Multiple spaces/newlines are merged into single spaces while preserving line breaks
func RegisterChunker ¶
func RegisterChunker(strategy core.ChunkStrategy, creator ChunkerCreator)
RegisterChunker registers a chunker with the global factory
Types ¶
type ChunkValidator ¶
type ChunkValidator struct {
// contains filtered or unexported fields
}
ChunkValidator validates chunk quality
func NewChunkValidator ¶
func NewChunkValidator(opts ...Option) *ChunkValidator
NewChunkValidator 创建验证器
func (*ChunkValidator) Validate ¶
func (v *ChunkValidator) Validate(chunks []*core.Chunk) *ValidationReport
Validate 验证分块质量
type ChunkerCreator ¶
ChunkerCreator is a function that creates a chunker instance
type ChunkingFactory ¶
type ChunkingFactory struct {
// contains filtered or unexported fields
}
ChunkingFactory creates chunker instances based on strategy type
func NewChunkingFactory ¶
func NewChunkingFactory() *ChunkingFactory
NewChunkingFactory creates a new ChunkingFactory with default chunkers registered
func (*ChunkingFactory) CreateChunker ¶
func (f *ChunkingFactory) CreateChunker(strategy core.ChunkStrategy, opts ...Option) (core.Chunker, error)
CreateChunker creates a chunker based on strategy
func (*ChunkingFactory) GetSupportedStrategies ¶
func (f *ChunkingFactory) GetSupportedStrategies() []core.ChunkStrategy
GetSupportedStrategies returns all registered strategies
func (*ChunkingFactory) IsStrategySupported ¶
func (f *ChunkingFactory) IsStrategySupported(strategy core.ChunkStrategy) bool
IsStrategySupported checks if a strategy is registered
func (*ChunkingFactory) MustCreateChunker ¶
func (f *ChunkingFactory) MustCreateChunker(strategy core.ChunkStrategy, opts ...Option) core.Chunker
MustCreateChunker creates a chunker, panics on error
func (*ChunkingFactory) RegisterChunker ¶
func (f *ChunkingFactory) RegisterChunker(strategy core.ChunkStrategy, creator ChunkerCreator)
RegisterChunker registers a chunker creator for a strategy
func (*ChunkingFactory) UnregisterChunker ¶
func (f *ChunkingFactory) UnregisterChunker(strategy core.ChunkStrategy)
UnregisterChunker removes a chunker registration
type CodeChunker ¶
type CodeChunker struct {
// contains filtered or unexported fields
}
CodeChunker 代码分块器 基于 StructureNode(由 CodeStructurizer 生成的 AST 结构)进行分块
func (*CodeChunker) Chunk ¶
func (c *CodeChunker) Chunk( structured *core.StructuredDocument, ) ([]*core.Chunk, error)
Chunk 实现分块接口
func (*CodeChunker) GetStrategy ¶
func (c *CodeChunker) GetStrategy() core.ChunkStrategy
GetStrategy 返回策略类型
type FixedSizeChunker ¶
type FixedSizeChunker struct {
// contains filtered or unexported fields
}
FixedSizeChunker splits text into fixed-size chunks Simple and fast, splits by character count
func NewFixedSizeChunker ¶
func NewFixedSizeChunker(opts ...Option) *FixedSizeChunker
NewFixedSizeChunker creates a new FixedSizeChunker
func (*FixedSizeChunker) Chunk ¶
func (c *FixedSizeChunker) Chunk( structured *core.StructuredDocument, ) ([]*core.Chunk, error)
Chunk implements the Chunker interface
func (*FixedSizeChunker) GetStrategy ¶
func (c *FixedSizeChunker) GetStrategy() core.ChunkStrategy
GetStrategy returns the chunk strategy type
type Option ¶
type Option func(*Options)
Option is a functional option for configuring chunkers
func WithMaxChunkSize ¶
WithMaxChunkSize sets the maximum chunk size
func WithMaxParagraphs ¶
WithMaxParagraphs sets the maximum number of paragraphs per chunk
func WithMaxSentences ¶
WithMaxSentences sets the maximum number of sentences per chunk
func WithMinChunkSize ¶
WithMinChunkSize sets the minimum chunk size
func WithParentSize ¶
WithParentSize sets the parent chunk size
func WithSeparators ¶
WithSeparators sets the separator list
func WithSimilarityThreshold ¶
WithSimilarityThreshold sets the similarity threshold for semantic chunking
type Options ¶
type Options struct {
ChunkSize int // chunk size in characters
Overlap int // overlap size in characters
MinChunkSize int // minimum chunk size
MaxChunkSize int // maximum chunk size
MaxSentences int // max sentences per chunk for sentence chunker
MaxParagraphs int // max paragraphs per chunk for paragraph chunker
SimilarityThreshold float32 // similarity threshold for semantic chunker
ParentSize int // parent chunk size for ParentDoc
ChildSize int // child chunk size for ParentDoc
Separators []string // separator list for recursive chunker
}
Options contains common configuration for all chunkers
func DefaultOptions ¶
func DefaultOptions() Options
DefaultOptions returns the default configuration
type ParagraphChunker ¶
type ParagraphChunker struct {
// contains filtered or unexported fields
}
ParagraphChunker splits text by paragraph boundaries Merges consecutive paragraphs until reaching chunkSize (target), using maxParagraphs as hard cap Overlap is applied between consecutive chunks by re-including trailing paragraphs from previous chunk
func NewParagraphChunker ¶
func NewParagraphChunker(opts ...Option) *ParagraphChunker
NewParagraphChunker creates a new ParagraphChunker
func (*ParagraphChunker) Chunk ¶
func (c *ParagraphChunker) Chunk( structured *core.StructuredDocument, ) ([]*core.Chunk, error)
Chunk implements the Chunker interface
func (*ParagraphChunker) GetStrategy ¶
func (c *ParagraphChunker) GetStrategy() core.ChunkStrategy
GetStrategy returns the chunk strategy type
type ParentDocChunker ¶
type ParentDocChunker struct {
// contains filtered or unexported fields
}
ParentDocChunker implements two-level chunking Finds parent chunks (large) that contain child chunks (small) Provides both precision and contextual richness
func NewParentDocChunker ¶
func NewParentDocChunker(opts ...Option) *ParentDocChunker
NewParentDocChunker creates a new ParentDocChunker
func (*ParentDocChunker) Chunk ¶
func (c *ParentDocChunker) Chunk( structured *core.StructuredDocument, ) ([]*core.Chunk, error)
Chunk implements the Chunker interface Returns all chunks (parent + child), with child chunks linked to parents via ParentID
func (*ParentDocChunker) GetStrategy ¶
func (c *ParentDocChunker) GetStrategy() core.ChunkStrategy
GetStrategy returns the chunk strategy type
type RecursiveChunker ¶
type RecursiveChunker struct {
// contains filtered or unexported fields
}
RecursiveChunker intelligently splits text by trying different separator levels in priority order and selecting optimal split points
func NewRecursiveChunker ¶
func NewRecursiveChunker(opts ...Option) *RecursiveChunker
NewRecursiveChunker creates a new RecursiveChunker
func (*RecursiveChunker) Chunk ¶
func (c *RecursiveChunker) Chunk( structured *core.StructuredDocument, ) ([]*core.Chunk, error)
Chunk implements the Chunker interface
func (*RecursiveChunker) GetStrategy ¶
func (c *RecursiveChunker) GetStrategy() core.ChunkStrategy
GetStrategy returns the chunk strategy type
type SemanticChunker ¶
type SemanticChunker struct {
// contains filtered or unexported fields
}
SemanticChunker splits text based on semantic similarity Detects topic changes where similarity drops below threshold
func NewSemanticChunker ¶
func NewSemanticChunker(embedder core.Embedder, opts ...Option) *SemanticChunker
NewSemanticChunker creates a new SemanticChunker
func (*SemanticChunker) Chunk ¶
func (c *SemanticChunker) Chunk( structured *core.StructuredDocument, ) ([]*core.Chunk, error)
Chunk implements the Chunker interface
func (*SemanticChunker) GetStrategy ¶
func (c *SemanticChunker) GetStrategy() core.ChunkStrategy
GetStrategy returns the chunk strategy type
type SentenceChunker ¶
type SentenceChunker struct {
// contains filtered or unexported fields
}
SentenceChunker splits text by sentence boundaries Ensures each chunk contains complete sentences
func NewSentenceChunker ¶
func NewSentenceChunker(opts ...Option) *SentenceChunker
NewSentenceChunker creates a new SentenceChunker
func (*SentenceChunker) Chunk ¶
func (c *SentenceChunker) Chunk( structured *core.StructuredDocument, ) ([]*core.Chunk, error)
Chunk implements the Chunker interface
func (*SentenceChunker) GetStrategy ¶
func (c *SentenceChunker) GetStrategy() core.ChunkStrategy
GetStrategy returns the chunk strategy type
type SizeStats ¶
type SizeStats struct {
Mean float64 // mean size
Median float64 // median size
StdDev float64 // standard deviation
Min int // minimum size
Max int // maximum size
}
SizeStats contains chunk size statistics
type ValidationError ¶
type ValidationError struct {
ChunkIndex int // chunk index
ErrorType string // error type
Message string // error message
}
ValidationError represents a validation error
type ValidationReport ¶
type ValidationReport struct {
TotalChunks int // total number of chunks
ValidChunks int // number of valid chunks
InvalidChunks int // number of invalid chunks
Errors []ValidationError // list of errors
Warnings []ValidationWarning // list of warnings
CohesionScore float64 // intra-chunk cohesion (0-1)
DiversityScore float64 // inter-chunk diversity (0-1)
SizeStats SizeStats // chunk size statistics
}
ValidationReport contains the results of chunk validation
func (*ValidationReport) IsValid ¶
func (r *ValidationReport) IsValid() bool
IsValid checks if the validation report has passed
type ValidationWarning ¶
type ValidationWarning struct {
ChunkIndex int // chunk index
WarningType string // warning type
Message string // warning message
}
ValidationWarning represents a validation warning