chunker

package
v1.1.10 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 11, 2026 License: MIT Imports: 12 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// StrategyFixedSize uses fixed-size chunking
	StrategyFixedSize core.ChunkStrategy = "fixed_size"
	// StrategySentence splits by sentences
	StrategySentence core.ChunkStrategy = "sentence"
	// StrategyParagraph splits by paragraphs
	StrategyParagraph core.ChunkStrategy = "paragraph"
	// StrategyRecursive uses recursive intelligent splitting
	StrategyRecursive core.ChunkStrategy = "recursive"
	// StrategySemantic uses semantic similarity for splitting
	StrategySemantic core.ChunkStrategy = "semantic"
	// StrategyCode splits code by structure
	StrategyCode core.ChunkStrategy = "code"
	// StrategyParentDoc uses two-level chunking with parent-child relationships
	StrategyParentDoc core.ChunkStrategy = "parent_doc"
)

Chunk strategy constants

View Source
const (
	DefaultChunkSize           = 1500 // default chunk size in characters (~500-600 tokens for Chinese text)
	DefaultOverlap             = 225  // default overlap in characters (~15% of chunk size)
	MinChunkSize               = 50   // minimum chunk size
	MaxChunkSize               = 2000 // maximum chunk size
	DefaultMaxSentences        = 5    // default max sentences per chunk
	DefaultMaxParagraphs       = 15   // default max paragraphs per chunk (hard upper limit)
	DefaultSimilarityThreshold = 0.7  // default similarity threshold for semantic chunking
	DefaultParentSize          = 1500 // default parent chunk size
	DefaultChildSize           = 400  // default child chunk size
)

Default configuration constants (based on NVIDIA 2025 RAG research: 512-1024 tokens sweet spot, ~15% overlap optimal)

Variables

This section is empty.

Functions

func Clamp

func Clamp(value, min, max int) int

Clamp limits a value to be within [min, max]

func CosineSimilarity

func CosineSimilarity(a, b []float32) float32

CosineSimilarity computes the cosine similarity between two vectors

func CountLines

func CountLines(text string) int

CountLines counts the number of lines in text

func CreateChunker

func CreateChunker(strategy core.ChunkStrategy, opts ...Option) (core.Chunker, error)

CreateChunker creates a chunker using the global factory

func DefaultSeparators

func DefaultSeparators() []string

DefaultSeparators returns the default separator list (priority from high to low)

func ExtractImageChunks

func ExtractImageChunks(structured *core.StructuredDocument) []*core.Chunk

ExtractImageChunks 从 structured 文档中提取图片子分块 图片作为子分块关联到主文档,通过 ParentID 实现文档级别召回

func GenerateChunkID

func GenerateChunkID(docID string, index int, content string) string

GenerateChunkID generates a unique chunk ID Format: chunk_{docID}_{index}_{hash8}

func GetSupportedStrategies

func GetSupportedStrategies() []core.ChunkStrategy

GetSupportedStrategies returns supported strategies from global factory

func NormalizeWhitespace

func NormalizeWhitespace(text string) string

NormalizeWhitespace normalizes whitespace characters Multiple spaces/newlines are merged into single spaces while preserving line breaks

func RegisterChunker

func RegisterChunker(strategy core.ChunkStrategy, creator ChunkerCreator)

RegisterChunker registers a chunker with the global factory

Types

type ChunkValidator

type ChunkValidator struct {
	// contains filtered or unexported fields
}

ChunkValidator validates chunk quality

func NewChunkValidator

func NewChunkValidator(opts ...Option) *ChunkValidator

NewChunkValidator 创建验证器

func (*ChunkValidator) Validate

func (v *ChunkValidator) Validate(chunks []*core.Chunk) *ValidationReport

Validate 验证分块质量

type ChunkerCreator

type ChunkerCreator func(opts ...Option) core.Chunker

ChunkerCreator is a function that creates a chunker instance

type ChunkingFactory

type ChunkingFactory struct {
	// contains filtered or unexported fields
}

ChunkingFactory creates chunker instances based on strategy type

func NewChunkingFactory

func NewChunkingFactory() *ChunkingFactory

NewChunkingFactory creates a new ChunkingFactory with default chunkers registered

func (*ChunkingFactory) CreateChunker

func (f *ChunkingFactory) CreateChunker(strategy core.ChunkStrategy, opts ...Option) (core.Chunker, error)

CreateChunker creates a chunker based on strategy

func (*ChunkingFactory) GetSupportedStrategies

func (f *ChunkingFactory) GetSupportedStrategies() []core.ChunkStrategy

GetSupportedStrategies returns all registered strategies

func (*ChunkingFactory) IsStrategySupported

func (f *ChunkingFactory) IsStrategySupported(strategy core.ChunkStrategy) bool

IsStrategySupported checks if a strategy is registered

func (*ChunkingFactory) MustCreateChunker

func (f *ChunkingFactory) MustCreateChunker(strategy core.ChunkStrategy, opts ...Option) core.Chunker

MustCreateChunker creates a chunker, panics on error

func (*ChunkingFactory) RegisterChunker

func (f *ChunkingFactory) RegisterChunker(strategy core.ChunkStrategy, creator ChunkerCreator)

RegisterChunker registers a chunker creator for a strategy

func (*ChunkingFactory) UnregisterChunker

func (f *ChunkingFactory) UnregisterChunker(strategy core.ChunkStrategy)

UnregisterChunker removes a chunker registration

type CodeChunker

type CodeChunker struct {
	// contains filtered or unexported fields
}

CodeChunker 代码分块器 基于 StructureNode(由 CodeStructurizer 生成的 AST 结构)进行分块

func NewCodeChunker

func NewCodeChunker(opts ...Option) *CodeChunker

NewCodeChunker 创建代码分块器

func (*CodeChunker) Chunk

func (c *CodeChunker) Chunk(
	structured *core.StructuredDocument,
) ([]*core.Chunk, error)

Chunk 实现分块接口

func (*CodeChunker) GetStrategy

func (c *CodeChunker) GetStrategy() core.ChunkStrategy

GetStrategy 返回策略类型

type FixedSizeChunker

type FixedSizeChunker struct {
	// contains filtered or unexported fields
}

FixedSizeChunker splits text into fixed-size chunks Simple and fast, splits by character count

func NewFixedSizeChunker

func NewFixedSizeChunker(opts ...Option) *FixedSizeChunker

NewFixedSizeChunker creates a new FixedSizeChunker

func (*FixedSizeChunker) Chunk

func (c *FixedSizeChunker) Chunk(
	structured *core.StructuredDocument,
) ([]*core.Chunk, error)

Chunk implements the Chunker interface

func (*FixedSizeChunker) GetStrategy

func (c *FixedSizeChunker) GetStrategy() core.ChunkStrategy

GetStrategy returns the chunk strategy type

type Option

type Option func(*Options)

Option is a functional option for configuring chunkers

func WithChildSize

func WithChildSize(size int) Option

WithChildSize sets the child chunk size

func WithChunkSize

func WithChunkSize(size int) Option

WithChunkSize sets the chunk size

func WithMaxChunkSize

func WithMaxChunkSize(maxSize int) Option

WithMaxChunkSize sets the maximum chunk size

func WithMaxParagraphs

func WithMaxParagraphs(maxParagraphs int) Option

WithMaxParagraphs sets the maximum number of paragraphs per chunk

func WithMaxSentences

func WithMaxSentences(maxSentences int) Option

WithMaxSentences sets the maximum number of sentences per chunk

func WithMinChunkSize

func WithMinChunkSize(minSize int) Option

WithMinChunkSize sets the minimum chunk size

func WithOverlap

func WithOverlap(overlap int) Option

WithOverlap sets the overlap size

func WithParentSize

func WithParentSize(size int) Option

WithParentSize sets the parent chunk size

func WithSeparators

func WithSeparators(separators []string) Option

WithSeparators sets the separator list

func WithSimilarityThreshold

func WithSimilarityThreshold(threshold float32) Option

WithSimilarityThreshold sets the similarity threshold for semantic chunking

type Options

type Options struct {
	ChunkSize           int      // chunk size in characters
	Overlap             int      // overlap size in characters
	MinChunkSize        int      // minimum chunk size
	MaxChunkSize        int      // maximum chunk size
	MaxSentences        int      // max sentences per chunk for sentence chunker
	MaxParagraphs       int      // max paragraphs per chunk for paragraph chunker
	SimilarityThreshold float32  // similarity threshold for semantic chunker
	ParentSize          int      // parent chunk size for ParentDoc
	ChildSize           int      // child chunk size for ParentDoc
	Separators          []string // separator list for recursive chunker
}

Options contains common configuration for all chunkers

func DefaultOptions

func DefaultOptions() Options

DefaultOptions returns the default configuration

type ParagraphChunker

type ParagraphChunker struct {
	// contains filtered or unexported fields
}

ParagraphChunker splits text by paragraph boundaries Merges consecutive paragraphs until reaching chunkSize (target), using maxParagraphs as hard cap Overlap is applied between consecutive chunks by re-including trailing paragraphs from previous chunk

func NewParagraphChunker

func NewParagraphChunker(opts ...Option) *ParagraphChunker

NewParagraphChunker creates a new ParagraphChunker

func (*ParagraphChunker) Chunk

func (c *ParagraphChunker) Chunk(
	structured *core.StructuredDocument,
) ([]*core.Chunk, error)

Chunk implements the Chunker interface

func (*ParagraphChunker) GetStrategy

func (c *ParagraphChunker) GetStrategy() core.ChunkStrategy

GetStrategy returns the chunk strategy type

type ParentDocChunker

type ParentDocChunker struct {
	// contains filtered or unexported fields
}

ParentDocChunker implements two-level chunking Finds parent chunks (large) that contain child chunks (small) Provides both precision and contextual richness

func NewParentDocChunker

func NewParentDocChunker(opts ...Option) *ParentDocChunker

NewParentDocChunker creates a new ParentDocChunker

func (*ParentDocChunker) Chunk

func (c *ParentDocChunker) Chunk(
	structured *core.StructuredDocument,
) ([]*core.Chunk, error)

Chunk implements the Chunker interface Returns all chunks (parent + child), with child chunks linked to parents via ParentID

func (*ParentDocChunker) GetStrategy

func (c *ParentDocChunker) GetStrategy() core.ChunkStrategy

GetStrategy returns the chunk strategy type

type RecursiveChunker

type RecursiveChunker struct {
	// contains filtered or unexported fields
}

RecursiveChunker intelligently splits text by trying different separator levels in priority order and selecting optimal split points

func NewRecursiveChunker

func NewRecursiveChunker(opts ...Option) *RecursiveChunker

NewRecursiveChunker creates a new RecursiveChunker

func (*RecursiveChunker) Chunk

func (c *RecursiveChunker) Chunk(
	structured *core.StructuredDocument,
) ([]*core.Chunk, error)

Chunk implements the Chunker interface

func (*RecursiveChunker) GetStrategy

func (c *RecursiveChunker) GetStrategy() core.ChunkStrategy

GetStrategy returns the chunk strategy type

type SemanticChunker

type SemanticChunker struct {
	// contains filtered or unexported fields
}

SemanticChunker splits text based on semantic similarity Detects topic changes where similarity drops below threshold

func NewSemanticChunker

func NewSemanticChunker(embedder core.Embedder, opts ...Option) *SemanticChunker

NewSemanticChunker creates a new SemanticChunker

func (*SemanticChunker) Chunk

func (c *SemanticChunker) Chunk(
	structured *core.StructuredDocument,
) ([]*core.Chunk, error)

Chunk implements the Chunker interface

func (*SemanticChunker) GetStrategy

func (c *SemanticChunker) GetStrategy() core.ChunkStrategy

GetStrategy returns the chunk strategy type

type SentenceChunker

type SentenceChunker struct {
	// contains filtered or unexported fields
}

SentenceChunker splits text by sentence boundaries Ensures each chunk contains complete sentences

func NewSentenceChunker

func NewSentenceChunker(opts ...Option) *SentenceChunker

NewSentenceChunker creates a new SentenceChunker

func (*SentenceChunker) Chunk

func (c *SentenceChunker) Chunk(
	structured *core.StructuredDocument,
) ([]*core.Chunk, error)

Chunk implements the Chunker interface

func (*SentenceChunker) GetStrategy

func (c *SentenceChunker) GetStrategy() core.ChunkStrategy

GetStrategy returns the chunk strategy type

type SizeStats

type SizeStats struct {
	Mean   float64 // mean size
	Median float64 // median size
	StdDev float64 // standard deviation
	Min    int     // minimum size
	Max    int     // maximum size
}

SizeStats contains chunk size statistics

type ValidationError

type ValidationError struct {
	ChunkIndex int    // chunk index
	ErrorType  string // error type
	Message    string // error message
}

ValidationError represents a validation error

type ValidationReport

type ValidationReport struct {
	TotalChunks    int                 // total number of chunks
	ValidChunks    int                 // number of valid chunks
	InvalidChunks  int                 // number of invalid chunks
	Errors         []ValidationError   // list of errors
	Warnings       []ValidationWarning // list of warnings
	CohesionScore  float64             // intra-chunk cohesion (0-1)
	DiversityScore float64             // inter-chunk diversity (0-1)
	SizeStats      SizeStats           // chunk size statistics
}

ValidationReport contains the results of chunk validation

func (*ValidationReport) IsValid

func (r *ValidationReport) IsValid() bool

IsValid checks if the validation report has passed

type ValidationWarning

type ValidationWarning struct {
	ChunkIndex  int    // chunk index
	WarningType string // warning type
	Message     string // warning message
}

ValidationWarning represents a validation warning

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL