Documentation
¶
Overview ¶
Package parser - Multi-format parser implementations for TXT, CSV, JSON
Package parser - PDF parsing implementation using unipdf ¶
Package formats - Text chunking implementation
Index ¶
- func DetectFileFormat(filePath string) string
- func RemoveStopWords(words []string) []string
- func SplitIntoWords(text string) []string
- type FormatParser
- func (fp *FormatParser) DetectContentType(content string) schema.ChunkType
- func (fp *FormatParser) ParseCSV(ctx context.Context, filePath string) ([]*schema.Chunk, error)
- func (fp *FormatParser) ParseFile(ctx context.Context, filePath string) ([]*schema.Chunk, error)
- func (fp *FormatParser) ParseJSON(ctx context.Context, filePath string) ([]*schema.Chunk, error)
- func (fp *FormatParser) ParseMarkdown(ctx context.Context, content string) ([]*schema.Chunk, error)
- func (fp *FormatParser) ParsePDF(ctx context.Context, filePath string) ([]*schema.Chunk, error)
- func (fp *FormatParser) ParseTXT(ctx context.Context, filePath string) ([]*schema.Chunk, error)
- func (fp *FormatParser) ParseText(ctx context.Context, content string) ([]*schema.Chunk, error)
- type PDFMetadata
- type PDFPageInfo
- type PDFParser
- func (pp *PDFParser) DetectContentType(content string) schema.ChunkType
- func (pp *PDFParser) ParseFile(ctx context.Context, filePath string) ([]*schema.Chunk, error)
- func (pp *PDFParser) ParseMarkdown(ctx context.Context, content string) ([]*schema.Chunk, error)
- func (pp *PDFParser) ParsePDF(ctx context.Context, filePath string) ([]*schema.Chunk, error)
- func (pp *PDFParser) ParseText(ctx context.Context, content string) ([]*schema.Chunk, error)
- type TextParser
- func (tp *TextParser) DetectContentType(content string) schema.ChunkType
- func (tp *TextParser) ParseFile(ctx context.Context, filePath string) ([]*schema.Chunk, error)
- func (tp *TextParser) ParseMarkdown(ctx context.Context, content string) ([]*schema.Chunk, error)
- func (tp *TextParser) ParsePDF(ctx context.Context, filePath string) ([]*schema.Chunk, error)
- func (tp *TextParser) ParseText(ctx context.Context, content string) ([]*schema.Chunk, error)
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func DetectFileFormat ¶
func RemoveStopWords ¶
RemoveStopWords removes common stop words from a word list
func SplitIntoWords ¶
SplitIntoWords splits text into words for keyword extraction
Types ¶
type FormatParser ¶
type FormatParser struct {
// contains filtered or unexported fields
}
FormatParser implements parsing for common file formats (TXT, CSV, JSON)
func NewFormatParser ¶
func NewFormatParser(config *schema.ChunkingConfig) *FormatParser
NewFormatParser creates a new format parser
func (*FormatParser) DetectContentType ¶
func (fp *FormatParser) DetectContentType(content string) schema.ChunkType
DetectContentType detects the type of content
func (*FormatParser) ParseMarkdown ¶
ParseMarkdown parses markdown content with structure preservation
type PDFMetadata ¶
type PDFMetadata struct {
Title string `json:"title"`
Author string `json:"author"`
Subject string `json:"subject"`
Creator string `json:"creator"`
Producer string `json:"producer"`
CreationDate time.Time `json:"creation_date"`
ModDate time.Time `json:"modification_date"`
PageCount int `json:"page_count"`
FileSize int64 `json:"file_size"`
Keywords string `json:"keywords"`
Language string `json:"language"`
IsEncrypted bool `json:"is_encrypted"`
}
PDFMetadata contains extracted PDF document metadata
type PDFPageInfo ¶
type PDFPageInfo struct {
PageNumber int `json:"page_number"`
Text string `json:"text"`
WordCount int `json:"word_count"`
CharCount int `json:"char_count"`
IsEmpty bool `json:"is_empty"`
}
PDFPageInfo contains information about a specific page
type PDFParser ¶
type PDFParser struct {
// contains filtered or unexported fields
}
PDFParser handles PDF document parsing
func NewPDFParser ¶
func NewPDFParser(config *schema.ChunkingConfig) *PDFParser
NewPDFParser creates a new PDF parser with the given configuration
func (*PDFParser) DetectContentType ¶
DetectContentType always returns PDF type for PDFParser
func (*PDFParser) ParseMarkdown ¶
ParseMarkdown is not applicable for PDFParser
type TextParser ¶
type TextParser struct {
// contains filtered or unexported fields
}
TextParser implements the Parser interface for text content
func NewTextParser ¶
func NewTextParser(config *schema.ChunkingConfig) *TextParser
NewTextParser creates a new text parser with the given configuration
func (*TextParser) DetectContentType ¶
func (tp *TextParser) DetectContentType(content string) schema.ChunkType
DetectContentType detects the type of content
func (*TextParser) ParseMarkdown ¶
ParseMarkdown parses markdown content with structure preservation