formats

package
v0.0.0-...-dbba89f Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 14, 2026 License: MIT Imports: 13 Imported by: 0

Documentation

Overview

Package parser - Multi-format parser implementations for TXT, CSV, JSON

Package parser - PDF parsing implementation using unipdf

Package formats - Text chunking implementation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func DetectFileFormat

func DetectFileFormat(filePath string) string

func RemoveStopWords

func RemoveStopWords(words []string) []string

RemoveStopWords removes common stop words from a word list

func SplitIntoWords

func SplitIntoWords(text string) []string

SplitIntoWords splits text into words for keyword extraction

Types

type FormatParser

type FormatParser struct {
	// contains filtered or unexported fields
}

FormatParser implements parsing for common file formats (TXT, CSV, JSON)

func NewFormatParser

func NewFormatParser(config *schema.ChunkingConfig) *FormatParser

NewFormatParser creates a new format parser

func (*FormatParser) DetectContentType

func (fp *FormatParser) DetectContentType(content string) schema.ChunkType

DetectContentType detects the type of content

func (*FormatParser) ParseCSV

func (fp *FormatParser) ParseCSV(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseCSV parses CSV files and converts each row to a chunk

func (*FormatParser) ParseFile

func (fp *FormatParser) ParseFile(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseFile parses a file based on its extension

func (*FormatParser) ParseJSON

func (fp *FormatParser) ParseJSON(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseJSON parses JSON files and converts them to chunks

func (*FormatParser) ParseMarkdown

func (fp *FormatParser) ParseMarkdown(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseMarkdown parses markdown content with structure preservation

func (*FormatParser) ParsePDF

func (fp *FormatParser) ParsePDF(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParsePDF parses PDF files (not supported by FormatParser)

func (*FormatParser) ParseTXT

func (fp *FormatParser) ParseTXT(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseTXT parses plain text files

func (*FormatParser) ParseText

func (fp *FormatParser) ParseText(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseText parses raw text content into chunks

type PDFMetadata

type PDFMetadata struct {
	Title        string    `json:"title"`
	Author       string    `json:"author"`
	Subject      string    `json:"subject"`
	Creator      string    `json:"creator"`
	Producer     string    `json:"producer"`
	CreationDate time.Time `json:"creation_date"`
	ModDate      time.Time `json:"modification_date"`
	PageCount    int       `json:"page_count"`
	FileSize     int64     `json:"file_size"`
	Keywords     string    `json:"keywords"`
	Language     string    `json:"language"`
	IsEncrypted  bool      `json:"is_encrypted"`
}

PDFMetadata contains extracted PDF document metadata

type PDFPageInfo

type PDFPageInfo struct {
	PageNumber int    `json:"page_number"`
	Text       string `json:"text"`
	WordCount  int    `json:"word_count"`
	CharCount  int    `json:"char_count"`
	IsEmpty    bool   `json:"is_empty"`
}

PDFPageInfo contains information about a specific page

type PDFParser

type PDFParser struct {
	// contains filtered or unexported fields
}

PDFParser handles PDF document parsing

func NewPDFParser

func NewPDFParser(config *schema.ChunkingConfig) *PDFParser

NewPDFParser creates a new PDF parser with the given configuration

func (*PDFParser) DetectContentType

func (pp *PDFParser) DetectContentType(content string) schema.ChunkType

DetectContentType always returns PDF type for PDFParser

func (*PDFParser) ParseFile

func (pp *PDFParser) ParseFile(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseFile implements the Parser interface for PDF files

func (*PDFParser) ParseMarkdown

func (pp *PDFParser) ParseMarkdown(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseMarkdown is not applicable for PDFParser

func (*PDFParser) ParsePDF

func (pp *PDFParser) ParsePDF(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParsePDF extracts text content from a PDF file and converts it to chunks

func (*PDFParser) ParseText

func (pp *PDFParser) ParseText(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseText is not applicable for PDFParser as it works with files

type TextParser

type TextParser struct {
	// contains filtered or unexported fields
}

TextParser implements the Parser interface for text content

func NewTextParser

func NewTextParser(config *schema.ChunkingConfig) *TextParser

NewTextParser creates a new text parser with the given configuration

func (*TextParser) DetectContentType

func (tp *TextParser) DetectContentType(content string) schema.ChunkType

DetectContentType detects the type of content

func (*TextParser) ParseFile

func (tp *TextParser) ParseFile(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseFile parses a file based on its extension

func (*TextParser) ParseMarkdown

func (tp *TextParser) ParseMarkdown(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseMarkdown parses markdown content with structure preservation

func (*TextParser) ParsePDF

func (tp *TextParser) ParsePDF(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParsePDF parses PDF files using the PDFParser

func (*TextParser) ParseText

func (tp *TextParser) ParseText(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseText implements text chunking based on the configured strategy

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL