formats

package

v0.0.0-...-dbba89f Latest Latest Go to latest Published: Apr 14, 2026 License: MIT Imports: 13 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/NortonBen/ai-memory-go

Links

Open Source Insights

Documentation ¶

Overview ¶

Package parser - Multi-format parser implementations for TXT, CSV, JSON

Package parser - PDF parsing implementation using unipdf ¶

Package formats - Text chunking implementation

Index ¶

func DetectFileFormat(filePath string) string
func RemoveStopWords(words []string) []string
func SplitIntoWords(text string) []string
type FormatParser
- func NewFormatParser(config *schema.ChunkingConfig) *FormatParser
type PDFMetadata
type PDFPageInfo
type PDFParser
- func NewPDFParser(config *schema.ChunkingConfig) *PDFParser
type TextParser
- func NewTextParser(config *schema.ChunkingConfig) *TextParser

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func DetectFileFormat ¶

func DetectFileFormat(filePath string) string

func RemoveStopWords ¶

func RemoveStopWords(words []string) []string

RemoveStopWords removes common stop words from a word list

func SplitIntoWords ¶

func SplitIntoWords(text string) []string

SplitIntoWords splits text into words for keyword extraction

Types ¶

type FormatParser ¶

type FormatParser struct {
	// contains filtered or unexported fields
}

FormatParser implements parsing for common file formats (TXT, CSV, JSON)

func NewFormatParser ¶

func NewFormatParser(config *schema.ChunkingConfig) *FormatParser

NewFormatParser creates a new format parser

func (*FormatParser) DetectContentType ¶

func (fp *FormatParser) DetectContentType(content string) schema.ChunkType

DetectContentType detects the type of content

func (*FormatParser) ParseCSV ¶

func (fp *FormatParser) ParseCSV(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseCSV parses CSV files and converts each row to a chunk

func (*FormatParser) ParseFile ¶

func (fp *FormatParser) ParseFile(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseFile parses a file based on its extension

func (*FormatParser) ParseJSON ¶

func (fp *FormatParser) ParseJSON(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseJSON parses JSON files and converts them to chunks

func (*FormatParser) ParseMarkdown ¶

func (fp *FormatParser) ParseMarkdown(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseMarkdown parses markdown content with structure preservation

func (*FormatParser) ParsePDF ¶

func (fp *FormatParser) ParsePDF(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParsePDF parses PDF files (not supported by FormatParser)

func (*FormatParser) ParseTXT ¶

func (fp *FormatParser) ParseTXT(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseTXT parses plain text files

func (*FormatParser) ParseText ¶

func (fp *FormatParser) ParseText(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseText parses raw text content into chunks

type PDFMetadata ¶

type PDFMetadata struct {
	Title        string    `json:"title"`
	Author       string    `json:"author"`
	Subject      string    `json:"subject"`
	Creator      string    `json:"creator"`
	Producer     string    `json:"producer"`
	CreationDate time.Time `json:"creation_date"`
	ModDate      time.Time `json:"modification_date"`
	PageCount    int       `json:"page_count"`
	FileSize     int64     `json:"file_size"`
	Keywords     string    `json:"keywords"`
	Language     string    `json:"language"`
	IsEncrypted  bool      `json:"is_encrypted"`
}

PDFMetadata contains extracted PDF document metadata

type PDFPageInfo ¶

type PDFPageInfo struct {
	PageNumber int    `json:"page_number"`
	Text       string `json:"text"`
	WordCount  int    `json:"word_count"`
	CharCount  int    `json:"char_count"`
	IsEmpty    bool   `json:"is_empty"`
}

PDFPageInfo contains information about a specific page

type PDFParser ¶

type PDFParser struct {
	// contains filtered or unexported fields
}

PDFParser handles PDF document parsing

func NewPDFParser ¶

func NewPDFParser(config *schema.ChunkingConfig) *PDFParser

NewPDFParser creates a new PDF parser with the given configuration

func (*PDFParser) DetectContentType ¶

func (pp *PDFParser) DetectContentType(content string) schema.ChunkType

DetectContentType always returns PDF type for PDFParser

func (*PDFParser) ParseFile ¶

func (pp *PDFParser) ParseFile(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseFile implements the Parser interface for PDF files

func (*PDFParser) ParseMarkdown ¶

func (pp *PDFParser) ParseMarkdown(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseMarkdown is not applicable for PDFParser

func (*PDFParser) ParsePDF ¶

func (pp *PDFParser) ParsePDF(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParsePDF extracts text content from a PDF file and converts it to chunks

func (*PDFParser) ParseText ¶

func (pp *PDFParser) ParseText(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseText is not applicable for PDFParser as it works with files

type TextParser ¶

type TextParser struct {
	// contains filtered or unexported fields
}

TextParser implements the Parser interface for text content

func NewTextParser ¶

func NewTextParser(config *schema.ChunkingConfig) *TextParser

NewTextParser creates a new text parser with the given configuration

func (*TextParser) DetectContentType ¶

func (tp *TextParser) DetectContentType(content string) schema.ChunkType

DetectContentType detects the type of content

func (*TextParser) ParseFile ¶

func (tp *TextParser) ParseFile(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParseFile parses a file based on its extension

func (*TextParser) ParseMarkdown ¶

func (tp *TextParser) ParseMarkdown(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseMarkdown parses markdown content with structure preservation

func (*TextParser) ParsePDF ¶

func (tp *TextParser) ParsePDF(ctx context.Context, filePath string) ([]*schema.Chunk, error)

ParsePDF parses PDF files using the PDFParser

func (*TextParser) ParseText ¶

func (tp *TextParser) ParseText(ctx context.Context, content string) ([]*schema.Chunk, error)

ParseText implements text chunking based on the configured strategy

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL