Documentation
¶
Overview ¶
Package docx provides DOCX (Office Open XML) document parsing.
Index ¶
- type ExtractOptions
- type ListParser
- type ListType
- type NumberingResolver
- type ParsedList
- type ParsedListItem
- type ParsedTable
- type ParsedTableCell
- type ParsedTableRow
- type Reader
- func (r *Reader) Close() error
- func (r *Reader) Document() (*model.Document, error)
- func (r *Reader) FooterTexts() []string
- func (r *Reader) HasFooters() bool
- func (r *Reader) HasHeaders() bool
- func (r *Reader) HeaderTexts() []string
- func (r *Reader) Lists() []ParsedList
- func (r *Reader) Markdown() (string, error)
- func (r *Reader) MarkdownWithOptions(opts ExtractOptions) (string, error)
- func (r *Reader) MarkdownWithRAGOptions(extractOpts ExtractOptions, mdOpts rag.MarkdownOptions) (string, error)
- func (r *Reader) Metadata() model.Metadata
- func (r *Reader) ModelLists() []*model.List
- func (r *Reader) ModelTables() []*model.Table
- func (r *Reader) PageCount() (int, error)
- func (r *Reader) Tables() []ParsedTable
- func (r *Reader) Text() (string, error)
- func (r *Reader) TextWithOptions(opts ExtractOptions) (string, error)
- type ResolvedRun
- type ResolvedStyle
- type StyleResolver
- type TableParser
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ExtractOptions ¶ added in v1.2.1
type ExtractOptions struct {
ExcludeHeaders bool
}
ExtractOptions holds options for text extraction.
type ListParser ¶
type ListParser struct {
// contains filtered or unexported fields
}
ListParser handles parsing of lists from document paragraphs.
func NewListParser ¶
func NewListParser(resolver *NumberingResolver) *ListParser
NewListParser creates a new list parser.
func (*ListParser) ExtractLists ¶
func (lp *ListParser) ExtractLists(paragraphs []parsedParagraph) []ParsedList
ExtractLists extracts lists from a sequence of parsed paragraphs. It groups consecutive list items into lists.
type NumberingResolver ¶
type NumberingResolver struct {
// contains filtered or unexported fields
}
NumberingResolver resolves numbering definitions from numbering.xml.
func NewNumberingResolver ¶
func NewNumberingResolver(numbering *numberingXML) *NumberingResolver
NewNumberingResolver creates a resolver from parsed numbering.xml.
func (*NumberingResolver) IsListParagraph ¶
func (nr *NumberingResolver) IsListParagraph(numID, ilvl string) bool
IsListParagraph returns true if the paragraph has numbering properties.
func (*NumberingResolver) ResolveLevel ¶
func (nr *NumberingResolver) ResolveLevel(numID string, level int) (listType ListType, bullet string, startAt int)
ResolveLevel returns the format info for a given numId and level.
type ParsedList ¶
type ParsedList struct {
Items []ParsedListItem
Type ListType
NumID string // Numbering ID from document
StartAt int // Starting number for ordered lists
}
ParsedList represents a parsed list with its items.
func (*ParsedList) ToModelList ¶
func (pl *ParsedList) ToModelList() *model.List
ToModelList converts a ParsedList to a model.List.
func (*ParsedList) ToText ¶
func (pl *ParsedList) ToText() string
ToText returns a plain text representation of the list.
type ParsedListItem ¶
type ParsedListItem struct {
Text string
Level int // Indentation level (0-based)
Bullet string // The bullet character or number prefix
NumID string
}
ParsedListItem represents a single list item.
type ParsedTable ¶
type ParsedTable struct {
Rows []ParsedTableRow
ColWidths []float64 // Column widths in points
HasBorders bool
StyleID string
}
ParsedTable represents a parsed table with resolved structure.
func (*ParsedTable) ColCount ¶
func (pt *ParsedTable) ColCount() int
ColCount returns the number of columns in the table.
func (*ParsedTable) ToMarkdown ¶
func (pt *ParsedTable) ToMarkdown() string
ToMarkdown returns a markdown table representation.
func (*ParsedTable) ToModelTable ¶
func (pt *ParsedTable) ToModelTable() *model.Table
ToModelTable converts a ParsedTable to a model.Table.
func (*ParsedTable) ToText ¶
func (pt *ParsedTable) ToText() string
ToText returns a plain text representation of the table.
type ParsedTableCell ¶
type ParsedTableCell struct {
// Content
Paragraphs []parsedParagraph
Text string // Combined text from all paragraphs
// Structure
ColSpan int // Number of columns spanned (gridSpan)
RowSpan int // Number of rows spanned (vMerge)
IsMergedContinuation bool // True if this is a continuation of a vertical merge
// Dimensions
Width float64 // Cell width in points
// Styling
VerticalAlign string // top, center, bottom
Shading string // Background color (hex)
HasBorders bool
// Nested tables
NestedTables []ParsedTable
}
ParsedTableCell represents a parsed table cell.
type ParsedTableRow ¶
type ParsedTableRow struct {
Cells []ParsedTableCell
Height float64 // Row height in points (0 = auto)
IsHeader bool
}
ParsedTableRow represents a parsed table row.
type Reader ¶
type Reader struct {
// contains filtered or unexported fields
}
Reader provides access to DOCX document content.
func (*Reader) FooterTexts ¶ added in v1.2.1
FooterTexts returns all footer text content from the document.
func (*Reader) HasFooters ¶ added in v1.2.1
HasFooters returns true if the document has footer content.
func (*Reader) HasHeaders ¶ added in v1.2.1
HasHeaders returns true if the document has header content.
func (*Reader) HeaderTexts ¶ added in v1.2.1
HeaderTexts returns all header text content from the document.
func (*Reader) Lists ¶
func (r *Reader) Lists() []ParsedList
Lists returns all parsed lists from the document.
func (*Reader) Markdown ¶
Markdown returns the document content as a Markdown-formatted string. It converts:
- Headings to # notation (level 1-6)
- Lists to - (bullet) or 1. (numbered) notation
- Tables to markdown table format
- Paragraphs to plain text
func (*Reader) MarkdownWithOptions ¶ added in v1.2.1
func (r *Reader) MarkdownWithOptions(opts ExtractOptions) (string, error)
MarkdownWithOptions returns document content as Markdown with the specified options. When ExcludeHeaders or ExcludeFooters is true, content matching header/footer text will be filtered out.
func (*Reader) MarkdownWithRAGOptions ¶ added in v1.2.1
func (r *Reader) MarkdownWithRAGOptions(extractOpts ExtractOptions, mdOpts rag.MarkdownOptions) (string, error)
MarkdownWithRAGOptions returns document content as Markdown with both extraction and RAG-style markdown options. This supports options like IncludeMetadata, IncludeTableOfContents, HeadingLevelOffset, and MaxHeadingLevel.
func (*Reader) ModelLists ¶
ModelLists returns lists converted to model.List format.
func (*Reader) ModelTables ¶
ModelTables returns tables converted to model.Table format.
func (*Reader) PageCount ¶
PageCount returns the number of "pages" in the document. Since DOCX doesn't have fixed pages, we return 1 (entire document as single page).
func (*Reader) Tables ¶
func (r *Reader) Tables() []ParsedTable
Tables returns all parsed tables from the document.
func (*Reader) Text ¶
Text extracts and returns all text content from the document. This includes text from paragraphs, lists, and tables in document order.
func (*Reader) TextWithOptions ¶ added in v1.2.1
func (r *Reader) TextWithOptions(opts ExtractOptions) (string, error)
TextWithOptions extracts text content with the specified options. When ExcludeHeaders or ExcludeFooters is true, content matching header/footer text will be filtered out.
type ResolvedRun ¶
type ResolvedRun struct {
Text string
FontName string
FontSize float64
Bold bool
Italic bool
Underline bool
Strike bool
Color string
Highlight string
}
ResolvedRun contains resolved properties for a text run.
type ResolvedStyle ¶
type ResolvedStyle struct {
// Identity
ID string
Name string
Type string // paragraph, character, table
// Heading info
IsHeading bool
HeadingLevel int // 1-9, 0 if not a heading
// Paragraph properties
Alignment string // left, center, right, both (justify)
SpaceBefore float64 // points
SpaceAfter float64 // points
LineSpacing float64 // points (0 = auto)
IndentLeft float64 // points
IndentRight float64 // points
IndentFirst float64 // points (first line indent, can be negative for hanging)
// Run/character properties
FontName string
FontSize float64 // points
Bold bool
Italic bool
Underline bool
Strike bool
SmallCaps bool
AllCaps bool
Color string // hex color like "FF0000"
Highlight string // highlight color name
}
ResolvedStyle contains the fully resolved properties for a style.
type StyleResolver ¶
type StyleResolver struct {
// contains filtered or unexported fields
}
StyleResolver resolves styles with inheritance support.
func NewStyleResolver ¶
func NewStyleResolver(styles *stylesXML) *StyleResolver
NewStyleResolver creates a new style resolver from parsed styles.
func (*StyleResolver) Resolve ¶
func (sr *StyleResolver) Resolve(styleID string) *ResolvedStyle
Resolve returns the fully resolved style for the given style ID. If the style doesn't exist, returns a default style.
func (*StyleResolver) ResolveRun ¶
func (sr *StyleResolver) ResolveRun(paragraphStyle string, runProps runPropsXML) *ResolvedRun
ResolveRun resolves run properties, combining paragraph style with direct formatting.
type TableParser ¶
type TableParser struct {
// contains filtered or unexported fields
}
TableParser handles parsing of DOCX tables.
func NewTableParser ¶
func NewTableParser(resolver *StyleResolver) *TableParser
NewTableParser creates a new table parser.
func (*TableParser) ParseTable ¶
func (tp *TableParser) ParseTable(tbl tableXML) ParsedTable
ParseTable parses a table XML element into a ParsedTable.