docx

package

v1.6.3 Latest Latest Go to latest Published: Feb 3, 2026 License: MIT Imports: 9 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/tsawler/tabula

Links

Open Source Insights

Documentation ¶

Overview ¶

Package docx provides DOCX (Office Open XML) document parsing.

Index ¶

type ExtractOptions
type ListParser
- func NewListParser(resolver *NumberingResolver) *ListParser
- func (lp *ListParser) ExtractLists(paragraphs []parsedParagraph) []ParsedList
type ListType
type NumberingResolver
- func NewNumberingResolver(numbering *numberingXML) *NumberingResolver
- func (nr *NumberingResolver) IsListParagraph(numID, ilvl string) bool
- func (nr *NumberingResolver) ResolveLevel(numID string, level int) (listType ListType, bullet string, startAt int)
type ParsedList
- func (pl *ParsedList) ToModelList() *model.List
- func (pl *ParsedList) ToText() string
type ParsedListItem
type ParsedTable
- func (pt *ParsedTable) ColCount() int
- func (pt *ParsedTable) ToMarkdown() string
- func (pt *ParsedTable) ToModelTable() *model.Table
- func (pt *ParsedTable) ToText() string
type ParsedTableCell
type ParsedTableRow
type Reader
- func Open(filename string) (*Reader, error)
- func (r *Reader) Close() error
- func (r *Reader) Document() (*model.Document, error)
- func (r *Reader) FooterTexts() []string
- func (r *Reader) HasFooters() bool
- func (r *Reader) HasHeaders() bool
- func (r *Reader) HeaderTexts() []string
- func (r *Reader) Lists() []ParsedList
- func (r *Reader) Markdown() (string, error)
- func (r *Reader) MarkdownWithOptions(opts ExtractOptions) (string, error)
- func (r *Reader) MarkdownWithRAGOptions(extractOpts ExtractOptions, mdOpts rag.MarkdownOptions) (string, error)
- func (r *Reader) Metadata() model.Metadata
- func (r *Reader) ModelLists() []*model.List
- func (r *Reader) ModelTables() []*model.Table
- func (r *Reader) PageCount() (int, error)
- func (r *Reader) Tables() []ParsedTable
- func (r *Reader) Text() (string, error)
- func (r *Reader) TextWithOptions(opts ExtractOptions) (string, error)
type ResolvedRun
type ResolvedStyle
type StyleResolver
- func NewStyleResolver(styles *stylesXML) *StyleResolver
- func (sr *StyleResolver) Resolve(styleID string) *ResolvedStyle
- func (sr *StyleResolver) ResolveRun(paragraphStyle string, runProps runPropsXML) *ResolvedRun
type TableParser
- func NewTableParser(resolver *StyleResolver) *TableParser
- func (tp *TableParser) ParseTable(tbl tableXML) ParsedTable

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type ExtractOptions ¶ added in v1.2.1

type ExtractOptions struct {
	ExcludeHeaders bool
	ExcludeFooters bool
}

ExtractOptions holds options for text extraction.

type ListParser ¶

type ListParser struct {
	// contains filtered or unexported fields
}

ListParser handles parsing of lists from document paragraphs.

func (*ListParser) ExtractLists ¶

func (lp *ListParser) ExtractLists(paragraphs []parsedParagraph) []ParsedList

ExtractLists extracts lists from a sequence of parsed paragraphs. It groups consecutive list items into lists.

type ListType ¶

type ListType int

ListType represents the type of list.

const (
	ListTypeUnordered ListType = iota // Bullet list
	ListTypeOrdered                   // Numbered list
)

type NumberingResolver ¶

type NumberingResolver struct {
	// contains filtered or unexported fields
}

NumberingResolver resolves numbering definitions from numbering.xml.

func NewNumberingResolver ¶

func NewNumberingResolver(numbering *numberingXML) *NumberingResolver

NewNumberingResolver creates a resolver from parsed numbering.xml.

func (*NumberingResolver) IsListParagraph ¶

func (nr *NumberingResolver) IsListParagraph(numID, ilvl string) bool

IsListParagraph returns true if the paragraph has numbering properties.

func (*NumberingResolver) ResolveLevel ¶

func (nr *NumberingResolver) ResolveLevel(numID string, level int) (listType ListType, bullet string, startAt int)

ResolveLevel returns the format info for a given numId and level.

type ParsedList ¶

type ParsedList struct {
	Items   []ParsedListItem
	Type    ListType
	NumID   string // Numbering ID from document
	StartAt int    // Starting number for ordered lists
}

ParsedList represents a parsed list with its items.

func (*ParsedList) ToModelList ¶

func (pl *ParsedList) ToModelList() *model.List

ToModelList converts a ParsedList to a model.List.

func (*ParsedList) ToText ¶

func (pl *ParsedList) ToText() string

ToText returns a plain text representation of the list.

type ParsedListItem ¶

type ParsedListItem struct {
	Text   string
	Level  int    // Indentation level (0-based)
	Bullet string // The bullet character or number prefix
	NumID  string
}

ParsedListItem represents a single list item.

type ParsedTable ¶

type ParsedTable struct {
	Rows       []ParsedTableRow
	ColWidths  []float64 // Column widths in points
	HasBorders bool
	StyleID    string
}

ParsedTable represents a parsed table with resolved structure.

func (*ParsedTable) ColCount ¶

func (pt *ParsedTable) ColCount() int

ColCount returns the number of columns in the table.

func (*ParsedTable) ToMarkdown ¶

func (pt *ParsedTable) ToMarkdown() string

ToMarkdown returns a markdown table representation.

func (*ParsedTable) ToModelTable ¶

func (pt *ParsedTable) ToModelTable() *model.Table

ToModelTable converts a ParsedTable to a model.Table.

func (*ParsedTable) ToText ¶

func (pt *ParsedTable) ToText() string

ToText returns a plain text representation of the table.

type ParsedTableCell ¶

type ParsedTableCell struct {
	// Content
	Paragraphs []parsedParagraph
	Text       string // Combined text from all paragraphs

	// Structure
	ColSpan              int  // Number of columns spanned (gridSpan)
	RowSpan              int  // Number of rows spanned (vMerge)
	IsMergedContinuation bool // True if this is a continuation of a vertical merge

	// Dimensions
	Width float64 // Cell width in points

	// Styling
	VerticalAlign string // top, center, bottom
	Shading       string // Background color (hex)
	HasBorders    bool

	// Nested tables
	NestedTables []ParsedTable
}

ParsedTableCell represents a parsed table cell.

type ParsedTableRow ¶

type ParsedTableRow struct {
	Cells    []ParsedTableCell
	Height   float64 // Row height in points (0 = auto)
	IsHeader bool
}

ParsedTableRow represents a parsed table row.

type Reader ¶

type Reader struct {
	// contains filtered or unexported fields
}

Reader provides access to DOCX document content.

func (*Reader) Close ¶

func (r *Reader) Close() error

Close releases resources associated with the Reader.

func (*Reader) Document ¶

func (r *Reader) Document() (*model.Document, error)

Document returns a model.Document representation of the DOCX content.

func (*Reader) FooterTexts ¶ added in v1.2.1

func (r *Reader) FooterTexts() []string

FooterTexts returns all footer text content from the document.

func (*Reader) HasFooters ¶ added in v1.2.1

func (r *Reader) HasFooters() bool

HasFooters returns true if the document has footer content.

func (*Reader) HasHeaders ¶ added in v1.2.1

func (r *Reader) HasHeaders() bool

HasHeaders returns true if the document has header content.

func (*Reader) HeaderTexts ¶ added in v1.2.1

func (r *Reader) HeaderTexts() []string

HeaderTexts returns all header text content from the document.

func (*Reader) Lists ¶

func (r *Reader) Lists() []ParsedList

Lists returns all parsed lists from the document.

func (*Reader) Markdown ¶

func (r *Reader) Markdown() (string, error)

Markdown returns the document content as a Markdown-formatted string. It converts:

Headings to # notation (level 1-6)
Lists to - (bullet) or 1. (numbered) notation
Tables to markdown table format
Paragraphs to plain text

func (*Reader) MarkdownWithOptions ¶ added in v1.2.1

func (r *Reader) MarkdownWithOptions(opts ExtractOptions) (string, error)

MarkdownWithOptions returns document content as Markdown with the specified options. When ExcludeHeaders or ExcludeFooters is true, content matching header/footer text will be filtered out.

func (*Reader) MarkdownWithRAGOptions ¶ added in v1.2.1

func (r *Reader) MarkdownWithRAGOptions(extractOpts ExtractOptions, mdOpts rag.MarkdownOptions) (string, error)

MarkdownWithRAGOptions returns document content as Markdown with both extraction and RAG-style markdown options. This supports options like IncludeMetadata, IncludeTableOfContents, HeadingLevelOffset, and MaxHeadingLevel.

func (*Reader) Metadata ¶

func (r *Reader) Metadata() model.Metadata

Metadata returns document metadata.

func (*Reader) ModelLists ¶

func (r *Reader) ModelLists() []*model.List

ModelLists returns lists converted to model.List format.

func (*Reader) ModelTables ¶

func (r *Reader) ModelTables() []*model.Table

ModelTables returns tables converted to model.Table format.

func (*Reader) PageCount ¶

func (r *Reader) PageCount() (int, error)

PageCount returns the number of "pages" in the document. Since DOCX doesn't have fixed pages, we return 1 (entire document as single page).

func (*Reader) Tables ¶

func (r *Reader) Tables() []ParsedTable

Tables returns all parsed tables from the document.

func (*Reader) Text ¶

func (r *Reader) Text() (string, error)

Text extracts and returns all text content from the document. This includes text from paragraphs, lists, and tables in document order.

func (*Reader) TextWithOptions ¶ added in v1.2.1

func (r *Reader) TextWithOptions(opts ExtractOptions) (string, error)

TextWithOptions extracts text content with the specified options. When ExcludeHeaders or ExcludeFooters is true, content matching header/footer text will be filtered out.

type ResolvedRun ¶

type ResolvedRun struct {
	Text      string
	FontName  string
	FontSize  float64
	Bold      bool
	Italic    bool
	Underline bool
	Strike    bool
	Color     string
	Highlight string
}

ResolvedRun contains resolved properties for a text run.

type ResolvedStyle ¶

type ResolvedStyle struct {
	// Identity
	ID   string
	Name string
	Type string // paragraph, character, table

	// Heading info
	IsHeading    bool
	HeadingLevel int // 1-9, 0 if not a heading

	// Paragraph properties
	Alignment   string  // left, center, right, both (justify)
	SpaceBefore float64 // points
	SpaceAfter  float64 // points
	LineSpacing float64 // points (0 = auto)
	IndentLeft  float64 // points
	IndentRight float64 // points
	IndentFirst float64 // points (first line indent, can be negative for hanging)

	// Run/character properties
	FontName  string
	FontSize  float64 // points
	Bold      bool
	Italic    bool
	Underline bool
	Strike    bool
	SmallCaps bool
	AllCaps   bool
	Color     string // hex color like "FF0000"
	Highlight string // highlight color name
}

ResolvedStyle contains the fully resolved properties for a style.

type StyleResolver ¶

type StyleResolver struct {
	// contains filtered or unexported fields
}

StyleResolver resolves styles with inheritance support.

func NewStyleResolver ¶

func NewStyleResolver(styles *stylesXML) *StyleResolver

NewStyleResolver creates a new style resolver from parsed styles.

func (*StyleResolver) Resolve ¶

func (sr *StyleResolver) Resolve(styleID string) *ResolvedStyle

Resolve returns the fully resolved style for the given style ID. If the style doesn't exist, returns a default style.

func (*StyleResolver) ResolveRun ¶

func (sr *StyleResolver) ResolveRun(paragraphStyle string, runProps runPropsXML) *ResolvedRun

ResolveRun resolves run properties, combining paragraph style with direct formatting.

type TableParser ¶

type TableParser struct {
	// contains filtered or unexported fields
}

TableParser handles parsing of DOCX tables.

func NewTableParser ¶

func NewTableParser(resolver *StyleResolver) *TableParser

NewTableParser creates a new table parser.

func (*TableParser) ParseTable ¶

func (tp *TableParser) ParseTable(tbl tableXML) ParsedTable

ParseTable parses a table XML element into a ParsedTable.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Overview ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

Types ¶

type ExtractOptions ¶ added in v1.2.1

type ListParser ¶

func NewListParser ¶

func (*ListParser) ExtractLists ¶

type ListType ¶

type NumberingResolver ¶

func NewNumberingResolver ¶

func (*NumberingResolver) IsListParagraph ¶

func (*NumberingResolver) ResolveLevel ¶

type ParsedList ¶

func (*ParsedList) ToModelList ¶

func (*ParsedList) ToText ¶

type ParsedListItem ¶

type ParsedTable ¶

func (*ParsedTable) ColCount ¶

func (*ParsedTable) ToMarkdown ¶

func (*ParsedTable) ToModelTable ¶

func (*ParsedTable) ToText ¶

type ParsedTableCell ¶

type ParsedTableRow ¶

type Reader ¶

func Open ¶

func (*Reader) Close ¶

func (*Reader) Document ¶

func (*Reader) FooterTexts ¶ added in v1.2.1

func (*Reader) HasFooters ¶ added in v1.2.1

func (*Reader) HasHeaders ¶ added in v1.2.1

func (*Reader) HeaderTexts ¶ added in v1.2.1

func (*Reader) Lists ¶

func (*Reader) Markdown ¶

func (*Reader) MarkdownWithOptions ¶ added in v1.2.1

func (*Reader) MarkdownWithRAGOptions ¶ added in v1.2.1

func (*Reader) Metadata ¶

func (*Reader) ModelLists ¶

func (*Reader) ModelTables ¶

func (*Reader) PageCount ¶

func (*Reader) Tables ¶

func (*Reader) Text ¶

func (*Reader) TextWithOptions ¶ added in v1.2.1

type ResolvedRun ¶

type ResolvedStyle ¶

type StyleResolver ¶

func NewStyleResolver ¶

func (*StyleResolver) Resolve ¶

func (*StyleResolver) ResolveRun ¶

type TableParser ¶

func NewTableParser ¶

func (*TableParser) ParseTable ¶

Source Files ¶