odt

package
v1.6.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 4, 2026 License: MIT Imports: 8 Imported by: 0

Documentation

Overview

Package odt provides ODT (OpenDocument Text) document parsing.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type ExtractOptions added in v1.2.1

type ExtractOptions struct {
	ExcludeHeaders bool
	ExcludeFooters bool
}

ExtractOptions holds options for text extraction.

type ListParser

type ListParser struct {
	// contains filtered or unexported fields
}

ListParser handles parsing of ODT lists.

func NewListParser

func NewListParser(resolver *StyleResolver) *ListParser

NewListParser creates a new list parser.

func (*ListParser) ParseList

func (lp *ListParser) ParseList(list listXML, level int) ParsedList

ParseList parses a list XML element into a ParsedList.

type ListType

type ListType int

ListType represents the type of list.

const (
	ListTypeUnordered ListType = iota // Bullet list
	ListTypeOrdered                   // Numbered list
)

type ParsedList

type ParsedList struct {
	Items     []ParsedListItem
	Type      ListType
	StyleName string
	StartAt   int // Starting number for ordered lists
}

ParsedList represents a parsed list with its items.

func (*ParsedList) ToModelList

func (pl *ParsedList) ToModelList() *model.List

ToModelList converts a ParsedList to a model.List.

func (*ParsedList) ToText

func (pl *ParsedList) ToText() string

ToText returns a plain text representation of the list.

type ParsedListItem

type ParsedListItem struct {
	Text   string
	Level  int    // Indentation level (0-based)
	Bullet string // The bullet character or number prefix
}

ParsedListItem represents a single list item.

type ParsedTable

type ParsedTable struct {
	Rows       []ParsedTableRow
	ColWidths  []float64 // Column widths in points
	HasBorders bool
	StyleName  string
}

ParsedTable represents a parsed table with resolved structure.

func (*ParsedTable) ColCount

func (pt *ParsedTable) ColCount() int

ColCount returns the number of columns in the table.

func (*ParsedTable) ToMarkdown

func (pt *ParsedTable) ToMarkdown() string

ToMarkdown returns a markdown table representation.

func (*ParsedTable) ToModelTable

func (pt *ParsedTable) ToModelTable() *model.Table

ToModelTable converts a ParsedTable to a model.Table.

func (*ParsedTable) ToText

func (pt *ParsedTable) ToText() string

ToText returns a plain text representation of the table.

type ParsedTableCell

type ParsedTableCell struct {
	// Content
	Paragraphs []parsedParagraph
	Text       string // Combined text from all paragraphs

	// Structure
	ColSpan   int  // Number of columns spanned
	RowSpan   int  // Number of rows spanned
	IsCovered bool // True if this is a covered cell (part of a merge)

	// Dimensions
	Width float64 // Cell width in points

	// Styling
	VerticalAlign string // top, middle, bottom
	Background    string // Background color (hex)
	HasBorders    bool
	StyleName     string
}

ParsedTableCell represents a parsed table cell.

type ParsedTableRow

type ParsedTableRow struct {
	Cells     []ParsedTableCell
	Height    float64 // Row height in points (0 = auto)
	StyleName string
}

ParsedTableRow represents a parsed table row.

type Reader

type Reader struct {
	// contains filtered or unexported fields
}

Reader provides access to ODT document content.

func Open

func Open(filename string) (*Reader, error)

Open opens an ODT file for reading.

func (*Reader) Close

func (r *Reader) Close() error

Close releases resources associated with the Reader.

func (*Reader) Document

func (r *Reader) Document() (*model.Document, error)

Document returns a model.Document representation of the ODT content.

func (*Reader) FooterTexts added in v1.2.1

func (r *Reader) FooterTexts() []string

FooterTexts returns all footer text content from the document.

func (*Reader) HasFooters added in v1.2.1

func (r *Reader) HasFooters() bool

HasFooters returns true if the document has footer content.

func (*Reader) HasHeaders added in v1.2.1

func (r *Reader) HasHeaders() bool

HasHeaders returns true if the document has header content.

func (*Reader) HeaderTexts added in v1.2.1

func (r *Reader) HeaderTexts() []string

HeaderTexts returns all header text content from the document.

func (*Reader) Lists

func (r *Reader) Lists() []ParsedList

Lists returns all parsed lists from the document.

func (*Reader) Markdown

func (r *Reader) Markdown() (string, error)

Markdown returns the document content as a Markdown-formatted string.

func (*Reader) MarkdownWithOptions added in v1.2.1

func (r *Reader) MarkdownWithOptions(opts ExtractOptions) (string, error)

MarkdownWithOptions returns document content as Markdown with the specified options. When ExcludeHeaders or ExcludeFooters is true, content matching header/footer text will be filtered out.

func (*Reader) MarkdownWithRAGOptions added in v1.2.1

func (r *Reader) MarkdownWithRAGOptions(extractOpts ExtractOptions, mdOpts rag.MarkdownOptions) (string, error)

MarkdownWithRAGOptions returns document content as Markdown with both extraction and RAG-style markdown options. This supports options like IncludeMetadata, IncludeTableOfContents, HeadingLevelOffset, and MaxHeadingLevel.

func (*Reader) Metadata

func (r *Reader) Metadata() model.Metadata

Metadata returns document metadata.

func (*Reader) ModelTables

func (r *Reader) ModelTables() []*model.Table

ModelTables returns tables converted to model.Table format.

func (*Reader) PageCount

func (r *Reader) PageCount() (int, error)

PageCount returns the number of "pages" in the document. Since ODT doesn't have fixed pages, we return 1 (entire document as single page).

func (*Reader) Tables

func (r *Reader) Tables() []ParsedTable

Tables returns all parsed tables from the document.

func (*Reader) Text

func (r *Reader) Text() (string, error)

Text extracts and returns all text content from the document.

func (*Reader) TextWithOptions added in v1.2.1

func (r *Reader) TextWithOptions(opts ExtractOptions) (string, error)

TextWithOptions extracts text content with the specified options. When ExcludeHeaders or ExcludeFooters is true, content matching header/footer text will be filtered out.

type ResolvedListLevel

type ResolvedListLevel struct {
	Level      int
	IsBullet   bool
	BulletChar string
	NumFormat  string // "1", "a", "A", "i", "I"
	NumPrefix  string
	NumSuffix  string
	StartValue int
}

ResolvedListLevel contains resolved list level properties.

type ResolvedStyle

type ResolvedStyle struct {
	// Identity
	Name   string
	Family string // paragraph, text, table, table-cell, etc.

	// Heading info
	IsHeading    bool
	HeadingLevel int // 1-9, 0 if not a heading

	// Paragraph properties
	Alignment   string  // left, center, right, justify
	SpaceBefore float64 // points
	SpaceAfter  float64 // points
	LineSpacing float64 // points (0 = auto)
	IndentLeft  float64 // points
	IndentRight float64 // points
	IndentFirst float64 // points (first line indent, can be negative for hanging)

	// Run/character properties
	FontName  string
	FontSize  float64 // points
	Bold      bool
	Italic    bool
	Underline bool
	Strike    bool
	Color     string // hex color like "#FF0000"
}

ResolvedStyle contains the fully resolved properties for a style.

type StyleResolver

type StyleResolver struct {
	// contains filtered or unexported fields
}

StyleResolver resolves styles with inheritance support.

func NewStyleResolver

func NewStyleResolver(contentStyles *contentStylesXML, docStyles *stylesXML) *StyleResolver

NewStyleResolver creates a new style resolver from parsed styles.

func (*StyleResolver) Resolve

func (sr *StyleResolver) Resolve(styleName string) *ResolvedStyle

Resolve returns the fully resolved style for the given style name. If the style doesn't exist, returns a default style.

func (*StyleResolver) ResolveListLevel

func (sr *StyleResolver) ResolveListLevel(listStyleName string, level int) *ResolvedListLevel

ResolveListLevel returns the resolved list level for a given list style and level.

type TableParser

type TableParser struct {
	// contains filtered or unexported fields
}

TableParser handles parsing of ODT tables.

func NewTableParser

func NewTableParser(resolver *StyleResolver) *TableParser

NewTableParser creates a new table parser.

func (*TableParser) ParseTable

func (tp *TableParser) ParseTable(tbl tableXML) ParsedTable

ParseTable parses a table XML element into a ParsedTable.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL