Documentation
¶
Index ¶
- type Alignment
- type BBox
- func (b BBox) Area() float64
- func (b BBox) Bottom() float64
- func (b BBox) Center() Point
- func (b BBox) Contains(p Point) bool
- func (b BBox) Expand(margin float64) BBox
- func (b BBox) Intersection(other BBox) BBox
- func (b BBox) Intersects(other BBox) bool
- func (b BBox) IsEmpty() bool
- func (b BBox) IsValid() bool
- func (b BBox) Left() float64
- func (b BBox) OverlapRatio(other BBox) float64
- func (b BBox) Right() float64
- func (b BBox) Top() float64
- func (b BBox) Union(other BBox) BBox
- type BlockInfo
- type Cell
- type CellStyle
- type Color
- type ColumnInfo
- type Document
- func (d *Document) AddPage(page *Page)
- func (d *Document) AllHeadings() []HeadingInfo
- func (d *Document) AllLists() []ListInfo
- func (d *Document) AllParagraphs() []ParagraphInfo
- func (d *Document) ExtractTables() []*Table
- func (d *Document) ExtractText() string
- func (d *Document) GetPage(number int) *Page
- func (d *Document) HasLayout() bool
- func (d *Document) LayoutStats() LayoutStats
- func (d *Document) PageCount() int
- func (d *Document) TableOfContents() []TOCEntry
- type Element
- type ElementType
- type Heading
- type HeadingInfo
- type Image
- type ImageFormat
- type LayoutStats
- type Line
- type LineInfo
- type List
- type ListInfo
- type ListItem
- type ListType
- type Matrix
- type Metadata
- type Page
- func (p *Page) AddElement(elem Element)
- func (p *Page) ColumnCount() int
- func (p *Page) ContentBBox() BBox
- func (p *Page) ElementsInReadingOrder() []Element
- func (p *Page) ExtractTables() []*Table
- func (p *Page) ExtractText() string
- func (p *Page) GetBlocks() []BlockInfo
- func (p *Page) GetElementsInRegion(bbox BBox) []Element
- func (p *Page) GetHeadings() []HeadingInfo
- func (p *Page) GetLists() []ListInfo
- func (p *Page) GetParagraphs() []ParagraphInfo
- func (p *Page) HasLayout() bool
- func (p *Page) IsMultiColumn() bool
- type PageLayout
- type Paragraph
- type ParagraphInfo
- type Point
- type TOCEntry
- type Table
- func (t *Table) BoundingBox() BBox
- func (t *Table) ColCount() int
- func (t *Table) GetCell(row, col int) *Cell
- func (t *Table) GetText() string
- func (t *Table) RowCount() int
- func (t *Table) SetCell(row, col int, cell Cell) error
- func (t *Table) ToCSV() string
- func (t *Table) ToMarkdown() string
- func (t *Table) Type() ElementType
- func (t *Table) ZIndex() int
- type TableGrid
- type TextAlignment
- type TextElement
- type TextFragment
- type TextStyle
- type VerticalAlignment
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BBox ¶
type BBox struct {
X float64 // Left
Y float64 // Bottom (PDF coordinate system)
Width float64
Height float64
}
BBox represents a bounding box (rectangle)
func NewBBoxFromPoints ¶
NewBBoxFromPoints creates a bounding box from two points
func (BBox) Intersection ¶
Intersection returns the intersection of two bounding boxes
func (BBox) Intersects ¶
Intersects checks if two bounding boxes intersect
func (BBox) OverlapRatio ¶
OverlapRatio calculates the overlap ratio with another box Returns value between 0 and 1
type BlockInfo ¶
type BlockInfo struct {
Index int // Block index
BBox BBox // Bounding box
LineCount int // Number of lines in block
Text string // Combined text content
Column int // Column index this block belongs to (-1 if unknown)
FontSize float64 // Average font size
Alignment Alignment // Text alignment
}
BlockInfo contains information about a detected text block
type Cell ¶
type Cell struct {
Text string
BBox BBox
RowSpan int
ColSpan int
IsHeader bool
// Cell styling
Style CellStyle
}
Cell represents a table cell
type CellStyle ¶
type CellStyle struct {
BackgroundColor Color
BorderColor Color
BorderWidth float64
TextStyle TextStyle
Alignment TextAlignment
VerticalAlign VerticalAlignment
}
CellStyle represents cell styling
type ColumnInfo ¶
type ColumnInfo struct {
Index int // Column index (0-based, left to right)
Left float64 // Left edge X coordinate
Right float64 // Right edge X coordinate
Width float64 // Column width
BBox BBox // Bounding box of column content
}
ColumnInfo contains information about a detected column
type Document ¶
Document represents a complete PDF document with extracted structure
func (*Document) AllHeadings ¶
func (d *Document) AllHeadings() []HeadingInfo
AllHeadings returns all detected headings across all pages
func (*Document) AllParagraphs ¶
func (d *Document) AllParagraphs() []ParagraphInfo
AllParagraphs returns all detected paragraphs across all pages
func (*Document) ExtractTables ¶
ExtractTables returns all tables from all pages
func (*Document) ExtractText ¶
ExtractText returns all text content concatenated
func (*Document) HasLayout ¶
HasLayout returns true if layout analysis has been performed on any page
func (*Document) LayoutStats ¶
func (d *Document) LayoutStats() LayoutStats
LayoutStats returns combined layout statistics for the entire document
func (*Document) TableOfContents ¶
TableOfContents returns headings organized as a document outline
type Element ¶
type Element interface {
Type() ElementType
BoundingBox() BBox
ZIndex() int
}
Element is the interface for all page elements
type ElementType ¶
type ElementType int
ElementType represents the type of page element
const ( ElementTypeUnknown ElementType = iota ElementTypeParagraph ElementTypeHeading ElementTypeList ElementTypeTable ElementTypeImage ElementTypeFigure ElementTypeCaption )
func (ElementType) String ¶
func (et ElementType) String() string
type Heading ¶
type Heading struct {
Text string
Level int // 1-6
BBox BBox
FontSize float64
FontName string
Style TextStyle
ZOrder int
}
Heading represents a heading
func (*Heading) BoundingBox ¶
func (*Heading) Type ¶
func (h *Heading) Type() ElementType
type HeadingInfo ¶
type HeadingInfo struct {
Level int // Heading level (1-6)
Text string // Heading text
BBox BBox // Bounding box
FontSize float64 // Font size
FontName string // Font name
Confidence float64 // Detection confidence (0-1)
}
HeadingInfo contains information about a detected heading
type Image ¶
type Image struct {
Data []byte
Format ImageFormat
BBox BBox
DPI float64
ZOrder int
// Alt text if available
AltText string
}
Image represents an embedded image
func (*Image) BoundingBox ¶
func (*Image) Type ¶
func (i *Image) Type() ElementType
type ImageFormat ¶
type ImageFormat int
ImageFormat represents image format
const ( ImageFormatUnknown ImageFormat = iota ImageFormatJPEG ImageFormatPNG ImageFormatTIFF ImageFormatJPEG2000 ImageFormatJBIG2 )
type LayoutStats ¶
type LayoutStats struct {
FragmentCount int // Number of text fragments processed
LineCount int // Number of text lines detected
BlockCount int // Number of text blocks detected
ParagraphCount int // Number of paragraphs detected
HeadingCount int // Number of headings detected
ListCount int // Number of lists detected
}
LayoutStats contains statistics about the layout analysis
type LineInfo ¶
type LineInfo struct {
Index int // Line index
BBox BBox // Bounding box
Text string // Text content
FontSize float64 // Average font size
Alignment Alignment // Detected alignment
IsIndent bool // Whether line appears indented
}
LineInfo contains information about a detected text line
type List ¶
List represents a list (ordered or unordered)
func (*List) BoundingBox ¶
func (*List) Type ¶
func (l *List) Type() ElementType
type ListInfo ¶
type ListInfo struct {
Type ListType // Type of list
Items []ListItem // List items
BBox BBox // Bounding box
Nested bool // Whether list contains nested items
StartValue int // Starting value for numbered lists
}
ListInfo contains information about a detected list
type Matrix ¶
type Matrix [6]float64
Matrix represents a 2D affine transformation matrix
func (Matrix) IsIdentity ¶
IsIdentity returns true if the matrix is an identity matrix
type Metadata ¶
type Metadata struct {
Title string
Author string
Subject string
Keywords []string
Creator string
Producer string
CreationDate time.Time
ModDate time.Time
// Custom metadata
Custom map[string]string
}
Metadata contains document-level information
type Page ¶
type Page struct {
Number int // 1-indexed page number
Width float64 // Page width in points
Height float64 // Page height in points
Rotation int // Rotation angle (0, 90, 180, 270)
Elements []Element // Ordered list of page elements
// Raw data for debugging/advanced use
RawText []TextFragment // All text fragments with positions
RawLines []Line // All detected lines/rectangles
// Layout analysis results (populated by AnalyzeLayout)
Layout *PageLayout // Layout analysis results, nil if not analyzed
}
Page represents a single page in a PDF document
func (*Page) AddElement ¶
AddElement adds an element to the page
func (*Page) ColumnCount ¶
ColumnCount returns the number of columns detected on this page
func (*Page) ContentBBox ¶
ContentBBox returns the bounding box of all content on the page, excluding headers and footers if detected
func (*Page) ElementsInReadingOrder ¶
ElementsInReadingOrder returns elements sorted by reading order If layout analysis hasn't been performed, returns elements in original order
func (*Page) ExtractTables ¶
ExtractTables returns all table elements on the page
func (*Page) ExtractText ¶
ExtractText concatenates all text elements
func (*Page) GetElementsInRegion ¶
GetElementsInRegion returns elements within a bounding box
func (*Page) GetHeadings ¶
func (p *Page) GetHeadings() []HeadingInfo
GetHeadings returns all headings on this page (requires layout analysis)
func (*Page) GetParagraphs ¶
func (p *Page) GetParagraphs() []ParagraphInfo
GetParagraphs returns all paragraphs on this page (requires layout analysis)
func (*Page) IsMultiColumn ¶
IsMultiColumn returns true if the page has multiple columns
type PageLayout ¶
type PageLayout struct {
// Column structure
Columns []ColumnInfo // Detected columns
ColumnCount int // Number of columns detected
// Text structure
TextBlocks []BlockInfo // Detected text blocks
Paragraphs []ParagraphInfo // Detected paragraphs
Lines []LineInfo // Detected text lines
// Semantic elements
Headings []HeadingInfo // Detected headings (H1-H6)
Lists []ListInfo // Detected lists
// Reading order
ReadingOrder []int // Indices into Elements in reading order
// Header/footer detection
HasHeader bool // Whether this page has a detected header
HeaderHeight float64 // Height of header region
// Statistics
Stats LayoutStats
}
PageLayout contains the results of layout analysis for a page
type Paragraph ¶
type Paragraph struct {
Text string
BBox BBox
FontSize float64
FontName string
Style TextStyle
Alignment TextAlignment
ZOrder int
}
Paragraph represents a paragraph of text
func (*Paragraph) BoundingBox ¶
func (*Paragraph) Type ¶
func (p *Paragraph) Type() ElementType
type ParagraphInfo ¶
type ParagraphInfo struct {
Index int // Paragraph index
BBox BBox // Bounding box
Text string // Text content
FontSize float64 // Average font size
FontName string // Primary font name
LineCount int // Number of lines
Alignment Alignment // Text alignment
FirstLine float64 // First line indent (positive = indented)
LineHeight float64 // Average line height
}
ParagraphInfo contains information about a detected paragraph
type TOCEntry ¶
type TOCEntry struct {
Level int // Heading level (1-6)
Text string // Heading text
Page int // Page number (1-indexed)
BBox BBox // Position on page
FontSize float64 // Font size of heading
}
TOCEntry represents an entry in the table of contents
type Table ¶
type Table struct {
Rows [][]Cell
BBox BBox
HasGrid bool // Whether table has visible gridlines
Confidence float64 // Detection confidence (0-1)
ZOrder int
}
Table represents a table with cells organized in rows and columns
func (*Table) BoundingBox ¶
func (*Table) ToMarkdown ¶
ToMarkdown converts the table to markdown format
func (*Table) Type ¶
func (t *Table) Type() ElementType
type TableGrid ¶
type TableGrid struct {
Rows []float64 // Y-coordinates of row boundaries
Cols []float64 // X-coordinates of column boundaries
HasHLines []bool // Horizontal line presence
HasVLines []bool // Vertical line presence
}
TableGrid represents the detected grid structure
func (*TableGrid) GetCellBBox ¶
GetCellBBox returns the bounding box for a cell
type TextAlignment ¶
type TextAlignment int
TextAlignment represents text alignment
const ( AlignLeft TextAlignment = iota AlignCenter AlignRight AlignJustify )
type TextElement ¶
TextElement is an interface for elements containing text
type TextFragment ¶
type TextFragment struct {
Text string
BBox BBox
FontSize float64
FontName string
Style TextStyle
Matrix [6]float64 // Text transformation matrix
}
TextFragment represents a positioned piece of text
type VerticalAlignment ¶
type VerticalAlignment int
VerticalAlignment represents vertical alignment
const ( VAlignTop VerticalAlignment = iota VAlignMiddle VAlignBottom )