reader

package

v0.4.1 Latest Latest Go to latest Published: Mar 22, 2026 License: Apache-2.0 Imports: 16 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/carlos7ags/folio

Links

Open Source Insights

Documentation ¶

Overview ¶

Package reader provides a PDF parser that can open, read, and extract content from existing PDF files.

Index ¶

Variables
func ExtractText(data []byte) string
func ExtractTextWithFonts(data []byte, fonts FontCache) string
func ExtractWithStrategy(data []byte, fonts FontCache, strategy ExtractionStrategy) string
func GlyphToRune(name string) rune
type Box
- func (b Box) Height() float64
- func (b Box) IsZero() bool
- func (b Box) Width() float64
type CMap
- func ParseCMap(data []byte) *CMap
- func (cm *CMap) CodeBytes() int
- func (cm *CMap) Decode(raw []byte) string
type ContentOp
- func ParseContentStream(data []byte) []ContentOp
type ContentProcessor
- func NewContentProcessor(fonts FontCache) *ContentProcessor
- func (p *ContentProcessor) Glyphs() []GlyphSpan
- func (p *ContentProcessor) Images() []ImageRef
- func (p *ContentProcessor) Paths() []PathOp
- func (p *ContentProcessor) Process(ops []ContentOp) []TextSpan
- func (p *ContentProcessor) SetExtractGlyphs(enabled bool)
- func (p *ContentProcessor) SetFormResolver(fn func(name string) []ContentOp)
- func (p *ContentProcessor) Spans() []TextSpan
type Copier
- func NewCopier(reader *PdfReader, addObject func(core.PdfObject) *core.PdfIndirectReference) *Copier
- func (c *Copier) CopyObject(obj core.PdfObject) (core.PdfObject, error)
- func (c *Copier) CopyPage(pageIndex int) (*core.PdfIndirectReference, error)
type Encoding
- func (e *Encoding) Decode(raw []byte) string
type ExtractionStrategy
type FontCache
- func BuildFontCache(resources *core.PdfDictionary, res *resolver) FontCache
- func BuildFontCacheWithShared(resources *core.PdfDictionary, res *resolver, shared map[int]*FontEntry) FontCache
type FontEntry
- func (fe *FontEntry) CharWidth(charCode int) int
- func (fe *FontEntry) Decode(raw []byte) string
- func (fe *FontEntry) SpaceWidth() int
- func (fe *FontEntry) TextWidth(raw []byte) int
type GlyphSpan
type ImageRef
type LocationStrategy
- func (l *LocationStrategy) ProcessSpan(span TextSpan)
- func (l *LocationStrategy) Result() string
type MemoryLimits
type Modifier
- func Merge(readers ...*PdfReader) (*Modifier, error)
- func MergeFiles(paths ...string) (*Modifier, error)
- func (m *Modifier) AddBlankPage(width, height float64)
- func (m *Modifier) AddPageWithText(width, height float64, text string, f *font.Standard, fontSize, x, y float64)
- func (m *Modifier) SaveTo(path string) error
- func (m *Modifier) SetInfo(title, author string)
- func (m *Modifier) WriteTo(w io.Writer) (int64, error)
type PageInfo
- func (p *PageInfo) ContentOps() ([]ContentOp, error)
- func (p *PageInfo) ContentStream() ([]byte, error)
- func (p *PageInfo) Dict() *core.PdfDictionary
- func (p *PageInfo) ExtractTaggedText() (string, error)
- func (p *PageInfo) ExtractText() (string, error)
- func (p *PageInfo) ExtractTextWithStrategy(strategy ExtractionStrategy) (string, error)
- func (p *PageInfo) Resources() (*core.PdfDictionary, error)
- func (p *PageInfo) TextSpans() ([]TextSpan, error)
- func (p *PageInfo) VisibleBox() Box
type PaintOp
type Parser
- func NewParser(tok *Tokenizer) *Parser
- func (p *Parser) ParseIndirectObject() (objNum, genNum int, obj core.PdfObject, err error)
- func (p *Parser) ParseObject() (core.PdfObject, error)
type PathOp
type PathType
type PdfReader
- func Open(path string) (*PdfReader, error)
- func Parse(data []byte) (*PdfReader, error)
- func ParseWithOptions(data []byte, opts ReadOptions) (*PdfReader, error)
- func (r *PdfReader) Catalog() *core.PdfDictionary
- func (r *PdfReader) Info() (title, author, subject, creator, producer string)
- func (r *PdfReader) MaxObjectNumber() int
- func (r *PdfReader) Page(index int) (*PageInfo, error)
- func (r *PdfReader) PageCount() int
- func (r *PdfReader) RawBytes() []byte
- func (r *PdfReader) ResolveObject(obj core.PdfObject) (core.PdfObject, error)
- func (r *PdfReader) Trailer() *core.PdfDictionary
- func (r *PdfReader) Version() string
type ReadOptions
type RegionStrategy
- func NewRegionStrategy(x, y, w, h float64, inner ExtractionStrategy) *RegionStrategy
- func (r *RegionStrategy) ProcessSpan(span TextSpan)
- func (r *RegionStrategy) Result() string
type SimpleStrategy
- func (s *SimpleStrategy) ProcessSpan(span TextSpan)
- func (s *SimpleStrategy) Result() string
type Strictness
type StructNode
type StructureTree
- func ParseStructureTree(catalog *core.PdfDictionary, res *resolver) *StructureTree
type TaggedStrategy
- func NewTaggedStrategy(tree *StructureTree, pageNum int) *TaggedStrategy
- func (s *TaggedStrategy) ProcessSpan(span TextSpan)
- func (s *TaggedStrategy) Result() string
type TextSpan
type Token
type TokenType
type Tokenizer
- func NewTokenizer(data []byte) *Tokenizer
- func NewTokenizerFromReader(r io.Reader) (*Tokenizer, error)
- func (t *Tokenizer) AtEnd() bool
- func (t *Tokenizer) Data() []byte
- func (t *Tokenizer) MatchKeyword(kw string) bool
- func (t *Tokenizer) Next() Token
- func (t *Tokenizer) Peek() Token
- func (t *Tokenizer) Pos() int
- func (t *Tokenizer) ReadBytes(n int) ([]byte, error)
- func (t *Tokenizer) ReadLine() string
- func (t *Tokenizer) ReadStreamData(length int) []byte
- func (t *Tokenizer) SetPos(pos int)
- func (t *Tokenizer) Skip(n int)
- func (t *Tokenizer) SkipByte(b byte) bool
- func (t *Tokenizer) SkipWhitespace()

Constants ¶

This section is empty.

Variables ¶

View Source

var ErrMemoryLimitExceeded = errors.New("reader: decompressed stream exceeds memory limit")

ErrMemoryLimitExceeded is returned when decompressed data exceeds the configured limit.

View Source

var MacRomanEncoding = makeMacRomanEncoding()

MacRomanEncoding is the Mac OS Roman encoding.

View Source

var StandardEncoding = makeStandardEncoding()

StandardEncoding is Adobe's standard encoding for Type1 fonts.

View Source

var WinAnsiEncoding = makeWinAnsiEncoding()

WinAnsiEncoding is the Windows-1252 encoding used by most PDF simple fonts.

Functions ¶

func ExtractText ¶

func ExtractText(data []byte) string

ExtractText extracts plain text from a content stream. Returns concatenated text from Tj and TJ operators. This is a simple extraction — it doesn't handle font encoding, character mapping, or text positioning.

func ExtractTextWithFonts ¶

func ExtractTextWithFonts(data []byte, fonts FontCache) string

ExtractTextWithFonts extracts text from a content stream using font encoding information and text positioning to produce properly spaced Unicode text.

func ExtractWithStrategy ¶

func ExtractWithStrategy(data []byte, fonts FontCache, strategy ExtractionStrategy) string

ExtractWithStrategy runs the ContentProcessor and feeds spans to a strategy.

func GlyphToRune ¶

func GlyphToRune(name string) rune

GlyphToRune converts an Adobe glyph name to a Unicode rune. Returns 0 if unknown.

Types ¶

type Box ¶

type Box struct {
	X1, Y1, X2, Y2 float64
}

Box represents a PDF rectangle: [x1, y1, x2, y2] in points. x1,y1 is the lower-left corner; x2,y2 is the upper-right corner.

func (Box) Height ¶

func (b Box) Height() float64

Height returns the box height.

func (Box) IsZero ¶

func (b Box) IsZero() bool

IsZero reports whether the box is unset (all zeros).

func (Box) Width ¶

func (b Box) Width() float64

Width returns the box width.

type CMap ¶

type CMap struct {
	// contains filtered or unexported fields
}

CMap is a parsed ToUnicode CMap that maps character codes to Unicode strings.

func ParseCMap ¶

func ParseCMap(data []byte) *CMap

ParseCMap parses a ToUnicode CMap stream into a CMap.

func (*CMap) CodeBytes ¶

func (cm *CMap) CodeBytes() int

CodeBytes returns the number of bytes per character code. Returns 1 for single-byte CMaps, 2 for two-byte, 0 if unknown.

func (*CMap) Decode ¶

func (cm *CMap) Decode(raw []byte) string

Decode maps raw character code bytes to a Unicode string using the CMap.

type ContentOp ¶

type ContentOp struct {
	Operator string  // e.g. "BT", "Tf", "Tj", "cm", "re", "f"
	Operands []Token // operand tokens preceding the operator
}

ContentOp is a single PDF content stream operator with its operands.

func ParseContentStream ¶

func ParseContentStream(data []byte) []ContentOp

ParseContentStream parses a decompressed content stream into a sequence of operators. Each operator is returned with its preceding operands.

Content stream syntax:

operand1 operand2 ... operator
e.g.: /F1 12 Tf     (set font F1 at 12pt)
      100 700 Td     (move to x=100, y=700)
      (Hello) Tj     (show text "Hello")

type ContentProcessor ¶

type ContentProcessor struct {
	// contains filtered or unexported fields
}

ContentProcessor walks a sequence of ContentOps, maintains full graphics state (CTM, color, font, clipping), and produces typed results: TextSpans, PathOps, ImageRefs, and optionally GlyphSpans.

func NewContentProcessor ¶

func NewContentProcessor(fonts FontCache) *ContentProcessor

NewContentProcessor creates a processor with the given font cache. Pass nil for fonts if font decoding is not needed.

func (*ContentProcessor) Glyphs ¶

func (p *ContentProcessor) Glyphs() []GlyphSpan

Glyphs returns per-glyph spans (only if SetExtractGlyphs(true) was called).

func (*ContentProcessor) Images ¶

func (p *ContentProcessor) Images() []ImageRef

Images returns image references extracted during Process().

func (*ContentProcessor) Paths ¶

func (p *ContentProcessor) Paths() []PathOp

Paths returns path operations extracted during Process().

func (*ContentProcessor) Process ¶

func (p *ContentProcessor) Process(ops []ContentOp) []TextSpan

Process walks the content ops and extracts TextSpans with full positioning.

func (*ContentProcessor) SetExtractGlyphs ¶

func (p *ContentProcessor) SetExtractGlyphs(enabled bool)

SetExtractGlyphs enables per-glyph span extraction. When true, Process() also populates Glyphs().

func (*ContentProcessor) SetFormResolver ¶

func (p *ContentProcessor) SetFormResolver(fn func(name string) []ContentOp)

SetFormResolver sets a callback that resolves Form XObject names to their parsed content ops. When set, the processor recursively processes Form XObjects encountered via the Do operator.

Example:

proc.SetFormResolver(func(name string) []ContentOp {
    // Look up XObject in page resources, check /Subtype /Form,
    // decompress stream, parse content ops.
    return parseFormXObject(resources, name, resolver)
})

func (*ContentProcessor) Spans ¶

func (p *ContentProcessor) Spans() []TextSpan

Spans returns the collected text spans from the last Process call.

type Copier ¶

type Copier struct {
	// contains filtered or unexported fields
}

Copier copies objects from a PdfReader into a document writer, remapping indirect references so object numbers don't collide.

func NewCopier ¶

func NewCopier(reader *PdfReader, addObject func(core.PdfObject) *core.PdfIndirectReference) *Copier

NewCopier creates a copier that bridges a reader and a writer's AddObject function.

func (*Copier) CopyObject ¶

func (c *Copier) CopyObject(obj core.PdfObject) (core.PdfObject, error)

CopyObject deep-copies a PDF object, resolving and remapping all indirect references. Returns the new object suitable for the target writer.

func (*Copier) CopyPage ¶

func (c *Copier) CopyPage(pageIndex int) (*core.PdfIndirectReference, error)

CopyPage copies a page and all its resources from the source reader into the target writer. Returns the new page dictionary reference.

type Encoding ¶

type Encoding struct {
	// contains filtered or unexported fields
}

Encoding maps byte values (0-255) to Unicode runes for simple (non-CID) fonts.

func (*Encoding) Decode ¶

func (e *Encoding) Decode(raw []byte) string

Decode converts raw bytes through the encoding table to a Unicode string.

type ExtractionStrategy ¶

type ExtractionStrategy interface {
	// ProcessSpan receives a single TextSpan. Called in content stream order.
	ProcessSpan(span TextSpan)

	// Result returns the final assembled text.
	Result() string
}

ExtractionStrategy assembles text from a sequence of TextSpans. Different strategies produce different output: simple concatenation, spatial layout preservation, or region-filtered extraction.

type FontCache ¶

type FontCache map[string]*FontEntry

FontCache maps font resource names (e.g. "F1") to their FontEntry.

func BuildFontCache ¶

func BuildFontCache(resources *core.PdfDictionary, res *resolver) FontCache

BuildFontCache constructs a FontCache from a page's Resources dictionary. The resolver is used to dereference indirect objects (font dicts, streams).

func BuildFontCacheWithShared ¶

func BuildFontCacheWithShared(resources *core.PdfDictionary, res *resolver, shared map[int]*FontEntry) FontCache

BuildFontCacheWithShared constructs a FontCache like BuildFontCache, but reuses parsed FontEntry values from a shared cross-page cache keyed by indirect reference object number. This avoids re-parsing the same font dictionary on every page of a multi-page document.

type FontEntry ¶

type FontEntry struct {
	// contains filtered or unexported fields
}

FontEntry holds the decoded character mapping and glyph widths for a single PDF font used during content stream parsing.

func (*FontEntry) CharWidth ¶

func (fe *FontEntry) CharWidth(charCode int) int

CharWidth returns the width of a character code in 1/1000 of text space. Returns 0 if width data is not available (caller should use estimation).

func (*FontEntry) Decode ¶

func (fe *FontEntry) Decode(raw []byte) string

Decode converts raw character code bytes to Unicode text.

func (*FontEntry) SpaceWidth ¶

func (fe *FontEntry) SpaceWidth() int

SpaceWidth returns the width of the space character in text space units (1/1000). Returns 0 if the space glyph width is not available.

func (*FontEntry) TextWidth ¶

func (fe *FontEntry) TextWidth(raw []byte) int

TextWidth computes the width of raw character code bytes in 1/1000 units. For simple fonts, each byte is a character code. For CIDFonts, pairs of bytes form character codes.

type GlyphSpan ¶

type GlyphSpan struct {
	Char  rune
	X, Y  float64 // baseline position in user space
	Width float64 // glyph width in user space
	Font  string
	Color [3]float64
}

GlyphSpan is a single glyph with its individual position and width. Produced when glyph-level extraction is enabled.

type ImageRef ¶

type ImageRef struct {
	Name   string     // XObject resource name (e.g. "Im1")
	X, Y   float64    // position in user space (bottom-left of image)
	Width  float64    // display width in user space
	Height float64    // display height in user space
	Matrix [6]float64 // full CTM at time of rendering
	Inline bool       // true if inline image (BI/ID/EI)
}

ImageRef represents an image reference found in the content stream.

type LocationStrategy ¶

type LocationStrategy struct {
	// contains filtered or unexported fields
}

LocationStrategy sorts text by position (top-to-bottom, left-to-right) to reconstruct the visual layout of the page. This handles PDFs where text is drawn in non-reading order.

func (*LocationStrategy) ProcessSpan ¶

func (l *LocationStrategy) ProcessSpan(span TextSpan)

ProcessSpan collects visible spans for later spatial sorting.

func (*LocationStrategy) Result ¶

func (l *LocationStrategy) Result() string

Result sorts spans top-to-bottom, left-to-right and returns the assembled text.

type MemoryLimits ¶

type MemoryLimits struct {
	// MaxStreamSize is the maximum decompressed size of a single stream in bytes.
	// Prevents zip-bomb attacks where a small compressed payload expands to gigabytes.
	// Default: 256 MB. Set to -1 to disable.
	MaxStreamSize int64

	// MaxTotalAlloc is the maximum total decompressed bytes across all streams
	// in a single document. Default: 1 GB. Set to -1 to disable.
	MaxTotalAlloc int64

	// MaxXrefSize is the maximum decompressed size of an xref stream.
	// Xref streams are parsed before the resolver is available, so they have
	// a separate (smaller) limit. Default: 32 MB. Set to -1 to disable.
	MaxXrefSize int64

	// MaxObjectCount is the maximum number of objects allowed in the xref table.
	// Prevents excessive memory from a malicious xref claiming millions of objects.
	// Default: 1,000,000. Set to -1 to disable.
	MaxObjectCount int
}

MemoryLimits configures memory safety bounds for the PDF reader. Zero values use sensible defaults. Negative values disable the limit.

type Modifier ¶

type Modifier struct {
	// contains filtered or unexported fields
}

Modifier builds a new PDF from copied pages and new content. It bridges the reader (source) and writer (output).

func Merge ¶

func Merge(readers ...*PdfReader) (*Modifier, error)

Merge concatenates multiple PDFs into a single PDF. Pages are appended in order: all pages from the first PDF, then all pages from the second, etc.

Example ¶

package main

import (
	"bytes"
	"fmt"

	"github.com/carlos7ags/folio/document"
	"github.com/carlos7ags/folio/reader"
)

func main() {
	// Create two PDFs.
	makePDF := func(title string) []byte {
		doc := document.NewDocument(document.PageSizeLetter)
		doc.Info.Title = title
		doc.AddPage()
		var buf bytes.Buffer
		_, _ = doc.WriteTo(&buf)
		return buf.Bytes()
	}

	r1, _ := reader.Parse(makePDF("Doc A"))
	r2, _ := reader.Parse(makePDF("Doc B"))

	m, _ := reader.Merge(r1, r2)
	m.SetInfo("Combined", "Folio")

	var out bytes.Buffer
	_, _ = m.WriteTo(&out)

	result, _ := reader.Parse(out.Bytes())
	fmt.Println("Merged pages:", result.PageCount())

}

Output:

Merged pages: 2

func MergeFiles ¶

func MergeFiles(paths ...string) (*Modifier, error)

MergeFiles is a convenience that opens, parses, and merges PDF files.

func (*Modifier) AddBlankPage ¶

func (m *Modifier) AddBlankPage(width, height float64)

AddBlankPage adds a blank page with the given dimensions.

func (*Modifier) AddPageWithText ¶

func (m *Modifier) AddPageWithText(width, height float64, text string, f *font.Standard, fontSize, x, y float64)

AddPageWithText adds a page with simple text content.

func (*Modifier) SaveTo ¶

func (m *Modifier) SaveTo(path string) error

SaveTo writes the merged/modified PDF to a file.

func (*Modifier) SetInfo ¶

func (m *Modifier) SetInfo(title, author string)

SetInfo sets document metadata on the output PDF.

func (*Modifier) WriteTo ¶

func (m *Modifier) WriteTo(w io.Writer) (int64, error)

WriteTo writes the merged/modified PDF to the given writer.

type PageInfo ¶

type PageInfo struct {
	Number int     // 1-based page number
	Width  float64 // page width in points (from effective visible box)
	Height float64 // page height in points (from effective visible box)
	Rotate int     // rotation in degrees (0, 90, 180, 270)

	// The 5 PDF page geometry boxes (ISO 32000 §14.11.2).
	// MediaBox is required; others are optional and inherit from MediaBox if absent.
	MediaBox Box // page boundaries — the full physical medium
	CropBox  Box // visible region (default = MediaBox)
	BleedBox Box // region for production bleed (default = CropBox)
	TrimBox  Box // intended finished page dimensions (default = CropBox)
	ArtBox   Box // meaningful content area (default = CropBox)
	// contains filtered or unexported fields
}

PageInfo holds parsed information about a single page.

func (*PageInfo) ContentOps ¶

func (p *PageInfo) ContentOps() ([]ContentOp, error)

ContentOps parses the page's content stream into a sequence of operators.

func (*PageInfo) ContentStream ¶

func (p *PageInfo) ContentStream() ([]byte, error)

ContentStream returns the decompressed content stream bytes for this page. If the page has multiple content streams, they are concatenated.

func (*PageInfo) Dict ¶

func (p *PageInfo) Dict() *core.PdfDictionary

Dict returns the raw page dictionary.

func (*PageInfo) ExtractTaggedText ¶

func (p *PageInfo) ExtractTaggedText() (string, error)

ExtractTaggedText extracts text using the structure tree for logical reading order. If the document is not tagged, falls back to LocationStrategy.

func (*PageInfo) ExtractText ¶

func (p *PageInfo) ExtractText() (string, error)

ExtractText returns text extracted from the page content stream. It uses the page's font resources to decode character codes to Unicode via ToUnicode CMaps and standard encodings (WinAnsi, MacRoman).

func (*PageInfo) ExtractTextWithStrategy ¶

func (p *PageInfo) ExtractTextWithStrategy(strategy ExtractionStrategy) (string, error)

ExtractTextWithStrategy extracts text using a pluggable strategy.

func (*PageInfo) Resources ¶

func (p *PageInfo) Resources() (*core.PdfDictionary, error)

Resources returns the page's resource dictionary, resolving indirect references. If the page has no /Resources entry, falls back to resources inherited from ancestor Pages nodes (per ISO 32000 §7.7.3.4).

func (*PageInfo) TextSpans ¶

func (p *PageInfo) TextSpans() ([]TextSpan, error)

TextSpans extracts all text spans from the page with full positioning, font, and color information. This is the richest extraction method.

func (*PageInfo) VisibleBox ¶

func (p *PageInfo) VisibleBox() Box

VisibleBox returns the effective visible area of the page. This is the CropBox if set, otherwise the MediaBox.

type PaintOp ¶

type PaintOp int

PaintOp describes how a path was painted.

const (
	PaintNone       PaintOp = iota
	PaintStroke             // S
	PaintFill               // f
	PaintFillStroke         // B
	PaintClip               // W
)

type Parser ¶

type Parser struct {
	// contains filtered or unexported fields
}

Parser builds PDF objects from a token stream.

func NewParser ¶

func NewParser(tok *Tokenizer) *Parser

NewParser creates a parser wrapping a tokenizer.

func (*Parser) ParseIndirectObject ¶

func (p *Parser) ParseIndirectObject() (objNum, genNum int, obj core.PdfObject, err error)

ParseIndirectObject reads "objNum genNum obj ... endobj" and returns the object number, generation number, and the contained object.

func (*Parser) ParseObject ¶

func (p *Parser) ParseObject() (core.PdfObject, error)

ParseObject reads the next PDF object from the token stream. Returns one of the core.PdfObject types, or an error.

type PathOp ¶

type PathOp struct {
	Type        PathType     // move, line, curve, rect, close
	Points      [][2]float64 // control/end points in user space
	StrokeColor [3]float64
	FillColor   [3]float64
	LineWidth   float64
	Painted     PaintOp // how the path was painted (stroke, fill, both)
}

PathOp represents a graphics path operation extracted from a content stream.

type PathType ¶

type PathType int

PathType identifies the kind of path segment.

const (
	PathMove  PathType = iota // moveto
	PathLine                  // lineto
	PathCurve                 // cubic bezier
	PathRect                  // rectangle
	PathClose                 // close subpath
)

type PdfReader ¶

type PdfReader struct {
	// contains filtered or unexported fields
}

PdfReader holds the parsed state of an existing PDF file, including the cross-reference table, object resolver, document catalog, and pages.

func Open ¶

func Open(path string) (*PdfReader, error)

Open reads and parses a PDF file from disk.

func Parse ¶

func Parse(data []byte) (*PdfReader, error)

Parse reads and parses a PDF from a byte slice.

Example ¶

package main

import (
	"bytes"
	"fmt"

	"github.com/carlos7ags/folio/document"
	"github.com/carlos7ags/folio/font"
	"github.com/carlos7ags/folio/reader"
)

func main() {
	// Generate a PDF in memory.
	doc := document.NewDocument(document.PageSizeLetter)
	doc.Info.Title = "Example"
	p := doc.AddPage()
	p.AddText("Hello World", font.Helvetica, 12, 72, 700)
	var buf bytes.Buffer
	_, _ = doc.WriteTo(&buf)

	// Parse it back.
	r, err := reader.Parse(buf.Bytes())
	if err != nil {
		fmt.Println("error:", err)
		return
	}

	fmt.Println("Version:", r.Version())
	fmt.Println("Pages:", r.PageCount())
	title, _, _, _, _ := r.Info()
	fmt.Println("Title:", title)

}

Output:

Version: 1.7
Pages: 1
Title: Example

func ParseWithOptions ¶

func ParseWithOptions(data []byte, opts ReadOptions) (*PdfReader, error)

ParseWithOptions reads and parses a PDF with custom options.

func (*PdfReader) Catalog ¶

func (r *PdfReader) Catalog() *core.PdfDictionary

Catalog returns the document catalog dictionary.

func (*PdfReader) Info ¶

func (r *PdfReader) Info() (title, author, subject, creator, producer string)

Info returns the document info dictionary values.

func (*PdfReader) MaxObjectNumber ¶

func (r *PdfReader) MaxObjectNumber() int

MaxObjectNumber returns the highest object number in the xref table.

func (*PdfReader) Page ¶

func (r *PdfReader) Page(index int) (*PageInfo, error)

Page returns information about the i-th page (0-based index).

func (*PdfReader) PageCount ¶

func (r *PdfReader) PageCount() int

PageCount returns the number of pages in the document.

func (*PdfReader) RawBytes ¶

func (r *PdfReader) RawBytes() []byte

RawBytes returns the raw PDF data that was parsed. This is needed for incremental save operations (e.g., digital signing).

func (*PdfReader) ResolveObject ¶

func (r *PdfReader) ResolveObject(obj core.PdfObject) (core.PdfObject, error)

ResolveObject resolves an indirect reference to its target object.

func (*PdfReader) Trailer ¶

func (r *PdfReader) Trailer() *core.PdfDictionary

Trailer returns the trailer dictionary from the most recent xref section.

func (*PdfReader) Version ¶

func (r *PdfReader) Version() string

Version returns the PDF version from the header (e.g. "1.7").

type ReadOptions ¶

type ReadOptions struct {
	Strictness   Strictness
	MaxCache     int          // max cached objects (0 = default 10000)
	MemoryLimits MemoryLimits // memory safety limits for decompression
}

ReadOptions configures the PDF reader.

type RegionStrategy ¶

type RegionStrategy struct {
	// contains filtered or unexported fields
}

RegionStrategy extracts text only from spans that fall within a specified rectangle. Useful for extracting text from a specific area of a page (e.g., a header, footer, or form field).

func NewRegionStrategy ¶

func NewRegionStrategy(x, y, w, h float64, inner ExtractionStrategy) *RegionStrategy

NewRegionStrategy creates a strategy that filters to a rectangle. (x, y) is the bottom-left corner; w and h are dimensions. The inner strategy assembles the filtered text.

func (*RegionStrategy) ProcessSpan ¶

func (r *RegionStrategy) ProcessSpan(span TextSpan)

ProcessSpan forwards the span to the inner strategy if it overlaps the region.

func (*RegionStrategy) Result ¶

func (r *RegionStrategy) Result() string

Result returns the inner strategy's assembled text.

type SimpleStrategy ¶

type SimpleStrategy struct {
	// contains filtered or unexported fields
}

SimpleStrategy concatenates text in content stream order, inserting spaces for gaps and newlines for line changes. This matches our original ExtractText behavior.

func (*SimpleStrategy) ProcessSpan ¶

func (s *SimpleStrategy) ProcessSpan(span TextSpan)

ProcessSpan appends the span's text, inserting spaces for gaps and newlines for line changes.

func (*SimpleStrategy) Result ¶

func (s *SimpleStrategy) Result() string

Result returns the assembled text.

type Strictness ¶

type Strictness int

Strictness controls how the reader handles malformed PDFs.

The strictness level affects behavior throughout the reader pipeline:

XRef parsing: tolerant mode falls back to xref repair on parse errors; strict mode fails immediately.
Object resolution: tolerant mode may return null for unparseable objects; strict mode returns an error.
Stream decompression: tolerant mode ignores unknown filters and returns raw data; strict mode rejects unknown filters.

const (
	// StrictnessTolerant attempts to recover from common PDF errors.
	// This is the default and handles most real-world PDFs.
	StrictnessTolerant Strictness = iota

	// StrictnessStrict fails immediately on any spec violation.
	StrictnessStrict
)

type StructNode ¶

type StructNode struct {
	Tag        string        // structure type (e.g. "P", "H1", "Table", "Span")
	MCID       int           // marked content identifier (-1 if not a leaf)
	PageObjNum int           // page object number for this MCID
	Children   []*StructNode // child nodes
}

StructNode represents a node in the PDF structure tree.

type StructureTree ¶

type StructureTree struct {
	Root *StructNode
}

StructureTree represents the parsed PDF structure tree.

func ParseStructureTree ¶

func ParseStructureTree(catalog *core.PdfDictionary, res *resolver) *StructureTree

ParseStructureTree extracts the structure tree from a PDF catalog. Returns nil if the document is not tagged (no /MarkInfo or /StructTreeRoot).

type TaggedStrategy ¶

type TaggedStrategy struct {
	// contains filtered or unexported fields
}

TaggedStrategy extracts text in logical reading order using the PDF structure tree. Falls back to position-based ordering for untagged content.

func NewTaggedStrategy ¶

func NewTaggedStrategy(tree *StructureTree, pageNum int) *TaggedStrategy

NewTaggedStrategy creates a strategy that uses the structure tree to determine reading order. pageNum is the 0-based page index.

func (*TaggedStrategy) ProcessSpan ¶

func (s *TaggedStrategy) ProcessSpan(span TextSpan)

ProcessSpan collects visible spans for later structure-tree-ordered assembly.

func (*TaggedStrategy) Result ¶

func (s *TaggedStrategy) Result() string

Result walks the structure tree and returns text assembled in logical reading order, with block-level tags producing line breaks.

type TextSpan ¶

type TextSpan struct {
	Text       string     // decoded Unicode text
	X, Y       float64    // baseline position in user space (after CTM)
	Width      float64    // text width in user space (from glyph metrics or estimate)
	Height     float64    // font size in user space
	Font       string     // font resource name (e.g. "F1")
	Color      [3]float64 // fill color (RGB, 0-1)
	Matrix     [6]float64 // full CTM at time of rendering [a b c d e f]
	Tag        string     // innermost marked content tag (e.g. "P", "H1", "Span"), empty if untagged
	Visible    bool       // false if text rendering mode is invisible (Tr=3)
	SpaceWidth float64    // width of space character in user space (0 if unavailable)
	MCID       int        // marked content identifier (-1 if not inside marked content)
}

TextSpan is a positioned piece of text extracted from a content stream. It carries full rendering context: position, size, font, color, and the current transformation matrix at the time of rendering.

type Token ¶

type Token struct {
	Type  TokenType
	Value string  // raw text value
	Int   int64   // parsed integer (for TokenNumber)
	Real  float64 // parsed float (for TokenNumber)
	IsInt bool    // true if the number is an integer
	Pos   int64   // byte offset in the input
}

Token is a single PDF lexical token.

type TokenType ¶

type TokenType int

TokenType identifies the kind of PDF token.

const (
	TokenNumber     TokenType = iota // integer or real number
	TokenString                      // literal string (...)
	TokenHexString                   // hexadecimal string <...>
	TokenName                        // /Name
	TokenBool                        // true or false
	TokenNull                        // null
	TokenKeyword                     // obj, endobj, stream, endstream, xref, trailer, startxref, R, etc.
	TokenArrayOpen                   // [
	TokenArrayClose                  // ]
	TokenDictOpen                    // <<
	TokenDictClose                   // >>
	TokenEOF                         // end of input
)

type Tokenizer ¶

type Tokenizer struct {
	// contains filtered or unexported fields
}

Tokenizer reads PDF tokens from a byte stream.

func NewTokenizer ¶

func NewTokenizer(data []byte) *Tokenizer

NewTokenizer creates a tokenizer over the given byte slice.

func NewTokenizerFromReader ¶

func NewTokenizerFromReader(r io.Reader) (*Tokenizer, error)

NewTokenizerFromReader reads all bytes and creates a tokenizer.

func (*Tokenizer) AtEnd ¶

func (t *Tokenizer) AtEnd() bool

AtEnd reports whether all input has been consumed.

func (*Tokenizer) Data ¶

func (t *Tokenizer) Data() []byte

Data returns the underlying byte slice.

func (*Tokenizer) MatchKeyword ¶

func (t *Tokenizer) MatchKeyword(kw string) bool

MatchKeyword checks if the bytes at the current position match the keyword. Does not advance the position.

func (*Tokenizer) Next ¶

func (t *Tokenizer) Next() Token

Next returns the next token, skipping whitespace and comments.

func (*Tokenizer) Peek ¶

func (t *Tokenizer) Peek() Token

Peek returns the next token without advancing the position.

func (*Tokenizer) Pos ¶

func (t *Tokenizer) Pos() int

Pos returns the current byte position.

func (*Tokenizer) ReadBytes ¶

func (t *Tokenizer) ReadBytes(n int) ([]byte, error)

ReadBytes reads n bytes from the current position.

func (*Tokenizer) ReadLine ¶

func (t *Tokenizer) ReadLine() string

ReadLine reads bytes until the next line ending, used for xref parsing.

func (*Tokenizer) ReadStreamData ¶

func (t *Tokenizer) ReadStreamData(length int) []byte

ReadStreamData reads stream data: skips the "stream" keyword and EOL, reads `length` bytes of data, then skips EOL + "endstream". If length <= 0, scans for "endstream" marker.

func (*Tokenizer) SetPos ¶

func (t *Tokenizer) SetPos(pos int)

SetPos seeks to a byte position.

func (*Tokenizer) Skip ¶

func (t *Tokenizer) Skip(n int)

Skip advances the position by n bytes.

func (*Tokenizer) SkipByte ¶

func (t *Tokenizer) SkipByte(b byte) bool

SkipByte advances past a specific byte if it matches.

func (*Tokenizer) SkipWhitespace ¶

func (t *Tokenizer) SkipWhitespace()

SkipWhitespace advances past whitespace and comments.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL