font

package

v1.0.0 Latest Latest Go to latest Published: Nov 27, 2025 License: MIT Imports: 9 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/tsawler/tabula

Links

Open Source Insights

Documentation ¶

Index ¶

Variables
func DecodeUTF16BE(data []byte) string
func DecodeUTF16LE(data []byte) string
func DecodeWithEncoding(data []byte, encodingName string) string
func IsEmojiSequence(s string) bool
func IsValidUTF8(s string) bool
func IsVerticalEncoding(encoding string) bool
func NormalizeEmojiSequence(s string) string
func NormalizeUnicode(s string) string
type CIDFont
- func NewCIDFont(fontDict core.Dict, resolver func(core.IndirectRef) (core.Object, error)) (*CIDFont, error)
- func (cid *CIDFont) GetCharacterCollection() string
- func (cid *CIDFont) GetWidthForCID(cidValue int) float64
- func (cid *CIDFont) IsCJK() bool
- func (cid *CIDFont) IsChinese() bool
- func (cid *CIDFont) IsJapanese() bool
- func (cid *CIDFont) IsKorean() bool
type CIDSystemInfo
type CMap
- func NewCMap() *CMap
- func ParseToUnicodeCMap(stream *core.Stream) (*CMap, error)
- func (cm *CMap) Lookup(charCode uint32) string
- func (cm *CMap) LookupString(data []byte) string
type CMapRange
type CMapTable
type CustomEncoding
- func NewCustomEncoding(base Encoding, differences map[byte]rune) *CustomEncoding
- func NewCustomEncodingFromGlyphs(base Encoding, differences map[byte]string) *CustomEncoding
- func (e *CustomEncoding) Decode(b byte) rune
- func (e *CustomEncoding) DecodeString(data []byte) string
- func (e *CustomEncoding) Name() string
type Encoding
- func GetEncoding(name string) Encoding
- func InferEncodingFromFontName(fontName string) Encoding
type Font
- func NewFont(name, baseFont, subtype string) *Font
- func (f *Font) DecodeString(data []byte) string
- func (f *Font) GetStringWidth(s string) float64
- func (f *Font) GetWidth(r rune) float64
- func (f *Font) IsStandardFont() bool
- func (f *Font) IsVertical() bool
type FontDescriptor
type Metric
type TrueTypeFont
- func NewTrueTypeFont(fontDict core.Dict, resolver func(core.IndirectRef) (core.Object, error)) (*TrueTypeFont, error)
- func (tt *TrueTypeFont) GetGlyphID(r rune) uint16
- func (tt *TrueTypeFont) GetWidthFromGlyph(glyphID uint16) float64
type Type0Font
- func NewType0Font(fontDict core.Dict, resolver func(core.IndirectRef) (core.Object, error)) (*Type0Font, error)
- func (t0 *Type0Font) GetWidth(r rune) float64
type Type1Font
- func NewType1Font(fontDict core.Dict, resolver func(core.IndirectRef) (core.Object, error)) (*Type1Font, error)
type VerticalMetrics
type WidthRange

Constants ¶

This section is empty.

Variables ¶

View Source

var MacRomanEncoding = &standardEncoding{
	name:  "MacRomanEncoding",
	table: macRomanTable,
}

MacRomanEncoding - Classic Mac OS encoding for Western European languages

View Source

var PDFDocEncoding = &standardEncoding{
	name:  "PDFDocEncoding",
	table: pdfDocTable,
}

PDFDocEncoding - PDF's default encoding for text strings

View Source

var StandardEncodingTable = &standardEncoding{
	name:  "StandardEncoding",
	table: standardEncodingTableData,
}

StandardEncodingTable - Adobe StandardEncoding for Type1 fonts

View Source

var SymbolEncoding = &standardEncoding{
	name:  "SymbolEncoding",
	table: symbolEncodingTable,
}

SymbolEncoding - Adobe Symbol font encoding Maps character codes to Greek letters, mathematical symbols, etc.

View Source

var WinAnsiEncoding = &standardEncoding{
	name:  "WinAnsiEncoding",
	table: winAnsiTable,
}

WinAnsiEncoding (Windows Code Page 1252) - Western European encoding This is the most common encoding in PDFs created on Windows

View Source

var ZapfDingbatsEncoding = &standardEncoding{
	name:  "ZapfDingbatsEncoding",
	table: zapfDingbatsEncodingTable,
}

ZapfDingbatsEncoding - Adobe ZapfDingbats font encoding Maps character codes to decorative symbols, arrows, etc.

Functions ¶

func DecodeUTF16BE ¶

func DecodeUTF16BE(data []byte) string

DecodeUTF16BE decodes UTF-16 Big Endian encoded bytes to a string Note: Input should NOT include the BOM (FEFF) - that should be stripped before calling

func DecodeUTF16LE ¶

func DecodeUTF16LE(data []byte) string

DecodeUTF16LE decodes UTF-16 Little Endian encoded bytes to a string Note: Input should NOT include the BOM (FFFE) - that should be stripped before calling

func DecodeWithEncoding ¶

func DecodeWithEncoding(data []byte, encodingName string) string

DecodeWithEncoding decodes data using the specified encoding and applies Unicode normalization

func IsEmojiSequence ¶

func IsEmojiSequence(s string) bool

IsEmojiSequence checks if a string contains emoji sequences Emoji can be multi-codepoint: base + modifiers (skin tone) + ZWJ sequences

func IsValidUTF8 ¶

func IsValidUTF8(s string) bool

IsValidUTF8 checks if a string is valid UTF-8 This is useful for detecting UTF-16BE strings (which will fail UTF-8 validation)

func IsVerticalEncoding ¶

func IsVerticalEncoding(encoding string) bool

IsVerticalEncoding checks if an encoding name indicates vertical writing mode Identity-V is used for vertical text in CJK fonts Identity-H (or any other encoding) is horizontal

func NormalizeEmojiSequence ¶

func NormalizeEmojiSequence(s string) string

NormalizeEmojiSequence normalizes emoji sequences for consistent storage This handles skin tone modifiers and ZWJ sequences

func NormalizeUnicode ¶

func NormalizeUnicode(s string) string

NormalizeUnicode normalizes a string to NFC (Canonical Decomposition followed by Canonical Composition) This ensures that characters like é are always represented as U+00E9 (precomposed) rather than U+0065 U+0301 (e + combining acute accent) This is critical for RAG applications to ensure consistent embeddings

Types ¶

type CIDFont ¶

type CIDFont struct {
	BaseFont       string
	Subtype        string // CIDFontType0 or CIDFontType2
	CIDSystemInfo  *CIDSystemInfo
	FontDescriptor *FontDescriptor
	DW             float64           // Default width
	W              []WidthRange      // Width specifications
	DW2            [2]float64        // Default width for vertical writing [w1y w1]
	W2             []VerticalMetrics // Vertical metrics
	CIDToGIDMap    *core.Stream      // CID to GID mapping (for CIDFontType2)
}

CIDFont represents a CIDFont (Character ID keyed font) Used as descendant font in Type0 fonts

func NewCIDFont ¶

func NewCIDFont(fontDict core.Dict, resolver func(core.IndirectRef) (core.Object, error)) (*CIDFont, error)

NewCIDFont creates a CIDFont from a PDF font dictionary

func (*CIDFont) GetCharacterCollection ¶

func (cid *CIDFont) GetCharacterCollection() string

GetCharacterCollection returns a string identifying the character collection

func (*CIDFont) GetWidthForCID ¶

func (cid *CIDFont) GetWidthForCID(cidValue int) float64

GetWidthForCID returns the width for a specific CID

func (*CIDFont) IsCJK ¶

func (cid *CIDFont) IsCJK() bool

IsCJK returns true if this is a CJK (Chinese, Japanese, Korean) font

func (*CIDFont) IsChinese ¶

func (cid *CIDFont) IsChinese() bool

IsChinese returns true if this is a Chinese font

func (*CIDFont) IsJapanese ¶

func (cid *CIDFont) IsJapanese() bool

IsJapanese returns true if this is a Japanese font

func (*CIDFont) IsKorean ¶

func (cid *CIDFont) IsKorean() bool

IsKorean returns true if this is a Korean font

type CIDSystemInfo ¶

type CIDSystemInfo struct {
	Registry   string // e.g., "Adobe"
	Ordering   string // e.g., "Japan1", "GB1", "CNS1", "Korea1"
	Supplement int    // Version of the character collection
}

CIDSystemInfo identifies a character collection

type CMap ¶

type CMap struct {
	// contains filtered or unexported fields
}

CMap represents a character map that maps character codes to Unicode

func NewCMap ¶

func NewCMap() *CMap

NewCMap creates a new empty CMap

func ParseToUnicodeCMap ¶

func ParseToUnicodeCMap(stream *core.Stream) (*CMap, error)

ParseToUnicodeCMap parses a ToUnicode CMap stream

func (*CMap) Lookup ¶

func (cm *CMap) Lookup(charCode uint32) string

Lookup looks up a character code and returns the Unicode string Returns empty string if no mapping is found (caller should handle fallback)

func (*CMap) LookupString ¶

func (cm *CMap) LookupString(data []byte) string

LookupString decodes a string of character codes to Unicode

type CMapRange ¶

type CMapRange struct {
	StartCode    uint32
	EndCode      uint32
	StartUnicode uint32
}

CMapRange represents a range of character code to Unicode mappings

type CMapTable ¶

type CMapTable struct {
	// contains filtered or unexported fields
}

CMapTable represents a TrueType cmap table

type CustomEncoding ¶

type CustomEncoding struct {
	// contains filtered or unexported fields
}

CustomEncoding represents an encoding with custom differences applied to a base encoding This implements the PDF Differences array mechanism where specific character codes are overridden to map to different glyphs

func NewCustomEncoding ¶

func NewCustomEncoding(base Encoding, differences map[byte]rune) *CustomEncoding

NewCustomEncoding creates a custom encoding by applying differences to a base encoding The differences map specifies byte values that should map to different runes than the base encoding

func NewCustomEncodingFromGlyphs ¶

func NewCustomEncodingFromGlyphs(base Encoding, differences map[byte]string) *CustomEncoding

NewCustomEncodingFromGlyphs creates a custom encoding using glyph names instead of runes This matches PDF's Differences array syntax which uses glyph names

func (*CustomEncoding) Decode ¶

func (e *CustomEncoding) Decode(b byte) rune

Decode converts a byte to a rune, using the difference if present, otherwise the base encoding

func (*CustomEncoding) DecodeString ¶

func (e *CustomEncoding) DecodeString(data []byte) string

DecodeString converts a byte sequence to a Unicode string using custom mappings

func (*CustomEncoding) Name ¶

func (e *CustomEncoding) Name() string

Name returns the encoding name

type Encoding ¶

type Encoding interface {
	// Decode converts a byte value to a Unicode rune
	Decode(b byte) rune

	// DecodeString converts a byte sequence to a Unicode string
	DecodeString(data []byte) string

	// Name returns the encoding name
	Name() string
}

Encoding represents a character encoding that maps byte values to Unicode code points

func GetEncoding ¶

func GetEncoding(name string) Encoding

GetEncoding returns the encoding by name

func InferEncodingFromFontName ¶

func InferEncodingFromFontName(fontName string) Encoding

InferEncodingFromFontName attempts to infer the appropriate encoding from a font name This is a fallback strategy when the PDF doesn't specify an encoding or ToUnicode CMap

type Font ¶

type Font struct {
	Name     string
	BaseFont string
	Subtype  string
	Encoding string

	// ToUnicode CMap for character code to Unicode mapping
	ToUnicodeCMap *CMap
	// contains filtered or unexported fields
}

Font represents a PDF font

func NewFont ¶

func NewFont(name, baseFont, subtype string) *Font

NewFont creates a new font

func (*Font) DecodeString ¶

func (f *Font) DecodeString(data []byte) string

DecodeString decodes a string of character codes to Unicode Priority order: 1. Use ToUnicode CMap if present (most accurate) 2. Check for UTF-16 Byte Order Mark (BOM) - FEFF or FFFE 3. Use font's Encoding property (standard encodings) 4. Fall back to raw bytes as string All decoded strings are normalized to NFC for consistent embeddings

func (*Font) GetStringWidth ¶

func (f *Font) GetStringWidth(s string) float64

GetStringWidth calculates the total width of a string

func (*Font) GetWidth ¶

func (f *Font) GetWidth(r rune) float64

GetWidth returns the width of a character (in 1000ths of em)

func (*Font) IsStandardFont ¶

func (f *Font) IsStandardFont() bool

IsStandardFont returns true if this is one of the Standard 14 fonts

func (*Font) IsVertical ¶

func (f *Font) IsVertical() bool

IsVertical returns true if this font uses vertical writing mode Vertical writing is indicated by the Identity-V encoding, commonly used for East Asian languages (Chinese, Japanese, Korean) where text flows top-to-bottom

type FontDescriptor ¶

type FontDescriptor struct {
	FontName     string
	Flags        int
	FontBBox     [4]float64 // [llx lly urx ury]
	ItalicAngle  float64
	Ascent       float64
	Descent      float64
	CapHeight    float64
	StemV        float64
	StemH        float64
	AvgWidth     float64
	MaxWidth     float64
	MissingWidth float64
	FontFile     *core.Stream // Type1 font program
	FontFile2    *core.Stream // TrueType font program
	FontFile3    *core.Stream // Type1C or CIDFont program
}

FontDescriptor contains font metrics and properties

type Metric ¶

type Metric struct {
	W1Y float64
	W1  float64
}

Metric represents a single vertical metric

type TrueTypeFont ¶

type TrueTypeFont struct {
	*Font // Embed basic font

	// TrueType-specific fields
	FirstChar      int
	LastChar       int
	Widths         []float64
	FontDescriptor *FontDescriptor
	ToUnicode      *core.Stream // CMap for character code to Unicode mapping

	// TrueType font program data
	FontProgram []byte            // Raw font program from FontFile2
	Tables      map[string][]byte // Parsed TrueType tables
	// contains filtered or unexported fields
}

TrueTypeFont represents a TrueType font in a PDF TrueType fonts contain glyph outlines as quadratic Bézier curves

func NewTrueTypeFont ¶

func NewTrueTypeFont(fontDict core.Dict, resolver func(core.IndirectRef) (core.Object, error)) (*TrueTypeFont, error)

NewTrueTypeFont creates a TrueType font from a PDF font dictionary

func (*TrueTypeFont) GetGlyphID ¶

func (tt *TrueTypeFont) GetGlyphID(r rune) uint16

GetGlyphID returns the glyph ID for a character

func (*TrueTypeFont) GetWidthFromGlyph ¶

func (tt *TrueTypeFont) GetWidthFromGlyph(glyphID uint16) float64

GetWidthFromGlyph gets the width for a glyph ID

type Type0Font ¶

type Type0Font struct {
	*Font // Embed basic font

	// Type0-specific fields
	Encoding       string
	DescendantFont *CIDFont     // The actual CIDFont
	ToUnicode      *core.Stream // CMap for CID to Unicode mapping
	IsVertical     bool         // true for Identity-V, false for Identity-H
}

Type0Font represents a Type0 (composite) font in a PDF Type0 fonts are used for fonts with large character sets, especially CJK fonts

func NewType0Font ¶

func NewType0Font(fontDict core.Dict, resolver func(core.IndirectRef) (core.Object, error)) (*Type0Font, error)

NewType0Font creates a Type0 font from a PDF font dictionary

func (*Type0Font) GetWidth ¶

func (t0 *Type0Font) GetWidth(r rune) float64

GetWidth returns the width for a character ID (CID)

type Type1Font ¶

type Type1Font struct {
	*Font // Embed basic font

	// Type1-specific fields
	FirstChar      int
	LastChar       int
	Widths         []float64
	FontDescriptor *FontDescriptor
	ToUnicode      *core.Stream // CMap for character code to Unicode mapping
}

Type1Font represents a Type1 font in a PDF Type1 fonts are the original PostScript fonts and one of the most common font types in PDFs

func NewType1Font ¶

func NewType1Font(fontDict core.Dict, resolver func(core.IndirectRef) (core.Object, error)) (*Type1Font, error)

NewType1Font creates a Type1 font from a PDF font dictionary

type VerticalMetrics ¶

type VerticalMetrics struct {
	StartCID int
	EndCID   int
	W1Y      float64  // Position vector y component
	W1       float64  // Vertical width
	Metrics  []Metric // Individual metrics (if W1Y == 0 && W1 == 0)
}

VerticalMetrics represents vertical writing metrics in the W2 array

type WidthRange ¶

type WidthRange struct {
	StartCID int
	EndCID   int
	Width    float64   // Single width for range
	Widths   []float64 // Individual widths (if Width == 0)
}

WidthRange represents a width specification in the W array

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL