Documentation
¶
Index ¶
- Variables
- func DecodeUTF16BE(data []byte) string
- func DecodeUTF16LE(data []byte) string
- func DecodeWithEncoding(data []byte, encodingName string) string
- func IsEmojiSequence(s string) bool
- func IsValidUTF8(s string) bool
- func IsVerticalEncoding(encoding string) bool
- func NormalizeEmojiSequence(s string) string
- func NormalizeUnicode(s string) string
- type CIDFont
- type CIDSystemInfo
- type CMap
- type CMapRange
- type CMapTable
- type CustomEncoding
- type Encoding
- type Font
- type FontDescriptor
- type Metric
- type TrueTypeFont
- type Type0Font
- type Type1Font
- type VerticalMetrics
- type WidthRange
Constants ¶
This section is empty.
Variables ¶
var MacRomanEncoding = &standardEncoding{
name: "MacRomanEncoding",
table: macRomanTable,
}
MacRomanEncoding - Classic Mac OS encoding for Western European languages
var PDFDocEncoding = &standardEncoding{
name: "PDFDocEncoding",
table: pdfDocTable,
}
PDFDocEncoding - PDF's default encoding for text strings
var StandardEncodingTable = &standardEncoding{
name: "StandardEncoding",
table: standardEncodingTableData,
}
StandardEncodingTable - Adobe StandardEncoding for Type1 fonts
var SymbolEncoding = &standardEncoding{
name: "SymbolEncoding",
table: symbolEncodingTable,
}
SymbolEncoding - Adobe Symbol font encoding Maps character codes to Greek letters, mathematical symbols, etc.
var WinAnsiEncoding = &standardEncoding{
name: "WinAnsiEncoding",
table: winAnsiTable,
}
WinAnsiEncoding (Windows Code Page 1252) - Western European encoding This is the most common encoding in PDFs created on Windows
var ZapfDingbatsEncoding = &standardEncoding{
name: "ZapfDingbatsEncoding",
table: zapfDingbatsEncodingTable,
}
ZapfDingbatsEncoding - Adobe ZapfDingbats font encoding Maps character codes to decorative symbols, arrows, etc.
Functions ¶
func DecodeUTF16BE ¶
DecodeUTF16BE decodes UTF-16 Big Endian encoded bytes to a string Note: Input should NOT include the BOM (FEFF) - that should be stripped before calling
func DecodeUTF16LE ¶
DecodeUTF16LE decodes UTF-16 Little Endian encoded bytes to a string Note: Input should NOT include the BOM (FFFE) - that should be stripped before calling
func DecodeWithEncoding ¶
DecodeWithEncoding decodes data using the specified encoding and applies Unicode normalization
func IsEmojiSequence ¶
IsEmojiSequence checks if a string contains emoji sequences Emoji can be multi-codepoint: base + modifiers (skin tone) + ZWJ sequences
func IsValidUTF8 ¶
IsValidUTF8 checks if a string is valid UTF-8 This is useful for detecting UTF-16BE strings (which will fail UTF-8 validation)
func IsVerticalEncoding ¶
IsVerticalEncoding checks if an encoding name indicates vertical writing mode Identity-V is used for vertical text in CJK fonts Identity-H (or any other encoding) is horizontal
func NormalizeEmojiSequence ¶
NormalizeEmojiSequence normalizes emoji sequences for consistent storage This handles skin tone modifiers and ZWJ sequences
func NormalizeUnicode ¶
NormalizeUnicode normalizes a string to NFC (Canonical Decomposition followed by Canonical Composition) This ensures that characters like é are always represented as U+00E9 (precomposed) rather than U+0065 U+0301 (e + combining acute accent) This is critical for RAG applications to ensure consistent embeddings
Types ¶
type CIDFont ¶
type CIDFont struct {
BaseFont string
Subtype string // CIDFontType0 or CIDFontType2
CIDSystemInfo *CIDSystemInfo
FontDescriptor *FontDescriptor
DW float64 // Default width
W []WidthRange // Width specifications
DW2 [2]float64 // Default width for vertical writing [w1y w1]
W2 []VerticalMetrics // Vertical metrics
CIDToGIDMap *core.Stream // CID to GID mapping (for CIDFontType2)
}
CIDFont represents a CIDFont (Character ID keyed font) Used as descendant font in Type0 fonts
func NewCIDFont ¶
func NewCIDFont(fontDict core.Dict, resolver func(core.IndirectRef) (core.Object, error)) (*CIDFont, error)
NewCIDFont creates a CIDFont from a PDF font dictionary
func (*CIDFont) GetCharacterCollection ¶
GetCharacterCollection returns a string identifying the character collection
func (*CIDFont) GetWidthForCID ¶
GetWidthForCID returns the width for a specific CID
func (*CIDFont) IsJapanese ¶
IsJapanese returns true if this is a Japanese font
type CIDSystemInfo ¶
type CIDSystemInfo struct {
Registry string // e.g., "Adobe"
Ordering string // e.g., "Japan1", "GB1", "CNS1", "Korea1"
Supplement int // Version of the character collection
}
CIDSystemInfo identifies a character collection
type CMap ¶
type CMap struct {
// contains filtered or unexported fields
}
CMap represents a character map that maps character codes to Unicode
func ParseToUnicodeCMap ¶
ParseToUnicodeCMap parses a ToUnicode CMap stream
func (*CMap) Lookup ¶
Lookup looks up a character code and returns the Unicode string Returns empty string if no mapping is found (caller should handle fallback)
func (*CMap) LookupString ¶
LookupString decodes a string of character codes to Unicode
type CMapTable ¶
type CMapTable struct {
// contains filtered or unexported fields
}
CMapTable represents a TrueType cmap table
type CustomEncoding ¶
type CustomEncoding struct {
// contains filtered or unexported fields
}
CustomEncoding represents an encoding with custom differences applied to a base encoding This implements the PDF Differences array mechanism where specific character codes are overridden to map to different glyphs
func NewCustomEncoding ¶
func NewCustomEncoding(base Encoding, differences map[byte]rune) *CustomEncoding
NewCustomEncoding creates a custom encoding by applying differences to a base encoding The differences map specifies byte values that should map to different runes than the base encoding
func NewCustomEncodingFromGlyphs ¶
func NewCustomEncodingFromGlyphs(base Encoding, differences map[byte]string) *CustomEncoding
NewCustomEncodingFromGlyphs creates a custom encoding using glyph names instead of runes This matches PDF's Differences array syntax which uses glyph names
func (*CustomEncoding) Decode ¶
func (e *CustomEncoding) Decode(b byte) rune
Decode converts a byte to a rune, using the difference if present, otherwise the base encoding
func (*CustomEncoding) DecodeString ¶
func (e *CustomEncoding) DecodeString(data []byte) string
DecodeString converts a byte sequence to a Unicode string using custom mappings
type Encoding ¶
type Encoding interface {
// Decode converts a byte value to a Unicode rune
Decode(b byte) rune
// DecodeString converts a byte sequence to a Unicode string
DecodeString(data []byte) string
// Name returns the encoding name
Name() string
}
Encoding represents a character encoding that maps byte values to Unicode code points
func InferEncodingFromFontName ¶
InferEncodingFromFontName attempts to infer the appropriate encoding from a font name This is a fallback strategy when the PDF doesn't specify an encoding or ToUnicode CMap
type Font ¶
type Font struct {
Name string
BaseFont string
Subtype string
Encoding string
// ToUnicode CMap for character code to Unicode mapping
ToUnicodeCMap *CMap
// contains filtered or unexported fields
}
Font represents a PDF font
func (*Font) DecodeString ¶
DecodeString decodes a string of character codes to Unicode Priority order: 1. Use ToUnicode CMap if present (most accurate) 2. Check for UTF-16 Byte Order Mark (BOM) - FEFF or FFFE 3. Use font's Encoding property (standard encodings) 4. Fall back to raw bytes as string All decoded strings are normalized to NFC for consistent embeddings
func (*Font) GetStringWidth ¶
GetStringWidth calculates the total width of a string
func (*Font) IsStandardFont ¶
IsStandardFont returns true if this is one of the Standard 14 fonts
func (*Font) IsVertical ¶
IsVertical returns true if this font uses vertical writing mode Vertical writing is indicated by the Identity-V encoding, commonly used for East Asian languages (Chinese, Japanese, Korean) where text flows top-to-bottom
type FontDescriptor ¶
type FontDescriptor struct {
FontName string
Flags int
FontBBox [4]float64 // [llx lly urx ury]
ItalicAngle float64
Ascent float64
Descent float64
CapHeight float64
StemV float64
StemH float64
AvgWidth float64
MaxWidth float64
MissingWidth float64
FontFile *core.Stream // Type1 font program
FontFile2 *core.Stream // TrueType font program
FontFile3 *core.Stream // Type1C or CIDFont program
}
FontDescriptor contains font metrics and properties
type TrueTypeFont ¶
type TrueTypeFont struct {
*Font // Embed basic font
// TrueType-specific fields
FirstChar int
LastChar int
Widths []float64
FontDescriptor *FontDescriptor
ToUnicode *core.Stream // CMap for character code to Unicode mapping
// TrueType font program data
FontProgram []byte // Raw font program from FontFile2
Tables map[string][]byte // Parsed TrueType tables
// contains filtered or unexported fields
}
TrueTypeFont represents a TrueType font in a PDF TrueType fonts contain glyph outlines as quadratic Bézier curves
func NewTrueTypeFont ¶
func NewTrueTypeFont(fontDict core.Dict, resolver func(core.IndirectRef) (core.Object, error)) (*TrueTypeFont, error)
NewTrueTypeFont creates a TrueType font from a PDF font dictionary
func (*TrueTypeFont) GetGlyphID ¶
func (tt *TrueTypeFont) GetGlyphID(r rune) uint16
GetGlyphID returns the glyph ID for a character
func (*TrueTypeFont) GetWidthFromGlyph ¶
func (tt *TrueTypeFont) GetWidthFromGlyph(glyphID uint16) float64
GetWidthFromGlyph gets the width for a glyph ID
type Type0Font ¶
type Type0Font struct {
*Font // Embed basic font
// Type0-specific fields
Encoding string
DescendantFont *CIDFont // The actual CIDFont
ToUnicode *core.Stream // CMap for CID to Unicode mapping
IsVertical bool // true for Identity-V, false for Identity-H
}
Type0Font represents a Type0 (composite) font in a PDF Type0 fonts are used for fonts with large character sets, especially CJK fonts
type Type1Font ¶
type Type1Font struct {
*Font // Embed basic font
// Type1-specific fields
FirstChar int
LastChar int
Widths []float64
FontDescriptor *FontDescriptor
ToUnicode *core.Stream // CMap for character code to Unicode mapping
}
Type1Font represents a Type1 font in a PDF Type1 fonts are the original PostScript fonts and one of the most common font types in PDFs