textractor

package module
v0.0.9 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 6, 2024 License: MIT Imports: 14 Imported by: 1

README

📄 go-textractor

Build Status Go Reference goreportcard codecov

Amazon textract response parser written in go.

Installation

Use Go modules to include go-textractor in your project:

go get github.com/hupe1980/go-textractor

Usage

package main

import (
	"context"
	"fmt"
	"io"
	"log"
	"os"

	"github.com/aws/aws-sdk-go-v2/config"
	"github.com/aws/aws-sdk-go-v2/service/textract"
	"github.com/aws/aws-sdk-go-v2/service/textract/types"
	"github.com/hupe1980/go-textractor"
)

func main() {
	file, err := os.Open("examples/analyze_document/testfile.pdf")
	if err != nil {
		log.Fatal(err)
	}

	defer file.Close()

	b, err := io.ReadAll(file)
	if err != nil {
		log.Fatal(err)
	}

	cfg, _ := config.LoadDefaultConfig(context.Background())
	client := textract.NewFromConfig(cfg)

	output, err := client.AnalyzeDocument(context.Background(), &textract.AnalyzeDocumentInput{
		Document: &types.Document{
			Bytes: b,
		},
		FeatureTypes: []types.FeatureType{
			types.FeatureTypeTables, types.FeatureTypeForms,
		},
	})
	if err != nil {
		log.Fatal(err)
	}

	doc, err := textractor.ParseDocumentAPIOutput(&textractor.DocumentAPIOutput{
		DocumentMetadata: output.DocumentMetadata,
		Blocks:           output.Blocks,
	})
	if err != nil {
		log.Fatal(err)
	}

	// Iterate over elements in the document
	for _, p := range doc.Pages() {
		// Print lines and words
		for _, l := range p.Lines() {
			fmt.Printf("Line: %s (%f)\n", l.Text(), l.Confidence())
			for _, w := range l.Words() {
				fmt.Printf("Word: %s (%f)\n", w.Text(), w.Confidence())
			}
		}

		// Print tables
		for _, t := range p.Tables() {
			for r, row := range t.Rows() {
				for c, cell := range row.Cells() {
					fmt.Printf("Table[%d][%d] = %s (%f)\n", r, c, cell.Text(), cell.Confidence())
				}
			}
		}

		// Print key values
		for _, kv := range p.KeyValues() {
			fmt.Printf("Key: %s, Value: %s\n", kv.Key(), kv.Value())
		}
	}
}

For more example usage, see examples.

Table extraction

f, err := os.Create("table.csv")
if err != nil {
	log.Fatal(err)
}

defer f.Close()

if err := doc.Tables[0].ToCSV(f); err != nil {
	log.Fatal(err)
}

Contributing

Contributions are welcome! Feel free to open an issue or submit a pull request for any improvements or new features you would like to see.

References

License

This project is licensed under the MIT License. See the LICENSE file for details.

Documentation

Index

Constants

This section is empty.

Variables

View Source
var DefaultLinerizationOptions = TextLinearizationOptions{
	MaxNumberOfConsecutiveNewLines: 2,
	HideHeaderLayout:               false,
	HideFooterLayout:               false,
	HideFigureLayout:               false,
	HidePageNumberLayout:           false,
	PageNumberPrefix:               "",
	PageNumberSuffix:               "",
	OnLinerizedPageNumber:          func(pn string) string { return pn },
	SameParagraphSeparator:         " ",
	LayoutElementSeparator:         "\n\n",
	ListElementSeparator:           "\n",
	ListLayoutPrefix:               "",
	ListLayoutSuffix:               "",
	ListElementPrefix:              "",
	ListElementSuffix:              "",
	RemoveNewLinesInListElements:   true,
	TitlePrefix:                    "",
	TitleSuffix:                    "",
	OnLinerizedTitle:               func(t string) string { return t },
	TableLayoutPrefix:              "\n\n",
	TableLayoutSuffix:              "\n",
	TableLinearizationFormat:       "plaintext",
	TableMinTableWords:             0,
	TableColumnSeparator:           "\t",
	TablePrefix:                    "",
	TableSuffix:                    "",
	TableRowSeparator:              "\n",
	TableRowPrefix:                 "",
	TableRowSuffix:                 "",
	TableCellPrefix:                "",
	TableCellSuffix:                "",
	SectionHeaderPrefix:            "",
	SectionHeaderSuffix:            "",
	OnLinerizedSectionHeader:       func(sh string) string { return sh },
	KeyValueLayoutPrefix:           "\n\n",
	KeyValueLayoutSuffix:           "",
	KeyValuePrefix:                 "",
	KeyValueSuffix:                 "",
	KeyPrefix:                      "",
	KeySuffix:                      "",
	ValuePrefix:                    "",
	ValueSuffix:                    "",
	SelectionElementSelected:       "[X]",
	SelectionElementNotSelected:    "[ ]",
	HeuristicHTolerance:            0.3,
	HeuristicOverlapRatio:          0.5,
	SignatureToken:                 "[SIGNATURE]",
}

Functions

This section is empty.

Types

type AnalyzeExpenseOutput added in v0.0.3

type AnalyzeExpenseOutput struct {
	DocumentMetadata *types.DocumentMetadata `json:"DocumentMetadata"`
	ExpenseDocuments []types.ExpenseDocument `json:"ExpenseDocuments"`
}

AnalyzeExpenseOutput represents the output of the Textract Analyze Expense API.

type AnalyzeIDOutput added in v0.0.4

type AnalyzeIDOutput struct {
	DocumentMetadata  *types.DocumentMetadata  `json:"DocumentMetadata"`
	IdentityDocuments []types.IdentityDocument `json:"IdentityDocuments"`
}

AnalyzeIDOutput represents the output of the Textract Analyze ID API.

type BoundingBox

type BoundingBox struct {
	// contains filtered or unexported fields
}

func NewEnclosingBoundingBox added in v0.0.4

func NewEnclosingBoundingBox[T BoundingBoxAccessor](accessors ...T) *BoundingBox

NewEnclosingBoundingBox returns a new bounding box that represents the union of multiple bounding boxes.

func (*BoundingBox) Area added in v0.0.4

func (bb *BoundingBox) Area() float64

Area calculates and returns the area of the bounding box. If either the width or height of the bounding box is less than zero, the area is considered zero to prevent negative area values.

func (*BoundingBox) Bottom

func (bb *BoundingBox) Bottom() float64

Bottom returns the bottom coordinate of the bounding box.

func (*BoundingBox) Height

func (bb *BoundingBox) Height() float64

func (*BoundingBox) HorizontalCenter

func (bb *BoundingBox) HorizontalCenter() float64

HorizontalCenter returns the horizontal center coordinate of the bounding box.

func (*BoundingBox) Intersection

func (bb *BoundingBox) Intersection(other *BoundingBox) *BoundingBox

Intersection returns a new bounding box that represents the intersection of two bounding boxes.

func (*BoundingBox) Left

func (bb *BoundingBox) Left() float64

func (*BoundingBox) Right

func (bb *BoundingBox) Right() float64

Right returns the right coordinate of the bounding box.

func (*BoundingBox) String added in v0.0.2

func (bb *BoundingBox) String() string

String returns a string representation of the bounding box.

func (*BoundingBox) Top

func (bb *BoundingBox) Top() float64

func (*BoundingBox) VerticalCenter

func (bb *BoundingBox) VerticalCenter() float64

VerticalCenter returns the vertical center coordinate of the bounding box.

func (*BoundingBox) Width

func (bb *BoundingBox) Width() float64

type BoundingBoxAccessor added in v0.0.4

type BoundingBoxAccessor interface {
	BoundingBox() *BoundingBox
}

type Cell

type Cell interface {
	Words() []*Word
	Text(optFns ...func(*TextLinearizationOptions)) string
	Confidence() float64
	OCRConfidence() *OCRConfidence
	IsColumnHeader() bool
	IsTableTitle() bool
	IsTableFooter() bool
	IsTableSummary() bool
	IsTableSectionTitle() bool
	IsMerged() bool
}

Cell defines the interface for a table cell in Textract.

type CellAtOptions added in v0.0.7

type CellAtOptions struct {
	IgnoreMergedCells bool
}

type Document

type Document struct {
	// contains filtered or unexported fields
}

Document represents a document consisting of multiple pages.

func ParseDocumentAPIOutput added in v0.0.4

func ParseDocumentAPIOutput(output *DocumentAPIOutput) (*Document, error)

ParseDocumentAPIOutput parses the Textract Document API output into a Document.

func (*Document) KeyValues added in v0.0.4

func (d *Document) KeyValues() []*KeyValue

KeyValues returns a slice containing all the key-value pairs in the document.

func (*Document) Lines added in v0.0.4

func (d *Document) Lines() []*Line

Lines returns a slice containing all the lines in the document.

func (*Document) Pages

func (d *Document) Pages() []*Page

Pages returns the slice of Page objects in the document.

func (*Document) Signatures added in v0.0.4

func (d *Document) Signatures() []*Signature

Signatures returns a slice containing all the signatures in the document.

func (*Document) Tables added in v0.0.4

func (d *Document) Tables() []*Table

Tables returns a slice containing all the tables in the document.

func (*Document) Text added in v0.0.4

func (d *Document) Text(optFns ...func(*TextLinearizationOptions)) string

Text linearizes the document into a single text string, optionally applying specified options.

func (*Document) Words added in v0.0.4

func (d *Document) Words() []*Word

Words returns a slice containing all the words in the document.

type DocumentAPIOutput added in v0.0.4

type DocumentAPIOutput struct {
	DocumentMetadata *types.DocumentMetadata `json:"DocumentMetadata"`
	Blocks           []types.Block           `json:"Blocks"`
}

DocumentAPIOutput represents the output of the Textract Document API.

type ExpenseDocument added in v0.0.6

type ExpenseDocument struct {
	// contains filtered or unexported fields
}

func ParseAnalyzeExpenseOutput added in v0.0.6

func ParseAnalyzeExpenseOutput(output *AnalyzeExpenseOutput) ([]*ExpenseDocument, error)

ParseAnalyzeExpenseOutput parses the Textract Analyze Expense API output into a slice of ExpenseDocument.

func (*ExpenseDocument) SummaryFields added in v0.0.7

func (ed *ExpenseDocument) SummaryFields() []*ExpenseField

type ExpenseField added in v0.0.7

type ExpenseField struct{}

type IdentityDocument added in v0.0.2

type IdentityDocument struct {
	// contains filtered or unexported fields
}

func ParseAnalyzeIDOutput added in v0.0.4

func ParseAnalyzeIDOutput(output *AnalyzeIDOutput) ([]*IdentityDocument, error)

ParseAnalyzeIDOutput parses the Textract Analyze ID API output into a slice of IdentityDocument.

func (*IdentityDocument) Document added in v0.0.4

func (id *IdentityDocument) Document() *Document

func (*IdentityDocument) FieldByType added in v0.0.3

func (*IdentityDocument) Fields added in v0.0.3

func (id *IdentityDocument) Fields() []*IdentityDocumentField

func (*IdentityDocument) IdentityDocumentType added in v0.0.4

func (id *IdentityDocument) IdentityDocumentType() IdentityDocumentType

type IdentityDocumentField added in v0.0.3

type IdentityDocumentField struct {
	// contains filtered or unexported fields
}

IdentityDocumentField represents a field extracted from an identity document by Textract.

func (*IdentityDocumentField) Confidence added in v0.0.3

func (idf *IdentityDocumentField) Confidence() float64

Confidence returns the confidence score associated with the field extraction.

func (*IdentityDocumentField) FieldType added in v0.0.4

FieldType returns the type of the identity document field.

func (*IdentityDocumentField) IsNormalized added in v0.0.3

func (idf *IdentityDocumentField) IsNormalized() bool

IsNormalized checks if the field value is normalized.

func (*IdentityDocumentField) NormalizedValue added in v0.0.3

NormalizedValue returns the normalized value of the identity document field.

func (*IdentityDocumentField) Value added in v0.0.3

func (idf *IdentityDocumentField) Value() string

Value returns the value of the identity document field.

type IdentityDocumentFieldType added in v0.0.3

type IdentityDocumentFieldType string

IdentityDocumentFieldType represents the type of fields in an identity document.

const (
	IdentityDocumentFieldTypeFirstName        IdentityDocumentFieldType = "FIRST_NAME"
	IdentityDocumentFieldTypeLastName         IdentityDocumentFieldType = "LAST_NAME"
	IdentityDocumentFieldTypeMiddleName       IdentityDocumentFieldType = "MIDDLE_NAME"
	IdentityDocumentFieldTypeSuffix           IdentityDocumentFieldType = "Suffix"
	IdentityDocumentFieldTypeCityInAddress    IdentityDocumentFieldType = "CITY_IN_ADDRESS"
	IdentityDocumentFieldTypeZipCodeInAddress IdentityDocumentFieldType = "ZIP_CODE_IN_ADDRESS"
	IdentityDocumentFieldTypeStateInAddress   IdentityDocumentFieldType = "STATE_IN_ADDRESS"
	IdentityDocumentFieldTypeStateName        IdentityDocumentFieldType = "STATE_NAME"
	IdentityDocumentFieldTypeDocumentNumber   IdentityDocumentFieldType = "DOCUMENT_NUMBER"
	IdentityDocumentFieldTypeExpirationDate   IdentityDocumentFieldType = "EXPIRATION_DATE"
	IdentityDocumentFieldTypeDateOfBirth      IdentityDocumentFieldType = "DATE_OF_BIRTH"
	IdentityDocumentFieldTypeDateOfIssue      IdentityDocumentFieldType = "DATE_OF_ISSUE"
	IdentityDocumentFieldTypeIDType           IdentityDocumentFieldType = "ID_TYPE"
	IdentityDocumentFieldTypeEndorsements     IdentityDocumentFieldType = "ENDORSEMENTS"
	IdentityDocumentFieldTypeVeteran          IdentityDocumentFieldType = "VETERAN"
	IdentityDocumentFieldTypeRestrictions     IdentityDocumentFieldType = "RESTRICTIONS"
	IdentityDocumentFieldTypeClass            IdentityDocumentFieldType = "CLASS"
	IdentityDocumentFieldTypeAddress          IdentityDocumentFieldType = "ADDRESS"
	IdentityDocumentFieldTypeCounty           IdentityDocumentFieldType = "COUNTY"
	IdentityDocumentFieldTypePlaceOfBirth     IdentityDocumentFieldType = "PLACE_OF_BIRTH"
	IdentityDocumentFieldTypeMRZCode          IdentityDocumentFieldType = "MRZ_CODE"
	IdentityDocumentFieldTypeOther            IdentityDocumentFieldType = "Other"
)

type IdentityDocumentType added in v0.0.3

type IdentityDocumentType string

IdentityDocumentType represents the type of an identity document.

const (
	IdentityDocumentTypeDriverLicenseFront IdentityDocumentType = "DRIVER LICENSE FRONT"
	IdentityDocumentTypePassport           IdentityDocumentType = "PASSPORT"
	IdentityDocumentTypeOther              IdentityDocumentType = "OTHER"
)

type Key added in v0.0.4

type Key struct {
	// contains filtered or unexported fields
}

Key represents the key part of a key-value pair.

func (*Key) BlockType added in v0.0.4

func (b *Key) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Key) BoundingBox added in v0.0.4

func (b *Key) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Key) Confidence added in v0.0.4

func (b *Key) Confidence() float64

Confidence returns the confidence of the block.

func (*Key) ID added in v0.0.4

func (b *Key) ID() string

ID returns the identifier of the block.

func (*Key) OCRConfidence added in v0.0.8

func (k *Key) OCRConfidence() *OCRConfidence

OCRConfidence returns the OCR confidence for the key.

func (*Key) PageNumber added in v0.0.4

func (b *Key) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Key) Polygon added in v0.0.4

func (b *Key) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*Key) Raw added in v0.0.4

func (b *Key) Raw() types.Block

Raw returns the raw block data.

func (*Key) String added in v0.0.4

func (k *Key) String() string

String returns the string representation of the key.

func (*Key) Text added in v0.0.4

func (k *Key) Text() string

Text returns the text content of the key.

func (*Key) Words added in v0.0.4

func (k *Key) Words() []*Word

Words returns the words in the key.

type KeyValue added in v0.0.4

type KeyValue struct {
	// contains filtered or unexported fields
}

KeyValue represents a key-value pair in a document.

func (*KeyValue) BlockType added in v0.0.4

func (b *KeyValue) BlockType() types.BlockType

BlockType returns the type of the block.

func (*KeyValue) BoundingBox added in v0.0.4

func (kv *KeyValue) BoundingBox() *BoundingBox

BoundingBox returns the bounding box that encloses the key-value pair.

func (*KeyValue) Confidence added in v0.0.4

func (kv *KeyValue) Confidence() float64

Confidence calculates the confidence score for a key value.

func (*KeyValue) ID added in v0.0.4

func (b *KeyValue) ID() string

ID returns the identifier of the block.

func (*KeyValue) Key added in v0.0.4

func (kv *KeyValue) Key() *Key

Key returns the key of the key-value pair.

func (*KeyValue) OCRConfidence added in v0.0.8

func (kv *KeyValue) OCRConfidence() *OCRConfidence

OCRConfidence returns the OCR confidence for the key-value pair.

func (*KeyValue) PageNumber added in v0.0.4

func (b *KeyValue) PageNumber() int

PageNumber returns the page number associated with the block.

func (*KeyValue) Polygon added in v0.0.4

func (kv *KeyValue) Polygon() Polygon

Polygon returns the polygon representing the key-value pair.

func (*KeyValue) Raw added in v0.0.4

func (b *KeyValue) Raw() types.Block

Raw returns the raw block data.

func (*KeyValue) String added in v0.0.7

func (kv *KeyValue) String() string

String returns the string representation of the key-value pair.

func (*KeyValue) Text added in v0.0.5

func (kv *KeyValue) Text(optFns ...func(*TextLinearizationOptions)) string

Text returns the text content of the key-value pair.

func (*KeyValue) Value added in v0.0.4

func (kv *KeyValue) Value() *Value

Value returns the value of the key-value pair.

func (*KeyValue) Words added in v0.0.4

func (kv *KeyValue) Words() []*Word

Words returns the words in the key-value pair.

type Layout added in v0.0.4

type Layout struct {
	// contains filtered or unexported fields
}

func (*Layout) AddChildren added in v0.0.4

func (l *Layout) AddChildren(children ...LayoutChild)

func (*Layout) BlockType added in v0.0.4

func (b *Layout) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Layout) BoundingBox added in v0.0.4

func (b *Layout) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Layout) Confidence added in v0.0.4

func (b *Layout) Confidence() float64

Confidence returns the confidence of the block.

func (*Layout) ID added in v0.0.4

func (b *Layout) ID() string

ID returns the identifier of the block.

func (*Layout) PageNumber added in v0.0.4

func (b *Layout) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Layout) Polygon added in v0.0.4

func (b *Layout) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*Layout) Raw added in v0.0.4

func (b *Layout) Raw() types.Block

Raw returns the raw block data.

func (*Layout) Text added in v0.0.4

func (l *Layout) Text(optFns ...func(*TextLinearizationOptions)) string

type LayoutChild added in v0.0.4

type LayoutChild interface {
	ID() string
	Text(optFns ...func(*TextLinearizationOptions)) string
	BoundingBox() *BoundingBox
}

type Line

type Line struct {
	// contains filtered or unexported fields
}

func (*Line) BlockType added in v0.0.4

func (b *Line) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Line) BoundingBox added in v0.0.4

func (b *Line) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Line) Confidence

func (b *Line) Confidence() float64

Confidence returns the confidence of the block.

func (*Line) ID

func (b *Line) ID() string

ID returns the identifier of the block.

func (*Line) PageNumber added in v0.0.4

func (b *Line) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Line) Polygon added in v0.0.4

func (b *Line) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*Line) Raw added in v0.0.4

func (b *Line) Raw() types.Block

Raw returns the raw block data.

func (*Line) String added in v0.0.7

func (l *Line) String() string

func (*Line) Text

func (l *Line) Text(_ ...func(*TextLinearizationOptions)) string

func (*Line) Words

func (l *Line) Words() []*Word

type NormalizedIdentityDocumentFieldValue added in v0.0.4

type NormalizedIdentityDocumentFieldValue struct {
	// contains filtered or unexported fields
}

NormalizedIdentityDocumentFieldValue represents a normalized value of an identity document field.

func (NormalizedIdentityDocumentFieldValue) DateValue added in v0.0.4

func (nidfv NormalizedIdentityDocumentFieldValue) DateValue() (time.Time, error)

DateValue returns the time representation of the normalized date value.

func (NormalizedIdentityDocumentFieldValue) Value added in v0.0.4

Value returns the string representation of the normalized value.

func (NormalizedIdentityDocumentFieldValue) ValueType added in v0.0.4

ValueType returns the type of the normalized value.

type OCRConfidence added in v0.0.2

type OCRConfidence struct {
	// contains filtered or unexported fields
}

OCRConfidence represents the confidence scores (mean, max, min) from OCR processing.

func (*OCRConfidence) Max added in v0.0.2

func (ocr *OCRConfidence) Max() float64

Max returns the maximum confidence score.

func (*OCRConfidence) Mean added in v0.0.2

func (ocr *OCRConfidence) Mean() float64

Mean returns the mean (average) confidence score.

func (*OCRConfidence) Min added in v0.0.2

func (ocr *OCRConfidence) Min() float64

Min returns the minimum confidence score.

type OnLinerizedPageNumber added in v0.0.9

type OnLinerizedPageNumber func(pn string) string

OnLinerizedPageNumber is a callback function to customize the processing of page numbers during linearization.

type OnLinerizedSectionHeader added in v0.0.9

type OnLinerizedSectionHeader func(sh string) string

OnLinerizedSectionHeader is a callback function to customize the processing of section headers during linearization.

type OnLinerizedTitle added in v0.0.9

type OnLinerizedTitle func(t string) string

OnLinerizedTitle is a callback function to customize the processing of titles during linearization.

type Orientation

type Orientation struct {
	// contains filtered or unexported fields
}

Orientation represents the orientation of a geometric element.

func (*Orientation) Degrees

func (o *Orientation) Degrees() float64

Degrees returns the orientation in degrees.

func (*Orientation) Radians

func (o *Orientation) Radians() float64

Radians returns the orientation in radians.

type Page

type Page struct {
	// contains filtered or unexported fields
}

func (*Page) AddLayouts added in v0.0.4

func (p *Page) AddLayouts(layouts ...*Layout)

func (*Page) Height added in v0.0.4

func (p *Page) Height() float64

func (*Page) ID

func (p *Page) ID() string

func (*Page) KeyValues added in v0.0.4

func (p *Page) KeyValues() []*KeyValue

func (*Page) Layouts added in v0.0.4

func (p *Page) Layouts() []*Layout

func (*Page) Lines

func (p *Page) Lines() []*Line

func (*Page) Number added in v0.0.4

func (p *Page) Number() int

func (*Page) Queries added in v0.0.2

func (p *Page) Queries() []*Query

func (*Page) SearchValueByKey added in v0.0.4

func (p *Page) SearchValueByKey(key string) []*KeyValue

func (*Page) Signatures added in v0.0.2

func (p *Page) Signatures() []*Signature

func (*Page) Tables

func (p *Page) Tables() []*Table

func (*Page) Text

func (p *Page) Text(optFns ...func(*TextLinearizationOptions)) string

func (*Page) Width added in v0.0.4

func (p *Page) Width() float64

func (*Page) Words added in v0.0.4

func (p *Page) Words() []*Word

type Point

type Point struct {
	// contains filtered or unexported fields
}

Point represents a 2D point.

func (*Point) String added in v0.0.2

func (p *Point) String() string

String returns a string representation of the Point, including its X and Y coordinates.

func (*Point) X

func (p *Point) X() float64

X returns the X coordinate of the point.

func (*Point) Y

func (p *Point) Y() float64

Y returns the Y coordinate of the point.

type Polygon added in v0.0.7

type Polygon []*Point

func (Polygon) String added in v0.0.7

func (p Polygon) String() string

type Query added in v0.0.2

type Query struct {
	// contains filtered or unexported fields
}

Query represents a query with associated information, including an identifier, text, alias, query pages, results, a page, and raw block data.

func (*Query) Alias added in v0.0.2

func (q *Query) Alias() string

Alias returns the alias for the query.

func (*Query) HasResult added in v0.0.4

func (q *Query) HasResult() bool

func (*Query) ResultsByConfidence added in v0.0.2

func (q *Query) ResultsByConfidence() []*QueryResult

ResultsByConfidence lists this query instance's results, sorted from most to least confident.

func (*Query) Text added in v0.0.2

func (q *Query) Text() string

Text returns the text associated with the query.

func (*Query) TopResult added in v0.0.2

func (q *Query) TopResult() *QueryResult

TopResult retrieves the top result by confidence score, if any are available.

type QueryResult added in v0.0.2

type QueryResult struct {
	// contains filtered or unexported fields
}

QueryResult represents the result of a parsed query.

func (*QueryResult) BlockType added in v0.0.4

func (b *QueryResult) BlockType() types.BlockType

BlockType returns the type of the block.

func (*QueryResult) BoundingBox added in v0.0.4

func (b *QueryResult) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*QueryResult) Confidence added in v0.0.2

func (b *QueryResult) Confidence() float64

Confidence returns the confidence of the block.

func (*QueryResult) ID added in v0.0.2

func (b *QueryResult) ID() string

ID returns the identifier of the block.

func (*QueryResult) PageNumber added in v0.0.4

func (b *QueryResult) PageNumber() int

PageNumber returns the page number associated with the block.

func (*QueryResult) Polygon added in v0.0.4

func (b *QueryResult) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*QueryResult) Raw added in v0.0.4

func (b *QueryResult) Raw() types.Block

Raw returns the raw block data.

func (*QueryResult) Text added in v0.0.2

func (qr *QueryResult) Text() string

Text returns the extracted text from the query result.

type RowCellsAtOptions added in v0.0.7

type RowCellsAtOptions struct {
	IgnoreMergedCells bool
}

type RowsOptions added in v0.0.7

type RowsOptions struct {
	IgnoreMergedCells bool
}

type SelectionElement

type SelectionElement struct {
	// contains filtered or unexported fields
}

SelectionElement represents an element with selection status.

func (*SelectionElement) BlockType added in v0.0.4

func (b *SelectionElement) BlockType() types.BlockType

BlockType returns the type of the block.

func (*SelectionElement) BoundingBox added in v0.0.4

func (b *SelectionElement) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*SelectionElement) Confidence

func (b *SelectionElement) Confidence() float64

Confidence returns the confidence of the block.

func (*SelectionElement) ID

func (b *SelectionElement) ID() string

ID returns the identifier of the block.

func (*SelectionElement) IsSelected

func (se *SelectionElement) IsSelected() bool

IsSelected checks if the element is selected.

func (*SelectionElement) PageNumber added in v0.0.4

func (b *SelectionElement) PageNumber() int

PageNumber returns the page number associated with the block.

func (*SelectionElement) Polygon added in v0.0.4

func (b *SelectionElement) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*SelectionElement) Raw added in v0.0.4

func (b *SelectionElement) Raw() types.Block

Raw returns the raw block data.

func (*SelectionElement) Status

func (se *SelectionElement) Status() types.SelectionStatus

Status returns the selection status of the element.

func (*SelectionElement) String added in v0.0.7

func (se *SelectionElement) String() string

func (*SelectionElement) Text added in v0.0.5

func (se *SelectionElement) Text(optFns ...func(*TextLinearizationOptions)) string

Text returns the text representation of the selection element. It considers the selection status and applies linearization options.

type Signature added in v0.0.2

type Signature struct {
	// contains filtered or unexported fields
}

func (*Signature) BlockType added in v0.0.4

func (b *Signature) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Signature) BoundingBox added in v0.0.4

func (b *Signature) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Signature) Confidence added in v0.0.2

func (b *Signature) Confidence() float64

Confidence returns the confidence of the block.

func (*Signature) ID added in v0.0.2

func (b *Signature) ID() string

ID returns the identifier of the block.

func (*Signature) PageNumber added in v0.0.4

func (b *Signature) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Signature) Polygon added in v0.0.4

func (b *Signature) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*Signature) Raw added in v0.0.4

func (b *Signature) Raw() types.Block

Raw returns the raw block data.

func (*Signature) Text added in v0.0.4

func (s *Signature) Text(optFns ...func(*TextLinearizationOptions)) string

func (*Signature) Words added in v0.0.4

func (s *Signature) Words() []*Word

type Table

type Table struct {
	// contains filtered or unexported fields
}

func (*Table) BlockType added in v0.0.4

func (b *Table) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Table) BoundingBox added in v0.0.4

func (b *Table) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Table) CellAt

func (t *Table) CellAt(rowIndex, columnIndex int, optFns ...func(*CellAtOptions)) Cell

func (*Table) Confidence

func (b *Table) Confidence() float64

Confidence returns the confidence of the block.

func (*Table) ID

func (b *Table) ID() string

ID returns the identifier of the block.

func (*Table) PageNumber added in v0.0.4

func (b *Table) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Table) Polygon added in v0.0.4

func (b *Table) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*Table) Raw added in v0.0.4

func (b *Table) Raw() types.Block

Raw returns the raw block data.

func (*Table) RowCellsAt added in v0.0.6

func (t *Table) RowCellsAt(rowIndex int, optFns ...func(*RowCellsAtOptions)) []Cell

func (*Table) RowCount

func (t *Table) RowCount() int

func (*Table) Rows

func (t *Table) Rows(optFns ...func(*RowsOptions)) []*TableRow

func (*Table) Text added in v0.0.5

func (t *Table) Text(optFns ...func(*TextLinearizationOptions)) string

func (*Table) ToCSV added in v0.0.8

func (t *Table) ToCSV(w io.Writer) error

func (*Table) Words added in v0.0.5

func (t *Table) Words() []*Word

type TableCell added in v0.0.4

type TableCell struct {
	// contains filtered or unexported fields
}

TableCell represents a cell in a table.

func (*TableCell) IsColumnHeader added in v0.0.4

func (c *TableCell) IsColumnHeader() bool

IsColumnHeader checks if the cell is a column header.

func (*TableCell) IsMerged added in v0.0.7

func (c *TableCell) IsMerged() bool

IsMerged checks if the cell is part of a merged group.

func (*TableCell) IsTableFooter added in v0.0.4

func (c *TableCell) IsTableFooter() bool

IsTableFooter checks if the cell is a table footer.

func (*TableCell) IsTableSectionTitle added in v0.0.4

func (c *TableCell) IsTableSectionTitle() bool

IsTableSectionTitle checks if the cell is a table section title.

func (*TableCell) IsTableSummary added in v0.0.4

func (c *TableCell) IsTableSummary() bool

IsTableSummary checks if the cell is a table summary.

func (*TableCell) IsTableTitle added in v0.0.4

func (c *TableCell) IsTableTitle() bool

IsTableTitle checks if the cell is a table title.

func (*TableCell) OCRConfidence added in v0.0.9

func (tc *TableCell) OCRConfidence() *OCRConfidence

OCRConfidence returns the OCR confidence for the table cell.

func (*TableCell) SelectionElement added in v0.0.7

func (tc *TableCell) SelectionElement() *SelectionElement

SelectionElement returns the selection element associated with the table cell.

func (*TableCell) Text added in v0.0.4

func (tc *TableCell) Text(optFns ...func(*TextLinearizationOptions)) string

Text returns the text content of the table cell.

func (*TableCell) Words added in v0.0.5

func (tc *TableCell) Words() []*Word

Words returns the words in the table cell.

type TableFooter added in v0.0.4

type TableFooter struct {
	// contains filtered or unexported fields
}

TableFooter represents the footer of a table block.

func (*TableFooter) BlockType added in v0.0.4

func (b *TableFooter) BlockType() types.BlockType

BlockType returns the type of the block.

func (*TableFooter) BoundingBox added in v0.0.4

func (b *TableFooter) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*TableFooter) Confidence added in v0.0.4

func (b *TableFooter) Confidence() float64

Confidence returns the confidence of the block.

func (*TableFooter) ID added in v0.0.4

func (b *TableFooter) ID() string

ID returns the identifier of the block.

func (*TableFooter) PageNumber added in v0.0.4

func (b *TableFooter) PageNumber() int

PageNumber returns the page number associated with the block.

func (*TableFooter) Polygon added in v0.0.4

func (b *TableFooter) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*TableFooter) Raw added in v0.0.4

func (b *TableFooter) Raw() types.Block

Raw returns the raw block data.

func (*TableFooter) Text added in v0.0.4

func (tf *TableFooter) Text(_ ...func(*TextLinearizationOptions)) string

Text returns the concatenated text of all words in the table footer.

func (*TableFooter) Words added in v0.0.6

func (tf *TableFooter) Words() []*Word

Words returns the words within the table footer.

type TableMergedCell added in v0.0.6

type TableMergedCell struct {
	// contains filtered or unexported fields
}

TableMergedCell represents a merged cell in a table.

func (*TableMergedCell) IsColumnHeader added in v0.0.6

func (c *TableMergedCell) IsColumnHeader() bool

IsColumnHeader checks if the cell is a column header.

func (*TableMergedCell) IsMerged added in v0.0.7

func (c *TableMergedCell) IsMerged() bool

IsMerged checks if the cell is part of a merged group.

func (*TableMergedCell) IsTableFooter added in v0.0.6

func (c *TableMergedCell) IsTableFooter() bool

IsTableFooter checks if the cell is a table footer.

func (*TableMergedCell) IsTableSectionTitle added in v0.0.6

func (c *TableMergedCell) IsTableSectionTitle() bool

IsTableSectionTitle checks if the cell is a table section title.

func (*TableMergedCell) IsTableSummary added in v0.0.6

func (c *TableMergedCell) IsTableSummary() bool

IsTableSummary checks if the cell is a table summary.

func (*TableMergedCell) IsTableTitle added in v0.0.6

func (c *TableMergedCell) IsTableTitle() bool

IsTableTitle checks if the cell is a table title.

func (*TableMergedCell) OCRConfidence added in v0.0.9

func (tmc *TableMergedCell) OCRConfidence() *OCRConfidence

OCRConfidence returns the OCR confidence for the merged cell.

func (*TableMergedCell) Text added in v0.0.6

func (tmc *TableMergedCell) Text(_ ...func(*TextLinearizationOptions)) string

Text returns the text content of the merged cell.

func (*TableMergedCell) Words added in v0.0.6

func (tmc *TableMergedCell) Words() []*Word

Words returns the words in the merged cell.

type TableRow added in v0.0.4

type TableRow struct {
	// contains filtered or unexported fields
}

func (*TableRow) Cells added in v0.0.4

func (tr *TableRow) Cells() []Cell

func (*TableRow) OCRConfidence added in v0.0.9

func (tr *TableRow) OCRConfidence() *OCRConfidence

OCRConfidence returns the OCR confidence for the table row.

type TableTitle added in v0.0.4

type TableTitle struct {
	// contains filtered or unexported fields
}

TableTitle represents the title of a table, containing a collection of words.

func (*TableTitle) BlockType added in v0.0.4

func (b *TableTitle) BlockType() types.BlockType

BlockType returns the type of the block.

func (*TableTitle) BoundingBox added in v0.0.4

func (b *TableTitle) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*TableTitle) Confidence added in v0.0.4

func (b *TableTitle) Confidence() float64

Confidence returns the confidence of the block.

func (*TableTitle) ID added in v0.0.4

func (b *TableTitle) ID() string

ID returns the identifier of the block.

func (*TableTitle) PageNumber added in v0.0.4

func (b *TableTitle) PageNumber() int

PageNumber returns the page number associated with the block.

func (*TableTitle) Polygon added in v0.0.4

func (b *TableTitle) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*TableTitle) Raw added in v0.0.4

func (b *TableTitle) Raw() types.Block

Raw returns the raw block data.

func (*TableTitle) Text added in v0.0.4

func (tt *TableTitle) Text(_ ...func(*TextLinearizationOptions)) string

Text returns the concatenated text of the table title, using default or provided linearization options.

func (*TableTitle) Words added in v0.0.6

func (tt *TableTitle) Words() []*Word

Words returns the words constituting the table title.

type TextLinearizationOptions added in v0.0.4

type TextLinearizationOptions struct {
	// MaxNumberOfConsecutiveNewLines sets the maximum number of consecutive new lines to keep, removing extra whitespace.
	MaxNumberOfConsecutiveNewLines int

	// HideHeaderLayout hides headers in the linearized output.
	HideHeaderLayout bool

	// HideFooterLayout hides footers in the linearized output.
	HideFooterLayout bool

	// HideFigureLayout hides figures in the linearized output.
	HideFigureLayout bool

	// HidePageNumberLayout hides page numbers in the linearized output.
	HidePageNumberLayout bool

	// PageNumberPrefix is the prefix for page number layout elements.
	PageNumberPrefix string

	// PageNumberSuffix is the suffix for page number layout elements.
	PageNumberSuffix string

	// OnLinerizedPageNumber is a callback function for customizing page number processing.
	OnLinerizedPageNumber OnLinerizedPageNumber

	// SameParagraphSeparator is the separator to use when combining elements within a text block.
	SameParagraphSeparator string

	// LayoutElementSeparator is the separator to use when combining linearized layout elements.
	LayoutElementSeparator string

	// ListElementSeparator is the separator for elements in a list layout.
	ListElementSeparator string

	// ListLayoutPrefix is the prefix for list layout elements (parent).
	ListLayoutPrefix string

	// ListLayoutSuffix is the suffix for list layout elements (parent).
	ListLayoutSuffix string

	// ListElementPrefix is the prefix for elements in a list layout (children).
	ListElementPrefix string

	// ListElementSuffix is the suffix for elements in a list layout (children).
	ListElementSuffix string

	// RemoveNewLinesInListElements removes new lines in list elements.
	RemoveNewLinesInListElements bool

	// TitlePrefix is the prefix for title layout elements.
	TitlePrefix string

	// TitleSuffix is the suffix for title layout elements.
	TitleSuffix string

	// OnLinerizedTitle is a callback function for customizing title processing.
	OnLinerizedTitle OnLinerizedTitle

	// TableLayoutPrefix is the prefix for table elements.
	TableLayoutPrefix string

	// TableLayoutSuffix is the suffix for table elements.
	TableLayoutSuffix string

	// TableLinearizationFormat sets how to represent tables in the linearized output. Choices are plaintext or markdown.
	TableLinearizationFormat string

	// TableMinTableWords is the threshold below which tables will be rendered as words instead of using table layout.
	TableMinTableWords int

	// TableColumnSeparator is the table column separator, used when linearizing layout tables, not used if AnalyzeDocument was called with the TABLES feature.
	TableColumnSeparator string

	// TablePrefix is the prefix for table layout.
	TablePrefix string

	// TableSuffix is the suffix for table layout.
	TableSuffix string

	// TableRowSeparator is the table row separator.
	TableRowSeparator string

	// TableRowPrefix is the prefix for table row.
	TableRowPrefix string

	// TableRowSuffix is the suffix for table row.
	TableRowSuffix string

	// TableCellPrefix is the prefix for table cell.
	TableCellPrefix string

	// TableCellSuffix is the suffix for table cell.
	TableCellSuffix string

	// SectionHeaderPrefix is the prefix for section header layout elements.
	SectionHeaderPrefix string

	// SectionHeaderSuffix is the suffix for section header layout elements.
	SectionHeaderSuffix string

	// OnLinerizedSectionHeader is a callback function for customizing section header processing.
	OnLinerizedSectionHeader OnLinerizedSectionHeader

	// KeyValueLayoutPrefix is the prefix for key_value layout elements (not for individual key-value elements).
	KeyValueLayoutPrefix string

	// KeyValueLayoutSuffix is the suffix for key_value layout elements (not for individual key-value elements).
	KeyValueLayoutSuffix string

	// KeyValuePrefix is the prefix for key-value elements.
	KeyValuePrefix string

	// KeyValueSuffix is the suffix for key-value elements.
	KeyValueSuffix string

	// KeyPrefix is the prefix for key elements.
	KeyPrefix string

	// KeySuffix is the suffix for key elements.
	KeySuffix string

	// ValuePrefix is the prefix for value elements.
	ValuePrefix string

	// ValueSuffix is the suffix for value elements.
	ValueSuffix string

	// SelectionElementSelected is the representation for selection elements when selected.
	SelectionElementSelected string

	// SelectionElementNotSelected is the representation for selection elements when not selected.
	SelectionElementNotSelected string

	// HeuristicHTolerance sets how much the line below and above the current line should differ in width to be separated.
	HeuristicHTolerance float64

	// HeuristicOverlapRatio sets how much vertical overlap is tolerated between two subsequent lines before merging them into a single line.
	HeuristicOverlapRatio float64

	// SignatureToken is the signature representation in the linearized text.
	SignatureToken string
}

TextLinearizationOptions defines how a document is linearized into a text string.

type Value added in v0.0.4

type Value struct {
	// contains filtered or unexported fields
}

Value represents the value part of a key-value pair.

func (*Value) BlockType added in v0.0.4

func (b *Value) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Value) BoundingBox added in v0.0.4

func (b *Value) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Value) Confidence added in v0.0.4

func (b *Value) Confidence() float64

Confidence returns the confidence of the block.

func (*Value) ID added in v0.0.4

func (b *Value) ID() string

ID returns the identifier of the block.

func (*Value) OCRConfidence added in v0.0.8

func (v *Value) OCRConfidence() *OCRConfidence

OCRConfidence returns the OCR confidence for the value.

func (*Value) PageNumber added in v0.0.4

func (b *Value) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Value) Polygon added in v0.0.4

func (b *Value) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*Value) Raw added in v0.0.4

func (b *Value) Raw() types.Block

Raw returns the raw block data.

func (*Value) SelectionElement added in v0.0.7

func (v *Value) SelectionElement() *SelectionElement

SelectionElement returns the selection element associated with the table cell.

func (*Value) String added in v0.0.4

func (v *Value) String() string

String returns the string representation of the value.

func (*Value) Text added in v0.0.4

func (v *Value) Text(optFns ...func(*TextLinearizationOptions)) string

Text returns the text content of the value.

func (*Value) Words added in v0.0.4

func (v *Value) Words() []*Word

Words returns the words in the value.

type Word

type Word struct {
	// contains filtered or unexported fields
}

Word represents a word extracted by Textract.

func (*Word) BlockType added in v0.0.4

func (b *Word) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Word) BoundingBox added in v0.0.4

func (b *Word) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Word) Confidence

func (b *Word) Confidence() float64

Confidence returns the confidence of the block.

func (*Word) ID

func (b *Word) ID() string

ID returns the identifier of the block.

func (*Word) IsHandwriting

func (w *Word) IsHandwriting() bool

IsHandwriting checks if the word is handwriting.

func (*Word) IsPrinted

func (w *Word) IsPrinted() bool

IsPrinted checks if the word is printed text.

func (*Word) PageNumber added in v0.0.4

func (b *Word) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Word) Polygon added in v0.0.4

func (b *Word) Polygon() Polygon

Polygon returns the polygon information of the block.

func (*Word) Raw added in v0.0.4

func (b *Word) Raw() types.Block

Raw returns the raw block data.

func (*Word) Text

func (w *Word) Text() string

Text returns the text content of the word.

func (*Word) TextType

func (w *Word) TextType() types.TextType

TextType returns the text type of the word.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL