tensorlake

package module

v0.1.1 Latest Latest Go to latest Published: Dec 22, 2025 License: Apache-2.0 Imports: 16 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/sixt/tensorlake-go

Links

Open Source Insights

README ¶

Tensorlake Go SDK

A comprehensive Go SDK for the Tensorlake API, enabling intelligent document processing with parsing, structured data extraction, and page classification capabilities.

Features

Document Parsing: Convert PDFs, DOCX, images, and more to structured markdown
Data Extraction: Extract structured data using JSON schemas
Page Classification: Classify pages by content type
File Management: Upload and manage documents
Datasets: Reusable parsing configurations for consistent processing
SSE Support: Real-time progress updates via Server-Sent Events
Iterator Pattern: Easy pagination through results

Installation

go get github.com/sixt/tensorlake-go

Requirements: Go 1.25 or later

Quick Start

1. Initialize the Client

import "github.com/sixt/tensorlake-go"

c := tensorlake.NewClient(
    tensorlake.WithBaseURL("https://api.your-domain.com"),
    tensorlake.WithAPIKey("your-api-key"),
)

2. Upload a File

file, _ := os.Open("document.pdf")
defer file.Close()

uploadResp, _ := c.UploadFile(context.Background(), &tensorlake.UploadFileRequest{
    FileBytes: file,
    FileName:  "document.pdf",
    Labels:    map[string]string{"category": "invoice"},
})

fmt.Printf("File uploaded: %s\n", uploadResp.FileId)

3. Parse the Document

parseJob, _ := c.ParseDocument(context.Background(), &tensorlake.ParseDocumentRequest{
    FileSource: tensorlake.FileSource{
        FileId: uploadResp.FileId,
    },
})

// Get results with real-time updates
result, _ := c.GetParseResult(
    context.Background(),
    parseJob.ParseId,
    tensorlake.WithSSE(true),
    tensorlake.WithOnUpdate(func(name tensorlake.ParseEventName, r *tensorlake.ParseResult) {
        fmt.Printf("Status: %s - %d/%d pages\n", name, r.ParsedPagesCount, r.TotalPages)
    }),
)

// Access parsed content
for _, page := range result.Pages {
    fmt.Printf("Page %d:\n", page.PageNumber)
    // Process page content...
}

Documentation

Core APIs

File Management APIs - Upload, list, retrieve metadata, and delete files
Parse APIs - Parse documents, extract data, and classify pages
Dataset APIs - Create reusable parsing configurations

Comprehensive Examples

Extract Structured Data

import "github.com/google/jsonschema-go/jsonschema"

// Define extraction schema
type InvoiceData struct {
    InvoiceNumber string     `json:"invoice_number"`
    VendorName    string     `json:"vendor_name"`
    TotalAmount   float64    `json:"total_amount"`
    LineItems     []LineItem `json:"line_items"`
}

type LineItem struct {
    Description string  `json:"description"`
    Amount      float64 `json:"amount"`
}

schema, _ := jsonschema.For[InvoiceData](nil)

// Parse with extraction
parseJob, _ := c.ParseDocument(context.Background(), &tensorlake.ParseDocumentRequest{
    FileSource: tensorlake.FileSource{FileId: fileId},
    StructuredExtractionOptions: []tensorlake.StructuredExtractionOptions{
        {
            SchemaName:        "invoice_data",
            JSONSchema:        schema,
            PartitionStrategy: tensorlake.PartitionStrategyNone,
            ProvideCitations:  true,
        },
    },
})

// Retrieve and unmarshal extracted data
result, _ := c.GetParseResult(context.Background(), parseJob.ParseId)
for _, data := range result.StructuredData {
    var extracted map[string]interface{}
    json.Unmarshal(data.Data, &extracted)
    fmt.Printf("Extracted: %+v\n", extracted)
}

Classify Pages

parseJob, err := c.ClassifyDocument(context.Background(), &tensorlake.ClassifyDocumentRequest{
    FileSource: tensorlake.FileSource{FileId: fileId},
    PageClassifications: []tensorlake.PageClassConfig{
        {
            Name:        "signature_page",
            Description: "Pages containing signatures or signature blocks",
        },
        {
            Name:        "terms_and_conditions",
            Description: "Pages with legal terms and conditions",
        },
    },
})

result, _ := c.GetParseResult(context.Background(), parseJob.ParseId)
for _, pageClass := range result.PageClasses {
    fmt.Printf("Class '%s' found on pages: %v\n", pageClass.PageClass, pageClass.PageNumbers)
}

Use Datasets for Batch Processing

// Create a reusable dataset
dataset, err := c.CreateDataset(context.Background(), &tensorlake.CreateDatasetRequest{
    Name:        "invoice-processing",
    Description: "Standard invoice parsing configuration",
    ParsingOptions: &tensorlake.ParsingOptions{
        TableOutputMode: tensorlake.TableOutputModeMarkdown,
    },
    StructuredExtractionOptions: []tensorlake.StructuredExtractionOptions{
        {
            SchemaName: "invoice",
            JSONSchema: schema,
        },
    },
})

// Process multiple files with the same configuration
fileIds := []string{"file_001", "file_002", "file_003"}
for _, fileId := range fileIds {
    parseJob, err := c.ParseDataset(context.Background(), &tensorlake.ParseDatasetRequest{
        DatasetId:  dataset.DatasetId,
        FileSource: tensorlake.FileSource{FileId: fileId},
    })
    // Process results...
}

Advanced Features

Server-Sent Events (SSE)

Get real-time progress updates for long-running parse jobs:

result, err := c.GetParseResult(
    ctx,
    parseId,
    tensorlake.WithSSE(true),
    tensorlake.WithOnUpdate(func(name tensorlake.ParseEventName, r *tensorlake.ParseResult) {
        switch eventName {
        case tensorlake.SSEEventParseQueued:
            fmt.Println("Job queued")
        case tensorlake.SSEEventParseUpdate:
            fmt.Printf("Progress: %d/%d pages\n", r.ParsedPagesCount, r.TotalPages)
        case tensorlake.SSEEventParseDone:
            fmt.Println("Complete!")
        case tensorlake.SSEEventParseFailed:
            fmt.Printf("Failed: %s\n", r.Error)
        }
    }),
)

Iterator Pattern

Easily iterate through paginated results:

// Iterate all files
for file, err := range c.IterFiles(ctx, 50) {
    if err != nil {
        panic(err)
    }
    fmt.Printf("File: %s\n", file.FileName)
}

// Iterate all parse jobs
for job, err := range c.IterParseJobs(ctx, 50) {
    if err != nil {
        panic(err)
    }
    fmt.Printf("Job %s: Status: %s\n", job.ParseId, job.Status)
}

// Iterate all datasets
for dataset, err := range c.IterDatasets(ctx, 50) {
    if err != nil {
        panic(err)
    }
    fmt.Printf("Dataset %s: Name: %s, Status: %s\n", dataset.DatasetId, dataset.Name, dataset.Status)
}

Supported File Types

Documents: PDF, DOCX
Spreadsheets: XLS, XLSX, XLSM, CSV
Presentations: PPTX, Apple Keynote
Images: PNG, JPG, JPEG
Text: Plain text, HTML

Maximum file size: 1 GB

Error Handling

All API methods return structured errors:

result, err := c.ParseDocument(ctx, request)
if err != nil {
    var apiErr *tensorlake.ErrorResponse
    if errors.As(err, &apiErr) {
        fmt.Printf("API Error: %s (Code: %s)\n", apiErr.Message, apiErr.ErrorCode)
        // Handle specific error codes
    } else {
        fmt.Printf("Network/Client Error: %v\n", err)
    }
}

Best Practices

Reuse Datasets - Create datasets for frequently processed document types
Use SSE - Enable SSE for large documents to track progress
Batch Processing - Process similar documents with the same dataset configuration
Error Handling - Always check error responses and handle retries appropriately
Labels - Use labels to organize and filter files and parse jobs
Iterators - Use iterator methods for efficient pagination through large result sets

Contributing

Contributions are welcome! Please feel free to submit issues or pull requests.

License

Documentation ¶

Overview ¶

Package tensorlake provides a Go SDK for the Tensorlake API.

Tensorlake enables document parsing, structured data extraction, and page classification for various document formats including PDF, DOCX, PPTX, images, and more.

Getting Started ¶

Create a client with your API key:

c := tensorlake.NewClient(
	tensorlake.WithBaseURL("https://api.your-domain.com"),
	tensorlake.WithAPIKey("your-api-key"),
)

Uploading a File ¶

Upload a file to the project:

file, err := os.Open("path/to/your/file.pdf")
if err != nil {
	log.Fatal(err)
}
defer file.Close()

r, err := c.UploadFile(context.Background(), &tensorlake.UploadFileRequest{
	FileBytes: file,
	FileName:  "your-file.pdf",
	Labels:    map[string]string{"category": "label-1", "subcategory": "label-2"},
})
if err != nil {
	log.Fatal(err)
}

Parsing a Document ¶

Parse an uploaded file and retrieve the results:

// Start parsing using the file ID from upload
parseJob, err := c.ParseDocument(context.Background(), &tensorlake.ParseDocumentRequest{
	FileSource: tensorlake.FileSource{
		FileId: r.FileId,
	},
	Labels: map[string]string{"type": "invoice"},
})
if err != nil {
	log.Fatal(err)
}

// Retrieve parse results with streaming updates
result, err := c.GetParseResult(context.Background(), parseJob.ParseId,
	tensorlake.WithSSE(true),
	tensorlake.WithOnUpdate(func(eventName string, r *tensorlake.ParseResult) {
		log.Printf("Parse status: %s", eventName)
	}),
)
if err != nil {
	log.Fatal(err)
}

// Access the parsed content
for _, page := range result.Pages {
	log.Printf("Page %d: %s", page.PageNumber, page.Markdown)
}

Index ¶

Constants
type Chunk
type ChunkingStrategy
type ClassifyDocumentRequest
type Client
- func NewClient(opts ...Option) *Client
- func (c *Client) ClassifyDocument(ctx context.Context, in *ClassifyDocumentRequest) (*ParseJob, error)
- func (c *Client) CreateDataset(ctx context.Context, in *CreateDatasetRequest) (*CreateDatasetResponse, error)
- func (c *Client) DeleteDataset(ctx context.Context, datasetId string) error
- func (c *Client) DeleteFile(ctx context.Context, fileId string) error
- func (c *Client) DeleteParseJob(ctx context.Context, parseId string) error
- func (c *Client) ExtractDocument(ctx context.Context, in *ExtractDocumentRequest) (*ParseJob, error)
- func (c *Client) GetDataset(ctx context.Context, datasetId string) (*Dataset, error)
- func (c *Client) GetFileMetadata(ctx context.Context, fileId string) (*FileInfo, error)
- func (c *Client) GetParseResult(ctx context.Context, parseId string, opts ...GetParseResultOption) (*ParseResult, error)
- func (c *Client) IterDatasetData(ctx context.Context, datasetId string, batchSize int) iter.Seq2[ParseResult, error]
- func (c *Client) IterDatasets(ctx context.Context, batchSize int) iter.Seq2[Dataset, error]
- func (c *Client) IterFiles(ctx context.Context, batchSize int) iter.Seq2[FileInfo, error]
- func (c *Client) IterParseJobs(ctx context.Context, batchSize int) iter.Seq2[ParseResult, error]
- func (c *Client) ListDatasetData(ctx context.Context, in *ListDatasetDataRequest) (*PaginationResult[ParseResult], error)
- func (c *Client) ListDatasets(ctx context.Context, in *ListDatasetsRequest) (*PaginationResult[Dataset], error)
- func (c *Client) ListFiles(ctx context.Context, in *ListFilesRequest) (*PaginationResult[FileInfo], error)
- func (c *Client) ListParseJobs(ctx context.Context, in *ListParseJobsRequest) (*PaginationResult[ParseResult], error)
- func (c *Client) ParseDataset(ctx context.Context, in *ParseDatasetRequest) (*ParseJob, error)
- func (c *Client) ParseDocument(ctx context.Context, in *ParseDocumentRequest) (*ParseJob, error)
- func (c *Client) ReadDocument(ctx context.Context, in *ReadDocumentRequest) (*ParseJob, error)
- func (c *Client) UpdateDataset(ctx context.Context, in *UpdateDatasetRequest) (*Dataset, error)
- func (c *Client) UploadFile(ctx context.Context, in *UploadFileRequest) (*FileUploadResponse, error)
type CreateDatasetRequest
type CreateDatasetResponse
type Dataset
type DatasetStatus
type EnrichmentOptions
type ErrorCode
type ErrorResponse
- func (e *ErrorResponse) Error() string
type ExtractDocumentRequest
type FileInfo
type FileSource
- func (fs *FileSource) SourceProvided() bool
type FileUploadResponse
type GetParseResultOption
- func WithOnUpdate(onUpdate ParseResultUpdateFunc) GetParseResultOption
- func WithOptions(enable bool) GetParseResultOption
- func WithSSE(enable bool) GetParseResultOption
type GetParseResultOptions
type JobType
type ListDatasetDataRequest
type ListDatasetsRequest
type ListFilesRequest
type ListParseJobsRequest
type MimeType
type ModelProvider
type OCRPipelineProvider
type Option
- func WithAPIKey(key string) Option
- func WithBaseURL(url string) Option
- func WithHTTPClient(client *http.Client) Option
type Page
type PageClass
type PageClassConfig
type PageDimensions
type PageFragment
type PageFragmentContent
type PageFragmentFigure
type PageFragmentHeader
type PageFragmentSignature
type PageFragmentTable
type PageFragmentTableCell
type PageFragmentText
type PageFragmentType
type PaginationDirection
type PaginationResult
type ParseDatasetRequest
type ParseDocumentRequest
type ParseEventName
type ParseJob
type ParseResult
type ParseResultOptions
type ParseResultUpdateFunc
type ParseStatus
type ParsingOptions
type PartitionStrategy
type ReadDocumentRequest
type StructuredData
type StructuredExtractionOptions
type TableOutputMode
type TableParsingFormat
type UnionValues
- func (v UnionValues[T]) MarshalJSON() ([]byte, error)
- func (v *UnionValues[T]) UnmarshalJSON(b []byte) error
type UpdateDatasetRequest
type UploadFileRequest
type Usage

Constants ¶

View Source

const (
	// EndpointEU is the European endpoint.
	EndpointEU string = "https://api.eu.tensorlake.ai/documents/v2"
	// EndpointUS is the United States endpoint.
	EndpointUS string = "https://api.tensorlake.ai/documents/v2"
)

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Chunk ¶

type Chunk struct {
	Content    string `json:"content"`
	PageNumber int    `json:"page_number"` // >= 0
}

Chunk represents a chunk of the document.

type ChunkingStrategy ¶

type ChunkingStrategy string

ChunkingStrategy determines how the document is chunked into smaller pieces.

Every text block, image, table, etc. is considered a fragment.

const (
	// ChunkingStrategyNone: No chunking is applied.
	ChunkingStrategyNone ChunkingStrategy = "none"
	// ChunkingStrategyPage: The document is chunked by page.
	ChunkingStrategyPage ChunkingStrategy = "page"
	// ChunkingStrategySection: The document is chunked into sections.
	// Title and section headers are used as chunking markers.
	ChunkingStrategySection ChunkingStrategy = "section"
	// ChunkingStrategyFragment: Each page element is converted into markdown form.
	ChunkingStrategyFragment ChunkingStrategy = "fragment"
)

type ClassifyDocumentRequest ¶

type ClassifyDocumentRequest struct {
	FileSource
	PageClassifications []PageClassConfig `json:"page_classifications"`
	PageRange           string            `json:"page_range,omitempty"`
	MimeType            string            `json:"mime_type,omitempty"`
	Labels              map[string]string `json:"labels,omitempty"`
}

ClassifyDocumentRequest holds options for classifying a document.

func NewClient ¶

func NewClient(opts ...Option) *Client

NewClient creates a new Tensorlake API client.

func (*Client) ClassifyDocument ¶

func (c *Client) ClassifyDocument(ctx context.Context, in *ClassifyDocumentRequest) (*ParseJob, error)

ClassifyDocument submits a document for page classification.

func (*Client) CreateDataset ¶

func (c *Client) CreateDataset(ctx context.Context, in *CreateDatasetRequest) (*CreateDatasetResponse, error)

CreateDataset creates a new dataset.

func (*Client) DeleteDataset ¶

func (c *Client) DeleteDataset(ctx context.Context, datasetId string) error

DeleteDataset deletes a dataset from Tensorlake.

func (*Client) DeleteFile ¶

func (c *Client) DeleteFile(ctx context.Context, fileId string) error

DeleteFile deletes a file from Tensorlake Cloud.

func (*Client) DeleteParseJob ¶

func (c *Client) DeleteParseJob(ctx context.Context, parseId string) error

DeleteParseJob deletes a previously submitted parse job. This will remove the parse job and its associated settings from the system. Deleting a parse job does not delete the original file used for parsing, nor does it affect any other parse jobs that may have been created from the same file

func (*Client) ExtractDocument ¶

func (c *Client) ExtractDocument(ctx context.Context, in *ExtractDocumentRequest) (*ParseJob, error)

ExtractDocument submits a document for structured data extraction.

func (*Client) GetDataset ¶

func (c *Client) GetDataset(ctx context.Context, datasetId string) (*Dataset, error)

GetDataset retrieves details for a specific dataset.

func (*Client) GetFileMetadata ¶

func (c *Client) GetFileMetadata(ctx context.Context, fileId string) (*FileInfo, error)

GetFileMetadata retrieves metadata for a specific file.

func (*Client) GetParseResult ¶

func (c *Client) GetParseResult(ctx context.Context, parseId string, opts ...GetParseResultOption) (*ParseResult, error)

GetParseResult retrieves the result of a parse job. The response will include: 1) parsed content (markdown or pages); 2) structured extraction results (if schemas are provided during the parse request); 3) page classification results (if page classifications are provided during the parse request).

When the job finishes successfully, the response will contain pages (chunks of the page) chunks (text chunks extracted from the document), structured data (every schema_name provided in the parse request as a key).

func (*Client) IterDatasetData ¶

func (c *Client) IterDatasetData(ctx context.Context, datasetId string, batchSize int) iter.Seq2[ParseResult, error]

IterDatasetData iterates over all dataset data in the organization.

func (*Client) IterDatasets ¶

func (c *Client) IterDatasets(ctx context.Context, batchSize int) iter.Seq2[Dataset, error]

IterDatasets iterates over all datasets in the organization.

func (*Client) IterFiles ¶

func (c *Client) IterFiles(ctx context.Context, batchSize int) iter.Seq2[FileInfo, error]

IterFiles iterates over all files in the project.

func (*Client) IterParseJobs ¶

func (c *Client) IterParseJobs(ctx context.Context, batchSize int) iter.Seq2[ParseResult, error]

IterParseJobs iterates over all parse jobs in the project.

func (*Client) ListDatasetData ¶

func (c *Client) ListDatasetData(ctx context.Context, in *ListDatasetDataRequest) (*PaginationResult[ParseResult], error)

ListDatasetData lists all the parse jobs associated with a specific dataset. This endpoint allows you to retrieve the status and metadata of each parse job that has been submitted under the specified dataset.

func (*Client) ListDatasets ¶

func (c *Client) ListDatasets(ctx context.Context, in *ListDatasetsRequest) (*PaginationResult[Dataset], error)

ListDatasets lists all datasets in the organization.

func (*Client) ListFiles ¶

func (c *Client) ListFiles(ctx context.Context, in *ListFilesRequest) (*PaginationResult[FileInfo], error)

ListFiles lists files in the Tensorlake project.

This operation allows to list every file that has been uploaded to the Project specified by the API key used in the request. The response will include metadata about each file, such as the file ID, name, size, and type. We use cursor-based pagination to return the files in pages. A page has the following fields:

Items: An array of file metadata, each containing the fields described below.
HasMore: A boolean indicating whether there are more files available beyond the current page.
NextCursor: A base64-encoded cursor for the next page of results. If HasMore is false, this field will be null.
PrevCursor: A base64-encoded cursor for the previous page of results. If this is the first page, this field will be null.

func (*Client) ListParseJobs ¶

func (c *Client) ListParseJobs(ctx context.Context, in *ListParseJobsRequest) (*PaginationResult[ParseResult], error)

ListParseJobs lists parse jobs in the Tensorlake project.

func (*Client) ParseDataset ¶

func (c *Client) ParseDataset(ctx context.Context, in *ParseDatasetRequest) (*ParseJob, error)

ParseDataset parses a document using a dataset's configuration.

func (*Client) ParseDocument ¶

func (c *Client) ParseDocument(ctx context.Context, in *ParseDocumentRequest) (*ParseJob, error)

ParseDocument submits a document for comprehensive parsing (read, extract, and classify).

func (*Client) ReadDocument ¶

func (c *Client) ReadDocument(ctx context.Context, in *ReadDocumentRequest) (*ParseJob, error)

ReadDocument submits an uploaded file, an internet-reachable URL, or any kind of raw text for document parsing. If you have configured a webhook, we will notify you when the job is complete, be it a success or a failure. The API will convert the document into markdown, and provide document layout information. Once submitted, the API will return a parse response with a parse_id field. You can query the status and results of the parse operation with the Get Parse Result endpoint.

func (*Client) UpdateDataset ¶

func (c *Client) UpdateDataset(ctx context.Context, in *UpdateDatasetRequest) (*Dataset, error)

UpdateDataset updates a dataset's settings.

func (*Client) UploadFile ¶

func (c *Client) UploadFile(ctx context.Context, in *UploadFileRequest) (*FileUploadResponse, error)

UploadFile uploads a file to Tensorlake Cloud.

The file will be associated with the project specified by the API key used in the request.

The file can be of any of the following types: - PDF - Word (DOCX) - Spreadsheets (XLS, XLSX, XSLM, CSV) - Presentations (PPTX, Apple Keynote) - Images (PNG, JPG, JPEG) - Raw text (plain text, HTML)

The file type is automatically detected based on Content-Type header. In case the Content-Type header is not provided, the file extension will be used to infer the type. If the file type cannot be determined, it will default to application/octet-stream.

We only keep one copy of the file, so uploading the same file multiple times will return the same file_id.

Labels ¶

Labels can be added to the file to help categorize the parse jobs associated with it. Labels are key-value pairs that can be used to filter and organize files. These should be provided in the a labels text field in the multipart form data. Labels are optional, but they can be very useful for organizing and managing parse jobs.

Limits ¶

There is an upload limit of 1 GB per file.

type CreateDatasetRequest ¶

type CreateDatasetRequest struct {
	// The name of the dataset.
	//
	// The name can only contain alphanumeric characters, hyphens, and
	// underscores.
	//
	// The name must be unique within the organization and project context.
	//
	// Example:
	// "invoices dataset"
	Name string `json:"name"`

	// 	A description of the dataset.
	//
	// This field is optional and can be used to provide additional context
	// about the dataset.
	//
	// Example:
	// "This dataset contains all invoices from 2023."
	Description string `json:"description,omitempty"`

	// The properties of this object define the configuration for the document
	// parsing process.
	//
	// Tensorlake provides sane defaults that work well for most
	// documents, so this object is not required. However, every document
	// is different, and you may want to customize the parsing process to
	// better suit your needs.
	ParsingOptions *ParsingOptions `json:"parsing_options,omitempty"`

	// The properties of this object define the configuration for structured
	// data extraction.
	//
	// If this object is present, the API will perform structured data
	// extraction on the document.
	StructuredExtractionOptions []StructuredExtractionOptions `json:"structured_extraction_options,omitempty"`

	// The properties of this object define the configuration for page
	// classify.
	//
	// If this object is present, the API will perform page classify on
	// the document.
	PageClassifications []PageClassConfig `json:"page_classifications,omitempty"`

	// The properties of this object help to extend the output of the document
	// parsing process with additional information.
	//
	// This includes summarization of tables and figures, which can help to
	// provide a more comprehensive understanding of the document.
	//
	// This object is not required, and the API will use default settings if it
	// is not present.
	EnrichmentOptions *EnrichmentOptions `json:"enrichment_options,omitempty"`
}

CreateDatasetRequest holds options for creating a dataset.

type CreateDatasetResponse ¶

type CreateDatasetResponse struct {
	// Name is the name of the dataset.
	Name string `json:"name"`
	// DatasetId is the ID of the created dataset.
	DatasetId string `json:"dataset_id"`
	// CreatedAt is the creation date and time of the dataset.
	CreatedAt string `json:"created_at"`
}

CreateDatasetResponse represents the response from creating a dataset.

type Dataset ¶

type Dataset struct {
	Name        string        `json:"name"`
	DatasetId   string        `json:"dataset_id"`
	Description string        `json:"description,omitempty"`
	Status      DatasetStatus `json:"status"`
	CreatedAt   string        `json:"created_at"`
	UpdatedAt   string        `json:"updated_at"`
}

Dataset represents a dataset.

type DatasetStatus ¶

type DatasetStatus string

const (
	DatasetStatusIdle       DatasetStatus = "idle"
	DatasetStatusProcessing DatasetStatus = "processing"
)

type EnrichmentOptions ¶

type EnrichmentOptions struct {
	// FigureSummarization enables summary generation for parsed figures.
	// The default is false.
	FigureSummarization bool `json:"figure_summarization,omitempty"`

	// FigureSummarizationPrompt is the prompt to guide the figure summarization.
	// If not provided, a default prompt will be used. It is not required to provide a prompt.
	// The prompt only has effect if [FigureSummarization] is set to `true`.
	FigureSummarizationPrompt string `json:"figure_summarization_prompt,omitempty"`

	// TableSummarization enables summary generation for parsed tables.
	// The default is false.
	TableSummarization bool `json:"table_summarization,omitempty"`

	// TableSummarizationPrompt is the prompt to guide the table summarization.
	// If not provided, a default prompt will be used. It is not required to provide a prompt.
	// The prompt only has effect if [TableSummarization] is set to `true`.
	TableSummarizationPrompt string `json:"table_summarization_prompt,omitempty"`

	// IncludeFullPageImage includes the full page image in addition to the cropped table and figure images.
	// This provides Language Models context about the table and figure they are summarizing in addition to the cropped images, and could improve the summarization quality.
	// The default is false.
	IncludeFullPageImage bool `json:"include_full_page_image,omitempty"`
}

EnrichmentOptions holds configuration for document enrichment.

type ErrorCode ¶

type ErrorCode string

ErrorCode represents error codes for Document AI API.

These codes are used to identify specific error conditions in the API. They can be used for programmatic handling of errors.

const (
	ErrorCodeQuotaExceeded        ErrorCode = "QUOTA_EXCEEDED"
	ErrorCodeInvalidJSONSchema    ErrorCode = "INVALID_JSON_SCHEMA"
	ErrorCodeInvalidConfiguration ErrorCode = "INVALID_CONFIGURATION"
	ErrorCodeInvalidPageClass     ErrorCode = "INVALID_PAGE_CLASSIFICATION"
	ErrorCodeEntityNotFound       ErrorCode = "ENTITY_NOT_FOUND"
	ErrorCodeEntityAlreadyExists  ErrorCode = "ENTITY_ALREADY_EXISTS"
	ErrorCodeInvalidFile          ErrorCode = "INVALID_FILE"
	ErrorCodeInvalidPageRange     ErrorCode = "INVALID_PAGE_RANGE"
	ErrorCodeInvalidMimeType      ErrorCode = "INVALID_MIME_TYPE"
	ErrorCodeInvalidDatasetName   ErrorCode = "INVALID_DATASET_NAME"
	ErrorCodeInternalError        ErrorCode = "INTERNAL_ERROR"
	ErrorCodeInvalidMultipart     ErrorCode = "INVALID_MULTIPART"
	ErrorCodeMultipartStreamEnd   ErrorCode = "MULTIPART_STREAM_END"
	ErrorCodeInvalidQueryParams   ErrorCode = "INVALID_QUERY_PARAMS"
)

type ErrorResponse ¶

type ErrorResponse struct {
	// Message is a human-readable error message.
	Message string `json:"message"`
	// Code is the error code for programmatic handling.
	Code ErrorCode `json:"code"`
	// TraceId is the trace ID of the error.
	TraceId string `json:"trace_id,omitempty"`
	// Details is the details of the error.
	Details any `json:"details,omitempty"`
}

ErrorResponse represents an error returned by the Tensorlake API.

func (*ErrorResponse) Error ¶

func (e *ErrorResponse) Error() string

type ExtractDocumentRequest ¶

type ExtractDocumentRequest struct {
	FileSource

	StructuredExtractionOptions []StructuredExtractionOptions `json:"structured_extraction_options"`
	PageRange                   string                        `json:"page_range,omitempty"`
	MimeType                    string                        `json:"mime_type,omitempty"`
	Labels                      map[string]string             `json:"labels,omitempty"`
}

ExtractDocumentRequest holds options for extracting structured data from a document.

type FileInfo ¶

type FileInfo struct {
	FileId         string            `json:"file_id"`
	FileName       string            `json:"file_name"`
	MimeType       MimeType          `json:"mime_type"`
	FileSize       int64             `json:"file_size"`
	ChecksumSHA256 string            `json:"checksum_sha256,omitempty"`
	CreatedAt      string            `json:"created_at,omitempty"`
	Labels         map[string]string `json:"labels,omitempty"`
}

FileInfo represents metadata about a file.

type FileSource ¶

type FileSource struct {
	// ID of the file previously uploaded to Tensorlake.
	// Has tensorlake- (V1) or file_ (V2) prefix.
	// Example: "file_abc123xyz"
	FileId string `json:"file_id,omitempty"`
	// External URL of the file to parse. Must be publicly accessible.
	// Examples: "https://pub-226479de18b2493f96b64c6674705dd8.r2.dev/real-estate-purchase-all-signed.pdf"
	FileURL string `json:"file_url,omitempty"`
	// The raw text content to parse.
	// Examples: "This is the document content..."
	RawText string `json:"raw_text,omitempty"`
}

FileSource represents the source of a document (FileId, FileURL, or RawText).

func (*FileSource) SourceProvided ¶

func (fs *FileSource) SourceProvided() bool

SourceProvided checks exactly one source is provided.

type FileUploadResponse ¶

type FileUploadResponse struct {
	// FileId is the ID of the created file.
	// Use this ID to reference the file in parse, datasets, and other operations.
	FileId string `json:"file_id"`

	// CreatedAt is the creation date and time of the file.
	// This is in RFC 3339 format.
	CreatedAt time.Time `json:"created_at"`
}

FileUploadResponse represents the response from uploading a file.

type GetParseResultOption ¶

type GetParseResultOption func(*GetParseResultOptions)

GetParseResultOption is a function that configures the GetParseResultOptions.

func WithOnUpdate ¶

func WithOnUpdate(onUpdate ParseResultUpdateFunc) GetParseResultOption

WithOnUpdate sets the callback function that receives intermediate parse result updates during SSE streaming. It will be called for each SSE event received.

func WithOptions ¶ added in v0.1.1

func WithOptions(enable bool) GetParseResultOption

func WithSSE ¶

func WithSSE(enable bool) GetParseResultOption

WithSSE enables Server-Sent Events (SSE) for streaming updates.

type GetParseResultOptions ¶

type GetParseResultOptions struct {
	// contains filtered or unexported fields
}

type JobType ¶ added in v0.1.1

type JobType string

const (
	JobTypeParse    JobType = "parse"
	JobTypeRead     JobType = "read"
	JobTypeExtract  JobType = "extract"
	JobTypeClassify JobType = "classify"
	JobTypeLegacy   JobType = "legacy"
	JobTypeDataset  JobType = "dataset"
)

type ListDatasetDataRequest ¶

type ListDatasetDataRequest struct {
	DatasetId string `json:"-"`

	Cursor         string              `json:"cursor,omitempty"`
	Direction      PaginationDirection `json:"direction,omitempty"`
	Limit          int                 `json:"limit,omitempty"`
	Status         ParseStatus         `json:"status,omitempty"`
	ParseId        string              `json:"parse_id,omitempty"`
	FileName       string              `json:"file_name,omitempty"`
	CreatedAfter   string              `json:"created_after,omitempty"`   // RFC3339
	CreatedBefore  string              `json:"created_before,omitempty"`  // RFC3339
	FinishedAfter  string              `json:"finished_after,omitempty"`  // RFC3339
	FinishedBefore string              `json:"finished_before,omitempty"` // RFC3339
}

ListDatasetDataRequest holds options for listing dataset parse jobs.

type ListDatasetsRequest ¶

type ListDatasetsRequest struct {
	Cursor    string
	Direction PaginationDirection
	Limit     int
	Status    DatasetStatus
	Name      string
}

ListDatasetsRequest holds options for listing datasets.

type ListFilesRequest ¶

type ListFilesRequest struct {
	// Cursor is the cursor to use for pagination.
	// This is a base64-encoded string representing a timestamp.
	// It is used to paginate through the results.
	//
	// Optional.
	Cursor string `json:"cursor,omitempty"`

	// Direction of pagination.
	//
	// This can be either next or prev.
	// next means to get the next page of results,
	// while prev means to get the previous page of results.
	//
	// Optional.
	Direction PaginationDirection `json:"direction,omitempty"`

	// Limit is the limits for the number of results to return.
	//
	// This is a positive integer that specifies the maximum number of results
	// to return. If not provided, a default value will be used.
	//
	// Required range: x >= 0.
	Limit int `json:"limit,omitempty"`

	// FileName is the name to filter results by.
	// This is a case-sensitive substring that will be matched against the file names.
	// If provided, only files with names containing this substring will be returned.
	FileName string `json:"file_name,omitempty"`

	// CreatedAfter is the date and time to filter results by.
	// The date should be in RFC 3339 format.
	CreatedAfter string `json:"created_after,omitempty"`

	// CreatedBefore is the date and time to filter results by.
	// The date should be in RFC 3339 format.
	CreatedBefore string `json:"created_before,omitempty"`
}

ListFilesRequest holds options for listing files.

type ListParseJobsRequest ¶

type ListParseJobsRequest struct {
	Cursor         string              `json:"cursor,omitempty"`
	Direction      PaginationDirection `json:"direction,omitempty"`
	DatasetName    string              `json:"dataset_name,omitempty"`
	Limit          int                 `json:"limit,omitempty"`
	FileName       string              `json:"file_name,omitempty"`
	Status         ParseStatus         `json:"status,omitempty"`
	CreatedAfter   string              `json:"created_after,omitempty"`
	CreatedBefore  string              `json:"created_before,omitempty"`
	FinishedAfter  string              `json:"finished_after,omitempty"`
	FinishedBefore string              `json:"finished_before,omitempty"`
}

type MimeType ¶

type MimeType string

MimeType represents supported MIME types for document parsing.

const (
	// MimeTypeTXT represents plain text files.
	MimeTypeTXT MimeType = "text/plain"
	// MimeTypeCSV represents a comma-separated values files.
	MimeTypeCSV MimeType = "text/csv"
	// MimeTypeHTML represents HTML files.
	MimeTypeHTML MimeType = "text/html"
	// MimeTypeJPEG represents JPEG image files.
	MimeTypeJPEG MimeType = "image/jpeg"
	// MimeTypePNG represents PNG image files.
	MimeTypePNG MimeType = "image/png"
	// MimeTypePDF represents Portable Document Format files.
	MimeTypePDF MimeType = "application/pdf"
	// MimeTypeDOCX represents Microsoft Word documents.
	MimeTypeDOCX MimeType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	// MimeTypePPTX represents Microsoft PowerPoint presentations.
	MimeTypePPTX MimeType = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
	// MimeTypeKEYNOTE represents Apple Keynote presentations.
	MimeTypeKEYNOTE MimeType = "application/vnd.apple.keynote"
	// MimeTypeXLS represents Microsoft Excel spreadsheets (legacy format).
	MimeTypeXLS MimeType = "application/vnd.ms-excel"
	// MimeTypeXLSX represents Microsoft Excel spreadsheets.
	MimeTypeXLSX MimeType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	// MimeTypeXLSM represents Microsoft Excel spreadsheets (macros enabled).
	MimeTypeXLSM MimeType = "application/vnd.ms-excel.sheet.macroenabled.12"
)

type ModelProvider ¶

type ModelProvider string

ModelProvider represents the LLM provider to use for structured data extraction.

const (
	// ModelProviderTensorlake represents private models, running on Tensorlake infrastructure.
	ModelProviderTensorlake ModelProvider = "tensorlake"

	// ModelProviderGemini3 represents Google Gemini 3 models.
	ModelProviderGemini3 ModelProvider = "gemini-3"

	// ModelProviderSonnet represents Anthropic Sonnet models.
	ModelProviderSonnet ModelProvider = "sonnet"

	// ModelProviderGPT4oMini represents OpenAI GPT-4o-mini model.
	ModelProviderGPT4oMini ModelProvider = "gpt_4o_mini"
)

type OCRPipelineProvider ¶

type OCRPipelineProvider string

OCRPipelineProvider represents the different models for OCR (Optical Character Recognition).

const (
	// OCRPipelineProviderDefault is the default OCR model (same as model01).
	OCRPipelineProviderDefault OCRPipelineProvider = ""

	// OCRPipelineProviderTensorlake01 is fast but could have lower accuracy on complex tables.
	// It's good for legal documents with footnotes.
	OCRPipelineProviderTensorlake01 OCRPipelineProvider = "model01"

	// OCRPipelineProviderTensorlake02 is slower but could have higher accuracy on complex tables.
	// It's good for financial documents with merged cells.
	OCRPipelineProviderTensorlake02 OCRPipelineProvider = "model02"

	// OCRPipelineProviderTensorlake03 is a compact model delivered to on-premise users.
	// It takes about 2 minutes to startup on Tensorlake's Cloud because it's meant
	// for testing for users who are eventually going to deploy this model on
	// dedicated hardware in their own datacenter.
	OCRPipelineProviderTensorlake03 OCRPipelineProvider = "model03"

	// OCRPipelineProviderGemini3 calls Google Gemini 3 API for OCR processing.
	OCRPipelineProviderGemini3 OCRPipelineProvider = "gemini3"
)

type Option ¶

type Option func(*Client)

Option defines a configuration option for the Client.

func WithAPIKey ¶

func WithAPIKey(key string) Option

WithAPIKey sets the API key to use for the client.

func WithBaseURL ¶

func WithBaseURL(url string) Option

WithBaseURL sets the base URL to use for the client.

func WithHTTPClient ¶

func WithHTTPClient(client *http.Client) Option

WithHTTPClient sets the HTTP client to use for the client.

type Page ¶

type Page struct {
	// Dimensions is a 2-element vector representing the width and height of
	// the page in points.
	Dimensions []int `json:"dimensions,omitempty"`

	// PageDimensions is a 2-element vector representing the width and height of
	// the page in points.
	PageDimensions PageDimensions `json:"page_dimensions,omitempty"`

	// Vector of text fragments extracted from the page.
	// Each fragment represents a distinct section of text, such as titles,
	// paragraphs, tables, figures, etc.
	PageFragments []PageFragment `json:"page_fragments,omitempty"`

	// 1-indexed page number in the document.
	PageNumber int `json:"page_number"`

	// If the page was classified into a specific class, this field contains
	// the reason for the classification.
	ClassificationReasons string `json:"classification_reasons,omitempty"`
}

Page represents a page in the parsed document.

type PageClass ¶

type PageClass struct {
	// PageClass is the name of the page class given in the parse request.
	// This value should match one of the class names provided in the
	// page_classification_options field of the parse request.
	//
	// Required.
	PageClass string `json:"page_class"`

	// PageNumbers is a list of page numbers (1-indexed) where
	// the page class was detected. Required.
	PageNumbers []int `json:"page_numbers"`

	// ClassificationReasons is a map of classification reasons per page number
	// The key is the page number, and the value is the reason for the classification.
	ClassificationReasons map[int]string `json:"classification_reasons,omitempty"`
}

PageClass extracted from the document.

type PageClassConfig ¶

type PageClassConfig struct {
	// Name is the name of the page class.
	Name string `json:"name"`

	// Description is the description of the page class to guide the model
	// to classify the pages. Describe what the model should look for in
	// the page to classify it.
	Description string `json:"description,omitempty"`
}

type PageDimensions ¶

type PageDimensions struct {
	// Width is the width of the page in points.
	Width int `json:"width"`
	// Height is the height of the page in points.
	Height int `json:"height"`
}

PageDimensions represents the dimensions of a page.

type PageFragment ¶

type PageFragment struct {
	FragmentType PageFragmentType    `json:"fragment_type"`
	Content      PageFragmentContent `json:"content"`
	ReadingOrder int64               `json:"reading_order,omitempty"`
	BoundingBox  map[string]float64  `json:"bbox,omitempty"`
}

PageFragment represents a fragment of a page in the parsed document.

type PageFragmentContent ¶

type PageFragmentContent struct {
	// One of these will be set depending on the JSON input:
	Text      *PageFragmentText      `json:"text,omitempty"`
	Header    *PageFragmentHeader    `json:"header,omitempty"`
	Table     *PageFragmentTable     `json:"table,omitempty"`
	Figure    *PageFragmentFigure    `json:"figure,omitempty"`
	Signature *PageFragmentSignature `json:"signature,omitempty"`
}

type PageFragmentFigure ¶

type PageFragmentFigure struct {
	Content string `json:"content"`
	Summary string `json:"summary,omitempty"`
}

type PageFragmentHeader ¶

type PageFragmentHeader struct {
	Content string `json:"content"`
	Level   int    `json:"level"`
}

type PageFragmentSignature ¶

type PageFragmentSignature struct {
	Content string `json:"content"`
}

type PageFragmentTable ¶

type PageFragmentTable struct {
	Content  string                  `json:"content"`
	Cells    []PageFragmentTableCell `json:"cells"`
	HTML     string                  `json:"html,omitempty"`
	Markdown string                  `json:"markdown,omitempty"`
	Summary  string                  `json:"summary,omitempty"`
}

type PageFragmentTableCell ¶

type PageFragmentTableCell struct {
	Text        string             `json:"text"`
	BoundingBox map[string]float64 `json:"bounding_box"`
}

type PageFragmentText ¶

type PageFragmentText struct {
	Content string `json:"content"`
}

type PageFragmentType ¶

type PageFragmentType string

PageFragmentType represents the type of a page fragment.

const (
	PageFragmentTypeSectionHeader  PageFragmentType = "section_header"
	PageFragmentTypeTitle          PageFragmentType = "title"
	PageFragmentTypeText           PageFragmentType = "text"
	PageFragmentTypeTable          PageFragmentType = "table"
	PageFragmentTypeFigure         PageFragmentType = "figure"
	PageFragmentTypeFormula        PageFragmentType = "formula"
	PageFragmentTypeForm           PageFragmentType = "form"
	PageFragmentTypeKeyValueRegion PageFragmentType = "key_value_region"
	PageFragmentTypeDocumentIndex  PageFragmentType = "document_index"
	PageFragmentTypeListItem       PageFragmentType = "list_item"
	PageFragmentTypeTableCaption   PageFragmentType = "table_caption"
	PageFragmentTypeFigureCaption  PageFragmentType = "figure_caption"
	PageFragmentTypeFormulaCaption PageFragmentType = "formula_caption"
	PageFragmentTypePageFooter     PageFragmentType = "page_footer"
	PageFragmentTypePageHeader     PageFragmentType = "page_header"
	PageFragmentTypePageNumber     PageFragmentType = "page_number"
	PageFragmentTypeSignature      PageFragmentType = "signature"
	PageFragmentTypeStrikethrough  PageFragmentType = "strikethrough"
	PageFragmentTypeBarcode        PageFragmentType = "barcode"
)

type PaginationDirection ¶

type PaginationDirection string

const (
	PaginationDirectionNext PaginationDirection = "next"
	PaginationDirectionPrev PaginationDirection = "prev"
)

type PaginationResult ¶

type PaginationResult[T any] struct {
	Items      []T    `json:"items"`
	HasMore    bool   `json:"has_more"`
	NextCursor string `json:"next_cursor,omitempty"`
	PrevCursor string `json:"prev_cursor,omitempty"`
}

PaginationResult represents the result of a pagination operation.

type ParseDatasetRequest ¶

type ParseDatasetRequest struct {
	DatasetId string `json:"-"`
	FileSource
	PageRange string            `json:"page_range,omitempty"`
	FileName  string            `json:"file_name,omitempty"`
	MimeType  MimeType          `json:"mime_type,omitempty"`
	Labels    map[string]string `json:"labels,omitempty"`
}

ParseDatasetRequest holds options for parsing a document with a dataset.

type ParseDocumentRequest ¶

type ParseDocumentRequest struct {
	FileSource

	// ParsingOptions contains the properties of this object define
	// the configuration for the document parsing process.
	//
	// Tensorlake provides sane defaults that work well for most
	// documents, so this object is not required. However, every document
	// is different, and you may want to customize the parsing process to
	// better suit your needs.
	ParsingOptions *ParsingOptions `json:"parsing_options,omitempty"`

	// The properties of this object help to extend the output of the document
	// parsing process with additional information.
	//
	// This includes summarization of tables and figures, which can help to
	// provide a more comprehensive understanding of the document.
	//
	// This object is not required, and the API will use default settings if it
	// is not present.
	EnrichmentOptions *EnrichmentOptions `json:"enrichment_options,omitempty"`

	// StructuredExtractionOptions is the options for structured data extraction.
	//
	// The properties of this object define the configuration for structured
	// data extraction.
	//
	// If this object is present, the API will perform structured data
	// extraction on the document.
	StructuredExtractionOptions []StructuredExtractionOptions `json:"structured_extraction_options,omitempty"`

	// PageClassificationOptions is the options for page classification.
	//
	// The properties of this object define the configuration for page
	// classify.
	//
	// If this object is present, the API will perform page classify on
	// the document.
	PageClassificationOptions []PageClassConfig `json:"page_classifications,omitempty"`

	// PageRange is a comma-separated list of page numbers or
	// ranges to parse (e.g., '1,2,3-5'). Default: all pages.
	// Examples: "1-5,8,10"
	PageRange string `json:"page_range,omitempty"`

	// Additional metadata to identify the read request. The labels are
	// returned in the read response.
	Labels map[string]string `json:"labels,omitempty"`

	// MimeType is the MIME type of the file. This is used to determine how to process the file.
	MimeType MimeType `json:"mime_type,omitempty"`
}

type ParseEventName ¶

type ParseEventName string

ParseEventName is the name of the SSE event.

const (
	SSEEventParseQueued ParseEventName = "parse_queued"
	SSEEventParseUpdate ParseEventName = "parse_update"
	SSEEventParseDone   ParseEventName = "parse_done"
	SSEEventParseFailed ParseEventName = "parse_failed"
)

The possible SSE events. See also: https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/documentai/_parse.py#L499

type ParseJob ¶

type ParseJob struct {
	// ParseId is the unique identifier for the parse job.
	// This is the ID that can be used to track the status of the parse job.
	// Used in the GET /documents/v2/parse/{parse_id} endpoint to retrieve
	// the status and results of the parse job.
	ParseId string `json:"parse_id"`
	// CreatedAt is the creation date and time of the parse job.
	CreatedAt string `json:"created_at"`
}

ParseJob represents a parse job.

type ParseResult ¶

type ParseResult struct {

	// The unique identifier for the parse job. This is the same value
	// returned from ReadDocument or ParseDocument.
	// Example: "parse_abcd1234"
	ParseId string `json:"parse_id"`

	// The number of pages that were parsed successfully.
	// This is the total number of pages that were successfully parsed
	// in the document. Required range: x >= 0. Example: 5
	ParsedPagesCount int `json:"parsed_pages_count"`

	// The current status of the parse job. This indicates whether the
	// job is pending, in progress, completed, or failed.
	// This can be used to track the progress of the parse operation.
	Status ParseStatus `json:"status"`

	// The date and time when the parse job was created.
	// The date is in RFC 3339 format. This can be used to track when
	// the parse job was initiated. Example: "2023-10-01T12:00:00Z"
	CreatedAt string `json:"created_at"`

	// Error occurred during any part of the parse execution.
	// This is only populated if the parse operation failed.
	Error string `json:"error,omitempty"`

	// The date and time when the parse job was finished.
	// The date is in RFC 3339 format.
	// This can be undefined if the parse job is still in progress or pending.
	FinishedAt string `json:"finished_at,omitempty"`

	// Labels associated with the parse job.
	//
	// These are the key-value, or json, pairs submitted with the parse
	// request.
	//
	// This can be used to categorize or tag the parse job for easier
	// identification and filtering.
	//
	// It can be undefined if no labels were provided in the request.
	Labels map[string]string `json:"labels,omitempty"`

	// TotalPages is the total number of pages in the document that was parsed.
	TotalPages int `json:"total_pages,omitempty"`

	// MessageUpdate is the message update for the parse job.
	MessageUpdate string `json:"message_update,omitempty"`

	// If the parse job was scheduled from a dataset, this field contains
	// the dataset id. This is the identifier used in URLs and API endpoints
	// to refer to the dataset.
	DatasetId string `json:"dataset_id,omitempty"`

	// Chunks of the document.
	//
	// This is a vector of Chunk objects, each containing a chunk of the
	// document.
	//
	// The number of chunks depend on the chunking strategy used during
	// parsing.
	Chunks []Chunk `json:"chunks,omitempty"`

	// List of pages parsed from the document.
	//
	// Each page has a list of fragments, which are detected objects such as
	// tables, text, figures, section headers, etc.
	//
	// We also return the detected text, structure of the table(if its a
	// table), and the bounding box of the object.
	Pages []Page `json:"pages"`

	// Page classes extracted from the document.
	//
	// This is a map where the keys are page class names provided in the parse
	// request under the page_classification_options field,
	// and the values are vectors of page numbers (1-indexed) where each page
	// class appears.
	//
	// This is used to categorize pages in the document based on the
	// classify options provided.
	PageClasses []PageClass `json:"page_classes,omitempty"`

	// Structured data extracted from the document.
	//
	// The structured data is a map where the keys are the schema names
	// provided in the parse request, and the values are
	// StructuredData objects containing the structured data extracted from
	// the document.
	//
	// The number of structured data objects depends on the partition strategy
	// None - one structured data object for the entire document.
	// Page - one structured data object for each page.
	StructuredData []StructuredData `json:"structured_data,omitempty"`

	// Options contains the options used for the parse job.
	// Note that this field is no
	Options *ParseResultOptions `json:"options,omitempty"`

	// Resource usage associated with the parse job.
	//
	// This includes details such as number of pages parsed, tokens used for
	// OCR and extraction, etc.
	//
	// Usage is only populated for successful jobs.
	//
	// Billing is based on the resource usage.
	Usage Usage `json:"usage"`
}

ParseResult represents the result of a parse job.

type ParseResultOptions ¶ added in v0.1.1

type ParseResultOptions struct {
	FileSource
	FileName      string            `json:"file_name"`
	FileLabels    map[string]string `json:"file_labels"`
	MimeType      MimeType          `json:"mime_type"`
	TraceId       string            `json:"trace_id"`
	PageRange     string            `json:"page_range"`
	JobType       JobType           `json:"job_type"`
	Configuration *ParsingOptions   `json:"configuration"`
	Usage         *Usage            `json:"usage,omitempty"`
	MessageUpdate string            `json:"message_update,omitempty"`
}

ParseResultOptions contains the options used for the parse job. It includes the configuration options used for the parse job, including the file ID, file URL, raw text, mime type, and structured extraction options, etc.

type ParseResultUpdateFunc ¶

type ParseResultUpdateFunc func(name ParseEventName, result *ParseResult)

ParseResultUpdateFunc is a callback function that receives intermediate parse result updates during SSE streaming. It will be called for each SSE event received.

type ParseStatus ¶

type ParseStatus string

ParseStatus indicates the status of the parse job.

const (
	// ParseStatusFailure means the job has failed.
	ParseStatusFailure ParseStatus = "failure"

	// ParseStatusPending means the job is waiting to be processed.
	ParseStatusPending ParseStatus = "pending"

	// ParseStatusProcessing means the job is currently being processed.
	ParseStatusProcessing ParseStatus = "processing"

	// ParseStatusSuccessful means the job has been successfully completed and the results are available.
	ParseStatusSuccessful ParseStatus = "successful"

	// ParseStatusDetectingLayout means the job is detecting the layout of the document.
	ParseStatusDetectingLayout ParseStatus = "detecting_layout"

	// ParseStatusLayoutDetected means the layout of the document has been detected.
	ParseStatusLayoutDetected ParseStatus = "detected_layout"

	// ParseStatusExtractingData means the job is extracting the data from the document.
	ParseStatusExtractingData ParseStatus = "extracting_data"

	// ParseStatusExtractedData means the data has been extracted from the document.
	ParseStatusExtractedData ParseStatus = "extracted_data"

	// ParseStatusFormattingOutput means the output is being formatted.
	ParseStatusFormattingOutput ParseStatus = "formatting_output"

	// ParseStatusFormattedOutput means the output has been formatted.
	ParseStatusFormattedOutput ParseStatus = "formatted_output"
)

type ParsingOptions ¶

type ParsingOptions struct {
	// Chunking strategy determines how the document is chunked into smaller pieces.
	// Different strategies can be used to optimize the parsing process.
	// Choose the one that best fits your use case. The default is `None`,
	// which means no chunking is applied.
	ChunkingStrategy ChunkingStrategy `json:"chunking_strategy,omitempty"`

	// CrossPageHeaderDetection enables header-hierarchy detection across pages.
	// When set to `true`, the parser will consider headers from different pages
	// when determining the hierarchy of headers within a single page.
	CrossPageHeaderDetection bool `json:"cross_page_header_detection,omitempty"`

	// DisableLayoutDetection disables bounding box detection for the document.
	// Leads to faster document parsing.
	DisableLayoutDetection bool `json:"disable_layout_detection,omitempty"`

	// OCRModel indicates the model to use for OCR (Optical Character Recognition).
	//
	//   - model01: It's fast but could have lower accuracy on complex tables.
	//              It's good for legal documents with footnotes.
	//   - model02: It's slower but could have higher accuracy on complex tables.
	//              It's good for financial documents with merged cells.
	//   - model03: A compact model that we deliver to on-premise users.
	//              It takes about 2 minutes to startup on Tensorlake's Cloud
	//              because it's meant for testing for users who are eventually
	//              going to deploy this model on dedicated hardware in their
	//              own datacenter.
	OCRModel OCRPipelineProvider `json:"ocr_model,omitempty"`

	// RemoveStrikethroughLines enables the detection, and removal, of
	// strikethrough text in the document. This flag incurs additional billing costs.
	RemoveStrikethroughLines bool `json:"remove_strikethrough_lines,omitempty"`

	// SignatureDetection enables the detection of signatures in the document.
	// This flag incurs additional billing costs.
	// The default is false.
	SignatureDetection bool `json:"signature_detection,omitempty"`

	// SkewDetection enables detect and correct skewed or rotated pages in the
	// document. Setting this to true will increase the processing time of the
	// document. The default is false.
	SkewDetection bool `json:"skew_detection,omitempty"`

	// TableOutputMode is the format for the tables extracted from the document.
	// The default is HTML.
	TableOutputMode TableOutputMode `json:"table_output_mode,omitempty"`

	// TableParsingFormat determines which model the system uses to identify
	// and extract tables from the document. The default is tsr.
	TableParsingFormat TableParsingFormat `json:"table_parsing_format,omitempty"`

	// IgnoreSections contain a set of page fragment types to ignore during parsing.
	//
	// This can be used to skip certain types of content that are not relevant
	// for the parsing process, such as headers, footers, or other
	// non-essential elements.
	//
	// The default is an empty set.
	IgnoreSections []PageFragmentType `json:"ignore_sections,omitempty"`

	// IncludeImages embeded images from document in the markdown.
	// The default is false.
	IncludeImages bool `json:"include_images,omitempty"`

	// BarcodeDetection enable barcode detection in the document.
	// Setting this to true will increase the processing time of the document.
	// The default is false.
	BarcodeDetection bool `json:"barcode_detection,omitempty"`
}

ParsingOptions holds configuration for document parsing.

type PartitionStrategy ¶

type PartitionStrategy string

PartitionStrategy determines how documents are partitioned before structured data extraction.

The API will return one structured data object per partition.

const (
	// PartitionStrategyNone: No partitioning is applied.
	// The entire document is treated as a single unit for extraction.
	PartitionStrategyNone PartitionStrategy = "none"

	// PartitionStrategyPage: The document is partitioned by individual pages.
	// Each page is treated as a separate unit for extraction.
	PartitionStrategyPage PartitionStrategy = "page"

	// PartitionStrategySection: The document is partitioned into sections based on
	// detected section headers. Each section is treated as a separate unit for extraction.
	PartitionStrategySection PartitionStrategy = "section"

	// PartitionStrategyFragment: The document is partitioned by individual page elements.
	// Each fragment is treated as a separate unit for extraction.
	PartitionStrategyFragment PartitionStrategy = "fragment"

	// PartitionStrategyPatterns: The document is partitioned based on user-defined
	// start and end patterns.
	PartitionStrategyPatterns PartitionStrategy = "patterns"
)

type ReadDocumentRequest ¶

type ReadDocumentRequest struct {
	FileSource

	// ParsingOptions contains the properties of this object define
	// the configuration for the document parsing process.
	//
	// Tensorlake provides sane defaults that work well for most
	// documents, so this object is not required. However, every document
	// is different, and you may want to customize the parsing process to
	// better suit your needs.
	ParsingOptions *ParsingOptions `json:"parsing_options,omitempty"`

	// The properties of this object help to extend the output of the document
	// parsing process with additional information.
	//
	// This includes summarization of tables and figures, which can help to
	// provide a more comprehensive understanding of the document.
	//
	// This object is not required, and the API will use default settings if it
	// is not present.
	EnrichmentOptions *EnrichmentOptions `json:"enrichment_options,omitempty"`

	// Additional metadata to identify the read request. The labels are
	// returned in the read response.
	Labels map[string]string `json:"labels,omitempty"`

	// FileName is the name of the file. Only populated when using file_id.
	// Examples: "document.pdf"
	FileName string `json:"file_name,omitempty"`

	// PageRange is a comma-separated list of page numbers or
	// ranges to parse (e.g., '1,2,3-5'). Default: all pages.
	// Examples: "1-5,8,10"
	PageRange string `json:"page_range,omitempty"`

	// MimeType is the MIME type of the file. This is used to determine how to process the file.
	MimeType MimeType `json:"mime_type,omitempty"`
}

ReadDocumentRequest holds the input parameters for reading/parsing a document.

type StructuredData ¶

type StructuredData struct {
	// Data is a JSON object containing the structured data extracted from the document.
	// The schema is specified in the StructuredExtractionOptions.JSONSchema field.
	Data json.RawMessage `json:"data"`
	// PageNumber contains either an integer or an array of integers regarding page numbers.
	// Example: [1, 2, 3] or 1
	PageNumbers UnionValues[int] `json:"page_numbers"`
	// SchemaName is the name of the schema used to extract the structured data.
	// It is specified in the StructuredExtractionOptions.SchemaName field.
	SchemaName string `json:"schema_name,omitempty"`
}

StructuredData extracted from the document. The structured data is a map where the keys are the schema names provided in the parse request, and the values are StructuredData objects containing the structured data extracted from the document.

type StructuredExtractionOptions ¶

type StructuredExtractionOptions struct {

	// The name of the schema. This is used to tag the structured data output
	// with a name in the response.
	SchemaName string `json:"schema_name"`

	// 	The JSON schema to guide structured data extraction from the file.
	//
	// This schema should be a valid JSON schema that defines the structure of
	// the data to be extracted.
	//
	// The API supports a subset of the JSON schema specification.
	//
	// This value must be provided if structured_extraction is present in the
	// request.
	JSONSchema *jsonschema.Schema `json:"json_schema"` // Can be any JSON schema structure

	// Strategy to partition the document before structured data extraction.
	// The API will return one structured data object per partition. This is
	// useful when you want to extract certain fields from every page.
	PartitionStrategy PartitionStrategy `json:"partition_strategy,omitempty"`

	// 	The model provider to use for structured data extraction.
	//
	// The default is tensorlake, which uses our private model, and runs on
	// our servers.
	ModelProvider ModelProvider `json:"model_provider,omitempty"`

	// Filter the pages of the document to be used for structured data
	// extraction by providing a list of page classes.
	PageClasses []string `json:"page_classes,omitempty"`

	// The prompt to use for structured data extraction.
	//
	// If not provided, the default prompt will be used.
	Prompt string `json:"prompt,omitempty"`

	// Flag to enable visual citations in the structured data output.
	// It returns the bounding boxes of the coordinates of the document
	// where the structured data was extracted from.
	ProvideCitations bool `json:"provide_citations,omitempty"`

	// Boolean flag to skip converting the document blob to OCR text before
	// structured data extraction.
	//
	// If set to true, the API will skip the OCR step and directly extract
	// structured data from the document.
	SkipOCR bool `json:"skip_ocr,omitempty"`
}

StructuredExtractionOptions holds configuration for structured data extraction.

type TableOutputMode ¶

type TableOutputMode string

TableOutputMode is the format for tables extracted from the document.

const (
	// TableOutputModeHTML outputs tables as HTML strings.
	TableOutputModeHTML TableOutputMode = "html"
	// TableOutputModeMarkdown outputs tables as Markdown strings.
	TableOutputModeMarkdown TableOutputMode = "markdown"
)

type TableParsingFormat ¶

type TableParsingFormat string

TableParsingFormat determines which model the system uses to identify and extract tables from the document.

const (
	// TableParsingFormatTSR identifies the structure of the table first,
	// then the cells of the tables. Better suited for clean, grid-like tables.
	TableParsingFormatTSR TableParsingFormat = "tsr"
	// TableParsingFormatVLM uses a vision language model to identify
	// and extract the cells of the tables. Better suited for tables
	// with merged cells or irregular structures.
	TableParsingFormatVLM TableParsingFormat = "vlm"
)

type UnionValues ¶

type UnionValues[T any] []T

UnionValues is a union of values of type T. It can be a single value or an array of values.

func (UnionValues[T]) MarshalJSON ¶

func (v UnionValues[T]) MarshalJSON() ([]byte, error)

MarshalJSON marshals a UnionValues into a JSON array.

func (*UnionValues[T]) UnmarshalJSON ¶

func (v *UnionValues[T]) UnmarshalJSON(b []byte) error

UnmarshalJSON unmarshals a JSON array or a single value into a UnionValues.

type UpdateDatasetRequest ¶

type UpdateDatasetRequest struct {
	DatasetId                   string                        `json:"-"`
	Description                 string                        `json:"description,omitempty"`
	ParsingOptions              *ParsingOptions               `json:"parsing_options,omitempty"`
	StructuredExtractionOptions []StructuredExtractionOptions `json:"structured_extraction_options,omitempty"`
	PageClassifications         []PageClassConfig             `json:"page_classifications,omitempty"`
	EnrichmentOptions           *EnrichmentOptions            `json:"enrichment_options,omitempty"`
}

UpdateDatasetRequest holds options for updating a dataset.

type UploadFileRequest ¶

type UploadFileRequest struct {
	// FileBytes is the reader for the file to upload.
	//
	// Required.
	FileBytes io.Reader `json:"file_bytes"`

	// FileName is the name of the file to upload.
	//
	// Optional.
	FileName string `json:"file_name"`

	// Labels are the labels to add to the file.
	//
	// Optional.
	Labels map[string]string `json:"labels,omitempty"`
}

UploadFileRequest holds options for uploading a file.

type Usage ¶ added in v0.1.1

type Usage struct {
	PagesParsed                  int `json:"pages_parsed"`
	SignatureDetectedPages       int `json:"signature_detected_pages"`
	StrikethroughDetectedPages   int `json:"strikethrough_detected_pages"`
	OCRInputTokenUsed            int `json:"ocr_input_token_used"`
	OCROutputTokenUsed           int `json:"ocr_output_token_used"`
	ExtractionInputTokenUsed     int `json:"extraction_input_token_used"`
	ExtractionOutputTokenUsed    int `json:"extraction_output_token_used"`
	SummarizationInputTokenUsed  int `json:"summarization_input_token_used"`
	SummarizationOutputTokenUsed int `json:"summarization_output_token_used"`
}

Usage contains resource usage associated with the parse job. This includes details such as number of pages parsed, tokens used for OCR and extraction, etc. Usage is only populated for successful jobs. Billing is based on the resource usage.

Directories ¶

Path	Synopsis
internal
sse

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL