model

package
v0.1.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 6, 2026 License: Apache-2.0 Imports: 3 Imported by: 0

Documentation

Overview

Package model defines the interfaces for different AI model types.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func CosineSimilarity

func CosineSimilarity(a, b []float64) float64

CosineSimilarity calculates the cosine similarity between two embeddings.

func DotProduct

func DotProduct(a, b []float64) float64

DotProduct calculates the dot product between two embeddings.

func EuclideanDistance

func EuclideanDistance(a, b []float64) float64

EuclideanDistance calculates the Euclidean distance between two embeddings.

Types

type EmbedCallOptions

type EmbedCallOptions struct {
	// Values is the list of texts to embed.
	Values []string

	// Dimensions is the desired embedding dimensions (if model supports it).
	Dimensions *int

	// ProviderOptions are provider-specific options.
	ProviderOptions map[string]any

	// Headers are additional HTTP headers.
	Headers map[string]string
}

EmbedCallOptions contains the options for embedding generation.

type EmbedResult

type EmbedResult struct {
	// Embeddings contains the generated embeddings.
	Embeddings []Embedding

	// Usage contains usage information.
	Usage EmbeddingUsage

	// Warnings contains any warnings from the embedding process.
	// Mirrors ai-sdk's EmbeddingModelV3 `warnings: SharedV3Warning[]`.
	Warnings []stream.Warning

	// Response contains provider-specific response data.
	Response EmbeddingResponse
}

EmbedResult contains the result of an embedding call.

type Embedding

type Embedding struct {
	// Values is the embedding vector.
	Values []float64

	// Index is the index of the input text this embedding corresponds to.
	Index int
}

Embedding represents a single embedding.

type EmbeddingModel

type EmbeddingModel interface {
	// ID returns the model identifier.
	ID() string

	// Provider returns the provider identifier.
	Provider() string

	// MaxEmbeddingsPerCall returns the maximum number of texts that can be embedded in a single call.
	MaxEmbeddingsPerCall() int

	// Dimensions returns the embedding dimensions (0 if variable or unknown).
	Dimensions() int

	// Embed generates embeddings for the provided texts.
	Embed(ctx context.Context, opts EmbedCallOptions) (*EmbedResult, error)
}

EmbeddingModel is the interface for text embedding models.

type EmbeddingResponse

type EmbeddingResponse struct {
	// ID is the response identifier.
	ID string

	// Model is the model used for generation.
	Model string

	// Headers contains response headers.
	Headers map[string]string
}

EmbeddingResponse contains provider-specific response metadata.

type EmbeddingUsage

type EmbeddingUsage struct {
	// Tokens is the total number of tokens used.
	Tokens int
}

EmbeddingUsage contains usage information for embedding generation.

type GeneratedImage

type GeneratedImage struct {
	// Base64 is the base64-encoded image data.
	Base64 string

	// URL is the URL of the generated image (if available).
	URL string

	// MimeType is the MIME type of the image (e.g., "image/png").
	MimeType string

	// Seed is the seed used for generation (if available).
	Seed *int64

	// RevisedPrompt is the revised prompt used for generation (if available).
	RevisedPrompt string
}

GeneratedImage represents a single generated image.

type ImageCallOptions

type ImageCallOptions struct {
	// Prompt is the text description of the image to generate.
	Prompt string

	// N is the number of images to generate.
	N int

	// Size is the size of the generated images (e.g., "1024x1024").
	Size string

	// AspectRatio is the aspect ratio (e.g., "16:9", "1:1").
	AspectRatio string

	// Seed for deterministic generation (if supported).
	Seed *int64

	// ProviderOptions are provider-specific options.
	ProviderOptions map[string]any

	// Headers are additional HTTP headers.
	Headers map[string]string

	// Files is an optional list of input images for image-to-image or
	// editing workflows. Each entry is raw bytes in any common image
	// encoding; providers detect the MIME type from the magic bytes
	// before forwarding the payload (ai-sdk ImageModelV3 `files`).
	Files [][]byte

	// Mask is an optional mask image for inpainting where non-zero
	// pixels indicate regions to regenerate (ai-sdk ImageModelV3 `mask`).
	Mask []byte
}

ImageCallOptions contains the options for image generation.

type ImageModel

type ImageModel interface {
	// ID returns the model identifier.
	ID() string

	// Provider returns the provider identifier.
	Provider() string

	// MaxImagesPerCall returns the maximum number of images that can be generated in a single call.
	MaxImagesPerCall() int

	// Generate generates images based on the provided options.
	Generate(ctx context.Context, opts ImageCallOptions) (*ImageResult, error)
}

ImageModel is the interface for image generation models.

type ImageResponse

type ImageResponse struct {
	// ID is the response identifier.
	ID string

	// Model is the model used for generation.
	Model string

	// Timestamp is the creation timestamp.
	Timestamp int64

	// Headers contains response headers.
	Headers map[string]string
}

ImageResponse contains provider-specific response metadata.

type ImageResult

type ImageResult struct {
	// Images contains the generated images.
	Images []GeneratedImage

	// Warnings contains any warnings from the generation process.
	Warnings []stream.Warning

	// Usage contains usage information (if available).
	Usage *ImageUsage

	// Response contains provider-specific response data.
	Response ImageResponse

	// ProviderMetadata contains provider-specific metadata returned by
	// the model (ai-sdk ImageModelV3 `providerMetadata`). Keys are
	// provider IDs, values are provider-defined payloads.
	ProviderMetadata map[string]any
}

ImageResult contains the result of an image generation call.

type ImageUsage

type ImageUsage struct {
	// TotalTokens is the total number of tokens used (for models that use tokens).
	TotalTokens int

	// Steps is the number of diffusion steps (for diffusion models).
	Steps int
}

ImageUsage contains usage information for image generation.

type LanguageModel

type LanguageModel = stream.Model

LanguageModel is the interface for text generation models. This is an alias for stream.Model for consistency with the existing codebase.

type LanguageModelInfo

type LanguageModelInfo struct {
	// ID is the model identifier.
	ID string

	// Provider is the provider identifier.
	Provider string

	// MaxTokens is the maximum number of tokens the model can generate.
	MaxTokens int

	// ContextWindow is the maximum context window size.
	ContextWindow int

	// SupportsTools indicates if the model supports tool/function calling.
	SupportsTools bool

	// SupportsVision indicates if the model supports image inputs.
	SupportsVision bool

	// SupportsStreaming indicates if the model supports streaming responses.
	SupportsStreaming bool

	// SupportsReasoning indicates if the model supports extended thinking.
	SupportsReasoning bool
}

LanguageModelInfo contains metadata about a language model.

type RankedDocument

type RankedDocument struct {
	// Index is the original index of the document.
	Index int

	// Score is the relevance score (higher is more relevant).
	Score float64

	// Document is the document text (if ReturnDocuments was true).
	Document string
}

RankedDocument represents a document with its relevance score.

type RerankCallOptions

type RerankCallOptions struct {
	// Query is the query to rank documents against.
	Query string

	// Documents is the list of documents to rerank.
	Documents []string

	// TopN is the number of top results to return (0 means return all).
	TopN int

	// ReturnDocuments specifies whether to include document text in results.
	ReturnDocuments bool

	// ProviderOptions are provider-specific options.
	ProviderOptions map[string]any

	// Headers are additional HTTP headers.
	Headers map[string]string
}

RerankCallOptions contains the options for reranking.

type RerankResponse

type RerankResponse struct {
	// ID is the response identifier.
	ID string

	// Model is the model used for reranking.
	Model string

	// Headers contains response headers.
	Headers map[string]string
}

RerankResponse contains provider-specific response metadata.

type RerankResult

type RerankResult struct {
	// Results contains the reranked documents.
	Results []RankedDocument

	// Usage contains usage information.
	Usage RerankUsage

	// Warnings contains any warnings from the reranking process.
	// Mirrors ai-sdk's RerankingModelV3 `warnings: SharedV3Warning[]`.
	Warnings []stream.Warning

	// Response contains provider-specific response data.
	Response RerankResponse
}

RerankResult contains the result of a reranking call.

type RerankUsage

type RerankUsage struct {
	// SearchUnits is the number of search units used.
	SearchUnits int

	// Tokens is the total number of tokens processed.
	Tokens int
}

RerankUsage contains usage information for reranking.

type RerankingModel

type RerankingModel interface {
	// ID returns the model identifier.
	ID() string

	// Provider returns the provider identifier.
	Provider() string

	// MaxDocumentsPerCall returns the maximum number of documents that can be reranked in a single call.
	MaxDocumentsPerCall() int

	// Rerank reranks documents based on their relevance to a query.
	Rerank(ctx context.Context, opts RerankCallOptions) (*RerankResult, error)
}

RerankingModel is the interface for document reranking models.

type SpeechCallOptions

type SpeechCallOptions struct {
	// Text is the text to convert to speech.
	Text string

	// Voice is the voice to use for generation.
	Voice string

	// OutputFormat is the desired output format (e.g., "mp3", "wav", "opus").
	OutputFormat string

	// Speed is the speed of the generated audio (0.25 to 4.0, 1.0 is normal).
	Speed *float64

	// ProviderOptions are provider-specific options.
	ProviderOptions map[string]any

	// Headers are additional HTTP headers.
	Headers map[string]string
}

SpeechCallOptions contains the options for speech generation.

type SpeechModel

type SpeechModel interface {
	// ID returns the model identifier.
	ID() string

	// Provider returns the provider identifier.
	Provider() string

	// Generate generates speech from text.
	Generate(ctx context.Context, opts SpeechCallOptions) (*SpeechResult, error)
}

SpeechModel is the interface for text-to-speech models.

type SpeechResponse

type SpeechResponse struct {
	// ID is the response identifier.
	ID string

	// Model is the model used for generation.
	Model string

	// Headers contains response headers.
	Headers map[string]string
}

SpeechResponse contains provider-specific response metadata.

type SpeechResult

type SpeechResult struct {
	// Audio is the generated audio data.
	Audio []byte

	// AudioReader provides streaming access to the audio data.
	AudioReader io.Reader

	// MimeType is the MIME type of the audio (e.g., "audio/mpeg").
	MimeType string

	// Duration is the duration of the audio in seconds (if available).
	Duration *float64

	// Warnings contains any warnings from the generation process.
	Warnings []stream.Warning

	// Usage contains usage information (if available).
	Usage *SpeechUsage

	// Response contains provider-specific response data.
	Response SpeechResponse
}

SpeechResult contains the result of a speech generation call.

type SpeechUsage

type SpeechUsage struct {
	// Characters is the number of characters processed.
	Characters int

	// Seconds is the duration of generated audio in seconds.
	Seconds float64
}

SpeechUsage contains usage information for speech generation.

type TranscribeCallOptions

type TranscribeCallOptions struct {
	// Audio is the audio data to transcribe.
	Audio []byte

	// AudioReader provides streaming access to audio data (alternative to Audio).
	AudioReader io.Reader

	// AudioURL is a URL to the audio file (alternative to Audio/AudioReader).
	AudioURL string

	// MimeType is the MIME type of the audio (e.g., "audio/wav", "audio/mp3").
	MimeType string

	// Filename is the filename of the audio (used for format detection).
	Filename string

	// Language is the language code of the audio (e.g., "en", "es").
	// If not provided, the model will attempt to detect the language.
	Language string

	// Prompt is an optional hint for the model (can improve accuracy).
	Prompt string

	// ProviderOptions are provider-specific options.
	ProviderOptions map[string]any

	// Headers are additional HTTP headers.
	Headers map[string]string
}

TranscribeCallOptions contains the options for transcription.

type TranscriptionModel

type TranscriptionModel interface {
	// ID returns the model identifier.
	ID() string

	// Provider returns the provider identifier.
	Provider() string

	// Transcribe transcribes audio to text.
	Transcribe(ctx context.Context, opts TranscribeCallOptions) (*TranscriptionResult, error)
}

TranscriptionModel is the interface for speech-to-text models.

type TranscriptionResponse

type TranscriptionResponse struct {
	// ID is the response identifier.
	ID string

	// Model is the model used for transcription.
	Model string

	// Headers contains response headers.
	Headers map[string]string
}

TranscriptionResponse contains provider-specific response metadata.

type TranscriptionResult

type TranscriptionResult struct {
	// Text is the transcribed text.
	Text string

	// Segments contains detailed segment information (if available).
	Segments []TranscriptionSegment

	// Language is the detected language code.
	Language string

	// Duration is the duration of the audio in seconds (if available).
	Duration *float64

	// Warnings contains any warnings from the transcription process.
	Warnings []stream.Warning

	// Usage contains usage information (if available).
	Usage *TranscriptionUsage

	// Response contains provider-specific response data.
	Response TranscriptionResponse
}

TranscriptionResult contains the result of a transcription call.

type TranscriptionSegment

type TranscriptionSegment struct {
	// ID is the segment identifier.
	ID int

	// Text is the transcribed text for this segment.
	Text string

	// Start is the start time in seconds.
	Start float64

	// End is the end time in seconds.
	End float64

	// Confidence is the confidence score (0 to 1).
	Confidence float64

	// Words contains word-level information (if available).
	Words []TranscriptionWord
}

TranscriptionSegment represents a segment of transcribed audio.

type TranscriptionUsage

type TranscriptionUsage struct {
	// DurationSeconds is the duration of audio processed in seconds.
	DurationSeconds float64
}

TranscriptionUsage contains usage information for transcription.

type TranscriptionWord

type TranscriptionWord struct {
	// Word is the transcribed word.
	Word string

	// Start is the start time in seconds.
	Start float64

	// End is the end time in seconds.
	End float64

	// Confidence is the confidence score (0 to 1).
	Confidence float64
}

TranscriptionWord represents a single word in the transcription.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL