Documentation
¶
Index ¶
- Variables
- func EstimateEmbedTokens(inputs []string) int64
- func EstimateTokens(messages []Message) int64
- func Float64Ptr(v float64) *float64
- func IntPtr(v int) *int
- func IsFatal(err error) bool
- func IsRetryable(err error) bool
- type AccountConfig
- type Auth
- type Candidate
- type CandidateError
- type ChatRequest
- type ChatResponse
- type Choice
- type Config
- type Delta
- type EmbedCandidate
- type EmbedProviderRequest
- type EmbedProviderResponse
- type EmbedRequest
- type EmbedResponse
- type EmbedUsage
- type EmbeddingProvider
- type ErrPartialBatch
- type HealthConfig
- type HealthState
- type HealthTracker
- type InputTokenBreakdown
- type Limits
- type Message
- type Meter
- type ModelMapping
- type ModelRef
- type Option
- type Part
- type PartType
- type Policy
- type Provider
- type ProviderRequest
- type ProviderResponse
- type ProviderStream
- type QuotaInitializer
- type QuotaStore
- type QuotaUnit
- type RateLimiter
- func (rl *RateLimiter) Allow(accountID, model string) bool
- func (rl *RateLimiter) Reset()
- func (rl *RateLimiter) ResetAccount(accountID string)
- func (rl *RateLimiter) SetAccountDefault(accountID string, limits Limits)
- func (rl *RateLimiter) SetLimit(accountID string, rpm int)
- func (rl *RateLimiter) SetModelLimits(accountID, model string, limits Limits)
- type Reservation
- type ResultEvent
- type RouteEvent
- type Router
- func (r *Router) ChatCompletion(ctx context.Context, req ChatRequest) (ChatResponse, error)
- func (r *Router) ChatCompletionStream(ctx context.Context, req ChatRequest) (*RouterStream, error)
- func (r *Router) Embed(ctx context.Context, req EmbedRequest) (EmbedResponse, error)
- func (r *Router) EmbedBatch(ctx context.Context, req EmbedRequest) (EmbedResponse, error)
- type RouterError
- type RouterStream
- type RoutingInfo
- type SpendTracker
- type StreamChunk
- type StreamDelta
- type Usage
Constants ¶
This section is empty.
Variables ¶
var ( ErrNoCandidates = errors.New("inferrouter: no candidates available") ErrNoFreeQuota = errors.New("inferrouter: no free quota remaining") ErrQuotaExceeded = errors.New("inferrouter: quota exceeded") ErrRateLimited = errors.New("inferrouter: rate limited by provider") ErrAuthFailed = errors.New("inferrouter: authentication failed") ErrInvalidRequest = errors.New("inferrouter: invalid request") ErrModelNotFound = errors.New("inferrouter: model not found") ErrAllFailed = errors.New("inferrouter: all candidates failed") ErrRPMExceeded = errors.New("inferrouter: requests per minute limit exceeded") // but no multimodal-capable candidate is available (all filtered out or // unhealthy). Not retryable with text-only fallback — callers should catch // this explicitly and either degrade (strip media) or fail the request. ErrMultimodalUnavailable = errors.New("inferrouter: no multimodal-capable candidates available") // ErrNoEmbeddingProviders is returned by Router.Embed/EmbedBatch when no // configured provider implements EmbeddingProvider for the requested model. // Symmetric to ErrMultimodalUnavailable — a specific failure mode distinct // from generic ErrNoCandidates. ErrNoEmbeddingProviders = errors.New("inferrouter: no embedding providers for model") // ErrBatchTooLarge is returned by Router.Embed (single-call path, NOT // EmbedBatch) when len(req.Inputs) exceeds the selected provider's // MaxBatchSize. Callers should use EmbedBatch for automatic splitting. ErrBatchTooLarge = errors.New("inferrouter: batch exceeds provider max size") // ErrInvalidConfig is returned by NewRouter for structural config problems // that cannot be expressed via YAML schema alone (e.g. embedding alias // with multiple models — see RFC §3.6 single-model invariant). ErrInvalidConfig = errors.New("inferrouter: invalid config") )
Sentinel errors.
Functions ¶
func EstimateEmbedTokens ¶
EstimateEmbedTokens provides a rough token count estimate for an embedding batch. Used only for quota pre-reservation sizing; the actual token count is committed on the successful call (providers typically don't return per-input token counts for embeddings, so we commit with the estimate).
Uses the same character-per-token heuristic as chat (~4 chars/token). Per-request overhead is omitted since embeddings have no system prompts or role scaffolding.
func EstimateTokens ¶
EstimateTokens provides a rough token count estimate for messages. Handles both legacy Content strings and multi-part messages including image/audio/video; for media parts, byte-size heuristics are used.
func Float64Ptr ¶
Float64Ptr returns a pointer to the given float64.
func IsRetryable ¶
IsRetryable returns true if the error can be retried with another candidate.
Types ¶
type AccountConfig ¶
type AccountConfig struct {
Provider string `yaml:"provider"`
ID string `yaml:"id"`
Auth Auth `yaml:"auth"`
DailyFree int64 `yaml:"daily_free"`
QuotaUnit QuotaUnit `yaml:"quota_unit"`
PaidEnabled bool `yaml:"paid_enabled"`
MaxDailySpend float64 `yaml:"max_daily_spend"`
// Deprecated: use CostPerInputToken and CostPerOutputToken instead.
CostPerToken float64 `yaml:"cost_per_token"`
CostPerInputToken float64 `yaml:"cost_per_input_token"`
CostPerOutputToken float64 `yaml:"cost_per_output_token"`
// Per-modality input costs. If zero for a given modality, CostPerInputToken
// is used as the fallback rate (text input rate as baseline).
CostPerAudioInputToken float64 `yaml:"cost_per_audio_input_token"`
CostPerImageInputToken float64 `yaml:"cost_per_image_input_token"`
CostPerVideoInputToken float64 `yaml:"cost_per_video_input_token"`
// CostPerEmbeddingInputToken is the per-input-token cost for embedding
// operations on this account. Typically differs from CostPerInputToken —
// text-embedding-004 on Gemini costs ~$0.025/1M input tokens while chat
// models cost an order of magnitude more. Reusing CostPerInputToken for
// embeddings would inflate cost estimates and break free-first policy
// ordering for accounts that mix chat and embedding billing.
//
// Zero means this account does not support paid embeddings. Combined
// with DailyFree=0 this disables embeddings for the account entirely
// (router will skip it as an embed candidate).
CostPerEmbeddingInputToken float64 `yaml:"cost_per_embedding_input_token"`
// RPM is the default requests-per-minute limit for this account (0 = unlimited).
// Applied to all models unless overridden by ModelLimits.
RPM int `yaml:"rpm"`
// ModelLimits configures per-model rate limits for this account.
// When set, each model has independent RPM/RPH/RPD budgets.
// Models not listed fall back to the account-level RPM.
ModelLimits map[string]Limits `yaml:"model_limits"`
}
AccountConfig configures a single provider account.
type Auth ¶
type Auth struct {
APIKey string `yaml:"api_key" json:"api_key"`
}
Auth holds authentication credentials for a provider account.
type Candidate ¶
type Candidate struct {
Provider Provider
AccountID string
Auth Auth
Model string
Free bool
Remaining int64 // remaining quota (tokens or requests)
QuotaUnit QuotaUnit // unit of the quota
Health HealthState
// Deprecated: use CostPerInputToken/CostPerOutputToken.
CostPerToken float64
CostPerInputToken float64
CostPerOutputToken float64
// Per-modality input rates. Zero means "fall back to CostPerInputToken".
CostPerAudioInputToken float64
CostPerImageInputToken float64
CostPerVideoInputToken float64
MaxDailySpend float64 // max daily dollar spend (0 = unlimited)
CurrentSpend float64 // current daily dollar spend
}
Candidate represents a possible route for a request.
func (Candidate) BlendedCost ¶
BlendedCost returns an estimated cost per token for sorting. Assumes ~3:1 input:output ratio typical for chat.
type CandidateError ¶
CandidateError records the error from a single candidate attempt.
func (*CandidateError) Error ¶
func (e *CandidateError) Error() string
func (*CandidateError) Unwrap ¶
func (e *CandidateError) Unwrap() error
type ChatRequest ¶
type ChatRequest struct {
Model string `json:"model"`
Messages []Message `json:"messages"`
Temperature *float64 `json:"temperature,omitempty"`
MaxTokens *int `json:"max_tokens,omitempty"`
TopP *float64 `json:"top_p,omitempty"`
Stream bool `json:"stream,omitempty"`
Stop []string `json:"stop,omitempty"`
}
ChatRequest represents a chat completion request.
type ChatResponse ¶
type ChatResponse struct {
ID string `json:"id"`
Choices []Choice `json:"choices"`
Usage Usage `json:"usage"`
Model string `json:"model"`
Routing RoutingInfo
}
ChatResponse represents a chat completion response.
type Choice ¶
type Choice struct {
Index int `json:"index"`
Message Message `json:"message"`
FinishReason string `json:"finish_reason"`
}
Choice represents a single completion choice.
type Config ¶
type Config struct {
AllowPaid bool `yaml:"allow_paid"`
DefaultModel string `yaml:"default_model"`
Models []ModelMapping `yaml:"models"`
Accounts []AccountConfig `yaml:"accounts"`
}
Config is the top-level router configuration.
func LoadConfig ¶
LoadConfig reads and parses a YAML config file. Environment variables in the format ${VAR} are expanded before parsing.
func (*Config) NormalizeCosts ¶
func (c *Config) NormalizeCosts()
NormalizeCosts applies backward compatibility for cost fields. If CostPerToken is set and the new fields are zero, it is used for both.
type EmbedCandidate ¶
type EmbedCandidate struct {
Provider EmbeddingProvider
AccountID string
Auth Auth
Model string
Free bool
Remaining int64
QuotaUnit QuotaUnit
Health HealthState
Cost float64 // CostPerEmbeddingInputToken
MaxDailySpend float64
CurrentSpend float64
}
EmbedCandidate is a possible (provider, account, model) tuple for an embedding request. Symmetric to Candidate for chat, but references EmbeddingProvider and uses the embedding-specific cost field.
type EmbedProviderRequest ¶
type EmbedProviderRequest struct {
Auth Auth
Model string
Inputs []string
TaskType string
OutputDimensionality int
}
EmbedProviderRequest is what the router passes to an EmbeddingProvider adapter.
The router guarantees len(Inputs) <= provider.MaxBatchSize() before calling Embed — providers do not need to split internally.
type EmbedProviderResponse ¶
type EmbedProviderResponse struct {
Embeddings [][]float32
Model string
Usage EmbedUsage
}
EmbedProviderResponse is what an EmbeddingProvider adapter returns.
Embeddings must be in the same order as EmbedProviderRequest.Inputs.
type EmbedRequest ¶
type EmbedRequest struct {
// Model is the alias or concrete model name (e.g. "text-embedding-004").
Model string
// Inputs is the batch of texts to embed. Order is preserved in the response.
Inputs []string
// TaskType influences embedding quality for some models. For
// text-embedding-004, valid values include:
// RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, SEMANTIC_SIMILARITY,
// CLASSIFICATION, CLUSTERING, QUESTION_ANSWERING,
// FACT_VERIFICATION, CODE_RETRIEVAL_QUERY.
//
// An empty string defers to the provider default. Consumers doing RAG
// should set this explicitly — indexing uses RETRIEVAL_DOCUMENT and
// queries use RETRIEVAL_QUERY (asymmetric retrieval).
TaskType string
// OutputDimensionality optionally truncates the embedding to a smaller
// size (Matryoshka representation). 0 means native size of the model.
// text-embedding-004 supports [1..768]. Ignored by providers that
// don't support truncation.
OutputDimensionality int
}
EmbedRequest is the public API request for text embeddings.
Unlike ChatRequest, embeddings are pure input — no temperature, no streaming, no multimodal media parts. Inputs is a batch of texts to embed in a single logical request; the router may split this across provider API calls if the batch exceeds the selected provider's MaxBatchSize.
type EmbedResponse ¶
type EmbedResponse struct {
Embeddings [][]float32
Model string
Usage EmbedUsage
Routing RoutingInfo
}
EmbedResponse is the public API response.
Invariants (contract for consumers):
- len(Embeddings) equals the number of successfully processed inputs. On happy path that equals len(req.Inputs). On ErrPartialBatch it equals ErrPartialBatch.ProcessedInputs (see errors.go).
- Embeddings[i] corresponds to req.Inputs[i] for i < len(Embeddings). Order is preserved strictly — consumers may use indices directly.
- Model holds the actual resolved model (not alias). Consumers may compare resp.Model against their expected model as a last-line defense against configuration drift.
type EmbedUsage ¶
EmbedUsage tracks embedding-specific usage.
Embeddings have only input tokens — no completion tokens, no cached tokens, no per-modality breakdown. Reusing the chat Usage type would force zero-filled fields that obscure the semantic difference.
type EmbeddingProvider ¶
type EmbeddingProvider interface {
// Name returns the provider identifier. For providers that implement
// both Provider and EmbeddingProvider, this must match Provider.Name().
Name() string
// SupportsEmbeddingModel reports whether this provider can handle the
// given embedding model name (e.g. "text-embedding-004").
SupportsEmbeddingModel(model string) bool
// Embed generates embeddings for a batch of input texts.
//
// The router guarantees len(req.Inputs) <= MaxBatchSize() before
// calling. Providers do not need to split internally.
//
// The returned Embeddings must preserve the order of req.Inputs.
Embed(ctx context.Context, req EmbedProviderRequest) (EmbedProviderResponse, error)
// MaxBatchSize returns the maximum number of inputs the provider
// accepts in a single Embed call.
//
// Known values:
// - Gemini text-embedding-004 / gemini-embedding-001: 100
// - OpenAI text-embedding-3-small / text-embedding-3-large: 2048
MaxBatchSize() int
}
EmbeddingProvider is an OPTIONAL capability interface.
Providers implement this interface if they support text embedding. A single Provider struct may implement both Provider (for chat) and EmbeddingProvider (for embeddings), or only one of them. The router discovers embedding capability via type assertion at NewRouter time.
Chat-only providers (e.g. openaicompat in its current form, gonka) do not implement this interface — this is honest via compile-time absence, not via a runtime "return ErrNotSupported" stub.
See RFC docs/proposals/inferrouter-embeddings.md §3.1.
type ErrPartialBatch ¶
ErrPartialBatch is returned by Router.EmbedBatch when the operation successfully processed some inputs before encountering an unrecoverable error on a later batch.
Contract (critical for consumer correctness):
- ProcessedInputs is the exact count of successfully processed inputs from the start of req.Inputs. Ordering is preserved.
- The accompanying EmbedResponse (returned via multi-return alongside this error) contains Embeddings[0..ProcessedInputs-1] — valid vectors for req.Inputs[0..ProcessedInputs-1], in original order.
- Usage reflects actual tokens consumed on the successful part only.
- Quota reservations for successful batches are COMMITTED; the failing batch's reservation is ROLLED BACK; unattempted remainder is not reserved. Consumer pays only for successful work.
Consumer retry pattern:
resp, err := router.EmbedBatch(ctx, req)
var partial *ErrPartialBatch
if errors.As(err, &partial) {
persist(resp.Embeddings) // valid prefix
return retryWith(req.Inputs[partial.ProcessedInputs:])
}
func (*ErrPartialBatch) Error ¶
func (e *ErrPartialBatch) Error() string
func (*ErrPartialBatch) Unwrap ¶
func (e *ErrPartialBatch) Unwrap() error
type HealthConfig ¶
type HealthConfig struct {
FailureThreshold int // failures to trip circuit (default: 3)
FailureWindow time.Duration // window for counting failures (default: 5min)
UnhealthyPeriod time.Duration // cooldown before half-open (default: 30s)
}
HealthConfig configures circuit breaker behavior.
func DefaultHealthConfig ¶
func DefaultHealthConfig() HealthConfig
DefaultHealthConfig returns the default circuit breaker settings.
type HealthState ¶
type HealthState int
HealthState describes the health of a provider account.
const ( HealthHealthy HealthState = iota HealthUnhealthy HealthHalfOpen )
func (HealthState) String ¶
func (h HealthState) String() string
type HealthTracker ¶
type HealthTracker struct {
// contains filtered or unexported fields
}
HealthTracker tracks per-account health using a circuit breaker pattern.
func NewHealthTracker ¶
func NewHealthTracker() *HealthTracker
NewHealthTracker creates a new HealthTracker with default config.
func NewHealthTrackerWithConfig ¶
func NewHealthTrackerWithConfig(cfg HealthConfig) *HealthTracker
NewHealthTrackerWithConfig creates a new HealthTracker with custom config.
func (*HealthTracker) GetHealth ¶
func (h *HealthTracker) GetHealth(accountID string) HealthState
GetHealth returns the current health state for an account.
func (*HealthTracker) RecordFailure ¶
func (h *HealthTracker) RecordFailure(accountID string)
RecordFailure records a failed request for an account.
func (*HealthTracker) RecordSuccess ¶
func (h *HealthTracker) RecordSuccess(accountID string)
RecordSuccess records a successful request for an account.
func (*HealthTracker) Reset ¶
func (h *HealthTracker) Reset()
Reset clears health state for all accounts, returning them to healthy.
func (*HealthTracker) ResetAccount ¶
func (h *HealthTracker) ResetAccount(accountID string)
ResetAccount clears health state for a single account.
type InputTokenBreakdown ¶
type InputTokenBreakdown struct {
Text int64 `json:"text"`
Audio int64 `json:"audio"`
Image int64 `json:"image"`
Video int64 `json:"video"`
}
InputTokenBreakdown splits PromptTokens by modality.
type Limits ¶
type Limits struct {
RPM int `yaml:"rpm"` // requests per minute
RPH int `yaml:"rph"` // requests per hour
RPD int `yaml:"rpd"` // requests per day
}
Limits defines rate limits for a provider account or model. Zero values mean unlimited for that window.
type Message ¶
type Message struct {
Role string `json:"role"`
Content string `json:"content,omitempty"`
Parts []Part `json:"parts,omitempty"`
}
Message represents a chat message.
For text-only messages, set Content. For multimodal messages (image/audio/video), set Parts. If Parts is non-empty, it takes precedence over Content.
type Meter ¶
type Meter interface {
// OnRoute is called when a routing decision is made.
OnRoute(event RouteEvent)
// OnResult is called when a provider returns a result.
OnResult(event ResultEvent)
}
Meter observes routing events for monitoring/logging.
type ModelMapping ¶
ModelMapping defines a model alias.
type Option ¶
type Option func(*Router)
Option configures a Router.
func WithHealthConfig ¶
func WithHealthConfig(cfg HealthConfig) Option
WithHealthConfig sets health tracker configuration.
func WithHealthTracker ¶
func WithHealthTracker(h *HealthTracker) Option
WithHealthTracker sets the health tracker.
func WithQuotaStore ¶
func WithQuotaStore(qs QuotaStore) Option
WithQuotaStore sets the quota store.
func WithRateLimiter ¶
func WithRateLimiter(rl *RateLimiter) Option
WithRateLimiter sets a custom rate limiter.
func WithSpendTracker ¶
func WithSpendTracker(s *SpendTracker) Option
WithSpendTracker sets the spend tracker.
type Part ¶
type Part struct {
Type PartType `json:"type"`
Text string `json:"text,omitempty"`
MIMEType string `json:"mime_type,omitempty"`
Data []byte `json:"data,omitempty"`
}
Part is a single content element in a multimodal Message.
For Type=PartText, set Text. For media parts, set MIMEType and Data (raw bytes). Provider adapters handle base64 encoding internally — callers pass raw bytes.
type Policy ¶
type Policy interface {
// Select orders candidates by priority. Returns ordered slice (highest priority first).
Select(candidates []Candidate) []Candidate
}
Policy selects and orders candidates for a given request.
type Provider ¶
type Provider interface {
// Name returns the provider identifier (e.g. "gemini", "openai", "grok").
Name() string
// SupportsModel returns true if this provider can handle the given model.
SupportsModel(model string) bool
// SupportsMultimodal reports whether this provider accepts media parts
// (image/audio/video) in messages. Text-only providers return false.
SupportsMultimodal() bool
// ChatCompletion performs a synchronous chat completion.
ChatCompletion(ctx context.Context, req ProviderRequest) (ProviderResponse, error)
// ChatCompletionStream performs a streaming chat completion.
ChatCompletionStream(ctx context.Context, req ProviderRequest) (ProviderStream, error)
}
Provider is the interface that LLM provider adapters must implement.
type ProviderRequest ¶
type ProviderRequest struct {
Auth Auth
Model string
Messages []Message
Temperature *float64
MaxTokens *int
TopP *float64
Stop []string
Stream bool
// HasMedia is precomputed by the router so providers don't need to
// rewalk Messages/Parts (important on the streaming path where buildUsage
// fires per chunk).
HasMedia bool
}
ProviderRequest is the request sent to a provider adapter.
type ProviderResponse ¶
type ProviderResponse struct {
ID string
Content string
FinishReason string
Usage Usage
Model string
}
ProviderResponse is the response from a provider adapter.
type ProviderStream ¶
type ProviderStream interface {
// Next returns the next chunk. Returns io.EOF when done.
Next() (StreamChunk, error)
// Close releases resources and signals completion.
Close() error
}
ProviderStream is the interface for streaming responses.
type QuotaInitializer ¶
type QuotaInitializer interface {
SetQuota(accountID string, dailyLimit int64, unit QuotaUnit) error
}
QuotaInitializer is an optional interface that QuotaStore implementations can implement to support automatic initialization from config.
type QuotaStore ¶
type QuotaStore interface {
// Reserve attempts to reserve quota for a request. Returns a Reservation on success.
Reserve(ctx context.Context, accountID string, amount int64, unit QuotaUnit, idempotencyKey string) (Reservation, error)
// Commit finalizes a reservation with the actual usage.
Commit(ctx context.Context, reservation Reservation, actualAmount int64) error
// Rollback releases a reservation that was not used.
Rollback(ctx context.Context, reservation Reservation) error
// Remaining returns the remaining free quota for an account.
Remaining(ctx context.Context, accountID string) (int64, error)
}
QuotaStore manages per-account quota reservations.
type RateLimiter ¶
type RateLimiter struct {
// contains filtered or unexported fields
}
RateLimiter enforces per-(account, model) rate limits using sliding windows. Thread-safe. Supports RPM, RPH, and RPD simultaneously.
Lookup order: model-specific limits first, then account-level defaults. This allows Cerebras-style configs where each model has independent limits, and simpler configs where one RPM applies to all models on an account.
func (*RateLimiter) Allow ¶
func (rl *RateLimiter) Allow(accountID, model string) bool
Allow checks if a request is permitted for the given (account, model) pair. Checks model-specific limits first. If none configured, falls back to account defaults. Returns true and records the request if under all limits. Returns false if any limit is exceeded.
func (*RateLimiter) Reset ¶
func (rl *RateLimiter) Reset()
Reset clears all rate limiter state (preserves configured limits).
func (*RateLimiter) ResetAccount ¶
func (rl *RateLimiter) ResetAccount(accountID string)
ResetAccount clears state for all models under an account.
func (*RateLimiter) SetAccountDefault ¶
func (rl *RateLimiter) SetAccountDefault(accountID string, limits Limits)
SetAccountDefault configures fallback rate limits for models without explicit limits.
func (*RateLimiter) SetLimit ¶
func (rl *RateLimiter) SetLimit(accountID string, rpm int)
SetLimit is a convenience method for backward compatibility. Equivalent to SetAccountDefault with RPM only.
func (*RateLimiter) SetModelLimits ¶
func (rl *RateLimiter) SetModelLimits(accountID, model string, limits Limits)
SetModelLimits configures rate limits for a specific (account, model) pair.
type Reservation ¶
Reservation represents a reserved quota allocation.
type ResultEvent ¶
type ResultEvent struct {
Provider string
AccountID string
Model string
Free bool
Success bool
Duration time.Duration
Usage Usage
Error error
DollarCost float64 // actual dollar cost for this request
}
ResultEvent describes the outcome of a provider call.
type RouteEvent ¶
type RouteEvent struct {
Provider string
AccountID string
Model string
Free bool
AttemptNum int
EstimatedIn int64
}
RouteEvent describes a routing decision.
type Router ¶
type Router struct {
// contains filtered or unexported fields
}
Router routes LLM requests across multiple providers and accounts.
func NewRouter ¶
NewRouter creates a new Router with the given config and providers. Default components (FreeFirstPolicy, MemoryQuotaStore, NoopMeter) are used unless overridden via options.
func (*Router) ChatCompletion ¶
func (r *Router) ChatCompletion(ctx context.Context, req ChatRequest) (ChatResponse, error)
ChatCompletion performs a synchronous chat completion with automatic routing.
func (*Router) ChatCompletionStream ¶
func (r *Router) ChatCompletionStream(ctx context.Context, req ChatRequest) (*RouterStream, error)
ChatCompletionStream performs a streaming chat completion with automatic routing.
func (*Router) Embed ¶
func (r *Router) Embed(ctx context.Context, req EmbedRequest) (EmbedResponse, error)
Embed performs a synchronous embedding request against a single candidate batch. This is the low-level escape hatch for callers that manage their own batching. For automatic batch splitting (which you probably want), use EmbedBatch.
Returns ErrBatchTooLarge if len(req.Inputs) exceeds any available provider's MaxBatchSize — callers should switch to EmbedBatch instead.
func (*Router) EmbedBatch ¶
func (r *Router) EmbedBatch(ctx context.Context, req EmbedRequest) (EmbedResponse, error)
EmbedBatch performs an embedding request with automatic batch splitting. req.Inputs is split into sub-batches of at most MaxBatchSize (per the first candidate provider) and each sub-batch goes through the full candidate selection + reservation workflow.
Happy path: returns EmbedResponse with Embeddings of len(req.Inputs), Usage summed across sub-batches, Routing reflecting the LAST successful sub-batch (typically all sub-batches route the same way).
Partial failure path: returns EmbedResponse with a valid prefix of Embeddings (the successfully processed portion) AND a non-nil *ErrPartialBatch error. Consumer pattern:
resp, err := router.EmbedBatch(ctx, req)
var partial *ErrPartialBatch
if errors.As(err, &partial) {
persist(resp.Embeddings) // valid prefix, len == partial.ProcessedInputs
return retryWith(req.Inputs[partial.ProcessedInputs:])
}
Full failure path (no successful sub-batches): returns zero-value EmbedResponse with a non-*ErrPartialBatch error (RouterError or sentinel).
type RouterError ¶
type RouterError struct {
Err error
Provider string
AccountID string
Model string
Attempts int
Tried []CandidateError // per-candidate errors (populated on ErrAllFailed)
}
RouterError wraps an error with routing context.
func (*RouterError) Error ¶
func (e *RouterError) Error() string
func (*RouterError) Unwrap ¶
func (e *RouterError) Unwrap() error
type RouterStream ¶
type RouterStream struct {
// contains filtered or unexported fields
}
RouterStream wraps a ProviderStream with quota commit on close.
func (*RouterStream) Close ¶
func (s *RouterStream) Close() error
Close releases the stream and commits quota.
func (*RouterStream) Next ¶
func (s *RouterStream) Next() (StreamChunk, error)
Next returns the next chunk from the stream.
type RoutingInfo ¶
RoutingInfo describes which provider/account served the request.
type SpendTracker ¶
type SpendTracker struct {
// contains filtered or unexported fields
}
SpendTracker tracks per-account dollar spend with daily reset.
func NewSpendTracker ¶
func NewSpendTracker() *SpendTracker
NewSpendTracker creates a new SpendTracker.
func (*SpendTracker) GetSpend ¶
func (s *SpendTracker) GetSpend(accountID string) float64
GetSpend returns the current daily spend for an account.
func (*SpendTracker) RecordSpend ¶
func (s *SpendTracker) RecordSpend(accountID string, dollars float64)
RecordSpend records dollar spend for an account.
type StreamChunk ¶
type StreamChunk struct {
ID string `json:"id"`
Choices []StreamDelta `json:"choices"`
Model string `json:"model"`
Usage *Usage `json:"usage,omitempty"`
}
StreamChunk represents a single chunk in a streaming response.
type StreamDelta ¶
type StreamDelta struct {
Index int `json:"index"`
Delta Delta `json:"delta"`
FinishReason string `json:"finish_reason,omitempty"`
}
StreamDelta represents a delta in a streaming choice.
type Usage ¶
type Usage struct {
PromptTokens int64 `json:"prompt_tokens"`
CompletionTokens int64 `json:"completion_tokens"`
TotalTokens int64 `json:"total_tokens"`
// CachedTokens is the subset of PromptTokens served from provider-side
// context cache. Orthogonal to modality. Observability-only — not
// subtracted from cost calculation (providers already price cached
// tokens server-side; subtracting would double-count the discount).
CachedTokens int64 `json:"cached_tokens,omitempty"`
// InputBreakdown splits PromptTokens by modality. Nil for providers
// that don't report it. When non-nil, Text+Audio+Image+Video == PromptTokens.
InputBreakdown *InputTokenBreakdown `json:"input_breakdown,omitempty"`
}
Usage represents token usage information.
Source Files
¶
Directories
¶
| Path | Synopsis |
|---|---|
|
examples
|
|
|
basic
command
|
|
|
embeddings
command
|
|
|
gonka
command
|
|
|
multi-provider
command
|
|
|
provider
|
|
|
postgres
module
|