Documentation
¶
Overview ¶
Package serve provides an OpenAI-compatible HTTP API server for model inference.
Index ¶
- type BatchConfig
- type BatchHandler
- type BatchRequest
- type BatchResult
- type BatchScheduler
- type ChatCompletionChoice
- type ChatCompletionRequest
- type ChatCompletionResponse
- type ChatMessage
- type CompletionChoice
- type CompletionRequest
- type CompletionResponse
- type EmbeddingObject
- type EmbeddingRequest
- type EmbeddingResponse
- type ModelDeleteResponse
- type ModelListResponse
- type ModelObject
- type Server
- type ServerMetrics
- type ServerOption
- type UsageInfo
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BatchConfig ¶
type BatchConfig struct {
MaxBatchSize int
BatchTimeout time.Duration
Handler BatchHandler
}
BatchConfig configures the batch scheduler.
type BatchHandler ¶
type BatchHandler func(ctx context.Context, reqs []BatchRequest) []BatchResult
BatchHandler processes a batch of requests and returns results. The results slice must have the same length as the requests slice.
type BatchRequest ¶
BatchRequest represents a single inference request in a batch.
type BatchResult ¶
BatchResult holds the result for a single request in a batch.
type BatchScheduler ¶
type BatchScheduler struct {
// contains filtered or unexported fields
}
BatchScheduler collects incoming requests into batches for efficient processing.
func NewBatchScheduler ¶
func NewBatchScheduler(config BatchConfig) *BatchScheduler
NewBatchScheduler creates a new batch scheduler.
func (*BatchScheduler) Start ¶
func (s *BatchScheduler) Start()
Start begins the batch collection loop.
func (*BatchScheduler) Stop ¶
func (s *BatchScheduler) Stop()
Stop gracefully shuts down the scheduler.
func (*BatchScheduler) Submit ¶
func (s *BatchScheduler) Submit(ctx context.Context, req BatchRequest) (BatchResult, error)
Submit adds a request to the next batch and waits for the result.
type ChatCompletionChoice ¶
type ChatCompletionChoice struct {
Index int `json:"index"`
Message ChatMessage `json:"message"`
FinishReason string `json:"finish_reason"`
}
ChatCompletionChoice is a single choice in the response.
type ChatCompletionRequest ¶
type ChatCompletionRequest struct {
Model string `json:"model"`
Messages []ChatMessage `json:"messages"`
Temperature *float64 `json:"temperature,omitempty"`
TopP *float64 `json:"top_p,omitempty"`
MaxTokens *int `json:"max_tokens,omitempty"`
Stream bool `json:"stream"`
}
ChatCompletionRequest represents the OpenAI chat completion request.
type ChatCompletionResponse ¶
type ChatCompletionResponse struct {
ID string `json:"id"`
Object string `json:"object"`
Created int64 `json:"created"`
Model string `json:"model"`
Choices []ChatCompletionChoice `json:"choices"`
Usage UsageInfo `json:"usage"`
}
ChatCompletionResponse is the non-streaming response.
type ChatMessage ¶
ChatMessage is a single message in the chat.
type CompletionChoice ¶
type CompletionChoice struct {
Index int `json:"index"`
Text string `json:"text"`
FinishReason string `json:"finish_reason"`
}
CompletionChoice is a single choice in the completion response.
type CompletionRequest ¶
type CompletionRequest struct {
Model string `json:"model"`
Prompt string `json:"prompt"`
Temperature *float64 `json:"temperature,omitempty"`
MaxTokens *int `json:"max_tokens,omitempty"`
Stream bool `json:"stream"`
}
CompletionRequest represents the OpenAI completion request.
type CompletionResponse ¶
type CompletionResponse struct {
ID string `json:"id"`
Object string `json:"object"`
Created int64 `json:"created"`
Model string `json:"model"`
Choices []CompletionChoice `json:"choices"`
Usage UsageInfo `json:"usage"`
}
CompletionResponse is the non-streaming completion response.
type EmbeddingObject ¶
type EmbeddingObject struct {
Object string `json:"object"`
Embedding []float32 `json:"embedding"`
Index int `json:"index"`
}
EmbeddingObject is a single embedding in the response.
type EmbeddingRequest ¶
type EmbeddingRequest struct {
Model string `json:"model"`
Input interface{} `json:"input"` // string or []string
}
EmbeddingRequest represents the OpenAI embeddings request.
type EmbeddingResponse ¶
type EmbeddingResponse struct {
Object string `json:"object"`
Data []EmbeddingObject `json:"data"`
Model string `json:"model"`
Usage UsageInfo `json:"usage"`
}
EmbeddingResponse is the /v1/embeddings response.
type ModelDeleteResponse ¶
type ModelDeleteResponse struct {
ID string `json:"id"`
Object string `json:"object"`
Deleted bool `json:"deleted"`
}
ModelDeleteResponse is the DELETE /v1/models/:id response.
type ModelListResponse ¶
type ModelListResponse struct {
Object string `json:"object"`
Data []ModelObject `json:"data"`
}
ModelListResponse is the /v1/models response.
type ModelObject ¶
type ModelObject struct {
ID string `json:"id"`
Object string `json:"object"`
Created int64 `json:"created"`
OwnedBy string `json:"owned_by"`
Architecture string `json:"architecture,omitempty"`
}
ModelObject represents a model in the /v1/models response.
type Server ¶
type Server struct {
// contains filtered or unexported fields
}
Server wraps a loaded model and serves OpenAI-compatible HTTP endpoints.
func NewServer ¶
func NewServer(m *inference.Model, opts ...ServerOption) *Server
NewServer creates a Server for the given model.
type ServerMetrics ¶
type ServerMetrics struct {
// contains filtered or unexported fields
}
ServerMetrics records serving metrics using a runtime.Collector.
func NewServerMetrics ¶
func NewServerMetrics(c runtime.Collector) *ServerMetrics
NewServerMetrics creates a ServerMetrics backed by the given collector.
func (*ServerMetrics) RecordRequest ¶
func (m *ServerMetrics) RecordRequest(tokens int, latency time.Duration)
RecordRequest records a completed request's metrics.
type ServerOption ¶
type ServerOption func(*Server)
ServerOption configures the server.
func WithBatchScheduler ¶
func WithBatchScheduler(bs *BatchScheduler) ServerOption
WithBatchScheduler attaches a batch scheduler for non-streaming requests. When set, incoming completion requests are routed through the scheduler to be grouped into batches for higher throughput.
func WithDraftModel ¶
func WithDraftModel(draft *inference.Model) ServerOption
WithDraftModel enables speculative decoding using the given draft model. When set, completion requests use speculative decode with the draft model proposing tokens and the target model verifying them.
func WithLogger ¶
func WithLogger(l log.Logger) ServerOption
WithLogger sets the logger for request logging.
func WithMetrics ¶
func WithMetrics(c runtime.Collector) ServerOption
WithMetrics sets the metrics collector for token rate and request tracking.