serve

package
v1.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 15, 2026 License: Apache-2.0 Imports: 15 Imported by: 0

Documentation

Overview

Package serve provides an OpenAI-compatible HTTP API server for model inference.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type BatchConfig

type BatchConfig struct {
	MaxBatchSize int
	BatchTimeout time.Duration
	Handler      BatchHandler
}

BatchConfig configures the batch scheduler.

type BatchHandler

type BatchHandler func(ctx context.Context, reqs []BatchRequest) []BatchResult

BatchHandler processes a batch of requests and returns results. The results slice must have the same length as the requests slice.

type BatchRequest

type BatchRequest struct {
	Prompt string
	Phase  string // "prefill" or "decode"
}

BatchRequest represents a single inference request in a batch.

type BatchResult

type BatchResult struct {
	Value string
	Err   error
}

BatchResult holds the result for a single request in a batch.

type BatchScheduler

type BatchScheduler struct {
	// contains filtered or unexported fields
}

BatchScheduler collects incoming requests into batches for efficient processing.

func NewBatchScheduler

func NewBatchScheduler(config BatchConfig) *BatchScheduler

NewBatchScheduler creates a new batch scheduler.

func (*BatchScheduler) Start

func (s *BatchScheduler) Start()

Start begins the batch collection loop.

func (*BatchScheduler) Stop

func (s *BatchScheduler) Stop()

Stop gracefully shuts down the scheduler.

func (*BatchScheduler) Submit

Submit adds a request to the next batch and waits for the result.

type ChatCompletionChoice

type ChatCompletionChoice struct {
	Index        int         `json:"index"`
	Message      ChatMessage `json:"message"`
	FinishReason string      `json:"finish_reason"`
}

ChatCompletionChoice is a single choice in the response.

type ChatCompletionRequest

type ChatCompletionRequest struct {
	Model       string        `json:"model"`
	Messages    []ChatMessage `json:"messages"`
	Temperature *float64      `json:"temperature,omitempty"`
	TopP        *float64      `json:"top_p,omitempty"`
	MaxTokens   *int          `json:"max_tokens,omitempty"`
	Stream      bool          `json:"stream"`
}

ChatCompletionRequest represents the OpenAI chat completion request.

type ChatCompletionResponse

type ChatCompletionResponse struct {
	ID      string                 `json:"id"`
	Object  string                 `json:"object"`
	Created int64                  `json:"created"`
	Model   string                 `json:"model"`
	Choices []ChatCompletionChoice `json:"choices"`
	Usage   UsageInfo              `json:"usage"`
}

ChatCompletionResponse is the non-streaming response.

type ChatMessage

type ChatMessage struct {
	Role    string `json:"role"`
	Content string `json:"content"`
}

ChatMessage is a single message in the chat.

type CompletionChoice

type CompletionChoice struct {
	Index        int    `json:"index"`
	Text         string `json:"text"`
	FinishReason string `json:"finish_reason"`
}

CompletionChoice is a single choice in the completion response.

type CompletionRequest

type CompletionRequest struct {
	Model       string   `json:"model"`
	Prompt      string   `json:"prompt"`
	Temperature *float64 `json:"temperature,omitempty"`
	MaxTokens   *int     `json:"max_tokens,omitempty"`
	Stream      bool     `json:"stream"`
}

CompletionRequest represents the OpenAI completion request.

type CompletionResponse

type CompletionResponse struct {
	ID      string             `json:"id"`
	Object  string             `json:"object"`
	Created int64              `json:"created"`
	Model   string             `json:"model"`
	Choices []CompletionChoice `json:"choices"`
	Usage   UsageInfo          `json:"usage"`
}

CompletionResponse is the non-streaming completion response.

type EmbeddingObject

type EmbeddingObject struct {
	Object    string    `json:"object"`
	Embedding []float32 `json:"embedding"`
	Index     int       `json:"index"`
}

EmbeddingObject is a single embedding in the response.

type EmbeddingRequest

type EmbeddingRequest struct {
	Model string      `json:"model"`
	Input interface{} `json:"input"` // string or []string
}

EmbeddingRequest represents the OpenAI embeddings request.

type EmbeddingResponse

type EmbeddingResponse struct {
	Object string            `json:"object"`
	Data   []EmbeddingObject `json:"data"`
	Model  string            `json:"model"`
	Usage  UsageInfo         `json:"usage"`
}

EmbeddingResponse is the /v1/embeddings response.

type ModelDeleteResponse

type ModelDeleteResponse struct {
	ID      string `json:"id"`
	Object  string `json:"object"`
	Deleted bool   `json:"deleted"`
}

ModelDeleteResponse is the DELETE /v1/models/:id response.

type ModelListResponse

type ModelListResponse struct {
	Object string        `json:"object"`
	Data   []ModelObject `json:"data"`
}

ModelListResponse is the /v1/models response.

type ModelObject

type ModelObject struct {
	ID           string `json:"id"`
	Object       string `json:"object"`
	Created      int64  `json:"created"`
	OwnedBy      string `json:"owned_by"`
	Architecture string `json:"architecture,omitempty"`
}

ModelObject represents a model in the /v1/models response.

type Server

type Server struct {
	// contains filtered or unexported fields
}

Server wraps a loaded model and serves OpenAI-compatible HTTP endpoints.

func NewServer

func NewServer(m *inference.Model, opts ...ServerOption) *Server

NewServer creates a Server for the given model.

func (*Server) Close

func (s *Server) Close(_ context.Context) error

Close implements shutdown.Closer for graceful shutdown integration.

func (*Server) Handler

func (s *Server) Handler() http.Handler

Handler returns the HTTP handler for this server.

type ServerMetrics

type ServerMetrics struct {
	// contains filtered or unexported fields
}

ServerMetrics records serving metrics using a runtime.Collector.

func NewServerMetrics

func NewServerMetrics(c runtime.Collector) *ServerMetrics

NewServerMetrics creates a ServerMetrics backed by the given collector.

func (*ServerMetrics) RecordRequest

func (m *ServerMetrics) RecordRequest(tokens int, latency time.Duration)

RecordRequest records a completed request's metrics.

type ServerOption

type ServerOption func(*Server)

ServerOption configures the server.

func WithBatchScheduler

func WithBatchScheduler(bs *BatchScheduler) ServerOption

WithBatchScheduler attaches a batch scheduler for non-streaming requests. When set, incoming completion requests are routed through the scheduler to be grouped into batches for higher throughput.

func WithDraftModel

func WithDraftModel(draft *inference.Model) ServerOption

WithDraftModel enables speculative decoding using the given draft model. When set, completion requests use speculative decode with the draft model proposing tokens and the target model verifying them.

func WithLogger

func WithLogger(l log.Logger) ServerOption

WithLogger sets the logger for request logging.

func WithMetrics

func WithMetrics(c runtime.Collector) ServerOption

WithMetrics sets the metrics collector for token rate and request tracking.

type UsageInfo

type UsageInfo struct {
	PromptTokens     int `json:"prompt_tokens"`
	CompletionTokens int `json:"completion_tokens"`
	TotalTokens      int `json:"total_tokens"`
}

UsageInfo reports token counts.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL