schema

package
v0.0.11 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 1, 2026 License: Apache-2.0 Imports: 6 Imported by: 0

Documentation

Index

Constants

View Source
const (
	CompletionFinishReasonMaxTokens = "max_tokens"
	CompletionFinishReasonStop      = "stop"
	CompletionFinishReasonEOS       = "eos"
)
View Source
const (
	CompletionStreamDeltaType = "completion.delta"
	CompletionStreamDoneType  = "completion.done"
	CompletionStreamErrorType = "completion.error"

	ModelPullProgressType = "model.pull.progress"
	ModelPullCompleteType = "model.pull.complete"
	ModelPullErrorType    = "model.pull.error"
)

Variables

This section is empty.

Functions

func SpanName

func SpanName(args ...string) string

SpanName returns a span name with "llamacpp" prefix joined by periods. SpanName("ListModels") returns "llamacpp.ListModels" SpanName("Model", "Load") returns "llamacpp.Model.Load"

Types

type CachedModel

type CachedModel struct {
	ServerModel
	Model
	LoadedAt time.Time     `json:"loaded_at,omitzero"`
	Runtime  *ModelRuntime `json:"runtime,omitempty"`
}

CachedModel represents a model loaded in memory. For server builds, it embeds ServerModel which provides the Handle and RWMutex.

func (CachedModel) String

func (m CachedModel) String() string

type ChatChunk added in v0.0.5

type ChatChunk struct {
	Message ChatMessage `json:"message"`
}

ChatChunk contains a streamed chat chunk.

func (ChatChunk) String added in v0.0.5

func (r ChatChunk) String() string

type ChatMessage added in v0.0.5

type ChatMessage struct {
	Role    string `json:"role"`    // "system", "user", "assistant", or "tool"
	Content string `json:"content"` // The message content
}

ChatMessage represents a single message in a conversation.

type ChatRequest added in v0.0.5

type ChatRequest struct {
	CompletionRequest
	Messages []ChatMessage `json:"messages"`
}

ChatRequest contains parameters for chat completion. It embeds CompletionRequest to reuse sampling and model options.

func (ChatRequest) String added in v0.0.5

func (r ChatRequest) String() string

type ChatResponse added in v0.0.5

type ChatResponse struct {
	Model        string       `json:"model"`                   // Model used
	Thinking     *ChatMessage `json:"thinking,omitempty"`      // Optional reasoning message
	Message      ChatMessage  `json:"message"`                 // Assistant message
	Usage        Usage        `json:"usage"`                   // Token usage
	FinishReason string       `json:"finish_reason,omitempty"` // Reason generation ended
}

ChatResponse contains the generated assistant message.

func (ChatResponse) String added in v0.0.5

func (r ChatResponse) String() string

type CompletionChunk

type CompletionChunk struct {
	Text string `json:"text"` // Chunk text
}

CompletionChunk contains a streamed completion chunk.

func (CompletionChunk) String

func (r CompletionChunk) String() string

type CompletionRequest

type CompletionRequest struct {
	Model         string   `json:"model"`                    // Model name
	Prompt        string   `json:"prompt"`                   // Prompt to complete
	MaxTokens     *int32   `json:"max_tokens,omitempty"`     // Max tokens to generate
	Temperature   *float32 `json:"temperature,omitempty"`    // Sampling temperature
	TopP          *float32 `json:"top_p,omitempty"`          // Nucleus sampling
	TopK          *int32   `json:"top_k,omitempty"`          // Top-k sampling
	RepeatPenalty *float32 `json:"repeat_penalty,omitempty"` // Penalize repeats (1.0 = disabled)
	RepeatLastN   *int32   `json:"repeat_last_n,omitempty"`  // Repeat penalty window size
	Seed          *uint32  `json:"seed,omitempty"`           // RNG seed
	Stop          []string `json:"stop,omitempty"`           // Stop words
	PrefixCache   *bool    `json:"prefix_cache,omitempty"`   // Enable prefix caching
}

CompletionRequest contains parameters for text completion.

func (CompletionRequest) String

func (r CompletionRequest) String() string

type CompletionResponse

type CompletionResponse struct {
	Model        string `json:"model"`                   // Model used
	Text         string `json:"text"`                    // Completion text
	Usage        Usage  `json:"usage"`                   // Token usage
	FinishReason string `json:"finish_reason,omitempty"` // Reason generation ended
}

CompletionResponse contains the generated completion.

func (CompletionResponse) String

func (r CompletionResponse) String() string

type ContextRequest

type ContextRequest struct {
	LoadModelRequest
	ContextSize   *uint32 `json:"context_size,omitempty"`   // Context size (nil = from model)
	BatchSize     *uint32 `json:"batch_size,omitempty"`     // Logical batch size (nil = default)
	UBatchSize    *uint32 `json:"ubatch_size,omitempty"`    // Physical/micro batch size (nil = default, must equal batch_size for encoder models)
	Threads       *int32  `json:"threads,omitempty"`        // Number of threads (nil = default)
	AttentionType *int32  `json:"attention_type,omitempty"` // Attention type: -1=auto, 0=causal, 1=non-causal (nil = auto)
	FlashAttn     *int32  `json:"flash_attn,omitempty"`     // Flash attention: -1=auto, 0=disabled, 1=enabled (nil = auto)
	Embeddings    *bool   `json:"embeddings,omitempty"`     // Enable embeddings extraction (nil = false)
	KVUnified     *bool   `json:"kv_unified,omitempty"`     // Use unified KV cache (nil = default, required for BERT)
}

ContextRequest contains parameters for creating an inference context.

func (ContextRequest) String

func (r ContextRequest) String() string

type DetokenizeRequest

type DetokenizeRequest struct {
	Model          string  `json:"model"`                     // Model name or path (must be loaded)
	Tokens         []Token `json:"tokens"`                    // Tokens to detokenize
	RemoveSpecial  *bool   `json:"remove_special,omitempty"`  // Remove BOS/EOS tokens (default: false)
	UnparseSpecial *bool   `json:"unparse_special,omitempty"` // Render special tokens as text (default: true)
}

DetokenizeRequest contains parameters for detokenizing tokens.

func (DetokenizeRequest) String

func (r DetokenizeRequest) String() string

type DetokenizeResponse

type DetokenizeResponse struct {
	Text string `json:"text"`
}

DetokenizeResponse contains the result of detokenization.

func (DetokenizeResponse) String

func (r DetokenizeResponse) String() string

type EmbedRequest

type EmbedRequest struct {
	Model     string   `json:"model"`               // Model name
	Input     []string `json:"input"`               // Text(s) to embed
	Normalize *bool    `json:"normalize,omitempty"` // L2-normalize embeddings (default: true)
}

EmbedRequest contains parameters for generating embeddings.

func (EmbedRequest) String

func (r EmbedRequest) String() string

type EmbedResponse

type EmbedResponse struct {
	Model      string      `json:"model"`      // Model used
	Embeddings [][]float32 `json:"embeddings"` // One embedding vector per input
	Dimension  int         `json:"dimension"`  // Embedding dimension
	Usage      Usage       `json:"usage"`      // Token usage
}

EmbedResponse contains the generated embeddings.

func (EmbedResponse) String

func (r EmbedResponse) String() string

type GPUDevice

type GPUDevice struct {
	ID               int32  `json:"id"`
	Name             string `json:"name"`
	FreeMemoryBytes  int64  `json:"free_memory_bytes"`  // -1 if unknown
	TotalMemoryBytes int64  `json:"total_memory_bytes"` // -1 if unknown
}

GPUDevice represents information about a single GPU device

func (GPUDevice) String

func (d GPUDevice) String() string

type GPUInfo

type GPUInfo struct {
	Backend string      `json:"backend"` // "Metal", "CUDA", "Vulkan", "CPU"
	Devices []GPUDevice `json:"devices"`
}

GPUInfo represents the GPU/accelerator configuration

func (GPUInfo) String

func (i GPUInfo) String() string

type LoadModelRequest

type LoadModelRequest struct {
	Name   string `json:"name"`                 // Model name or path to load
	Load   *bool  `json:"load,omitempty"`       // Load (true) or unload (false) model (nil = load)
	Gpu    *int32 `json:"gpu,omitempty"`        // Main GPU index (nil = default)
	Layers *int32 `json:"gpu_layers,omitempty"` // Number of layers to offload to GPU (nil = default, -1 = all)
	Mmap   *bool  `json:"use_mmap,omitempty"`   // Use memory mapping for model loading (nil = default)
	Mlock  *bool  `json:"use_mlock,omitempty"`  // Lock model in memory (nil = default)
}

LoadModelRequest contains the parameters for loading a model into memory.

func (LoadModelRequest) String

func (r LoadModelRequest) String() string

type Model

type Model struct {
	// Identity
	Path         string `json:"path,omitempty"`
	Name         string `json:"name,omitempty"`
	Architecture string `json:"architecture,omitempty"`
	Description  string `json:"description,omitempty"`

	// Chat template
	ChatTemplate string `json:"chatTemplate,omitempty"`

	// Dimensions
	ContextSize   int32 `json:"contextSize,omitempty"`
	EmbeddingSize int32 `json:"embeddingSize,omitempty"`
	LayerCount    int32 `json:"layerCount,omitempty"`
	HeadCount     int32 `json:"headCount,omitempty"`
	HeadKVCount   int32 `json:"headKVCount,omitempty"`

	// Raw metadata key/value pairs from the model
	Meta map[string]any `json:"meta,omitempty"`
}

Model represents model metadata and capabilities (excluding load params).

func NewModelFromGGUF

func NewModelFromGGUF(basePath, relPath string, ctx *gguf.Context) (*Model, error)

NewModelFromGGUF builds a schema Model from a GGUF file context. The relPath is the relative path from basePath to the model file. This is a lightweight way to get model metadata without loading the full model.

func (Model) String

func (m Model) String() string

type ModelPullProgress

type ModelPullProgress struct {
	Filename      string  `json:"model"`
	BytesReceived uint64  `json:"bytes_received"`
	TotalBytes    uint64  `json:"total_bytes,omitempty"`
	Percentage    float64 `json:"percent,omitempty"`
}

ModelPullProgress represents progress information during model download

type ModelRuntime

type ModelRuntime struct {
	NLayer    int32  `json:"layerCount,omitempty"`
	NHead     int32  `json:"headCount,omitempty"`
	NHeadKV   int32  `json:"headKVCount,omitempty"`
	NEmbd     int32  `json:"embeddingSize,omitempty"`
	NCtxTrain int32  `json:"contextSize,omitempty"`
	NParams   uint64 `json:"paramCount,omitempty"`
	ModelSize uint64 `json:"modelSizeBytes,omitempty"`
}

ModelRuntime represents runtime statistics for a loaded model.

func (ModelRuntime) String added in v0.0.3

func (m ModelRuntime) String() string

type PullModelRequest

type PullModelRequest struct {
	URL string `json:"url"` // URL to download the model from (supports hf:// and https://)
}

PullModelRequest contains the parameters for downloading a model from a URL.

func (PullModelRequest) String

func (r PullModelRequest) String() string

type ServerModel added in v0.0.3

type ServerModel struct {
	sync.RWMutex
	Handle *llamacpp.Model
}

ServerModel represents a model loaded in memory on the server. It includes the C model handle and synchronization primitives.

type Token added in v0.0.3

type Token = int32

Token is a token ID (type alias for int32)

type TokenizeRequest

type TokenizeRequest struct {
	Model        string `json:"model"`                   // Model name or path (must be loaded)
	Text         string `json:"text"`                    // Text to tokenize
	AddSpecial   *bool  `json:"add_special,omitempty"`   // Add BOS/EOS tokens (default: true)
	ParseSpecial *bool  `json:"parse_special,omitempty"` // Parse special tokens in text (default: false)
}

TokenizeRequest contains parameters for tokenizing text.

func (TokenizeRequest) String

func (r TokenizeRequest) String() string

type TokenizeResponse

type TokenizeResponse struct {
	Tokens []Token `json:"tokens"`
}

TokenizeResponse contains the result of tokenization.

func (TokenizeResponse) String

func (r TokenizeResponse) String() string

type Usage

type Usage struct {
	InputTokens  int `json:"input_tokens"`  // Tokens in input (prompt/text to embed)
	OutputTokens int `json:"output_tokens"` // Tokens generated (0 for embeddings)
}

Usage tracks token usage for requests.

func (Usage) TotalTokens

func (u Usage) TotalTokens() int

TotalTokens returns the sum of input and output tokens.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL