Documentation
¶
Index ¶
- Variables
- func ApplySuppressTokens(logits []float32, suppress []int32)
- func CheckMemoryBudget(modelPath string, cfg ModelConfig, requestedSeqLen int) (int, error)
- func EstimateLayerBytes(l *Layer) int64
- func EstimateRuntimeBytes(cfg ModelConfig, seqLen int) int64
- func FormatChat(cfg ModelConfig, system, user string) string
- func FormatMessages(cfg ModelConfig, messages []Message, opts ...FormatOptions) string
- func Forward(m *Model, token int32, pos int, kv *memory.MultiLayerKVCache, rs *RunState) []float32
- func ForwardAttention(layer *Layer, rs *RunState, kv *memory.MultiLayerKVCache, ...)
- func ForwardBatch(m *Model, tokens []int32, startPos int, kv *memory.MultiLayerKVCache, ...) []float32
- func ForwardFromLayer(m *Model, startLayer, pos int, kv *memory.MultiLayerKVCache, rs *RunState) []float32
- func ForwardMLA(layer *Layer, rs *RunState, kv *memory.MultiLayerKVCache, l, pos int, ...)
- func ForwardMoEFFN(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)
- func ForwardMoEFFNDispatch(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)
- func ForwardMoEFFN_OAI(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)
- func ForwardRange(m *Model, token int32, pos, startLayer, endLayer int, ...) []float32
- func ForwardSSMLayer(layer *Layer, rs *RunState, ssm *SSMRunState, ssmState *memory.SSMLayerState, ...) []float32
- func PinLayerToRAM(l *Layer)
- func RegisterArchitecture(name string, desc ArchDescriptor)
- func TrimStopText(text string, cfg ModelConfig) string
- type ArchDescriptor
- type BatchState
- type CoreKind
- type FFNKind
- type FormatOptions
- type GenerateConfig
- type GenerateResult
- type Layer
- type LayerSpec
- type Message
- type Model
- type ModelConfig
- type NormKind
- type Pipeline
- func (p *Pipeline) Chat(system, user string, cfg GenerateConfig) (string, float64, error)
- func (p *Pipeline) ChatMessages(messages []Message, cfg GenerateConfig) (string, float64, error)
- func (p *Pipeline) FreeForGPU()
- func (p *Pipeline) Generate(prompt []int32, cfg GenerateConfig) ([]int32, error)
- func (p *Pipeline) GenerateDetailed(prompt string, cfg GenerateConfig) (*GenerateResult, error)
- func (p *Pipeline) GenerateText(prompt string, cfg GenerateConfig) (string, float64, error)
- func (p *Pipeline) GenerateTextWithStopStrings(prompt string, cfg GenerateConfig) (string, float64, error)
- func (p *Pipeline) RebuildBuffers()
- type ResKind
- type RunState
- func (rs *RunState) ApplyRoPEFast(vec []float32, pos int)
- func (rs *RunState) ApplyRoPEFastDim(vec []float32, pos, ropeDim int)
- func (rs *RunState) PrecomputeRoPE(maxSeqLen, ropeDim, headDim int, freqBase float32)
- func (rs *RunState) PrecomputeSWARoPE(maxSeqLen, ropeDim, headDim int, freqBase float32)
- func (rs *RunState) PrecomputeYaRNRoPE(maxSeqLen, ropeDim, headDim int, freqBase, factor float32, origMaxPos int, ...)
- func (rs *RunState) RoPETables() (cosTable, sinTable []float32)
- func (rs *RunState) RoPETablesSWA() (cosTable, sinTable []float32)
- func (rs *RunState) SetRopeNeox(neox bool)
- type SSMRunState
- type Tokenizer
- type WorkerPool
Constants ¶
This section is empty.
Variables ¶
var CPUDiag cpuDiagState
CPUDiag provides structured CPU forward pass diagnostic logging, mirroring the GPU diagnostic system. Enable with DLGO_CPU_DIAG=1 or the same layer/ position filter syntax as DLGO_GPU_DIAG.
Enable: DLGO_CPU_DIAG=1 (all layers, all positions)
DLGO_CPU_DIAG=L0 (layer 0 only) DLGO_CPU_DIAG=L0:P0 (layer 0, position 0 only) DLGO_CPU_DIAG=L0-L5 (layers 0 through 5)
Verbosity: DLGO_CPU_DIAG_V=1 (default: print first 8 elements)
DLGO_CPU_DIAG_V=2 (print first 16 elements + norms) DLGO_CPU_DIAG_V=3 (print first 32 elements + full stats)
var DebugForward bool
DebugForward enables per-layer norm prints for the current forward pass.
var DiagLayer, DiagPos int
DiagLayer and DiagPos track the current layer/position for functions that don't have direct access to them (e.g. MoE FFN dispatch). Only set when CPUDiag is enabled.
Functions ¶
func ApplySuppressTokens ¶
ApplySuppressTokens sets logits to -inf for suppressed EOG tokens.
func CheckMemoryBudget ¶
func CheckMemoryBudget(modelPath string, cfg ModelConfig, requestedSeqLen int) (int, error)
CheckMemoryBudget checks whether loading the given model at the requested context length will fit in available RAM. Returns an adjusted (possibly reduced) maxSeqLen and an error only if even the minimum context won't fit.
Model weights are memory-mapped and demand-paged by the OS — they do NOT consume heap RAM. Only runtime buffers (KV cache, RunState, BatchState) need actual RAM. The budget is: 85% of total physical RAM minus current usage. This ensures any model can load regardless of size; throughput degrades gracefully via mmap paging but the system never crashes.
func EstimateLayerBytes ¶
EstimateLayerBytes returns the approximate size in bytes of a layer's weight data.
func EstimateRuntimeBytes ¶
func EstimateRuntimeBytes(cfg ModelConfig, seqLen int) int64
EstimateRuntimeBytes estimates heap bytes needed for KV cache, RunState, and BatchState at a given context length, WITHOUT counting the model weights (which are memory-mapped and paged in by the OS on demand).
func FormatChat ¶
func FormatChat(cfg ModelConfig, system, user string) string
FormatChat formats a single-turn chat prompt (system + user) for the model.
func FormatMessages ¶
func FormatMessages(cfg ModelConfig, messages []Message, opts ...FormatOptions) string
FormatMessages formats a multi-turn conversation for the model.
func ForwardAttention ¶
func ForwardAttention( layer *Layer, rs *RunState, kv *memory.MultiLayerKVCache, l, pos, numHeads, numKVHeads, headDim, kvMul int, cfg ModelConfig, pool *blas.Pool, )
forwardAttention runs one full-attention layer. Writes result to rs.AttnProj.
func ForwardBatch ¶
func ForwardBatch(m *Model, tokens []int32, startPos int, kv *memory.MultiLayerKVCache, rs *RunState, bs *BatchState) []float32
ForwardBatch processes multiple tokens in a single pass (prefill). Returns logits for the last position only. Fills the KV cache for all positions. If the number of tokens exceeds bs.maxPos, the prompt is processed in chunks of bs.maxPos, correctly maintaining KV cache and SSM state across chunks.
func ForwardFromLayer ¶
func ForwardFromLayer(m *Model, startLayer, pos int, kv *memory.MultiLayerKVCache, rs *RunState) []float32
ForwardFromLayer resumes forward pass from a given layer. rs.X must already contain the hidden state from the previous layer.
func ForwardMLA ¶
func ForwardMLA( layer *Layer, rs *RunState, kv *memory.MultiLayerKVCache, l, pos int, cfg ModelConfig, pool *blas.Pool, )
ForwardMLA runs one Multi-head Latent Attention layer (DeepSeek-V2/GLM-4). Uses absorbed key/value approach to avoid expanding K for every cached position.
func ForwardMoEFFN ¶
ForwardMoEFFN runs the Mixture-of-Experts FFN. Result written to rs.FFNOut.
func ForwardMoEFFNDispatch ¶
func ForwardMoEFFNDispatch(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)
ForwardMoEFFNDispatch routes to the correct MoE implementation based on FFN type.
func ForwardMoEFFN_OAI ¶
func ForwardMoEFFN_OAI(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)
ForwardMoEFFN_OAI runs the gpt-oss MoE with SOFTMAX_WEIGHT gating and SwiGLU_OAI activation.
func ForwardRange ¶
func ForwardRange(m *Model, token int32, pos, startLayer, endLayer int, kv *memory.MultiLayerKVCache, rs *RunState) []float32
ForwardRange performs a forward pass through layers [startLayer, endLayer). If startLayer == 0, token embedding is done. If endLayer == numLayers, the final norm and output projection are included.
func ForwardSSMLayer ¶
func ForwardSSMLayer( layer *Layer, rs *RunState, ssm *SSMRunState, ssmState *memory.SSMLayerState, xnorm []float32, cfg ModelConfig, pool *blas.Pool, ) []float32
ForwardSSMLayer runs one Gated Delta Net layer for single-token autoregressive inference.
Implements the recurrent delta rule with error correction:
S[h] = exp(g[h]) * S[h] // decay v_pred = S^T @ k // predict value from key delta = v - v_pred // error signal S[h] += sigmoid(beta[h]) * outer(k, delta) // error-corrected update out[h] = S^T @ (q / sqrt(headKDim)) // scaled output
func PinLayerToRAM ¶
func PinLayerToRAM(l *Layer)
PinLayerToRAM copies a layer's weight data from the mmap'd region into heap-allocated slices, ensuring the layer stays in physical RAM and avoids page faults during inference.
func RegisterArchitecture ¶
func RegisterArchitecture(name string, desc ArchDescriptor)
RegisterArchitecture registers or overwrites an architecture descriptor. Use for extensibility when adding support for new model families.
func TrimStopText ¶
func TrimStopText(text string, cfg ModelConfig) string
TrimStopText removes trailing stop strings and whitespace from generated text.
Types ¶
type ArchDescriptor ¶
type ArchDescriptor struct {
RopeNeox bool // true = NeoX-style RoPE, false = interleaved
FFNGelu bool // true = GeGLU (Gemma), false = SwiGLU (LLaMA/Qwen)
EmbedScaleMode string // "none" or "sqrt_dim"
ChatTemplate string // "chatml", "llama2", "llama3", "gemma", "phi"
SupportsThinking bool // true = model uses <think> blocks (Qwen3/3.5)
}
ArchDescriptor describes architecture-specific behavior for an LLM.
func GetArchDescriptor ¶
func GetArchDescriptor(arch string) ArchDescriptor
GetArchDescriptor returns the descriptor for the given architecture. Unknown architectures receive a default descriptor (interleaved RoPE, SwiGLU, no embed scale, chatml).
type BatchState ¶
type BatchState struct {
XBatch []float32 // [maxPos * dim]
XNormBatch []float32 // [maxPos * dim]
QBatch []float32 // [maxPos * qDim]
KBatch []float32 // [maxPos * kvDim]
VBatch []float32 // [maxPos * kvDim]
AttnBatch []float32 // [maxPos * qDim]
ProjBatch []float32 // [maxPos * dim]
FFNInBatch []float32 // [maxPos * dim]
NormBatch []float32 // [maxPos * dim]
GateBatch []float32 // [maxPos * ffnDim]
UpBatch []float32 // [maxPos * ffnDim]
HidBatch []float32 // [maxPos * ffnDim]
FFNBatch []float32 // [maxPos * dim]
Q8Buf []byte // pre-allocated Q8 quantization buffer
// GatedQ attention: Wq outputs 2*qDim (interleaved Q + gate per head)
QFullBatch []float32 // [maxPos * 2*qDim]
QGateBatch []float32 // [maxPos * qDim]
// SSM batch buffers: matmuls batched across positions, recurrence per-position
SSMQKVBatch []float32 // [maxPos * qkvDim]
SSMZBatch []float32 // [maxPos * valueDim]
SSMAlphaBatch []float32 // [maxPos * numHeads]
SSMBetaBatch []float32 // [maxPos * numHeads]
SSMYBatch []float32 // [maxPos * valueDim]
// MoE batch buffers
MoERouterBatch []float32 // [maxPos * expertCount] router logits
MoEExpDim int // expert FFN hidden dim
MoEShDim int // shared expert FFN hidden dim
// Score buffers sized to maxSeqLen (full context) — each worker needs one
// score vector covering ALL positions for correct causal attention.
ScoreBufs [][]float32 // [numWorkers][maxSeqLen]
// KGather/VGather for SIMD batched attention.
// Layout: [numKVHeads * maxSeqLen * headDim]
// Nil when maxSeqLen is too large (SIMD disabled, non-SIMD path used instead).
KGather []float32
VGather []float32
// contains filtered or unexported fields
}
BatchState holds pre-allocated buffers for batch (prefill) forward passes.
func NewBatchState ¶
func NewBatchState(cfg ModelConfig, maxPos int, maxSeqLenHint ...int) *BatchState
NewBatchState allocates batch buffers for up to maxPos positions. maxSeqLenHint (optional) specifies the total context length for score/gather buffer sizing; defaults to maxPos when not provided.
type FormatOptions ¶
type FormatOptions struct {
ReasoningEffort string // "low", "medium", "high" (default: "medium")
EnableThinking *bool // nil = auto (enabled for thinking models), false = disable
}
FormatOptions controls template-level formatting (e.g. reasoning effort).
type GenerateConfig ¶
type GenerateConfig struct {
MaxTokens int
Sampler ops.SamplerConfig
Seed int64
Stream func(token string) // called for each generated token (nil = no streaming)
Grammar *grammar.Grammar // optional grammar constraint (nil = unconstrained)
ThinkingPhase bool // suppress EOS during <think> block (set automatically by Chat/GenerateText)
}
GenerateConfig controls text generation behavior.
func DefaultGenerateConfig ¶
func DefaultGenerateConfig() GenerateConfig
DefaultGenerateConfig returns sensible defaults.
type GenerateResult ¶
type GenerateResult struct {
Text string
Tokens []int32
TokensPerSec float64
PrefillTimeMs float64
GenerateTimeMs float64
TotalTokens int
PromptTokens int
}
GenerateResult holds detailed output from a generation run.
type Layer ¶
type Layer struct {
Spec LayerSpec // architectural choices, resolved at load time
// Attention
AttnNorm []float32 // [dim] norm weight
AttnNormBias []float32 // [dim] optional LayerNorm bias (Phi-2)
Wq *core.QuantizedTensor // [qDim × dim]
Wk *core.QuantizedTensor // [kvDim × dim]
Wv *core.QuantizedTensor // [kvDim × dim]
Wo *core.QuantizedTensor // [dim × qDim]
Bq []float32 // [qDim] optional (Qwen)
Bk []float32 // [kvDim] optional
Bv []float32 // [kvDim] optional
Bo []float32 // [dim] optional attn_output bias (Phi-2)
AttnQNorm []float32 // [headDim] optional QK norm (Qwen3/Gemma3)
AttnKNorm []float32 // [headDim] optional QK norm (Qwen3/Gemma3)
AttnGate *core.QuantizedTensor // [dim × dim] optional gated attention (Qwen3.5)
AttnSinks []float32 // [numKVHeads] attention sink weights (gpt-oss)
PostAttnNorm []float32 // [dim] optional post-attention norm (Gemma 3)
FFNNorm []float32 // [dim] norm weight (nil = parallel attn+FFN)
FFNGate *core.QuantizedTensor // [ffnDim × dim] w1 (nil = plain MLP)
FFNUp *core.QuantizedTensor // [ffnDim × dim] w3
FFNDown *core.QuantizedTensor // [dim × ffnDim] w2
FFNUpBias []float32 // [ffnDim] optional (Phi-2)
FFNDownBias []float32 // [dim] optional (Phi-2)
PostFFNNorm []float32 // [dim] optional post-FFN norm (Gemma 3)
// MoE (Mixture of Experts) — packed expert weights
FFNRouter *core.QuantizedTensor // [expertCount × dim] router/gating network
FFNGateExps *core.QuantizedTensor // [expertCount*expertFFNDim × dim] packed expert gate
FFNUpExps *core.QuantizedTensor // [expertCount*expertFFNDim × dim] packed expert up
FFNGateUpExps *core.QuantizedTensor // [expertCount*2*expertFFNDim × dim] fused gate+up (interleaved per expert)
FFNDownExps *core.QuantizedTensor // [expertCount*dim × expertFFNDim] packed expert down
FFNGateExpsBias []float32 // [expertCount*expertFFNDim] packed expert gate bias
FFNUpExpsBias []float32 // [expertCount*expertFFNDim] packed expert up bias
FFNDownExpsBias []float32 // [expertCount*dim] packed expert down bias
FFNRouterBias []float32 // [expertCount] router logit bias (DeepSeek-V2/GLM-4)
// MLA (Multi-head Latent Attention) — DeepSeek-V2/GLM-4
WqA *core.QuantizedTensor // [qLORARank × dim] Q down-projection
WqANorm []float32 // [qLORARank] norm between Q projections
WqB *core.QuantizedTensor // [numHeads*(qkNope+qkRope) × qLORARank] Q up-projection
WkvA *core.QuantizedTensor // [kvLORARank+qkRope × dim] KV down-projection (includes rope keys)
WkvANorm []float32 // [kvLORARank] norm for KV compressed
WkB *core.QuantizedTensor // [numHeads*qkNope × kvLORARank] K up-projection (3D packed per head)
WvB *core.QuantizedTensor // [numHeads*vHeadDim × kvLORARank] V up-projection (3D packed per head)
// Gated Delta Net weights — only for Qwen3.5 linear attention layers
SSMInProj *core.QuantizedTensor // [dim × qkvDim] fused QKV in-projection (stored via attn_qkv when not standard attention)
SSMConv1dW []float32 // [channels × convKernel] depthwise conv weights (flat)
SSMA []float32 // [numHeads] log(-A) decay parameter
SSMAlpha *core.QuantizedTensor // [dim × numHeads] dt/alpha projection
SSMBeta *core.QuantizedTensor // [dim × numHeads] beta/learning-rate projection
SSMFusedBA *core.QuantizedTensor // [dim × 2*numHeads] fused beta+alpha (interleaved per KV group)
SSMDtBias []float32 // [numHeads] dt bias
SSMNorm []float32 // [headVDim] per-head RMSNorm weight (shared across heads)
SSMOut *core.QuantizedTensor // [dim × valueDim] output projection
}
Layer holds the weights for one transformer block.
type LayerSpec ¶
type LayerSpec struct {
Norm NormKind
Core CoreKind
Residual ResKind
FFN FFNKind
GatedQ bool // Fused Q+gate projection (Qwen3.5 attention layers)
QKNorm bool // Per-head QK normalization (Gemma 3, Qwen3)
SlidingWindow int // >0 = use sliding window attention with this size
}
LayerSpec captures all architectural choices for one transformer layer. Resolved once at load time from tensor presence; the forward pass dispatches on these fields via switch statements that compile to jump tables.
type Model ¶
type Model struct {
Config ModelConfig
TokenEmbed *core.QuantizedTensor // [vocabSize × dim]
OutputNorm []float32 // [dim]
OutputNormBias []float32 // [dim] optional LayerNorm bias (Phi-2)
Output *core.QuantizedTensor // [vocabSize × dim] (may tie with TokenEmbed)
OutputBias []float32 // [vocabSize] optional
Layers []Layer
MmapFile *mmap.MappedFile // underlying mmap'd GGUF file (nil if loaded via ReadAt)
}
Model holds all weights for a decoder-only transformer LLM.
type ModelConfig ¶
type ModelConfig struct {
Architecture string
VocabSize int
ContextLength int
EmbeddingDim int
NumLayers int
FFNDim int
NumHeads int
NumKVHeads int
HeadDim int
RMSNormEps float32
RopeFreqBase float32
RopeNeox bool
RopeDim int // partial RoPE: 0 = full headDim, else only first RopeDim dims
RopeScaleType int // 0=none, 1=linear, 2=yarn
RopeScaleFactor float32 // scaling factor for extended context
RopeOrigMaxPos int // original max position embeddings (for YaRN)
RopeYaRNBetaFast float32 // YaRN beta_fast (default 32)
RopeYaRNBetaSlow float32 // YaRN beta_slow (default 1)
RopeYaRNExtFactor float32 // YaRN ext_factor: 0=disable ramp, 1=full ramp (default 1)
RopeYaRNAttnFactor float32 // YaRN attn_factor: magnitude scaling base (default 1)
RopeFreqBaseSWA float32 // Separate RoPE frequency base for SWA layers (0 = same as RopeFreqBase)
SlidingWindow int // sliding window attention size (0 = disabled)
SlidingWindowPattern int // 0=all layers, N=alternating (every Nth layer is full)
BOS int32
EOS int32
StopTokens []int32
SuppressTokens []int32 // logit = -inf before sampling (matching llama.cpp's EOG bias)
AddBOS bool
FFNGelu bool // true = GeGLU (Gemma), false = SwiGLU (LLaMA/Qwen)
EmbedScale float32 // non-zero = scale embeddings (Gemma: sqrt(dim))
ChatTemplate string // chat format: "chatml", "llama2", "llama3", "gemma", "phi"
// Gemma 2 soft-capping
AttnLogitSoftcap float32 // 0=disabled; >0 = tanh(logit/cap)*cap before softmax
FinalLogitSoftcap float32 // 0=disabled; >0 = tanh(logit/cap)*cap on final logits
// Qwen3.5 hybrid Mamba/Attention
FullAttentionInterval int // 0 = all attention; N = every Nth layer is attention
SSMConvKernel int
SSMInnerSize int
SSMStateSize int
SSMTimeStepRank int
SSMGroupCount int
SSMTiledVOrder bool // true = GGUF V heads in tiled order (Qwen3.5); false = grouped (Qwen3Next)
// MoE (Mixture of Experts)
ExpertCount int // 0 = dense (no MoE); >0 = number of experts per layer
ExpertUsedCount int // top-K experts selected per token
ExpertFFNDim int // hidden dim per expert
ExpertGatingFunc int // 0=none, 1=softmax, 2=sigmoid, 3=softmax_weight (top-k raw then softmax)
ExpertWeightsNorm bool // normalize selected expert weights by sum
ExpertWeightsScale float32 // scale factor for expert weights (0 = no scaling)
// MLA (Multi-head Latent Attention) — DeepSeek-V2/GLM-4
QLORARank int // Q compression rank (attn_q_a output dim)
KVLORARank int // KV compression rank (compressed KV without rope)
QKNopeDim int // Non-positional K dim per head (k_b output per head)
QKRopeDim int // Positional (RoPE) K dim per head
VHeadDim int // V dim per head (v_b output per head)
LeadingDenseCount int // Number of initial dense layers before MoE
}
ModelConfig holds all architecture parameters for a decoder-only transformer LLM. Auto-populated from GGUF metadata.
func ParseConfig ¶
func ParseConfig(md map[string]interface{}) (ModelConfig, error)
ParseConfig extracts a ModelConfig from GGUF metadata.
type Pipeline ¶
type Pipeline struct {
Model *Model
Tokenizer *Tokenizer
KVCache *memory.MultiLayerKVCache
RunState *RunState
BatchState *BatchState
MaxSeqLen int
}
Pipeline bundles a loaded model, tokenizer, KV cache, and run state for inference.
func NewPipeline ¶
NewPipeline loads a GGUF model and creates a ready-to-use inference pipeline with automatic tokenizer extraction from GGUF metadata.
func (*Pipeline) Chat ¶
Chat formats a user message (with optional system prompt) using the model's chat template, then generates a response. Returns generated text and tok/s.
func (*Pipeline) ChatMessages ¶
ChatMessages formats a multi-turn conversation and generates the assistant's next response. Returns generated text and tok/s.
func (*Pipeline) FreeForGPU ¶
func (p *Pipeline) FreeForGPU()
FreeForGPU releases CPU-side KV cache, RunState, and BatchState that are unused when a GPU pipeline handles all inference. Only Model, Tokenizer, and MaxSeqLen are retained (needed by the scheduler for tokenization and config lookups). Call this after the GPU pipeline is successfully created.
func (*Pipeline) Generate ¶
func (p *Pipeline) Generate(prompt []int32, cfg GenerateConfig) ([]int32, error)
Generate produces text from a prompt using the loaded model.
func (*Pipeline) GenerateDetailed ¶
func (p *Pipeline) GenerateDetailed(prompt string, cfg GenerateConfig) (*GenerateResult, error)
GenerateDetailed is like GenerateText but returns detailed timing information.
func (*Pipeline) GenerateText ¶
GenerateText is a convenience method that takes a text prompt, encodes it, generates tokens, and decodes the result. Returns the generated text and token/second throughput.
func (*Pipeline) GenerateTextWithStopStrings ¶
func (p *Pipeline) GenerateTextWithStopStrings(prompt string, cfg GenerateConfig) (string, float64, error)
GenerateTextWithStopStrings is like GenerateText but also handles text-level stop string detection for multi-token stop sequences.
func (*Pipeline) RebuildBuffers ¶
func (p *Pipeline) RebuildBuffers()
RebuildBuffers re-creates KV cache, RunState, and BatchState using the current MaxSeqLen. Used to restore CPU-side buffers after FreeForGPU when GPU pipeline creation fails and we fall back to CPU inference.
type RunState ¶
type RunState struct {
X []float32 // [dim] current activation
XNorm []float32 // [dim] normalized activation
Q []float32 // [qDim] query projection
K []float32 // [kvDim] key projection
V []float32 // [kvDim] value projection
AttnOut []float32 // [qDim] attention output
AttnProj []float32 // [dim] output projection
FFNIn []float32 // [dim] FFN input (after residual)
FFNNorm []float32 // [dim] FFN normalized
Gate []float32 // [ffnDim] gate projection
Up []float32 // [ffnDim] up projection
Hidden []float32 // [ffnDim] gated hidden
FFNOut []float32 // [dim] FFN output
Logits []float32 // [vocabSize] output logits
Scores []float32 // [maxSeqLen] attention scores scratch (legacy)
HeadScores [][]float32 // [numHeads][maxSeqLen] per-head score buffers for parallel attention
// Qwen3.5 gated attention: Wq outputs interleaved [Q,gate] per head
QFull []float32 // [2*qDim] fused Q+gate output (nil for non-gated models)
QGate []float32 // [qDim] attention gate values (nil for non-gated models)
// SSM (Gated Delta Net) scratch buffers — nil for pure transformer models
SSMRun *SSMRunState
SSMState *memory.SSMStateCache
// MoE (Mixture of Experts) scratch buffers — nil for dense models
MoELogits []float32 // [expertCount] router logits
MoEGates [][]float32 // [nUsed][expertFFNDim] per-expert gate (parallel)
MoEUps [][]float32 // [nUsed][expertFFNDim] per-expert up (parallel)
MoEHiddens [][]float32 // [nUsed][expertFFNDim] per-expert hidden (parallel)
MoEExpertOuts [][]float32 // [nUsed][dim] per-expert output (parallel)
MoEShGate []float32 // [sharedFFNDim] shared expert gate
MoEShUp []float32 // [sharedFFNDim] shared expert up
MoEShHidden []float32 // [sharedFFNDim] shared expert hidden
MoEShOut []float32 // [dim] shared expert output
// MLA (Multi-head Latent Attention) scratch buffers
MLAQComp []float32 // [qLORARank] compressed Q intermediate
MLAQAbsorbed []float32 // [numHeads * kvLORARank] absorbed key vectors per head
MLAAttnKV []float32 // [numHeads * kvLORARank] weighted KV sum per head
// Worker pool for parallel matmul
Pool *blas.Pool
// contains filtered or unexported fields
}
RunState holds pre-allocated buffers for inference, avoiding per-token allocations.
func NewRunState ¶
func NewRunState(cfg ModelConfig, maxSeqLen int) *RunState
NewRunState allocates all buffers for a model.
func (*RunState) ApplyRoPEFast ¶
ApplyRoPEFast applies precomputed RoPE to vec in-place. vec must be one head's worth [headDim]; pos is the token position. Only the first ropeDim dimensions are rotated; the rest pass through unchanged.
func (*RunState) ApplyRoPEFastDim ¶
ApplyRoPEFastDim applies RoPE to a vector of exactly ropeDim elements (e.g. the rope portion of MLA Q/K heads).
func (*RunState) PrecomputeRoPE ¶
PrecomputeRoPE fills RunState with precomputed cos/sin tables for RoPE. maxSeqLen: maximum sequence length; ropeDim, headDim: dimensions; freqBase: RoPE frequency base. ropeDim may be less than headDim for partial RoPE (e.g. Phi-2: 32 of 80).
func (*RunState) PrecomputeSWARoPE ¶
PrecomputeSWARoPE builds separate cos/sin tables for SWA layers with a different frequency base (e.g. gpt-oss ISWA uses theta=10000 for SWA, theta=150000 for full attention).
func (*RunState) PrecomputeYaRNRoPE ¶
func (rs *RunState) PrecomputeYaRNRoPE(maxSeqLen, ropeDim, headDim int, freqBase, factor float32, origMaxPos int, betaFast, betaSlow, extFactor, attnFactor float32)
PrecomputeYaRNRoPE precomputes RoPE tables with YaRN extended-context scaling.
func (*RunState) RoPETables ¶
RoPETables returns the precomputed cos/sin tables for GPU upload.
func (*RunState) RoPETablesSWA ¶
RoPETablesSWA returns the SWA-specific RoPE tables (nil if not set).
func (*RunState) SetRopeNeox ¶
SetRopeNeox sets whether to use NeoX-style (split-half) RoPE pairing. Call after PrecomputeRoPE if needed.
type SSMRunState ¶
type SSMRunState struct {
QKV []float32 // [qkvDim] in-projection output (goes through conv)
Z []float32 // [valueDim] gate projection output
Alpha []float32 // [numHeads] raw alpha (decay param)
Beta []float32 // [numHeads] raw beta (learning rate)
FusedBA []float32 // [2*numHeads] fused beta+alpha output (interleaved per KV group)
Y []float32 // [valueDim] attention/SSM output
}
SSMRunState holds pre-allocated scratch buffers for SSM (Gated Delta Net) layers.
type Tokenizer ¶
type Tokenizer struct {
// Backward-compatible fields
Tokens []string
TokenToID map[string]int32
BOS int32
EOS int32
AddBOS bool
PreBOS int32 // token to prepend before BOS (e.g., [gMASK]); -1 = unused
// BPE/SPM-specific fields
MergeRanks map[[2]string]int // GPT-2: merge pair -> priority (lower = merge first)
Scores []float32 // SentencePiece: token scores for merge ordering
ModelType string // "llama" or "gpt2"
SpecialTokens map[string]int32 // special/control token text -> id
// contains filtered or unexported fields
}
Tokenizer encodes text to token IDs and decodes token IDs back to text. Supports both SentencePiece (LLaMA) and GPT-2 BPE (Qwen) tokenization.
func NewTokenizerFromGGUF ¶
func NewTokenizerFromGGUF(md map[string]interface{}, cfg ModelConfig) (*Tokenizer, error)
NewTokenizerFromGGUF extracts vocabulary and tokenizer config from GGUF metadata. Auto-detects tokenizer type based on tokenizer.ggml.model: "llama" -> SentencePiece, "gpt2" -> GPT-2 BPE. Falls back to heuristic if model key is missing.
func (*Tokenizer) DecodeToken ¶
DecodeToken converts a single token ID to its string representation. Special tokens like thinking markers are preserved as their raw text.
type WorkerPool ¶
type WorkerPool struct {
// contains filtered or unexported fields
}
WorkerPool manages persistent goroutines for parallel inference. Each worker has its own task channel; work is distributed round-robin.
func NewWorkerPool ¶
func NewWorkerPool(n int) *WorkerPool
NewWorkerPool creates a pool with n persistent workers. Callers must call Shutdown when done.
func (*WorkerPool) Dispatch ¶
func (wp *WorkerPool) Dispatch(total, numActive int, work func(workerID, start, end int))
Dispatch distributes work across numActive workers and blocks until complete. total is the total number of items; work(workerID, start, end) processes [start, end).
func (*WorkerPool) DispatchAsync ¶
func (wp *WorkerPool) DispatchAsync(total, numActive int, work func(workerID, start, end int))
DispatchAsync distributes work without waiting. Call Wait() to block until all async work completes.
func (*WorkerPool) Shutdown ¶
func (wp *WorkerPool) Shutdown()
Shutdown stops all workers and waits for them to exit.
func (*WorkerPool) Wait ¶
func (wp *WorkerPool) Wait()
Wait blocks until all work dispatched via DispatchAsync has completed.