llm

package

v0.0.0-...-6093f96 Latest Latest Go to latest Published: Apr 24, 2026 License: Apache-2.0 Imports: 22 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/computerex/dlgo

Links

Open Source Insights

Documentation ¶

Index ¶

Variables
func ApplySuppressTokens(logits []float32, suppress []int32)
func CheckMemoryBudget(modelPath string, cfg ModelConfig, requestedSeqLen int) (int, error)
func EstimateLayerBytes(l *Layer) int64
func EstimateRuntimeBytes(cfg ModelConfig, seqLen int) int64
func FormatChat(cfg ModelConfig, system, user string) string
func FormatMessages(cfg ModelConfig, messages []Message, opts ...FormatOptions) string
func Forward(m *Model, token int32, pos int, kv *memory.MultiLayerKVCache, rs *RunState) []float32
func ForwardAttention(layer *Layer, rs *RunState, kv *memory.MultiLayerKVCache, ...)
func ForwardBatch(m *Model, tokens []int32, startPos int, kv *memory.MultiLayerKVCache, ...) []float32
func ForwardFromLayer(m *Model, startLayer, pos int, kv *memory.MultiLayerKVCache, rs *RunState) []float32
func ForwardMLA(layer *Layer, rs *RunState, kv *memory.MultiLayerKVCache, l, pos int, ...)
func ForwardMoEFFN(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)
func ForwardMoEFFNDispatch(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)
func ForwardMoEFFN_OAI(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)
func ForwardRange(m *Model, token int32, pos, startLayer, endLayer int, ...) []float32
func ForwardSSMLayer(layer *Layer, rs *RunState, ssm *SSMRunState, ssmState *memory.SSMLayerState, ...) []float32
func PinLayerToRAM(l *Layer)
func RegisterArchitecture(name string, desc ArchDescriptor)
func TrimStopText(text string, cfg ModelConfig) string
type ArchDescriptor
- func GetArchDescriptor(arch string) ArchDescriptor
type BatchState
- func NewBatchState(cfg ModelConfig, maxPos int, maxSeqLenHint ...int) *BatchState
type CoreKind
type FFNKind
type FormatOptions
type GenerateConfig
- func DefaultGenerateConfig() GenerateConfig
type GenerateResult
type Layer
type LayerSpec
type Message
type Model
- func LoadModel(path string) (*Model, error)
- func (m *Model) Close()
type ModelConfig
- func ParseConfig(md map[string]interface{}) (ModelConfig, error)
type NormKind
type Pipeline
- func NewPipeline(modelPath string, maxSeqLen int) (*Pipeline, error)
- func (p *Pipeline) Chat(system, user string, cfg GenerateConfig) (string, float64, error)
- func (p *Pipeline) ChatMessages(messages []Message, cfg GenerateConfig) (string, float64, error)
- func (p *Pipeline) FreeForGPU()
- func (p *Pipeline) Generate(prompt []int32, cfg GenerateConfig) ([]int32, error)
- func (p *Pipeline) GenerateDetailed(prompt string, cfg GenerateConfig) (*GenerateResult, error)
- func (p *Pipeline) GenerateText(prompt string, cfg GenerateConfig) (string, float64, error)
- func (p *Pipeline) GenerateTextWithStopStrings(prompt string, cfg GenerateConfig) (string, float64, error)
- func (p *Pipeline) RebuildBuffers()
type ResKind
type RunState
- func NewRunState(cfg ModelConfig, maxSeqLen int) *RunState
- func (rs *RunState) ApplyRoPEFast(vec []float32, pos int)
- func (rs *RunState) ApplyRoPEFastDim(vec []float32, pos, ropeDim int)
- func (rs *RunState) PrecomputeRoPE(maxSeqLen, ropeDim, headDim int, freqBase float32)
- func (rs *RunState) PrecomputeSWARoPE(maxSeqLen, ropeDim, headDim int, freqBase float32)
- func (rs *RunState) PrecomputeYaRNRoPE(maxSeqLen, ropeDim, headDim int, freqBase, factor float32, origMaxPos int, ...)
- func (rs *RunState) RoPETables() (cosTable, sinTable []float32)
- func (rs *RunState) RoPETablesSWA() (cosTable, sinTable []float32)
- func (rs *RunState) SetRopeNeox(neox bool)
type SSMRunState
type Tokenizer
- func NewTokenizerFromGGUF(md map[string]interface{}, cfg ModelConfig) (*Tokenizer, error)
- func (t *Tokenizer) Decode(tokens []int32) string
- func (t *Tokenizer) DecodeToken(id int32) string
- func (t *Tokenizer) Encode(text string) []int32
- func (t *Tokenizer) VocabSize() int
type WorkerPool
- func NewWorkerPool(n int) *WorkerPool
- func (wp *WorkerPool) Dispatch(total, numActive int, work func(workerID, start, end int))
- func (wp *WorkerPool) DispatchAsync(total, numActive int, work func(workerID, start, end int))
- func (wp *WorkerPool) Shutdown()
- func (wp *WorkerPool) Wait()

Constants ¶

This section is empty.

Variables ¶

View Source

var CPUDiag cpuDiagState

CPUDiag provides structured CPU forward pass diagnostic logging, mirroring the GPU diagnostic system. Enable with DLGO_CPU_DIAG=1 or the same layer/ position filter syntax as DLGO_GPU_DIAG.

Enable: DLGO_CPU_DIAG=1 (all layers, all positions)

DLGO_CPU_DIAG=L0          (layer 0 only)
DLGO_CPU_DIAG=L0:P0       (layer 0, position 0 only)
DLGO_CPU_DIAG=L0-L5       (layers 0 through 5)

Verbosity: DLGO_CPU_DIAG_V=1 (default: print first 8 elements)

DLGO_CPU_DIAG_V=2  (print first 16 elements + norms)
DLGO_CPU_DIAG_V=3  (print first 32 elements + full stats)

View Source

var DebugForward bool

DebugForward enables per-layer norm prints for the current forward pass.

View Source

var DiagLayer, DiagPos int

DiagLayer and DiagPos track the current layer/position for functions that don't have direct access to them (e.g. MoE FFN dispatch). Only set when CPUDiag is enabled.

Functions ¶

func ApplySuppressTokens ¶

func ApplySuppressTokens(logits []float32, suppress []int32)

ApplySuppressTokens sets logits to -inf for suppressed EOG tokens.

func CheckMemoryBudget ¶

func CheckMemoryBudget(modelPath string, cfg ModelConfig, requestedSeqLen int) (int, error)

CheckMemoryBudget checks whether loading the given model at the requested context length will fit in available RAM. Returns an adjusted (possibly reduced) maxSeqLen and an error only if even the minimum context won't fit.

Model weights are memory-mapped and demand-paged by the OS — they do NOT consume heap RAM. Only runtime buffers (KV cache, RunState, BatchState) need actual RAM. The budget is: 85% of total physical RAM minus current usage. This ensures any model can load regardless of size; throughput degrades gracefully via mmap paging but the system never crashes.

func EstimateLayerBytes ¶

func EstimateLayerBytes(l *Layer) int64

EstimateLayerBytes returns the approximate size in bytes of a layer's weight data.

func EstimateRuntimeBytes ¶

func EstimateRuntimeBytes(cfg ModelConfig, seqLen int) int64

EstimateRuntimeBytes estimates heap bytes needed for KV cache, RunState, and BatchState at a given context length, WITHOUT counting the model weights (which are memory-mapped and paged in by the OS on demand).

func FormatChat ¶

func FormatChat(cfg ModelConfig, system, user string) string

FormatChat formats a single-turn chat prompt (system + user) for the model.

func FormatMessages ¶

func FormatMessages(cfg ModelConfig, messages []Message, opts ...FormatOptions) string

FormatMessages formats a multi-turn conversation for the model.

func Forward ¶

func Forward(m *Model, token int32, pos int, kv *memory.MultiLayerKVCache, rs *RunState) []float32

Forward performs a single-token forward pass through the model.

func ForwardAttention ¶

func ForwardAttention(
	layer *Layer, rs *RunState, kv *memory.MultiLayerKVCache,
	l, pos, numHeads, numKVHeads, headDim, kvMul int,
	cfg ModelConfig, pool *blas.Pool,
)

forwardAttention runs one full-attention layer. Writes result to rs.AttnProj.

func ForwardBatch ¶

func ForwardBatch(m *Model, tokens []int32, startPos int, kv *memory.MultiLayerKVCache, rs *RunState, bs *BatchState) []float32

ForwardBatch processes multiple tokens in a single pass (prefill). Returns logits for the last position only. Fills the KV cache for all positions. If the number of tokens exceeds bs.maxPos, the prompt is processed in chunks of bs.maxPos, correctly maintaining KV cache and SSM state across chunks.

func ForwardFromLayer ¶

func ForwardFromLayer(m *Model, startLayer, pos int, kv *memory.MultiLayerKVCache, rs *RunState) []float32

ForwardFromLayer resumes forward pass from a given layer. rs.X must already contain the hidden state from the previous layer.

func ForwardMLA ¶

func ForwardMLA(
	layer *Layer, rs *RunState, kv *memory.MultiLayerKVCache,
	l, pos int, cfg ModelConfig, pool *blas.Pool,
)

ForwardMLA runs one Multi-head Latent Attention layer (DeepSeek-V2/GLM-4). Uses absorbed key/value approach to avoid expanding K for every cached position.

func ForwardMoEFFN ¶

func ForwardMoEFFN(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)

ForwardMoEFFN runs the Mixture-of-Experts FFN. Result written to rs.FFNOut.

func ForwardMoEFFNDispatch ¶

func ForwardMoEFFNDispatch(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)

ForwardMoEFFNDispatch routes to the correct MoE implementation based on FFN type.

func ForwardMoEFFN_OAI ¶

func ForwardMoEFFN_OAI(layer *Layer, rs *RunState, input []float32, cfg ModelConfig, pool *blas.Pool)

ForwardMoEFFN_OAI runs the gpt-oss MoE with SOFTMAX_WEIGHT gating and SwiGLU_OAI activation.

func ForwardRange ¶

func ForwardRange(m *Model, token int32, pos, startLayer, endLayer int, kv *memory.MultiLayerKVCache, rs *RunState) []float32

ForwardRange performs a forward pass through layers [startLayer, endLayer). If startLayer == 0, token embedding is done. If endLayer == numLayers, the final norm and output projection are included.

func ForwardSSMLayer ¶

func ForwardSSMLayer(
	layer *Layer,
	rs *RunState,
	ssm *SSMRunState,
	ssmState *memory.SSMLayerState,
	xnorm []float32,
	cfg ModelConfig,
	pool *blas.Pool,
) []float32

ForwardSSMLayer runs one Gated Delta Net layer for single-token autoregressive inference.

Implements the recurrent delta rule with error correction:

S[h] = exp(g[h]) * S[h]                         // decay
v_pred = S^T @ k                                 // predict value from key
delta  = v - v_pred                              // error signal
S[h]  += sigmoid(beta[h]) * outer(k, delta)      // error-corrected update
out[h] = S^T @ (q / sqrt(headKDim))              // scaled output

func PinLayerToRAM ¶

func PinLayerToRAM(l *Layer)

PinLayerToRAM copies a layer's weight data from the mmap'd region into heap-allocated slices, ensuring the layer stays in physical RAM and avoids page faults during inference.

func RegisterArchitecture ¶

func RegisterArchitecture(name string, desc ArchDescriptor)

RegisterArchitecture registers or overwrites an architecture descriptor. Use for extensibility when adding support for new model families.

func TrimStopText ¶

func TrimStopText(text string, cfg ModelConfig) string

TrimStopText removes trailing stop strings and whitespace from generated text.

Types ¶

type ArchDescriptor ¶

type ArchDescriptor struct {
	RopeNeox         bool   // true = NeoX-style RoPE, false = interleaved
	FFNGelu          bool   // true = GeGLU (Gemma), false = SwiGLU (LLaMA/Qwen)
	EmbedScaleMode   string // "none" or "sqrt_dim"
	ChatTemplate     string // "chatml", "llama2", "llama3", "gemma", "phi"
	SupportsThinking bool   // true = model uses <think> blocks (Qwen3/3.5)
}

ArchDescriptor describes architecture-specific behavior for an LLM.

func GetArchDescriptor ¶

func GetArchDescriptor(arch string) ArchDescriptor

GetArchDescriptor returns the descriptor for the given architecture. Unknown architectures receive a default descriptor (interleaved RoPE, SwiGLU, no embed scale, chatml).

type BatchState ¶

type BatchState struct {
	XBatch     []float32 // [maxPos * dim]
	XNormBatch []float32 // [maxPos * dim]
	QBatch     []float32 // [maxPos * qDim]
	KBatch     []float32 // [maxPos * kvDim]
	VBatch     []float32 // [maxPos * kvDim]
	AttnBatch  []float32 // [maxPos * qDim]
	ProjBatch  []float32 // [maxPos * dim]
	FFNInBatch []float32 // [maxPos * dim]
	NormBatch  []float32 // [maxPos * dim]
	GateBatch  []float32 // [maxPos * ffnDim]
	UpBatch    []float32 // [maxPos * ffnDim]
	HidBatch   []float32 // [maxPos * ffnDim]
	FFNBatch   []float32 // [maxPos * dim]

	Q8Buf []byte // pre-allocated Q8 quantization buffer

	// GatedQ attention: Wq outputs 2*qDim (interleaved Q + gate per head)
	QFullBatch []float32 // [maxPos * 2*qDim]
	QGateBatch []float32 // [maxPos * qDim]

	// SSM batch buffers: matmuls batched across positions, recurrence per-position
	SSMQKVBatch   []float32 // [maxPos * qkvDim]
	SSMZBatch     []float32 // [maxPos * valueDim]
	SSMAlphaBatch []float32 // [maxPos * numHeads]
	SSMBetaBatch  []float32 // [maxPos * numHeads]
	SSMYBatch     []float32 // [maxPos * valueDim]

	// MoE batch buffers
	MoERouterBatch []float32 // [maxPos * expertCount] router logits
	MoEExpDim      int       // expert FFN hidden dim
	MoEShDim       int       // shared expert FFN hidden dim

	// Score buffers sized to maxSeqLen (full context) — each worker needs one
	// score vector covering ALL positions for correct causal attention.
	ScoreBufs [][]float32 // [numWorkers][maxSeqLen]

	// KGather/VGather for SIMD batched attention.
	// Layout: [numKVHeads * maxSeqLen * headDim]
	// Nil when maxSeqLen is too large (SIMD disabled, non-SIMD path used instead).
	KGather []float32
	VGather []float32
	// contains filtered or unexported fields
}

BatchState holds pre-allocated buffers for batch (prefill) forward passes.

func NewBatchState ¶

func NewBatchState(cfg ModelConfig, maxPos int, maxSeqLenHint ...int) *BatchState

NewBatchState allocates batch buffers for up to maxPos positions. maxSeqLenHint (optional) specifies the total context length for score/gather buffer sizing; defaults to maxPos when not provided.

type CoreKind ¶

type CoreKind uint8

CoreKind selects the per-layer compute core.

const (
	CoreAttention CoreKind = iota // Standard grouped-query attention
	CoreSSM                       // Gated Delta Network (Qwen3.5 linear attention)
	CoreMLA                       // Multi-head Latent Attention (DeepSeek-V2/GLM-4)
)

type FFNKind ¶

type FFNKind uint8

FFNKind selects the feed-forward network variant.

const (
	FFNSwiGLU    FFNKind = iota // gate·SiLU ⊙ up → down (LLaMA, Qwen)
	FFNGeGLU                    // gate·GELU ⊙ up → down (Gemma)
	FFNPlain                    // up → GELU → down (Phi-2)
	FFNMoE                      // Mixture of Experts: route to top-K experts, each SwiGLU
	FFNMoESwiOAI                // MoE with SwiGLU_OAI activation (gpt-oss)
)

type FormatOptions ¶

type FormatOptions struct {
	ReasoningEffort string // "low", "medium", "high" (default: "medium")
	EnableThinking  *bool  // nil = auto (enabled for thinking models), false = disable
}

FormatOptions controls template-level formatting (e.g. reasoning effort).

type GenerateConfig ¶

type GenerateConfig struct {
	MaxTokens     int
	Sampler       ops.SamplerConfig
	Seed          int64
	Stream        func(token string) // called for each generated token (nil = no streaming)
	Grammar       *grammar.Grammar   // optional grammar constraint (nil = unconstrained)
	ThinkingPhase bool               // suppress EOS during <think> block (set automatically by Chat/GenerateText)
}

GenerateConfig controls text generation behavior.

func DefaultGenerateConfig ¶

func DefaultGenerateConfig() GenerateConfig

DefaultGenerateConfig returns sensible defaults.

type GenerateResult ¶

type GenerateResult struct {
	Text           string
	Tokens         []int32
	TokensPerSec   float64
	PrefillTimeMs  float64
	GenerateTimeMs float64
	TotalTokens    int
	PromptTokens   int
}

GenerateResult holds detailed output from a generation run.

type Layer ¶

type Layer struct {
	Spec LayerSpec // architectural choices, resolved at load time

	// Attention
	AttnNorm     []float32             // [dim] norm weight
	AttnNormBias []float32             // [dim] optional LayerNorm bias (Phi-2)
	Wq           *core.QuantizedTensor // [qDim × dim]
	Wk           *core.QuantizedTensor // [kvDim × dim]
	Wv           *core.QuantizedTensor // [kvDim × dim]
	Wo           *core.QuantizedTensor // [dim × qDim]
	Bq           []float32             // [qDim] optional (Qwen)
	Bk           []float32             // [kvDim] optional
	Bv           []float32             // [kvDim] optional
	Bo           []float32             // [dim] optional attn_output bias (Phi-2)
	AttnQNorm    []float32             // [headDim] optional QK norm (Qwen3/Gemma3)
	AttnKNorm    []float32             // [headDim] optional QK norm (Qwen3/Gemma3)
	AttnGate     *core.QuantizedTensor // [dim × dim] optional gated attention (Qwen3.5)

	AttnSinks []float32 // [numKVHeads] attention sink weights (gpt-oss)

	PostAttnNorm []float32             // [dim] optional post-attention norm (Gemma 3)
	FFNNorm      []float32             // [dim] norm weight (nil = parallel attn+FFN)
	FFNGate      *core.QuantizedTensor // [ffnDim × dim] w1 (nil = plain MLP)
	FFNUp        *core.QuantizedTensor // [ffnDim × dim] w3
	FFNDown      *core.QuantizedTensor // [dim × ffnDim] w2
	FFNUpBias    []float32             // [ffnDim] optional (Phi-2)
	FFNDownBias  []float32             // [dim] optional (Phi-2)
	PostFFNNorm  []float32             // [dim] optional post-FFN norm (Gemma 3)

	// MoE (Mixture of Experts) — packed expert weights
	FFNRouter       *core.QuantizedTensor // [expertCount × dim] router/gating network
	FFNGateExps     *core.QuantizedTensor // [expertCount*expertFFNDim × dim] packed expert gate
	FFNUpExps       *core.QuantizedTensor // [expertCount*expertFFNDim × dim] packed expert up
	FFNGateUpExps   *core.QuantizedTensor // [expertCount*2*expertFFNDim × dim] fused gate+up (interleaved per expert)
	FFNDownExps     *core.QuantizedTensor // [expertCount*dim × expertFFNDim] packed expert down
	FFNGateExpsBias []float32             // [expertCount*expertFFNDim] packed expert gate bias
	FFNUpExpsBias   []float32             // [expertCount*expertFFNDim] packed expert up bias
	FFNDownExpsBias []float32             // [expertCount*dim] packed expert down bias
	FFNGateShared   *core.QuantizedTensor // [sharedFFNDim × dim] shared expert gate
	FFNUpShared     *core.QuantizedTensor // [sharedFFNDim × dim] shared expert up
	FFNDownShared   *core.QuantizedTensor // [dim × sharedFFNDim] shared expert down
	FFNRouterShared []float32             // [dim] shared expert gate weight (sigmoid of dot product)
	FFNRouterBias   []float32             // [expertCount] router logit bias (DeepSeek-V2/GLM-4)

	// MLA (Multi-head Latent Attention) — DeepSeek-V2/GLM-4
	WqA      *core.QuantizedTensor // [qLORARank × dim] Q down-projection
	WqANorm  []float32             // [qLORARank] norm between Q projections
	WqB      *core.QuantizedTensor // [numHeads*(qkNope+qkRope) × qLORARank] Q up-projection
	WkvA     *core.QuantizedTensor // [kvLORARank+qkRope × dim] KV down-projection (includes rope keys)
	WkvANorm []float32             // [kvLORARank] norm for KV compressed
	WkB      *core.QuantizedTensor // [numHeads*qkNope × kvLORARank] K up-projection (3D packed per head)
	WvB      *core.QuantizedTensor // [numHeads*vHeadDim × kvLORARank] V up-projection (3D packed per head)

	// Gated Delta Net weights — only for Qwen3.5 linear attention layers
	SSMInProj  *core.QuantizedTensor // [dim × qkvDim] fused QKV in-projection (stored via attn_qkv when not standard attention)
	SSMConv1dW []float32             // [channels × convKernel] depthwise conv weights (flat)
	SSMA       []float32             // [numHeads] log(-A) decay parameter
	SSMAlpha   *core.QuantizedTensor // [dim × numHeads] dt/alpha projection
	SSMBeta    *core.QuantizedTensor // [dim × numHeads] beta/learning-rate projection
	SSMFusedBA *core.QuantizedTensor // [dim × 2*numHeads] fused beta+alpha (interleaved per KV group)
	SSMDtBias  []float32             // [numHeads] dt bias
	SSMNorm    []float32             // [headVDim] per-head RMSNorm weight (shared across heads)
	SSMOut     *core.QuantizedTensor // [dim × valueDim] output projection
}

Layer holds the weights for one transformer block.

type LayerSpec ¶

type LayerSpec struct {
	Norm          NormKind
	Core          CoreKind
	Residual      ResKind
	FFN           FFNKind
	GatedQ        bool // Fused Q+gate projection (Qwen3.5 attention layers)
	QKNorm        bool // Per-head QK normalization (Gemma 3, Qwen3)
	SlidingWindow int  // >0 = use sliding window attention with this size
}

LayerSpec captures all architectural choices for one transformer layer. Resolved once at load time from tensor presence; the forward pass dispatches on these fields via switch statements that compile to jump tables.

type Message ¶

type Message struct {
	Role    string
	Content string
}

Message represents a single chat message with role and content.

type Model ¶

type Model struct {
	Config         ModelConfig
	TokenEmbed     *core.QuantizedTensor // [vocabSize × dim]
	OutputNorm     []float32             // [dim]
	OutputNormBias []float32             // [dim] optional LayerNorm bias (Phi-2)
	Output         *core.QuantizedTensor // [vocabSize × dim] (may tie with TokenEmbed)
	OutputBias     []float32             // [vocabSize] optional
	Layers         []Layer

	MmapFile *mmap.MappedFile // underlying mmap'd GGUF file (nil if loaded via ReadAt)
}

Model holds all weights for a decoder-only transformer LLM.

func LoadModel ¶

func LoadModel(path string) (*Model, error)

LoadModel opens a GGUF file, parses config from metadata, and loads all tensors.

func (*Model) Close ¶

func (m *Model) Close()

Close releases the memory-mapped file backing the model weights. Must be called when the model is no longer needed.

type ModelConfig ¶

type ModelConfig struct {
	Architecture         string
	VocabSize            int
	ContextLength        int
	EmbeddingDim         int
	NumLayers            int
	FFNDim               int
	NumHeads             int
	NumKVHeads           int
	HeadDim              int
	RMSNormEps           float32
	RopeFreqBase         float32
	RopeNeox             bool
	RopeDim              int     // partial RoPE: 0 = full headDim, else only first RopeDim dims
	RopeScaleType        int     // 0=none, 1=linear, 2=yarn
	RopeScaleFactor      float32 // scaling factor for extended context
	RopeOrigMaxPos       int     // original max position embeddings (for YaRN)
	RopeYaRNBetaFast     float32 // YaRN beta_fast (default 32)
	RopeYaRNBetaSlow     float32 // YaRN beta_slow (default 1)
	RopeYaRNExtFactor    float32 // YaRN ext_factor: 0=disable ramp, 1=full ramp (default 1)
	RopeYaRNAttnFactor   float32 // YaRN attn_factor: magnitude scaling base (default 1)
	RopeFreqBaseSWA      float32 // Separate RoPE frequency base for SWA layers (0 = same as RopeFreqBase)
	SlidingWindow        int     // sliding window attention size (0 = disabled)
	SlidingWindowPattern int     // 0=all layers, N=alternating (every Nth layer is full)
	BOS                  int32
	EOS                  int32
	StopTokens           []int32
	SuppressTokens       []int32 // logit = -inf before sampling (matching llama.cpp's EOG bias)
	AddBOS               bool
	FFNGelu              bool    // true = GeGLU (Gemma), false = SwiGLU (LLaMA/Qwen)
	EmbedScale           float32 // non-zero = scale embeddings (Gemma: sqrt(dim))
	ChatTemplate         string  // chat format: "chatml", "llama2", "llama3", "gemma", "phi"

	// Gemma 2 soft-capping
	AttnLogitSoftcap  float32 // 0=disabled; >0 = tanh(logit/cap)*cap before softmax
	FinalLogitSoftcap float32 // 0=disabled; >0 = tanh(logit/cap)*cap on final logits

	// Qwen3.5 hybrid Mamba/Attention
	FullAttentionInterval int // 0 = all attention; N = every Nth layer is attention
	SSMConvKernel         int
	SSMInnerSize          int
	SSMStateSize          int
	SSMTimeStepRank       int
	SSMGroupCount         int
	SSMTiledVOrder        bool // true = GGUF V heads in tiled order (Qwen3.5); false = grouped (Qwen3Next)

	// MoE (Mixture of Experts)
	ExpertCount        int     // 0 = dense (no MoE); >0 = number of experts per layer
	ExpertUsedCount    int     // top-K experts selected per token
	ExpertFFNDim       int     // hidden dim per expert
	SharedExpertFFNDim int     // hidden dim for shared expert (0 = no shared expert)
	ExpertGatingFunc   int     // 0=none, 1=softmax, 2=sigmoid, 3=softmax_weight (top-k raw then softmax)
	ExpertWeightsNorm  bool    // normalize selected expert weights by sum
	ExpertWeightsScale float32 // scale factor for expert weights (0 = no scaling)

	// MLA (Multi-head Latent Attention) — DeepSeek-V2/GLM-4
	QLORARank         int // Q compression rank (attn_q_a output dim)
	KVLORARank        int // KV compression rank (compressed KV without rope)
	QKNopeDim         int // Non-positional K dim per head (k_b output per head)
	QKRopeDim         int // Positional (RoPE) K dim per head
	VHeadDim          int // V dim per head (v_b output per head)
	LeadingDenseCount int // Number of initial dense layers before MoE
}

ModelConfig holds all architecture parameters for a decoder-only transformer LLM. Auto-populated from GGUF metadata.

func ParseConfig ¶

func ParseConfig(md map[string]interface{}) (ModelConfig, error)

ParseConfig extracts a ModelConfig from GGUF metadata.

type NormKind ¶

type NormKind uint8

NormKind selects the pre-layer normalization variant.

const (
	NormRMS   NormKind = iota // RMSNorm (LLaMA, Qwen, Gemma, SmolLM2, Qwen3.5)
	NormLayer                 // LayerNorm with bias (Phi-2)
)

type Pipeline ¶

type Pipeline struct {
	Model      *Model
	Tokenizer  *Tokenizer
	KVCache    *memory.MultiLayerKVCache
	RunState   *RunState
	BatchState *BatchState
	MaxSeqLen  int
}

Pipeline bundles a loaded model, tokenizer, KV cache, and run state for inference.

func NewPipeline ¶

func NewPipeline(modelPath string, maxSeqLen int) (*Pipeline, error)

NewPipeline loads a GGUF model and creates a ready-to-use inference pipeline with automatic tokenizer extraction from GGUF metadata.

func (*Pipeline) Chat ¶

func (p *Pipeline) Chat(system, user string, cfg GenerateConfig) (string, float64, error)

Chat formats a user message (with optional system prompt) using the model's chat template, then generates a response. Returns generated text and tok/s.

func (*Pipeline) ChatMessages ¶

func (p *Pipeline) ChatMessages(messages []Message, cfg GenerateConfig) (string, float64, error)

ChatMessages formats a multi-turn conversation and generates the assistant's next response. Returns generated text and tok/s.

func (*Pipeline) FreeForGPU ¶

func (p *Pipeline) FreeForGPU()

FreeForGPU releases CPU-side KV cache, RunState, and BatchState that are unused when a GPU pipeline handles all inference. Only Model, Tokenizer, and MaxSeqLen are retained (needed by the scheduler for tokenization and config lookups). Call this after the GPU pipeline is successfully created.

func (*Pipeline) Generate ¶

func (p *Pipeline) Generate(prompt []int32, cfg GenerateConfig) ([]int32, error)

Generate produces text from a prompt using the loaded model.

func (*Pipeline) GenerateDetailed ¶

func (p *Pipeline) GenerateDetailed(prompt string, cfg GenerateConfig) (*GenerateResult, error)

GenerateDetailed is like GenerateText but returns detailed timing information.

func (*Pipeline) GenerateText ¶

func (p *Pipeline) GenerateText(prompt string, cfg GenerateConfig) (string, float64, error)

GenerateText is a convenience method that takes a text prompt, encodes it, generates tokens, and decodes the result. Returns the generated text and token/second throughput.

func (*Pipeline) GenerateTextWithStopStrings ¶

func (p *Pipeline) GenerateTextWithStopStrings(prompt string, cfg GenerateConfig) (string, float64, error)

GenerateTextWithStopStrings is like GenerateText but also handles text-level stop string detection for multi-token stop sequences.

func (*Pipeline) RebuildBuffers ¶

func (p *Pipeline) RebuildBuffers()

RebuildBuffers re-creates KV cache, RunState, and BatchState using the current MaxSeqLen. Used to restore CPU-side buffers after FreeForGPU when GPU pipeline creation fails and we fall back to CPU inference.

type ResKind ¶

type ResKind uint8

ResKind selects the residual connection + FFN-norm wiring.

const (
	ResStandard    ResKind = iota // Separate FFNNorm; optional PostAttnNorm/PostFFNNorm (LLaMA, Gemma, Qwen)
	ResPostAttnFFN                // PostAttnNorm doubles as FFN norm (Qwen3.5)
	ResParallel                   // Parallel attn+FFN on same pre-norm; X += attn + FFN (Phi-2)
)

type RunState ¶

type RunState struct {
	X          []float32   // [dim] current activation
	XNorm      []float32   // [dim] normalized activation
	Q          []float32   // [qDim] query projection
	K          []float32   // [kvDim] key projection
	V          []float32   // [kvDim] value projection
	AttnOut    []float32   // [qDim] attention output
	AttnProj   []float32   // [dim] output projection
	FFNIn      []float32   // [dim] FFN input (after residual)
	FFNNorm    []float32   // [dim] FFN normalized
	Gate       []float32   // [ffnDim] gate projection
	Up         []float32   // [ffnDim] up projection
	Hidden     []float32   // [ffnDim] gated hidden
	FFNOut     []float32   // [dim] FFN output
	Logits     []float32   // [vocabSize] output logits
	Scores     []float32   // [maxSeqLen] attention scores scratch (legacy)
	HeadScores [][]float32 // [numHeads][maxSeqLen] per-head score buffers for parallel attention

	// Qwen3.5 gated attention: Wq outputs interleaved [Q,gate] per head
	QFull []float32 // [2*qDim] fused Q+gate output (nil for non-gated models)
	QGate []float32 // [qDim] attention gate values (nil for non-gated models)

	// SSM (Gated Delta Net) scratch buffers — nil for pure transformer models
	SSMRun   *SSMRunState
	SSMState *memory.SSMStateCache

	// MoE (Mixture of Experts) scratch buffers — nil for dense models
	MoELogits     []float32   // [expertCount] router logits
	MoEGates      [][]float32 // [nUsed][expertFFNDim] per-expert gate (parallel)
	MoEUps        [][]float32 // [nUsed][expertFFNDim] per-expert up (parallel)
	MoEHiddens    [][]float32 // [nUsed][expertFFNDim] per-expert hidden (parallel)
	MoEExpertOuts [][]float32 // [nUsed][dim] per-expert output (parallel)
	MoEShGate     []float32   // [sharedFFNDim] shared expert gate
	MoEShUp       []float32   // [sharedFFNDim] shared expert up
	MoEShHidden   []float32   // [sharedFFNDim] shared expert hidden
	MoEShOut      []float32   // [dim] shared expert output

	// MLA (Multi-head Latent Attention) scratch buffers
	MLAQComp     []float32 // [qLORARank] compressed Q intermediate
	MLAQAbsorbed []float32 // [numHeads * kvLORARank] absorbed key vectors per head
	MLAAttnKV    []float32 // [numHeads * kvLORARank] weighted KV sum per head

	// Worker pool for parallel matmul
	Pool *blas.Pool
	// contains filtered or unexported fields
}

RunState holds pre-allocated buffers for inference, avoiding per-token allocations.

func NewRunState ¶

func NewRunState(cfg ModelConfig, maxSeqLen int) *RunState

NewRunState allocates all buffers for a model.

func (*RunState) ApplyRoPEFast ¶

func (rs *RunState) ApplyRoPEFast(vec []float32, pos int)

ApplyRoPEFast applies precomputed RoPE to vec in-place. vec must be one head's worth [headDim]; pos is the token position. Only the first ropeDim dimensions are rotated; the rest pass through unchanged.

func (*RunState) ApplyRoPEFastDim ¶

func (rs *RunState) ApplyRoPEFastDim(vec []float32, pos, ropeDim int)

ApplyRoPEFastDim applies RoPE to a vector of exactly ropeDim elements (e.g. the rope portion of MLA Q/K heads).

func (*RunState) PrecomputeRoPE ¶

func (rs *RunState) PrecomputeRoPE(maxSeqLen, ropeDim, headDim int, freqBase float32)

PrecomputeRoPE fills RunState with precomputed cos/sin tables for RoPE. maxSeqLen: maximum sequence length; ropeDim, headDim: dimensions; freqBase: RoPE frequency base. ropeDim may be less than headDim for partial RoPE (e.g. Phi-2: 32 of 80).

func (*RunState) PrecomputeSWARoPE ¶

func (rs *RunState) PrecomputeSWARoPE(maxSeqLen, ropeDim, headDim int, freqBase float32)

PrecomputeSWARoPE builds separate cos/sin tables for SWA layers with a different frequency base (e.g. gpt-oss ISWA uses theta=10000 for SWA, theta=150000 for full attention).

func (*RunState) PrecomputeYaRNRoPE ¶

func (rs *RunState) PrecomputeYaRNRoPE(maxSeqLen, ropeDim, headDim int,
	freqBase, factor float32, origMaxPos int, betaFast, betaSlow, extFactor, attnFactor float32)

PrecomputeYaRNRoPE precomputes RoPE tables with YaRN extended-context scaling.

func (*RunState) RoPETables ¶

func (rs *RunState) RoPETables() (cosTable, sinTable []float32)

RoPETables returns the precomputed cos/sin tables for GPU upload.

func (*RunState) RoPETablesSWA ¶

func (rs *RunState) RoPETablesSWA() (cosTable, sinTable []float32)

RoPETablesSWA returns the SWA-specific RoPE tables (nil if not set).

func (*RunState) SetRopeNeox ¶

func (rs *RunState) SetRopeNeox(neox bool)

SetRopeNeox sets whether to use NeoX-style (split-half) RoPE pairing. Call after PrecomputeRoPE if needed.

type SSMRunState ¶

type SSMRunState struct {
	QKV     []float32 // [qkvDim] in-projection output (goes through conv)
	Z       []float32 // [valueDim] gate projection output
	Alpha   []float32 // [numHeads] raw alpha (decay param)
	Beta    []float32 // [numHeads] raw beta (learning rate)
	FusedBA []float32 // [2*numHeads] fused beta+alpha output (interleaved per KV group)
	Y       []float32 // [valueDim] attention/SSM output
}

SSMRunState holds pre-allocated scratch buffers for SSM (Gated Delta Net) layers.

type Tokenizer ¶

type Tokenizer struct {
	// Backward-compatible fields
	Tokens    []string
	TokenToID map[string]int32
	BOS       int32
	EOS       int32
	AddBOS    bool
	PreBOS    int32 // token to prepend before BOS (e.g., [gMASK]); -1 = unused

	// BPE/SPM-specific fields
	MergeRanks    map[[2]string]int // GPT-2: merge pair -> priority (lower = merge first)
	Scores        []float32         // SentencePiece: token scores for merge ordering
	ModelType     string            // "llama" or "gpt2"
	SpecialTokens map[string]int32  // special/control token text -> id
	// contains filtered or unexported fields
}

Tokenizer encodes text to token IDs and decodes token IDs back to text. Supports both SentencePiece (LLaMA) and GPT-2 BPE (Qwen) tokenization.

func NewTokenizerFromGGUF ¶

func NewTokenizerFromGGUF(md map[string]interface{}, cfg ModelConfig) (*Tokenizer, error)

NewTokenizerFromGGUF extracts vocabulary and tokenizer config from GGUF metadata. Auto-detects tokenizer type based on tokenizer.ggml.model: "llama" -> SentencePiece, "gpt2" -> GPT-2 BPE. Falls back to heuristic if model key is missing.

func (*Tokenizer) Decode ¶

func (t *Tokenizer) Decode(tokens []int32) string

Decode converts token IDs back to text.

func (*Tokenizer) DecodeToken ¶

func (t *Tokenizer) DecodeToken(id int32) string

DecodeToken converts a single token ID to its string representation. Special tokens like thinking markers are preserved as their raw text.

func (*Tokenizer) Encode ¶

func (t *Tokenizer) Encode(text string) []int32

Encode converts text to a sequence of token IDs.

func (*Tokenizer) VocabSize ¶

func (t *Tokenizer) VocabSize() int

VocabSize returns the vocabulary size.

type WorkerPool ¶

type WorkerPool struct {
	// contains filtered or unexported fields
}

WorkerPool manages persistent goroutines for parallel inference. Each worker has its own task channel; work is distributed round-robin.

func NewWorkerPool ¶

func NewWorkerPool(n int) *WorkerPool

NewWorkerPool creates a pool with n persistent workers. Callers must call Shutdown when done.

func (*WorkerPool) Dispatch ¶

func (wp *WorkerPool) Dispatch(total, numActive int, work func(workerID, start, end int))

Dispatch distributes work across numActive workers and blocks until complete. total is the total number of items; work(workerID, start, end) processes [start, end).

func (*WorkerPool) DispatchAsync ¶

func (wp *WorkerPool) DispatchAsync(total, numActive int, work func(workerID, start, end int))

DispatchAsync distributes work without waiting. Call Wait() to block until all async work completes.

func (*WorkerPool) Shutdown ¶

func (wp *WorkerPool) Shutdown()

Shutdown stops all workers and waits for them to exit.

func (*WorkerPool) Wait ¶

func (wp *WorkerPool) Wait()

Wait blocks until all work dispatched via DispatchAsync has completed.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL