Documentation
¶
Index ¶
- func WithCache[T tensor.Numeric](ctx context.Context, cache CacheProvider[T]) context.Context
- func WithKVCache[T tensor.Numeric](ctx context.Context, cache *KVCache[T]) context.Context
- type BatchRequest
- type BatchResult
- type Block
- type BlockPool
- type CacheProvider
- type GPUAllocator
- type GPUKVCache
- func (c *GPUKVCache) Append(layerIdx int, k, v []float32, seqPos int) error
- func (c *GPUKVCache) AppendGPU(layerIdx int, kSrc, vSrc unsafe.Pointer, stream unsafe.Pointer) error
- func (c *GPUKVCache) Close() error
- func (c *GPUKVCache) DevicePointerArrays() (kPtrs, vPtrs unsafe.Pointer, err error)
- func (c *GPUKVCache) GPUCounterPtr() unsafe.Pointer
- func (c *GPUKVCache) Pointers(layerIdx int) (kPtr, vPtr unsafe.Pointer, seqLen int)
- func (c *GPUKVCache) Reset()
- func (c *GPUKVCache) SeqLen() int
- func (c *GPUKVCache) SyncCounterFromGPU() error
- type Generator
- func (gen *Generator[T]) BatchGenerate(ctx context.Context, requests []BatchRequest) []BatchResult
- func (gen *Generator[T]) BatchGenerateStream(ctx context.Context, requests []BatchRequest, streams []TokenStream) []error
- func (gen *Generator[T]) Config() ModelConfig
- func (gen *Generator[T]) Engine() compute.Engine[T]
- func (gen *Generator[T]) Generate(ctx context.Context, prompt string, sc SamplingConfig) (string, error)
- func (gen *Generator[T]) GenerateStream(ctx context.Context, prompt string, sc SamplingConfig, stream TokenStream) error
- func (gen *Generator[T]) Graph() *graph.Graph[T]
- func (gen *Generator[T]) Tokenizer() tokenizer.Tokenizer
- type GeneratorOption
- type KVCache
- type LayerKV
- type ModelConfig
- type PagedKVCache
- func (c *PagedKVCache[T]) Append(layer int, newK, newV *tensor.TensorNumeric[T]) error
- func (c *PagedKVCache[T]) Free()
- func (c *PagedKVCache[T]) Get(layer int) (*LayerKV[T], bool)
- func (c *PagedKVCache[T]) GetKV(layer int) (*LayerKV[T], bool)
- func (c *PagedKVCache[T]) NumLayers() int
- func (c *PagedKVCache[T]) Reset()
- func (c *PagedKVCache[T]) SeqLen() int
- func (c *PagedKVCache[T]) Truncate(newSeqLen int)
- func (c *PagedKVCache[T]) Update(layer int, newK, newV *tensor.TensorNumeric[T]) error
- type SamplingConfig
- type SpeculativeGenerator
- type TensorCache
- func (c *TensorCache[T]) Free()
- func (c *TensorCache[T]) GPUCounterPtr() unsafe.Pointer
- func (c *TensorCache[T]) Get(layer int) (*LayerKV[T], bool)
- func (c *TensorCache[T]) GetFullBuffer(layer int) (k, v *tensor.TensorNumeric[T])
- func (c *TensorCache[T]) KVSeqLenPtr() unsafe.Pointer
- func (c *TensorCache[T]) MaxSeqLen() int
- func (c *TensorCache[T]) Reset()
- func (c *TensorCache[T]) SeqLen() int
- func (c *TensorCache[T]) SyncCounterFromGPU() error
- func (c *TensorCache[T]) Truncate(newSeqLen int)
- func (c *TensorCache[T]) Update(layer int, newK, newV *tensor.TensorNumeric[T]) error
- type TensorCacheOption
- type TokenStream
- type TokenStreamFunc
- type TracingCacheProvider
- func (t *TracingCacheProvider[T]) Get(layer int) (*LayerKV[T], bool)
- func (t *TracingCacheProvider[T]) Reset()
- func (t *TracingCacheProvider[T]) SeqLen() int
- func (t *TracingCacheProvider[T]) Truncate(newSeqLen int)
- func (t *TracingCacheProvider[T]) Update(layer int, newK, newV *tensor.TensorNumeric[T]) error
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
Types ¶
type BatchRequest ¶
type BatchRequest struct {
Prompt string
Sampling SamplingConfig
}
BatchRequest represents a single generation request in a batch.
type BatchResult ¶
BatchResult holds the output for a single request in a batch.
type Block ¶
type Block[T tensor.Numeric] struct { K []T V []T Used int // number of token positions written (0..blockSize) }
Block holds pre-allocated key and value data for a fixed number of token positions across all layers. K and V each have numLayers * blockSize * headDim elements laid out as [layer][position][headDim] in row-major order.
type BlockPool ¶
BlockPool manages a fixed-size pool of pre-allocated KV cache blocks. Blocks are allocated at startup and recycled via Alloc/Free. All methods are safe for concurrent use.
func NewBlockPool ¶
func NewBlockPool[T tensor.Numeric](numLayers, blockSize, headDim, maxMemoryMB int) (*BlockPool[T], error)
NewBlockPool creates a pool of blocks sized to fit within maxMemoryMB. Each block holds K and V data for blockSize token positions across numLayers, with headDim elements per position per layer. The element size is assumed to be 4 bytes (float32).
func (*BlockPool[T]) Alloc ¶
Alloc returns a free block from the pool. Returns an error if the pool is exhausted. The returned block has Used reset to 0.
type CacheProvider ¶
type CacheProvider[T tensor.Numeric] interface { Update(layer int, newK, newV *tensor.TensorNumeric[T]) error Get(layer int) (*LayerKV[T], bool) SeqLen() int Reset() Truncate(newSeqLen int) }
CacheProvider is the interface implemented by both KVCache (pre-allocated) and PagedKVCache (block-based). Attention layers use this interface to store and retrieve cached key-value tensors during generation.
type GPUAllocator ¶
type GPUAllocator interface {
// Alloc allocates size bytes of device memory and returns a device pointer.
Alloc(size int) (unsafe.Pointer, error)
// Free releases device memory previously returned by Alloc.
Free(ptr unsafe.Pointer) error
// Memcpy copies size bytes between host and device memory.
// kind follows the gpuapi convention: 0 = HostToDevice, 1 = DeviceToHost.
Memcpy(dst, src unsafe.Pointer, size int, kind int) error
}
GPUAllocator abstracts GPU memory operations so that GPUKVCache can be tested without a real GPU device. Production code passes a thin wrapper around gpuapi.Runtime; tests supply a mock.
type GPUKVCache ¶
type GPUKVCache struct {
// contains filtered or unexported fields
}
GPUKVCache manages GPU-resident key/value buffers for all attention layers during megakernel inference. Memory is allocated once at construction and reused across generation steps.
func NewGPUKVCache ¶
func NewGPUKVCache(alloc GPUAllocator, numLayers, maxSeqLen, numHeads, headDim int) (*GPUKVCache, error)
NewGPUKVCache allocates GPU buffers for numLayers attention layers. Each layer gets two buffers (K and V) of size maxSeqLen * numHeads * headDim float32 elements.
func (*GPUKVCache) Append ¶
func (c *GPUKVCache) Append(layerIdx int, k, v []float32, seqPos int) error
Append copies new K/V float32 data to the correct position in the GPU buffer for the given layer. k and v must each have length numHeads * headDim (one token's worth of data). seqPos is the sequence position to write at; it must equal the current seqLen (enforcing sequential append).
func (*GPUKVCache) AppendGPU ¶
func (c *GPUKVCache) AppendGPU(layerIdx int, kSrc, vSrc unsafe.Pointer, stream unsafe.Pointer) error
AppendGPU copies one token's K/V data from GPU-resident src pointers into the KV cache using the offset_memcpy kernel. The kernel reads gpuCounter on the GPU to compute the write offset, eliminating any D2H copy per token. After writing K and V for the last layer, it increments the GPU counter via the increment_counter kernel and advances the CPU seqLen for compatibility.
kSrc and vSrc must each point to numHeads*headDim float32 values on the GPU. stream is the CUDA stream for async execution.
func (*GPUKVCache) Close ¶
func (c *GPUKVCache) Close() error
Close frees all GPU memory held by the cache. The cache must not be used after Close is called.
func (*GPUKVCache) DevicePointerArrays ¶
func (c *GPUKVCache) DevicePointerArrays() (kPtrs, vPtrs unsafe.Pointer, err error)
DevicePointerArrays returns GPU-resident arrays of float* pointers for K and V buffers across all layers. These can be passed directly to the megakernel. The arrays are allocated once and cached.
func (*GPUKVCache) GPUCounterPtr ¶
func (c *GPUKVCache) GPUCounterPtr() unsafe.Pointer
GPUCounterPtr returns the device pointer to the GPU-resident int32 position counter. Kernels (offset_memcpy, rope_select, increment_counter) use this pointer to read/write the current sequence position on the GPU, enabling CUDA graph capture of the decode loop.
func (*GPUKVCache) Pointers ¶
func (c *GPUKVCache) Pointers(layerIdx int) (kPtr, vPtr unsafe.Pointer, seqLen int)
Pointers returns the device pointers for the K and V buffers of the given layer, along with the current sequence length. The megakernel reads from these pointers directly.
func (*GPUKVCache) Reset ¶
func (c *GPUKVCache) Reset()
Reset resets the sequence position to zero without freeing GPU memory. Buffers are reused for the next generation. The GPU counter is also zeroed so that GPU-side kernels see the reset position.
func (*GPUKVCache) SeqLen ¶
func (c *GPUKVCache) SeqLen() int
SeqLen returns the current cached sequence length.
func (*GPUKVCache) SyncCounterFromGPU ¶
func (c *GPUKVCache) SyncCounterFromGPU() error
SyncCounterFromGPU performs a D2H copy of the GPU counter to update the CPU seqLen. Call this after the decode loop completes, not per token.
type Generator ¶
Generator produces text autoregressively using a loaded model graph.
func NewGenerator ¶
func NewGenerator[T tensor.Numeric]( g *graph.Graph[T], tok tokenizer.Tokenizer, eng compute.Engine[T], cfg ModelConfig, opts ...GeneratorOption, ) *Generator[T]
NewGenerator creates a Generator from a model graph, tokenizer, engine, and config.
func (*Generator[T]) BatchGenerate ¶
func (gen *Generator[T]) BatchGenerate(ctx context.Context, requests []BatchRequest) []BatchResult
BatchGenerate runs multiple generation requests concurrently. Each request gets its own KV cache and sampling state. This provides throughput gains when the model graph is configured with WithParallel(true) or when generation is I/O bound.
For true batched tensor operations (batch dimension > 1 in a single forward pass), the model graph and attention layers need native batch support, which is not yet implemented. This function provides request-level parallelism as an interim solution.
func (*Generator[T]) BatchGenerateStream ¶
func (gen *Generator[T]) BatchGenerateStream(ctx context.Context, requests []BatchRequest, streams []TokenStream) []error
BatchGenerateStream runs multiple streaming generation requests concurrently. Each request gets its own KV cache, sampling state, and token stream.
func (*Generator[T]) Config ¶
func (gen *Generator[T]) Config() ModelConfig
Config returns the model configuration.
func (*Generator[T]) Generate ¶
func (gen *Generator[T]) Generate(ctx context.Context, prompt string, sc SamplingConfig) (string, error)
Generate produces text from a prompt using the given sampling configuration. It tokenizes the prompt, runs the autoregressive loop with KV caching, and returns the generated text (excluding the prompt).
func (*Generator[T]) GenerateStream ¶
func (gen *Generator[T]) GenerateStream(ctx context.Context, prompt string, sc SamplingConfig, stream TokenStream) error
GenerateStream produces text from a prompt, delivering each token to the stream as it is generated. The final output matches what Generate would return.
type GeneratorOption ¶
type GeneratorOption func(*generatorOptions)
GeneratorOption configures a Generator.
func WithGeneratorKVDtype ¶
func WithGeneratorKVDtype(dtype string) GeneratorOption
WithGeneratorKVDtype sets the KV cache storage dtype. Supported: "fp32" (default), "fp16".
func WithPagedKV ¶
func WithPagedKV(maxMemoryMB, headDim int) GeneratorOption
WithPagedKV enables paged KV caching with the given memory budget in MB. When enabled, the Generator allocates blocks from a shared BlockPool instead of pre-allocating the full maxSeqLen per sequence. headDim is the per-position storage size: for GQA models pass numKVHeads * actualHeadDim so the pool can store all KV heads per position.
type KVCache ¶
KVCache stores key-value tensors for all attention layers during autoregressive generation. Buffers are pre-allocated to maxSeqLen on first Update, and subsequent Updates copy data at the cursor position with zero allocation.
func GetKVCache ¶
GetKVCache extracts the KVCache from the context, if present. It handles both direct *KVCache storage and CacheProvider interface storage. Deprecated: Use GetCache for CacheProvider-based caching.
func NewKVCache ¶
NewKVCache creates a KVCache for the specified number of layers and maximum sequence length. Backing buffers are lazily allocated on the first Update call for each layer (when batch and dim become known).
func (*KVCache[T]) Get ¶
Get returns the cached key-value pair for the given layer as tensors covering [0:cursor] on the sequence axis. For batch=1, the returned tensors are zero-copy views over the pre-allocated buffer. For batch>1, data is compacted into a contiguous slice. Returns false if the layer has not been populated yet.
func (*KVCache[T]) Reset ¶
func (c *KVCache[T]) Reset()
Reset clears all cached data and resets cursors to zero. The pre-allocated buffers are retained for reuse.
func (*KVCache[T]) SeqLen ¶
SeqLen returns the current cached sequence length. Returns 0 if the cache is empty.
func (*KVCache[T]) Truncate ¶
Truncate rolls back the cache to the given sequence length. If newSeqLen >= current SeqLen, this is a no-op.
func (*KVCache[T]) Update ¶
func (c *KVCache[T]) Update(layer int, newK, newV *tensor.TensorNumeric[T]) error
Update appends new key and value tensors to the cache for the given layer. Tensors are expected to have shape [batch, seq_len, dim]. Data is copied into the pre-allocated buffer at the current cursor position. After the initial allocation, Update performs zero heap allocations.
type LayerKV ¶
type LayerKV[T tensor.Numeric] struct { Key *tensor.TensorNumeric[T] Value *tensor.TensorNumeric[T] }
LayerKV holds the cached key and value tensors for a single attention layer.
type ModelConfig ¶
type ModelConfig struct {
VocabSize int // Total tokens in vocabulary
MaxSeqLen int // Maximum sequence length the model supports
EOSTokenID int // End-of-sequence token ID
BOSTokenID int // Beginning-of-sequence token ID
NumLayers int // Number of transformer layers (for KV cache sizing)
}
ModelConfig holds model architecture parameters needed for generation.
type PagedKVCache ¶
PagedKVCache stores key-value tensors for autoregressive generation using block-level allocation from a BlockPool. Instead of pre-allocating the full maxSeqLen per sequence, blocks of blockSize tokens are allocated on demand, reducing memory waste for concurrent sequences of varying length.
Each sequence gets its own PagedKVCache. The cache accepts tensors with arbitrary first dimensions (channels). GQA attention stores KV as [batchSize*numKVHeads, seqLen, headDim]; the pool's headDim must equal channels * dim to accommodate the full per-position data.
func NewPagedKVCache ¶
func NewPagedKVCache[T tensor.Numeric](pool *BlockPool[T], numLayers int) *PagedKVCache[T]
NewPagedKVCache creates a paged KV cache backed by the given block pool.
func (*PagedKVCache[T]) Append ¶
func (c *PagedKVCache[T]) Append(layer int, newK, newV *tensor.TensorNumeric[T]) error
Append writes new key and value data for the given layer. The tensors must have shape [channels, seqLen, dim] where channels*dim equals the pool's headDim. For standard caching channels=1; for GQA caching channels equals batchSize*numKVHeads. Data is written into the current block; a new block is allocated from the pool when the current one fills up.
func (*PagedKVCache[T]) Free ¶
func (c *PagedKVCache[T]) Free()
Free returns all allocated blocks to the pool and resets the cache.
func (*PagedKVCache[T]) Get ¶
func (c *PagedKVCache[T]) Get(layer int) (*LayerKV[T], bool)
Get returns the cached KV for the given layer. This is an alias for GetKV that satisfies the CacheProvider interface.
func (*PagedKVCache[T]) GetKV ¶
func (c *PagedKVCache[T]) GetKV(layer int) (*LayerKV[T], bool)
GetKV returns the cached key and value tensors for the given layer, gathered into contiguous [channels, seqLen, dim] tensors. Returns false if the layer is out of range or the cache is empty for that layer.
func (*PagedKVCache[T]) NumLayers ¶
func (c *PagedKVCache[T]) NumLayers() int
NumLayers returns the number of layers in the cache.
func (*PagedKVCache[T]) Reset ¶
func (c *PagedKVCache[T]) Reset()
Reset clears the cache and returns all blocks to the pool.
func (*PagedKVCache[T]) SeqLen ¶
func (c *PagedKVCache[T]) SeqLen() int
SeqLen returns the number of token positions stored in the cache, based on layer 0's cursor. Returns 0 if the cache is empty.
func (*PagedKVCache[T]) Truncate ¶
func (c *PagedKVCache[T]) Truncate(newSeqLen int)
Truncate rolls back the cache to the given sequence length. Blocks beyond the new length are returned to the pool.
func (*PagedKVCache[T]) Update ¶
func (c *PagedKVCache[T]) Update(layer int, newK, newV *tensor.TensorNumeric[T]) error
Update appends new key and value data for the given layer. This is an alias for Append that satisfies the CacheProvider interface.
type SamplingConfig ¶
type SamplingConfig struct {
Temperature float64 // Divide logits by this value; 0 = greedy
TopK int // Keep only top K tokens; 0 = disabled
TopP float64 // Keep tokens with cumulative prob >= P; 1.0 = disabled
RepetitionPenalty float64 // Penalize repeated tokens; 1.0 = disabled
MaxNewTokens int // Maximum number of tokens to generate
StopTokenIDs []int // Stop when any of these token IDs are generated
StopStrings []string // Stop when output contains any of these strings
}
SamplingConfig controls how tokens are selected during generation.
func DefaultSamplingConfig ¶
func DefaultSamplingConfig() SamplingConfig
DefaultSamplingConfig returns a SamplingConfig with sensible defaults.
type SpeculativeGenerator ¶
SpeculativeGenerator implements speculative decoding using a small draft model and a large target model. The draft model proposes N tokens greedily, then the target model verifies all N in a single batched forward pass. Accepted tokens are emitted; on first mismatch the target's token is used.
func NewSpeculativeGenerator ¶
func NewSpeculativeGenerator[T tensor.Numeric]( draftGraph, targetGraph *graph.Graph[T], tok tokenizer.Tokenizer, engine compute.Engine[T], draftCfg, targetCfg ModelConfig, draftLen int, ) *SpeculativeGenerator[T]
NewSpeculativeGenerator creates a speculative generator with separate draft and target model graphs. draftLen controls how many tokens the draft model proposes per verification step (typically 2-8).
func (*SpeculativeGenerator[T]) Generate ¶
func (sg *SpeculativeGenerator[T]) Generate(ctx context.Context, prompt string, sc SamplingConfig) (string, error)
Generate produces text from a prompt using speculative decoding with greedy sampling. The draft model proposes tokens, the target model verifies them.
func (*SpeculativeGenerator[T]) WithAdaptive ¶
func (sg *SpeculativeGenerator[T]) WithAdaptive(enabled bool) *SpeculativeGenerator[T]
WithAdaptive enables or disables adaptive draft length adjustment. When enabled (default), the draft length is adjusted based on acceptance rate.
type TensorCache ¶
TensorCache is a KV cache that keeps tensors in pre-allocated buffers. On the first Update for a layer, it allocates [batch, maxSeqLen, dim] memory (GPU or CPU depending on the source tensor). Subsequent Updates append new K/V data via direct memcpy at the correct offset, avoiding per-token allocation overhead.
func NewTensorCache ¶
func NewTensorCache[T tensor.Numeric](engine compute.Engine[T], numLayers, maxSeqLen int, opts ...TensorCacheOption) *TensorCache[T]
NewTensorCache creates a TensorCache backed by the given engine. numLayers should match the model's transformer layer count. maxSeqLen limits the total cached sequence length. If the engine implements GPUStreamAccessor, async memcpy is used for KV cache updates (required for CUDA graph capture compatibility).
func (*TensorCache[T]) Free ¶
func (c *TensorCache[T]) Free()
Free releases all pre-allocated GPU buffers. CPU buffers are left to GC.
func (*TensorCache[T]) GPUCounterPtr ¶
func (c *TensorCache[T]) GPUCounterPtr() unsafe.Pointer
GPUCounterPtr returns the device pointer to the GPU-resident int32 position counter. Returns nil if no GPU counter is allocated (CPU-only cache). Kernels (offset_memcpy, rope_select, increment_counter) use this pointer to read/write the current sequence position on the GPU, enabling CUDA graph capture of the decode loop.
func (*TensorCache[T]) Get ¶
func (c *TensorCache[T]) Get(layer int) (*LayerKV[T], bool)
Get returns the cached key-value pair for the given layer. For GPU-backed layers, returns a view into the pre-allocated buffer. For CPU-backed layers, returns a tensor wrapping the buffer slice. Returns false if the layer index is out of range or the layer is empty.
func (*TensorCache[T]) GetFullBuffer ¶
func (c *TensorCache[T]) GetFullBuffer(layer int) (k, v *tensor.TensorNumeric[T])
GetFullBuffer returns GPU-backed KV tensors spanning the full pre-allocated buffer (maxSeqLen capacity) for the given layer. The shape is [batch, maxSeqLen, dim]. This is used by flash_attention_decode which reads the actual KV length from a GPU-resident counter, so it needs the buffer with its full stride rather than a seqLen-trimmed view. Returns nil if the layer is CPU-backed or not yet initialized.
func (*TensorCache[T]) KVSeqLenPtr ¶
func (c *TensorCache[T]) KVSeqLenPtr() unsafe.Pointer
KVSeqLenPtr returns the device pointer to the GPU-resident int32 KV sequence length counter. Returns nil if not allocated (CPU-only cache). The flash_attention_decode kernel reads this pointer at runtime so the KV length is not frozen by CUDA graph capture.
func (*TensorCache[T]) MaxSeqLen ¶
func (c *TensorCache[T]) MaxSeqLen() int
MaxSeqLen returns the maximum sequence length (buffer capacity).
func (*TensorCache[T]) Reset ¶
func (c *TensorCache[T]) Reset()
Reset clears sequence lengths to zero. Pre-allocated buffers are kept for reuse; only data pointers are logically invalidated. The GPU counters are also zeroed so that GPU-side kernels see the reset position.
func (*TensorCache[T]) SeqLen ¶
func (c *TensorCache[T]) SeqLen() int
SeqLen returns the current cached sequence length (from layer 0).
func (*TensorCache[T]) SyncCounterFromGPU ¶
func (c *TensorCache[T]) SyncCounterFromGPU() error
SyncCounterFromGPU performs a D2H copy of the GPU counter to update the CPU seqLen across all layers. Call this after the decode loop completes to bring the CPU-side cursor back in sync with the GPU counter.
func (*TensorCache[T]) Truncate ¶
func (c *TensorCache[T]) Truncate(newSeqLen int)
Truncate rolls back the cache to the given sequence length. Pre-allocated buffers are kept; the data beyond newSeqLen is simply ignored. GPU-resident counters (gpuCounter and kvSeqLenCounter) are also reset to match newSeqLen so that GPU-side kernels (offset_memcpy, rope_select, flash_attention_decode) see the correct position after rollback.
func (*TensorCache[T]) Update ¶
func (c *TensorCache[T]) Update(layer int, newK, newV *tensor.TensorNumeric[T]) error
Update appends new key and value tensors to the cache for the given layer. Tensors must be 3D with shape [batch, seqLen, dim]. On the first call for a layer, pre-allocated buffers are created. Subsequent calls copy new data directly into the buffers at the current sequence offset.
type TensorCacheOption ¶
type TensorCacheOption func(*tensorCacheOptions)
TensorCacheOption configures a TensorCache.
func WithKVDtype ¶
func WithKVDtype(dtype string) TensorCacheOption
WithKVDtype sets the KV cache storage dtype. Supported values: "fp32" (default), "fp16". FP16 mode halves KV memory bandwidth but requires GPU and CUDA conversion kernels.
type TokenStream ¶
type TokenStream interface {
// OnToken is called for each decoded token during streaming generation.
// When done is true, generation is complete (token may be empty).
// Returning a non-nil error stops generation.
OnToken(token string, done bool) error
}
TokenStream receives tokens as they are generated.
type TokenStreamFunc ¶
TokenStreamFunc adapts a function to the TokenStream interface.
type TracingCacheProvider ¶
TracingCacheProvider wraps a real CacheProvider and records KV cache operations into a Tracer during a tracing compilation pass. This allows the tracing compiler to capture the full attention dataflow including cache reads and writes.
func NewTracingCacheProvider ¶
func NewTracingCacheProvider[T tensor.Numeric](real CacheProvider[T], tracer *compute.Tracer[T]) *TracingCacheProvider[T]
NewTracingCacheProvider creates a TracingCacheProvider wrapping the given real cache and recording ops into the tracer.
func (*TracingCacheProvider[T]) Get ¶
func (t *TracingCacheProvider[T]) Get(layer int) (*LayerKV[T], bool)
Get delegates to the real cache and records KVCacheGetK/V ops.
func (*TracingCacheProvider[T]) Reset ¶
func (t *TracingCacheProvider[T]) Reset()
Reset delegates to the real cache.
func (*TracingCacheProvider[T]) SeqLen ¶
func (t *TracingCacheProvider[T]) SeqLen() int
SeqLen delegates to the real cache.
func (*TracingCacheProvider[T]) Truncate ¶
func (t *TracingCacheProvider[T]) Truncate(newSeqLen int)
Truncate delegates to the real cache.
func (*TracingCacheProvider[T]) Update ¶
func (t *TracingCacheProvider[T]) Update(layer int, newK, newV *tensor.TensorNumeric[T]) error
Update delegates to the real cache and records KVCacheAppendK/V ops.