graph

package

v0.3.0 Latest Latest Go to latest Published: Mar 21, 2026 License: Apache-2.0 Imports: 15 Imported by: 22

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/zerfoo/ztensor

Links

Open Source Insights

Documentation ¶

Overview ¶

Package graph provides a computational graph abstraction.

Index ¶

Variables
func ApplyBatchSchedule[T tensor.Numeric](plan *ExecutionPlan[T], captureStart, captureEnd int, schedule *BatchSchedule) (newStart, newEnd int)
func Checkpoint[T tensor.Numeric](...) *checkpointNode[T]
func ParallelForward[T tensor.Numeric](ctx context.Context, g *Graph[T], inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
type BatchGroup
- func (g BatchGroup) Size() int
type BatchSchedule
- func ScheduleBatches[T tensor.Numeric](plan *ExecutionPlan[T]) *BatchSchedule
- func (s *BatchSchedule) Coverage() float64
- func (s *BatchSchedule) GroupCount() int
type BufferArena
- func NewBufferArena[T tensor.Numeric](shapes [][]int) *BufferArena[T]
- func (a *BufferArena[T]) Get(idx int) *tensor.TensorNumeric[T]
- func (a *BufferArena[T]) Len() int
- func (a *BufferArena[T]) Reset()
- func (a *BufferArena[T]) Set(idx int, t *tensor.TensorNumeric[T], freeze bool)
type BufferLayout
- func ComputeBufferLayout(slotShapes [][]int, frozenIdx []int, inputIdx []int) BufferLayout
type Builder
- func NewBuilder[T tensor.Numeric](engine compute.Engine[T]) *Builder[T]
- func (b *Builder[T]) AddNode(node Node[T], inputs ...Node[T]) Node[T]
- func (b *Builder[T]) Build(outputNode Node[T]) (*Graph[T], error)
- func (b *Builder[T]) Input(shape []int) Node[T]
- func (b *Builder[T]) Parameters() []*Parameter[T]
type CUDAGraphExecutor
- func NewCUDAGraphExecutor[T tensor.Numeric](plan *ExecutionPlan[T], streamPtr unsafe.Pointer, warmups int, ...) *CUDAGraphExecutor[T]
- func (g *CUDAGraphExecutor[T]) Destroy()
- func (g *CUDAGraphExecutor[T]) Run(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
type CheckpointedSegment
type EmbeddedFrozenProvider
type ExecutionPlan
- func (p *ExecutionPlan[T]) BufferLayout() *BufferLayout
- func (p *ExecutionPlan[T]) ComputeLastUse()
- func (p *ExecutionPlan[T]) EnsureCaptureInputsGPU(start, end int, gpuSlotCache map[int]*tensor.TensorNumeric[T])
- func (p *ExecutionPlan[T]) EnsureSlotsGPU(gpuSlotCache map[int]*tensor.TensorNumeric[T])
- func (p *ExecutionPlan[T]) FrozenSlots() []FrozenSlot[T]
- func (p *ExecutionPlan[T]) HasLastUse() bool
- func (p *ExecutionPlan[T]) HasPreallocatedBuffers() bool
- func (p *ExecutionPlan[T]) InputSlots() []int
- func (p *ExecutionPlan[T]) InstructionCount() int
- func (p *ExecutionPlan[T]) InstructionOpName(i int) string
- func (p *ExecutionPlan[T]) InstructionOutputIdx(i int) int
- func (p *ExecutionPlan[T]) Instructions() []InstructionMeta
- func (p *ExecutionPlan[T]) LastUse(slotIdx int) int
- func (p *ExecutionPlan[T]) OutputSlot() int
- func (p *ExecutionPlan[T]) OutputTensor() *tensor.TensorNumeric[T]
- func (p *ExecutionPlan[T]) PreUploadFrozenWeights() error
- func (p *ExecutionPlan[T]) PreallocateBuffers()
- func (p *ExecutionPlan[T]) PrepareSlots(inputs ...*tensor.TensorNumeric[T]) error
- func (p *ExecutionPlan[T]) Run(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
- func (p *ExecutionPlan[T]) RunBatched(ctx context.Context, schedule *BatchSchedule, ...) (*tensor.TensorNumeric[T], error)
- func (p *ExecutionPlan[T]) RunInstructionRange(ctx context.Context, start, end int) error
- func (p *ExecutionPlan[T]) RunInstructions(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
- func (p *ExecutionPlan[T]) ScratchSlot(idx int) *tensor.TensorNumeric[T]
- func (p *ExecutionPlan[T]) SetArenaFreeFn(fn func(ptr unsafe.Pointer, byteSize int))
- func (p *ExecutionPlan[T]) SetMegakernelFn(...)
- func (p *ExecutionPlan[T]) SetScratchSlot(idx int, t *tensor.TensorNumeric[T])
- func (p *ExecutionPlan[T]) SlotCount() int
- func (p *ExecutionPlan[T]) SlotShapes() [][]int
type FrozenSlot
type Graph
- func FoldConstantTransposes[T tensor.Numeric](g *Graph[T], tr Transposer[T]) (*Graph[T], error)
- func (g *Graph[T]) AddKVPair(input StatefulInputNode[T], output Node[T])
- func (g *Graph[T]) Backward(ctx context.Context, mode types.BackwardMode, ...) error
- func (g *Graph[T]) ClearMemo()
- func (g *Graph[T]) Compile(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*ExecutionPlan[T], error)
- func (g *Graph[T]) CompileTraced(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*ExecutionPlan[T], error)
- func (g *Graph[T]) ConstantTensors() []*tensor.TensorNumeric[T]
- func (g *Graph[T]) Dependencies(n Node[T]) []Node[T]
- func (g *Graph[T]) EngineProxy() *compute.EngineProxy[T]
- func (g *Graph[T]) Forward(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
- func (g *Graph[T]) GetAllNodes() []Node[T]
- func (g *Graph[T]) GetDependencies() map[Node[T]][]Node[T]
- func (g *Graph[T]) GetNodeMetadata(n Node[T]) map[string]interface{}
- func (g *Graph[T]) GetTopologicalOrder() ([]Node[T], error)
- func (g *Graph[T]) Inputs() []Node[T]
- func (g *Graph[T]) LoadCheckpoint(path string) (map[string]interface{}, error)
- func (g *Graph[T]) LoadParameters(params map[string][]T) error
- func (g *Graph[T]) LoadParametersFromFile(path string) error
- func (g *Graph[T]) Nodes() []Node[T]
- func (g *Graph[T]) Output() Node[T]
- func (g *Graph[T]) Parameters() []*Parameter[T]
- func (g *Graph[T]) ResetStatefulNodes()
- func (g *Graph[T]) SaveCheckpoint(path string, optimizerState map[string]interface{}) error
- func (g *Graph[T]) SaveParameters(path string) error
- func (g *Graph[T]) SetEngineProxy(proxy *compute.EngineProxy[T])
- func (g *Graph[T]) WithParallel(enabled bool)
- func (g *Graph[T]) WithPool(pool TensorReleaser[T])
type Instruction
type InstructionMeta
type NoParameters
- func (n *NoParameters[T]) Parameters() []*Parameter[T]
type Node
type Parameter
- func NewParameter[T tensor.Numeric](name string, value *tensor.TensorNumeric[T], ...) (*Parameter[T], error)
- func (p *Parameter[T]) AddGradient(grad *tensor.TensorNumeric[T]) error
- func (p *Parameter[T]) ClearGradient()
type Resettable
type StatefulInputNode
type TensorReleaser
type Transposer

Constants ¶

This section is empty.

Variables ¶

View Source

var ErrInvalidInputCount = errors.New("invalid number of inputs")

ErrInvalidInputCount is returned when the number of inputs to a node is incorrect.

Functions ¶

func ApplyBatchSchedule ¶ added in v0.3.0

func ApplyBatchSchedule[T tensor.Numeric](plan *ExecutionPlan[T], captureStart, captureEnd int, schedule *BatchSchedule) (newStart, newEnd int)

ApplyBatchSchedule annotates the CUDAGraphExecutor's capture region expansion by extending the capture region to include adjacent batchable instructions that were previously excluded. It returns a revised pair of (captureStart, captureEnd) for the executor to use.

The expansion logic: scan outward from the current [captureStart, captureEnd) and absorb any contiguous batchable ops at the boundaries, provided they are also capturable (not in nonCapturableOps).

func Checkpoint ¶ added in v0.3.0

func Checkpoint[T tensor.Numeric](
	segmentFn func(ctx context.Context, inputs []*tensor.TensorNumeric[T]) ([]*tensor.TensorNumeric[T], error),
	outputShape []int,
) *checkpointNode[T]

Checkpoint wraps a subgraph segment function for gradient checkpointing. The returned function can be used in place of the original segment: it produces the same output during forward, but discards intermediate activations. During backward, intermediates are recomputed from the saved inputs.

Usage:

checkpointed := graph.Checkpoint[float32](func(ctx context.Context, inputs []*tensor.TensorNumeric[float32]) ([]*tensor.TensorNumeric[float32], error) {
    // ... build and run a subgraph segment ...
    return outputs, nil
})
outputNode := checkpointed(builder, inputNodes)

func ParallelForward ¶

func ParallelForward[T tensor.Numeric](ctx context.Context, g *Graph[T], inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

ParallelForward executes the forward pass with dependency-aware parallelism. Independent nodes are dispatched to a goroutine pool concurrently. The result is identical to sequential Forward.

Types ¶

type BatchGroup ¶ added in v0.3.0

type BatchGroup struct {
	// Start is the inclusive start index of the group in the instruction list.
	Start int
	// End is the exclusive end index of the group in the instruction list.
	End int
}

BatchGroup describes a contiguous range of instructions [Start, End) in an ExecutionPlan that can be dispatched together as a single logical unit. All instructions in a group are batchable ops (elementwise) whose data dependencies are entirely self-contained within the group or reference frozen/input slots outside it.

func (BatchGroup) Size ¶ added in v0.3.0

func (g BatchGroup) Size() int

Size returns the number of instructions in the group.

type BatchSchedule ¶ added in v0.3.0

type BatchSchedule struct {
	// Groups is the ordered list of batch groups, sorted by Start index.
	// Groups are non-overlapping and strictly ascending.
	Groups []BatchGroup

	// TotalInstructions is the total instruction count for the plan this
	// schedule was derived from.
	TotalInstructions int

	// BatchedCount is the total number of instructions covered by groups.
	BatchedCount int
}

BatchSchedule holds the batch groups identified by ScheduleBatches. Instruction indices not covered by any group are executed individually.

func ScheduleBatches ¶ added in v0.3.0

func ScheduleBatches[T tensor.Numeric](plan *ExecutionPlan[T]) *BatchSchedule

ScheduleBatches analyzes the ExecutionPlan and returns a BatchSchedule describing which adjacent instruction ranges can be dispatched as batches.

A batch group is a maximal contiguous run of instructions where:

Every instruction's OpName is in batchableOps.
The group contains at least minBatchSize instructions.

Data dependency constraints are NOT checked here because batchable ops form linear chains by construction in transformer graphs (each feeds the next). The caller is responsible for maintaining topological order.

func (*BatchSchedule) Coverage ¶ added in v0.3.0

func (s *BatchSchedule) Coverage() float64

Coverage returns the fraction of instructions covered by batch groups, in the range [0.0, 1.0].

func (*BatchSchedule) GroupCount ¶ added in v0.3.0

func (s *BatchSchedule) GroupCount() int

GroupCount returns the number of batch groups.

type BufferArena ¶

type BufferArena[T tensor.Numeric] struct {
	// contains filtered or unexported fields
}

BufferArena pre-allocates tensor buffers for use by an ExecutionPlan. All buffers are created once and reused across Run() calls. Frozen slots (parameters, constants) are not zeroed on Reset.

func NewBufferArena ¶

func NewBufferArena[T tensor.Numeric](shapes [][]int) *BufferArena[T]

NewBufferArena pre-allocates one tensor per shape.

func (*BufferArena[T]) Get ¶

func (a *BufferArena[T]) Get(idx int) *tensor.TensorNumeric[T]

Get returns the pre-allocated buffer at index idx.

func (*BufferArena[T]) Len ¶

func (a *BufferArena[T]) Len() int

Len returns the number of buffer slots.

func (*BufferArena[T]) Reset ¶

func (a *BufferArena[T]) Reset()

Reset zeros all non-frozen buffer data for the next execution step.

func (*BufferArena[T]) Set ¶

func (a *BufferArena[T]) Set(idx int, t *tensor.TensorNumeric[T], freeze bool)

Set replaces the buffer at idx with the given tensor and optionally marks it as frozen (skip during Reset).

type BufferLayout ¶

type BufferLayout struct {
	// Offsets[i] is the element offset of slot i into the contiguous buffer.
	// A value of -1 means the slot is not part of the contiguous buffer
	// (e.g. frozen or input slots that are managed externally).
	Offsets []int

	// Sizes[i] is the element count for slot i (product of its shape dims).
	// Zero for slots without a known shape.
	Sizes []int

	// TotalElements is the sum of all slot sizes that are part of the
	// contiguous buffer.
	TotalElements int
}

BufferLayout describes a contiguous pre-allocated buffer with fixed offsets for each slot in an ExecutionPlan. CUDA graph capture requires that device memory addresses remain stable across runs; this layout ensures every intermediate tensor occupies the same offset in every execution.

func ComputeBufferLayout ¶

func ComputeBufferLayout(slotShapes [][]int, frozenIdx []int, inputIdx []int) BufferLayout

ComputeBufferLayout computes element offsets for each slot based on the slot shapes from compilation. Frozen and input slots are excluded (offset -1) since they are managed externally (model weights are constant, inputs change).

type Builder ¶

type Builder[T tensor.Numeric] struct {
	// contains filtered or unexported fields
}

Builder provides a fluent API for constructing a computation graph.

func NewBuilder ¶

func NewBuilder[T tensor.Numeric](engine compute.Engine[T]) *Builder[T]

NewBuilder creates a new graph builder.

func (*Builder[T]) AddNode ¶

func (b *Builder[T]) AddNode(node Node[T], inputs ...Node[T]) Node[T]

AddNode adds a new node to the graph with the given inputs.

func (*Builder[T]) Build ¶

func (b *Builder[T]) Build(outputNode Node[T]) (*Graph[T], error)

Build constructs the final graph.

func (*Builder[T]) Input ¶

func (b *Builder[T]) Input(shape []int) Node[T]

Input creates a new input node.

func (*Builder[T]) Parameters ¶

func (b *Builder[T]) Parameters() []*Parameter[T]

Parameters returns all the trainable parameters in the graph. The returned slice is sorted by parameter name for deterministic ordering.

type CUDAGraphExecutor ¶

type CUDAGraphExecutor[T tensor.Numeric] struct {
	// contains filtered or unexported fields
}

CUDAGraphExecutor captures and replays a CUDA graph for an ExecutionPlan. It splits the plan into three regions:

Pre-capture: instructions that trigger D2H copies or have dynamic state
Capture region: GPU-only, position-independent instructions
Post-capture: any trailing non-capturable instructions

During replay, regions 1 and 3 run normally while region 2 is replayed from the captured graph with near-zero launch overhead.

func NewCUDAGraphExecutor ¶

func NewCUDAGraphExecutor[T tensor.Numeric](plan *ExecutionPlan[T], streamPtr unsafe.Pointer, warmups int, onCaptured func(), snapshotCache func(ctx context.Context) func()) *CUDAGraphExecutor[T]

NewCUDAGraphExecutor creates a graph executor for the given plan. The optional onCaptured callback is invoked after a successful capture, allowing the caller to protect arena allocations from being reclaimed. The optional snapshotCache callback is called before the capture region to snapshot KV cache state. It returns a restore function invoked on capture failure, preventing double KV cache updates in the RunInstructions fallback.

func (*CUDAGraphExecutor[T]) Destroy ¶

func (g *CUDAGraphExecutor[T]) Destroy()

Destroy releases the CUDA graph resources.

func (*CUDAGraphExecutor[T]) Run ¶

func (g *CUDAGraphExecutor[T]) Run(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

Run executes the plan, using graph capture/replay when available.

type CheckpointedSegment ¶ added in v0.3.0

type CheckpointedSegment[T tensor.Numeric] struct {
	// contains filtered or unexported fields
}

CheckpointedSegment wraps a subgraph segment for gradient checkpointing. During the forward pass, it executes the segment normally but discards intermediate activations, keeping only the final output. During the backward pass, it re-runs the forward pass to recompute the intermediates before computing gradients.

This trades compute for memory: peak memory is reduced because intermediate tensors from checkpointed layers are not held simultaneously.

type EmbeddedFrozenProvider ¶

type EmbeddedFrozenProvider[T tensor.Numeric] interface {
	EmbeddedFrozen() []*tensor.TensorNumeric[T]
}

EmbeddedFrozenProvider is implemented by nodes that carry frozen data internally (e.g. Gather with embedded weights). Compile detects this interface and creates synthetic frozen slots so the megakernel emitter can reference the data via frozen_%d pointers.

type ExecutionPlan ¶

type ExecutionPlan[T tensor.Numeric] struct {
	// contains filtered or unexported fields
}

ExecutionPlan is a compiled, flat instruction sequence that replaces the interpreted node-by-node Forward() loop. Node outputs are stored in an indexed slot array instead of a map, eliminating map lookups.

func (*ExecutionPlan[T]) BufferLayout ¶

func (p *ExecutionPlan[T]) BufferLayout() *BufferLayout

BufferLayout returns the computed buffer layout, or nil if buffers have not been pre-allocated.

func (*ExecutionPlan[T]) ComputeLastUse ¶ added in v0.3.0

func (p *ExecutionPlan[T]) ComputeLastUse()

ComputeLastUse performs lifetime analysis on the execution plan. For each slot, it records the index of the last instruction that reads from it. Frozen slots, input slots, and the output slot are marked -1 (not freeable). This must be called after Compile/CompileTraced before intra-pass reuse.

func (*ExecutionPlan[T]) EnsureCaptureInputsGPU ¶

func (p *ExecutionPlan[T]) EnsureCaptureInputsGPU(start, end int, gpuSlotCache map[int]*tensor.TensorNumeric[T])

EnsureCaptureInputsGPU uploads CPU-resident slots that are inputs to instructions in [start, end) to GPU. Unlike EnsureSlotsGPU, this includes frozen scalar constants. Ops like Range/Pow that need host scalars are typically outside the capture region; within the capture region, all data must be GPU-resident to avoid cudaMemcpy during stream capture.

func (*ExecutionPlan[T]) EnsureSlotsGPU ¶

func (p *ExecutionPlan[T]) EnsureSlotsGPU(gpuSlotCache map[int]*tensor.TensorNumeric[T])

EnsureSlotsGPU uploads any CPU-resident scratch slot tensors to GPU. If a pre-allocated GPU tensor exists for the slot (from a previous capture), the CPU data is copied into it to preserve device addresses for CUDA graph replay. Otherwise a new GPU tensor is allocated and stored in gpuSlotCache for reuse.

This is called after pre-capture instructions run (e.g. EmbeddingLookup with quantized embedding tables that produce CPU tensors) to ensure the capture region sees only GPU-resident data.

func (*ExecutionPlan[T]) FrozenSlots ¶

func (p *ExecutionPlan[T]) FrozenSlots() []FrozenSlot[T]

FrozenSlots returns the frozen (constant/parameter) slots and their data.

func (*ExecutionPlan[T]) HasLastUse ¶ added in v0.3.0

func (p *ExecutionPlan[T]) HasLastUse() bool

HasLastUse returns true if lifetime analysis has been computed.

func (*ExecutionPlan[T]) HasPreallocatedBuffers ¶

func (p *ExecutionPlan[T]) HasPreallocatedBuffers() bool

HasPreallocatedBuffers reports whether buffers have been pre-allocated.

func (*ExecutionPlan[T]) InputSlots ¶

func (p *ExecutionPlan[T]) InputSlots() []int

InputSlots returns the slot indices that receive graph inputs.

func (*ExecutionPlan[T]) InstructionCount ¶

func (p *ExecutionPlan[T]) InstructionCount() int

InstructionCount returns the number of instructions in the plan.

func (*ExecutionPlan[T]) InstructionOpName ¶

func (p *ExecutionPlan[T]) InstructionOpName(i int) string

InstructionOpName returns the operation name of instruction at index i.

func (*ExecutionPlan[T]) InstructionOutputIdx ¶

func (p *ExecutionPlan[T]) InstructionOutputIdx(i int) int

InstructionOutputIdx returns the output slot index of instruction at index i.

func (*ExecutionPlan[T]) Instructions ¶

func (p *ExecutionPlan[T]) Instructions() []InstructionMeta

Instructions returns exported metadata for each compute instruction in the plan. The order matches the execution order.

func (*ExecutionPlan[T]) LastUse ¶ added in v0.3.0

func (p *ExecutionPlan[T]) LastUse(slotIdx int) int

LastUse returns the last-use instruction index for a slot, or -1 if the slot is not tracked (frozen, input, output, or never used).

func (*ExecutionPlan[T]) OutputSlot ¶

func (p *ExecutionPlan[T]) OutputSlot() int

OutputSlot returns the slot index that holds the final output.

func (*ExecutionPlan[T]) OutputTensor ¶

func (p *ExecutionPlan[T]) OutputTensor() *tensor.TensorNumeric[T]

OutputTensor returns the tensor currently in the output slot. Used by CUDAGraphExecutor to read the result after graph replay.

func (*ExecutionPlan[T]) PreUploadFrozenWeights ¶

func (p *ExecutionPlan[T]) PreUploadFrozenWeights() error

PreUploadFrozenWeights uploads all frozen (parameter/constant) slot tensors that have CPU-backed storage to the GPU. The uploaded tensor replaces the original in both the canonical slots array and any initialized scratch slots. This must be called BEFORE warmup runs and BEFORE EnsureSlotsGPU so that frozen weights are already GPU-resident when graph capture begins, avoiding synchronous H2D copies on the capturing stream (which cause cuda error 901).

func (*ExecutionPlan[T]) PreallocateBuffers ¶

func (p *ExecutionPlan[T]) PreallocateBuffers()

PreallocateBuffers creates pre-allocated tensors for all intermediate slots in the execution plan based on the slot shapes determined during compilation. After calling this method, RunInstructions will copy each Forward() result into the pre-allocated buffer, keeping memory addresses stable across runs.

Frozen and input slots are excluded since they are managed externally.

func (*ExecutionPlan[T]) PrepareSlots ¶

func (p *ExecutionPlan[T]) PrepareSlots(inputs ...*tensor.TensorNumeric[T]) error

PrepareSlots initializes the scratch slot array and populates input slots. Must be called before RunInstructionRange.

func (*ExecutionPlan[T]) Run ¶

func (p *ExecutionPlan[T]) Run(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

Run executes the compiled plan. It sets input tensors into the slot array, executes each instruction in sequence, and returns the output.

Not safe for concurrent use. The generator calls Run() sequentially per token.

func (*ExecutionPlan[T]) RunBatched ¶ added in v0.3.0

func (p *ExecutionPlan[T]) RunBatched(ctx context.Context, schedule *BatchSchedule, inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

RunBatched executes the ExecutionPlan using the provided BatchSchedule. Instructions within a batch group are dispatched sequentially but counted as a single logical dispatch unit. Instructions outside groups are executed individually as usual.

The practical effect is that all instructions in a batch group share a single Go-side dispatch entry point, reducing stack frame overhead and enabling future work to submit them as a single GPU stream command.

RunBatched is semantically equivalent to plan.RunInstructions; it produces the same output for the same inputs.

func (*ExecutionPlan[T]) RunInstructionRange ¶

func (p *ExecutionPlan[T]) RunInstructionRange(ctx context.Context, start, end int) error

RunInstructionRange executes instructions [start, end) using the shared slot array. The caller must have already populated input slots. This is used by CUDAGraphExecutor to split execution into capturable and non-capturable regions.

func (*ExecutionPlan[T]) RunInstructions ¶

func (p *ExecutionPlan[T]) RunInstructions(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

RunInstructions executes the instruction loop directly, bypassing the megakernel/graph capture hook. Used by CUDAGraphExecutor during warmup and capture phases.

func (*ExecutionPlan[T]) ScratchSlot ¶

func (p *ExecutionPlan[T]) ScratchSlot(idx int) *tensor.TensorNumeric[T]

ScratchSlot returns the tensor at the given scratch slot index, or nil.

func (*ExecutionPlan[T]) SetArenaFreeFn ¶ added in v0.3.0

func (p *ExecutionPlan[T]) SetArenaFreeFn(fn func(ptr unsafe.Pointer, byteSize int))

SetArenaFreeFn sets the callback used to return intermediate tensor memory to the arena free-list when a slot reaches its last use. The callback receives the GPU device pointer and byte size of the allocation.

func (*ExecutionPlan[T]) SetMegakernelFn ¶

func (p *ExecutionPlan[T]) SetMegakernelFn(fn func(context.Context, []*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error))

SetMegakernelFn sets an optional megakernel function that, when set, replaces the per-instruction execution loop in Run(). This allows a fused kernel to transparently handle the entire plan execution.

func (*ExecutionPlan[T]) SetScratchSlot ¶

func (p *ExecutionPlan[T]) SetScratchSlot(idx int, t *tensor.TensorNumeric[T])

SetScratchSlot sets the tensor at the given scratch slot index.

func (*ExecutionPlan[T]) SlotCount ¶ added in v0.3.0

func (p *ExecutionPlan[T]) SlotCount() int

SlotCount returns the number of slots in the plan.

func (*ExecutionPlan[T]) SlotShapes ¶

func (p *ExecutionPlan[T]) SlotShapes() [][]int

SlotShapes returns the shape of each slot as determined during compilation. Nil entries indicate slots that were not populated during the warmup pass.

type FrozenSlot ¶

type FrozenSlot[T tensor.Numeric] struct {
	SlotIdx int
	Data    *tensor.TensorNumeric[T]
}

FrozenSlot describes a slot that holds frozen (constant) data such as model weights. The Data field holds the tensor from the warmup pass.

type Graph ¶

type Graph[T tensor.Numeric] struct {
	// contains filtered or unexported fields
}

Graph represents a computation graph with a defined execution order.

func FoldConstantTransposes ¶

func FoldConstantTransposes[T tensor.Numeric](g *Graph[T], tr Transposer[T]) (*Graph[T], error)

FoldConstantTransposes removes Transpose nodes whose sole input is a constant (Parameter/Constant node). The transpose is pre-applied to the constant data and all consumers of the Transpose node are rewired to use the pre-transposed constant directly.

If the graph has no foldable transposes, the original graph is returned. The original graph should not be used after this call if a new graph is returned.

func (*Graph[T]) AddKVPair ¶

func (g *Graph[T]) AddKVPair(input StatefulInputNode[T], output Node[T])

AddKVPair registers a stateful input node that should receive the output of another node after each forward pass. Used for ONNX KV cache feedback.

func (*Graph[T]) Backward ¶

func (g *Graph[T]) Backward(ctx context.Context, mode types.BackwardMode, initialGradient *tensor.TensorNumeric[T]) error

Backward executes the backward pass of the entire graph. It is safe for concurrent use; callers will be serialized.

func (*Graph[T]) ClearMemo ¶

func (g *Graph[T]) ClearMemo()

ClearMemo releases intermediate tensors from the last forward pass. Call this after Backward to free GPU device memory between training steps. Input tensors and parameter values are not released.

func (*Graph[T]) Compile ¶

func (g *Graph[T]) Compile(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*ExecutionPlan[T], error)

Compile pre-compiles the graph into a flat ExecutionPlan. It runs one Forward() pass to determine tensor shapes, then assigns buffer indices and creates instruction kernels for each node.

func (*Graph[T]) CompileTraced ¶

func (g *Graph[T]) CompileTraced(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*ExecutionPlan[T], error)

CompileTraced produces a primitive-op ExecutionPlan by tracing through the graph's Forward pass with the EngineProxy recording every engine call. Unlike Compile (which creates one instruction per graph node), CompileTraced decomposes composite nodes into their constituent engine calls, enabling the megakernel emitter to see only primitive operations.

func (*Graph[T]) ConstantTensors ¶

func (g *Graph[T]) ConstantTensors() []*tensor.TensorNumeric[T]

ConstantTensors returns all constant/parameter weight tensors in the graph. Includes tensors from Parameter/Constant nodes, tensors embedded in nodes that implement EmbeddedFrozenProvider (e.g. LM head, gather), and all Parameter values from every node (e.g. attention and FFN weights). Call after graph construction to collect tensors for GPU pre-upload.

func (*Graph[T]) Dependencies ¶

func (g *Graph[T]) Dependencies(n Node[T]) []Node[T]

Dependencies returns the dependencies of a given node.

func (*Graph[T]) EngineProxy ¶

func (g *Graph[T]) EngineProxy() *compute.EngineProxy[T]

EngineProxy returns the EngineProxy if one was set, or nil.

func (*Graph[T]) Forward ¶

func (g *Graph[T]) Forward(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

Forward executes the forward pass of the entire graph. It is safe for concurrent use; callers will be serialized. When parallel mode is enabled via WithParallel(true), independent nodes are executed concurrently using a goroutine pool.

func (*Graph[T]) GetAllNodes ¶

func (g *Graph[T]) GetAllNodes() []Node[T]

GetAllNodes returns all nodes in the graph in their current order.

func (*Graph[T]) GetDependencies ¶

func (g *Graph[T]) GetDependencies() map[Node[T]][]Node[T]

GetDependencies returns the dependency map for all nodes in the graph.

func (*Graph[T]) GetNodeMetadata ¶

func (g *Graph[T]) GetNodeMetadata(n Node[T]) map[string]interface{}

GetNodeMetadata returns metadata for a specific node including its type, attributes, and shape.

func (*Graph[T]) GetTopologicalOrder ¶

func (g *Graph[T]) GetTopologicalOrder() ([]Node[T], error)

GetTopologicalOrder returns the nodes in topological order for execution.

func (*Graph[T]) Inputs ¶

func (g *Graph[T]) Inputs() []Node[T]

Inputs returns the input nodes of the graph.

func (*Graph[T]) LoadCheckpoint ¶ added in v0.3.0

func (g *Graph[T]) LoadCheckpoint(path string) (map[string]interface{}, error)

LoadCheckpoint restores graph parameters from a checkpoint file and returns the optimizer state. The file must have been created by SaveCheckpoint.

func (*Graph[T]) LoadParameters ¶ added in v0.3.0

func (g *Graph[T]) LoadParameters(params map[string][]T) error

LoadParameters sets parameter values by name from the provided map. Returns an error if a name is not found in the graph or the slice length does not match the parameter's value tensor.

func (*Graph[T]) LoadParametersFromFile ¶ added in v0.3.0

func (g *Graph[T]) LoadParametersFromFile(path string) error

LoadParametersFromFile reads parameter values from a JSON file and loads them into the graph. The file must have been created by SaveParameters.

func (*Graph[T]) Nodes ¶

func (g *Graph[T]) Nodes() []Node[T]

Nodes returns all the nodes in the graph.

func (*Graph[T]) Output ¶

func (g *Graph[T]) Output() Node[T]

Output returns the output node of the graph.

func (*Graph[T]) Parameters ¶

func (g *Graph[T]) Parameters() []*Parameter[T]

Parameters returns all the trainable parameters in the graph. The returned slice is sorted by parameter name for deterministic ordering.

func (*Graph[T]) ResetStatefulNodes ¶

func (g *Graph[T]) ResetStatefulNodes()

ResetStatefulNodes resets all nodes that implement the Resettable interface. Call this before starting a new generation sequence to clear accumulated state from previous runs.

func (*Graph[T]) SaveCheckpoint ¶ added in v0.3.0

func (g *Graph[T]) SaveCheckpoint(path string, optimizerState map[string]interface{}) error

SaveCheckpoint serializes graph parameters and optimizer state to a JSON file.

func (*Graph[T]) SaveParameters ¶ added in v0.3.0

func (g *Graph[T]) SaveParameters(path string) error

SaveParameters serializes all graph parameters to a JSON file. Parameter values are converted to float64 for JSON compatibility.

func (*Graph[T]) SetEngineProxy ¶

func (g *Graph[T]) SetEngineProxy(proxy *compute.EngineProxy[T])

SetEngineProxy stores a reference to the EngineProxy used by this graph's layers.

func (*Graph[T]) WithParallel ¶

func (g *Graph[T]) WithParallel(enabled bool)

WithParallel enables or disables parallel execution of independent nodes. When enabled, Forward delegates to ParallelForward for concurrent execution. Default is false (sequential) for backward compatibility.

func (*Graph[T]) WithPool ¶

func (g *Graph[T]) WithPool(pool TensorReleaser[T])

WithPool sets a tensor pool for intermediate buffer reuse during Forward. When set, the executor releases intermediate tensors back to the pool as soon as all their consumers have executed.

type Instruction ¶

type Instruction[T tensor.Numeric] struct {
	Forward   func(ctx context.Context, inputs []*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
	InputIdx  []int          // indices into the slot array
	OutputIdx int            // index into the slot array
	OpName    string         // for error reporting
	ExtraArgs map[string]any // optional extra arguments (e.g. layer index for KV cache ops)
}

Instruction is a single pre-resolved operation in a compiled execution plan. It holds a direct function that calls node.Forward() with pre-computed buffer indices, eliminating dependency map lookups and memo operations.

type InstructionMeta ¶

type InstructionMeta struct {
	OpName    string         // operation type (e.g. "Add", "MatMulNBits", "RMSNorm")
	InputIdx  []int          // slot indices for inputs
	OutputIdx int            // slot index for the output
	ExtraArgs map[string]any // optional extra arguments (e.g. layer index for KV cache ops)
}

InstructionMeta is the exported metadata for a single compiled instruction. It contains everything needed by a code generator without exposing the Forward() closure.

type NoParameters ¶

type NoParameters[T tensor.Numeric] struct{}

NoParameters is a utility type for nodes that have no trainable parameters.

func (*NoParameters[T]) Parameters ¶

func (n *NoParameters[T]) Parameters() []*Parameter[T]

Parameters returns an empty slice of parameters.

type Node ¶

type Node[T tensor.Numeric] interface {
	// OpType returns the operation type of the node, e.g., "ReLU", "Dense".
	OpType() string

	// Attributes returns a map of the node's non-tensor attributes.
	Attributes() map[string]interface{}

	// Forward computes the output of the node given the inputs.
	Forward(ctx context.Context, inputs ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

	// Backward computes the gradients of the node with respect to its inputs.
	Backward(ctx context.Context, mode types.BackwardMode, outputGradient *tensor.TensorNumeric[T], inputs ...*tensor.TensorNumeric[T]) ([]*tensor.TensorNumeric[T], error)

	// Parameters returns the trainable parameters of the node.
	Parameters() []*Parameter[T]

	// OutputShape returns the shape of the output tensor.
	OutputShape() []int
}

Node represents a node in the computation graph.

type Parameter ¶

type Parameter[T tensor.Numeric] struct {
	Name     string
	Value    *tensor.TensorNumeric[T]
	Gradient *tensor.TensorNumeric[T]
}

Parameter represents a trainable parameter in the graph.

func NewParameter ¶

func NewParameter[T tensor.Numeric](name string, value *tensor.TensorNumeric[T], newTensorFn func([]int, []T) (*tensor.TensorNumeric[T], error)) (*Parameter[T], error)

NewParameter creates a new parameter.

func (*Parameter[T]) AddGradient ¶

func (p *Parameter[T]) AddGradient(grad *tensor.TensorNumeric[T]) error

AddGradient adds the given gradient to the parameter's gradient.

func (*Parameter[T]) ClearGradient ¶

func (p *Parameter[T]) ClearGradient()

ClearGradient resets the parameter's gradient to zero.

type Resettable ¶

type Resettable interface {
	Reset()
}

Resettable is implemented by nodes that carry state between forward passes and need to be reset before a new generation sequence (e.g. position ID counters, attention mask accumulators, KV cache buffers).

type StatefulInputNode ¶

type StatefulInputNode[T tensor.Numeric] interface {
	SetStored(t *tensor.TensorNumeric[T])
}

StatefulInputNode is implemented by graph input nodes that carry state between forward passes (e.g., KV cache inputs in ONNX models).

type TensorReleaser ¶

type TensorReleaser[T tensor.Numeric] interface {
	Release(t *tensor.TensorNumeric[T])
}

TensorReleaser can release tensors back to a pool for reuse.

type Transposer ¶

type Transposer[T tensor.Numeric] interface {
	Transpose(ctx context.Context, a *tensor.TensorNumeric[T], axes []int, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
}

Transposer is the minimal interface needed by FoldConstantTransposes to pre-apply transpose operations on constant tensors. The signature matches compute.Engine[T].Transpose (variadic dst for buffer reuse).

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
kv Package kv provides paged-attention KV cache primitives for transformer inference.	Package kv provides paged-attention KV cache primitives for transformer inference.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL