Why Go
- Case Studies
  
  Common problems companies solve with Go
- Use Cases
  
  Stories about how and why companies use Go
- Security
  
  How Go can help keep you secure by default
Learn
Docs
- Effective Go
  
  Tips for writing clear, performant, and idiomatic Go code
- Go User Manual
  
  A complete introduction to building software with Go
- Standard library
  
  Reference documentation for Go's standard library
- Release Notes
  
  Learn what's new in each Go release
Packages
Community
- Recorded Talks
  
  Videos from prior events
- Meetups
  
  Meet other local Go developers
- Conferences
  
  Learn and network with Go developers from around the world
- Go blog
  
  The Go project's official blog.
- Go project
  
  Get help and stay informed from Go
- Get connected

compute

package

Go to main page

Versions in this module

v0

v0.2.0

Mar 16, 2026

v0.1.0

Mar 16, 2026

Changes in this version

+ var ErrMemoryLimitExceeded = errors.New("memory limit exceeded")

+ func FusedRMSNorm(input, weight *tensor.TensorNumeric[float32], epsilon float32) (output, scales *tensor.TensorNumeric[float32], err error)

+ func FusedRoPE(input, cosAngles, sinAngles *tensor.TensorNumeric[float32], rotaryDim int) (*tensor.TensorNumeric[float32], error)

+ func FusedSiLUGate(gate, up *tensor.TensorNumeric[float32]) (*tensor.TensorNumeric[float32], error)

+ type CPUEngine struct

+ func NewCPUEngine[T tensor.Numeric](ops numeric.Arithmetic[T]) *CPUEngine[T]

+ func (e *CPUEngine[T]) Add(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) AddScalar(_ context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Close(_ context.Context) error

+ func (e *CPUEngine[T]) Concat(_ context.Context, tensors []*tensor.TensorNumeric[T], axis int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Copy(_ context.Context, dst, src *tensor.TensorNumeric[T]) error

+ func (e *CPUEngine[T]) Cos(_ context.Context, a *tensor.TensorNumeric[T], dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Div(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) DivScalar(_ context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Exp(_ context.Context, a *tensor.TensorNumeric[T], dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Fill(_ context.Context, t *tensor.TensorNumeric[T], value T) error

+ func (e *CPUEngine[T]) Gather(_ context.Context, params *tensor.TensorNumeric[T], ...) error

+ func (e *CPUEngine[T]) Log(_ context.Context, a *tensor.TensorNumeric[T], dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) MemoryTracker() *MemoryTracker

+ func (e *CPUEngine[T]) Mul(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) MulScalar(_ context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) OneHot(_ context.Context, input *tensor.TensorNumeric[int], depth int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Ops() numeric.Arithmetic[T]

+ func (e *CPUEngine[T]) Pow(ctx context.Context, base, exponent *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) RandomUniform(_ context.Context, t *tensor.TensorNumeric[T], minVal, maxVal T) error

+ func (e *CPUEngine[T]) ReduceMean(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) ReduceSum(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Repeat(_ context.Context, a *tensor.TensorNumeric[T], axis int, repetitions int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Reshape(_ context.Context, a *tensor.TensorNumeric[T], shape []int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Rsqrt(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) ScatterAdd(_ context.Context, dEmbeddingTable *tensor.TensorNumeric[T], ...) error

+ func (e *CPUEngine[T]) SetCollector(c metrics.Collector)

+ func (e *CPUEngine[T]) SetLogger(l log.Logger)

+ func (e *CPUEngine[T]) SetMemoryLimit(bytes int64)

+ func (e *CPUEngine[T]) Sin(_ context.Context, a *tensor.TensorNumeric[T], dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Softmax(_ context.Context, a *tensor.TensorNumeric[T], axis int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Split(_ context.Context, a *tensor.TensorNumeric[T], numSplits int, axis int) ([]*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Sqrt(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Sub(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Sum(_ context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Tanh(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) TanhPrime(ctx context.Context, a, upstream *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Transpose(_ context.Context, a *tensor.TensorNumeric[T], axes []int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) UnaryOp(ctx context.Context, a *tensor.TensorNumeric[T], op func(T) T, ...) (*tensor.TensorNumeric[T], error)

+ func (e *CPUEngine[T]) Zero(_ context.Context, a *tensor.TensorNumeric[T]) error

+ func (e *CPUEngine[T]) Zeros(ctx context.Context, a *tensor.TensorNumeric[T], shape []int) error

+ type DType int

+ const DTypeF32

+ const DTypeFP16

+ const DTypeFP8

+ type Engine interface

+ Add func(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ AddScalar func(ctx context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ Concat func(ctx context.Context, tensors []*tensor.TensorNumeric[T], axis int, ...) (*tensor.TensorNumeric[T], error)

+ Copy func(ctx context.Context, dst, src *tensor.TensorNumeric[T]) error

+ Cos func(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ Div func(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ DivScalar func(ctx context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ Exp func(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ Fill func(ctx context.Context, t *tensor.TensorNumeric[T], value T) error

+ Gather func(ctx context.Context, params *tensor.TensorNumeric[T], ...) error

+ Log func(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ MatMul func(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ Mul func(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ MulScalar func(ctx context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ OneHot func(ctx context.Context, input *tensor.TensorNumeric[int], depth int, ...) (*tensor.TensorNumeric[T], error)

+ Ops func() numeric.Arithmetic[T]

+ Pow func(ctx context.Context, base, exponent *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ RandomUniform func(ctx context.Context, t *tensor.TensorNumeric[T], minVal, maxVal T) error

+ ReduceMean func(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ ReduceSum func(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ Repeat func(ctx context.Context, a *tensor.TensorNumeric[T], axis int, repetitions int, ...) (*tensor.TensorNumeric[T], error)

+ Reshape func(ctx context.Context, a *tensor.TensorNumeric[T], shape []int, ...) (*tensor.TensorNumeric[T], error)

+ Rsqrt func(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ ScatterAdd func(ctx context.Context, dEmbeddingTable *tensor.TensorNumeric[T], ...) error

+ Sin func(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ Softmax func(ctx context.Context, a *tensor.TensorNumeric[T], axis int, ...) (*tensor.TensorNumeric[T], error)

+ Split func(ctx context.Context, a *tensor.TensorNumeric[T], numSplits int, axis int) ([]*tensor.TensorNumeric[T], error)

+ Sqrt func(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ Sub func(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ Sum func(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ Tanh func(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ TanhPrime func(ctx context.Context, a, upstream *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ Transpose func(ctx context.Context, a *tensor.TensorNumeric[T], axes []int, ...) (*tensor.TensorNumeric[T], error)

+ UnaryOp func(ctx context.Context, a *tensor.TensorNumeric[T], op func(T) T, ...) (*tensor.TensorNumeric[T], error)

+ Zero func(ctx context.Context, a *tensor.TensorNumeric[T]) error

+ Zeros func(ctx context.Context, a *tensor.TensorNumeric[T], shape []int) error

+ type EngineProxy struct

+ func NewEngineProxy[T tensor.Numeric](real Engine[T]) *EngineProxy[T]

+ func (p *EngineProxy[T]) Add(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) AddScalar(ctx context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) ArenaUsedBytes() int

+ func (p *EngineProxy[T]) Concat(ctx context.Context, tensors []*tensor.TensorNumeric[T], axis int, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Copy(ctx context.Context, dst, src *tensor.TensorNumeric[T]) error

+ func (p *EngineProxy[T]) Cos(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Div(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) DivScalar(ctx context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Exp(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Fill(ctx context.Context, t *tensor.TensorNumeric[T], value T) error

+ func (p *EngineProxy[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float32], epsilon float32) (*tensor.TensorNumeric[float32], *tensor.TensorNumeric[float32], error)

+ func (p *EngineProxy[T]) GPUFusedAddRMSNorm(input, residual, weight *tensor.TensorNumeric[T], eps float32) (normed *tensor.TensorNumeric[T], residualOut *tensor.TensorNumeric[T], ...)

+ func (p *EngineProxy[T]) Gather(ctx context.Context, params *tensor.TensorNumeric[T], ...) error

+ func (p *EngineProxy[T]) Log(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) MatMulTransposeB(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Mul(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) MulScalar(ctx context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) OneHot(ctx context.Context, input *tensor.TensorNumeric[int], depth int, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Ops() numeric.Arithmetic[T]

+ func (p *EngineProxy[T]) Pow(ctx context.Context, base, exponent *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) RandomUniform(ctx context.Context, t *tensor.TensorNumeric[T], minVal, maxVal T) error

+ func (p *EngineProxy[T]) Real() Engine[T]

+ func (p *EngineProxy[T]) ReduceMean(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) ReduceSum(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Repeat(ctx context.Context, a *tensor.TensorNumeric[T], axis int, repetitions int, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) ResetPool()

+ func (p *EngineProxy[T]) Reshape(ctx context.Context, a *tensor.TensorNumeric[T], shape []int, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Rsqrt(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) ScatterAdd(ctx context.Context, dEmbeddingTable *tensor.TensorNumeric[T], ...) error

+ func (p *EngineProxy[T]) SetArenaResetFloor(floor int)

+ func (p *EngineProxy[T]) Sin(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Softmax(ctx context.Context, a *tensor.TensorNumeric[T], axis int, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Split(ctx context.Context, a *tensor.TensorNumeric[T], numSplits int, axis int) ([]*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Sqrt(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) StartTracing(tracer TraceRecorder[T])

+ func (p *EngineProxy[T]) StopTracing()

+ func (p *EngineProxy[T]) Sub(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Sum(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Tanh(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) TanhPrime(ctx context.Context, a, upstream *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T], axes []int, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) UnaryOp(ctx context.Context, a *tensor.TensorNumeric[T], op func(T) T, ...) (*tensor.TensorNumeric[T], error)

+ func (p *EngineProxy[T]) Zero(ctx context.Context, a *tensor.TensorNumeric[T]) error

+ func (p *EngineProxy[T]) Zeros(ctx context.Context, a *tensor.TensorNumeric[T], shape []int) error

+ type FP16ToF32Converter interface

+ ConvertFP16ToF32 func(t *tensor.TensorNumeric[float32]) (*tensor.TensorNumeric[float32], error)

+ type FailableTensor struct

+ func NewFailableTensor[T tensor.Numeric](t *tensor.TensorNumeric[T]) *FailableTensor[T]

+ func (f *FailableTensor[T]) Set(value T, indices ...int) error

+ func (f *FailableTensor[T]) SetFailOnSet(fail bool)

+ func (f *FailableTensor[T]) SetFailOnSetAfter(count int)

+ type FailableZeroer struct

+ func NewFailableZeroer[T tensor.Numeric](engine *TestableEngine[T]) *FailableZeroer[T]

+ func (f *FailableZeroer[T]) SetFailOnZero(fail bool)

+ func (f *FailableZeroer[T]) Zero(ctx context.Context, a *tensor.TensorNumeric[T]) error

+ type FusedAddRMSNormProvider interface

+ GPUFusedAddRMSNorm func(input, residual, weight *tensor.TensorNumeric[T], eps float32) (normed *tensor.TensorNumeric[T], residualOut *tensor.TensorNumeric[T], ...)

+ type FusedNormAddProvider interface

+ GPUFusedNormAdd func(input, weight, residual *tensor.TensorNumeric[T], eps float32) (*tensor.TensorNumeric[T], error)

+ type FusedQKNormRoPEProvider interface

+ GPUFusedQKNormRoPE func(input *tensor.TensorNumeric[T], weightQ, weightK *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ type FusedRMSNormer interface

+ FusedRMSNormGPU func(input, weight *tensor.TensorNumeric[float32], epsilon float32) (output, scales *tensor.TensorNumeric[float32], err error)

+ type FusedRoPEProvider interface

+ GPUFusedRoPE func(input, cosAngles, sinAngles *tensor.TensorNumeric[T], rotaryDim int) (*tensor.TensorNumeric[T], error)

+ type FusedScaledSoftmaxProvider interface

+ GPUScaledSoftmax func(input *tensor.TensorNumeric[T], scale float32, axis int) (*tensor.TensorNumeric[T], error)

+ type FusedSwiGLUProvider interface

+ GPUFusedSwiGLU func(w1, w3 *tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

+ type GPUArgmaxer interface

+ GPUArgmax func(t *tensor.TensorNumeric[float32]) (int, error)

+ type GPUEngine struct

+ func NewGPUEngine[T tensor.Numeric](ops numeric.Arithmetic[T], deviceID ...int) (*GPUEngine[T], error)

+ func (e *GPUEngine[T]) Add(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) AddScalar(ctx context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) ArenaUsedBytes() int

+ func (e *GPUEngine[T]) BatchNormForwardInference(_ context.Context, x, scale, bias, mean, variance *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) BatchNormForwardTraining(_ context.Context, x, scale, bias *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], *tensor.TensorNumeric[T], *tensor.TensorNumeric[T], ...)

+ func (e *GPUEngine[T]) Close() error

+ func (e *GPUEngine[T]) Concat(ctx context.Context, tensors []*tensor.TensorNumeric[T], axis int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Conv2dBackwardData(_ context.Context, w *tensor.TensorNumeric[T], dy *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Conv2dBackwardFilter(_ context.Context, x *tensor.TensorNumeric[T], dy *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Conv2dForward(_ context.Context, x, w *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) ConvertFP16ToF32(t *tensor.TensorNumeric[float32]) (*tensor.TensorNumeric[float32], error)

+ func (e *GPUEngine[T]) Copy(ctx context.Context, dst, src *tensor.TensorNumeric[T]) error

+ func (e *GPUEngine[T]) Cos(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) CudnnActivationBackward(_ context.Context, x, y, dy *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) CudnnActivationForward(_ context.Context, x *tensor.TensorNumeric[T], mode gpuapi.ActivationMode) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) CudnnBatchNormBackward(_ context.Context, x, dy, scale *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], *tensor.TensorNumeric[T], *tensor.TensorNumeric[T], ...)

+ func (e *GPUEngine[T]) CudnnPoolingBackward(_ context.Context, x, y, dy *tensor.TensorNumeric[T], mode gpuapi.PoolingMode, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) CudnnPoolingForward(_ context.Context, x *tensor.TensorNumeric[T], mode gpuapi.PoolingMode, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) CudnnSoftmaxForward(_ context.Context, x *tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) DTypeValue() DType

+ func (e *GPUEngine[T]) DeviceID() int

+ func (e *GPUEngine[T]) Div(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) DivScalar(ctx context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Exp(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Fill(ctx context.Context, t *tensor.TensorNumeric[T], value T) error

+ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float32], epsilon float32) (*tensor.TensorNumeric[float32], *tensor.TensorNumeric[float32], error)

+ func (e *GPUEngine[T]) GPUArgmax(t *tensor.TensorNumeric[float32]) (int, error)

+ func (e *GPUEngine[T]) GPUFusedAddRMSNorm(input, residual *tensor.TensorNumeric[T], weight *tensor.TensorNumeric[T], ...) (normed *tensor.TensorNumeric[T], residualOut *tensor.TensorNumeric[T], ...)

+ func (e *GPUEngine[T]) GPUFusedNormAdd(input, weight, residual *tensor.TensorNumeric[T], eps float32) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) GPUFusedQKNormRoPE(input *tensor.TensorNumeric[T], weightQ, weightK *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) GPUFusedRoPE(input, cosAngles, sinAngles *tensor.TensorNumeric[T], rotaryDim int) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) GPUFusedSwiGLU(w1, w3 *tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) GPUScaledSoftmax(input *tensor.TensorNumeric[T], scale float32, axis int) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) GPUStream() gpuapi.Stream

+ func (e *GPUEngine[T]) Gather(ctx context.Context, params *tensor.TensorNumeric[T], ...) error

+ func (e *GPUEngine[T]) IsManagedMemory() bool

+ func (e *GPUEngine[T]) Log(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) MatMulTransposeB(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Mul(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) MulScalar(ctx context.Context, a *tensor.TensorNumeric[T], scalar T, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) OOMFallbackCount() int64

+ func (e *GPUEngine[T]) OneHot(ctx context.Context, input *tensor.TensorNumeric[int], depth int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Ops() numeric.Arithmetic[T]

+ func (e *GPUEngine[T]) Pow(ctx context.Context, base, exponent *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) RandomUniform(ctx context.Context, t *tensor.TensorNumeric[T], minVal, maxVal T) error

+ func (e *GPUEngine[T]) ReduceMean(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) ReduceSum(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Repeat(ctx context.Context, a *tensor.TensorNumeric[T], axis int, repetitions int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) ResetPool()

+ func (e *GPUEngine[T]) Reshape(ctx context.Context, a *tensor.TensorNumeric[T], shape []int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Rsqrt(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) ScatterAdd(ctx context.Context, dEmbeddingTable *tensor.TensorNumeric[T], ...) error

+ func (e *GPUEngine[T]) SetArenaResetFloor(floor int)

+ func (e *GPUEngine[T]) SetDType(d DType)

+ func (e *GPUEngine[T]) SetLogger(l log.Logger)

+ func (e *GPUEngine[T]) Sin(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Softmax(ctx context.Context, a *tensor.TensorNumeric[T], axis int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Split(ctx context.Context, a *tensor.TensorNumeric[T], numSplits int, axis int) ([]*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Sqrt(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Stream() unsafe.Pointer

+ func (e *GPUEngine[T]) Sub(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Sum(ctx context.Context, a *tensor.TensorNumeric[T], axis int, keepDims bool, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Sync() error

+ func (e *GPUEngine[T]) Tanh(ctx context.Context, a *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) TanhPrime(ctx context.Context, a, upstream *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T], axes []int, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) UnaryOp(ctx context.Context, a *tensor.TensorNumeric[T], op func(T) T, ...) (*tensor.TensorNumeric[T], error)

+ func (e *GPUEngine[T]) UploadWeights(tensors []*tensor.TensorNumeric[float32]) error

+ func (e *GPUEngine[T]) Zero(ctx context.Context, a *tensor.TensorNumeric[T]) error

+ func (e *GPUEngine[T]) Zeros(ctx context.Context, a *tensor.TensorNumeric[T], shape []int) error

+ type GPUStreamAccessor interface

+ GPUStream func() gpuapi.Stream

+ type MemoryTracker struct

+ func NewMemoryTracker(limit int64) *MemoryTracker

+ func (m *MemoryTracker) Alloc(bytes int64) error

+ func (m *MemoryTracker) Allocated() int64

+ func (m *MemoryTracker) Free(bytes int64)

+ func (m *MemoryTracker) Limit() int64

+ type PoolResetter interface

+ ResetPool func()

+ type StreamProvider interface

+ Stream func() unsafe.Pointer

+ type TensorArena struct

+ func (a *TensorArena) Get(n int) []float32

+ func (a *TensorArena) Put(buf []float32)

+ func (a *TensorArena) Reset()

+ type TensorPool struct

+ func NewTensorPool[T tensor.Numeric]() *TensorPool[T]

+ func (p *TensorPool[T]) Acquire(shape []int) (*tensor.TensorNumeric[T], error)

+ func (p *TensorPool[T]) Len() int

+ func (p *TensorPool[T]) Release(t *tensor.TensorNumeric[T])

+ type TestableEngine struct

+ func NewTestableEngine[T tensor.Numeric](ops numeric.Arithmetic[T]) *TestableEngine[T]

+ func (e *TestableEngine[T]) TestableMatMul(_ context.Context, a, b *tensor.TensorNumeric[T], result *FailableTensor[T]) error

+ func (e *TestableEngine[T]) TestableSum(ctx context.Context, a *tensor.TensorNumeric[T], axis int, _ bool, ...) error

+ func (e *TestableEngine[T]) TestableTranspose(_ context.Context, a *tensor.TensorNumeric[T], result *FailableTensor[T]) error

+ type TraceRecorder interface

+ Record func(opName string, inputs []*tensor.TensorNumeric[T], ...)

+ RecordGather func(params *tensor.TensorNumeric[T], indices *tensor.TensorNumeric[int], ...)

+ RecordMultiOutput func(opName string, inputs []*tensor.TensorNumeric[T], ...)

+ type TracedOp struct

+ ExtraArgs map[string]any

+ InputIDs []int

+ OpName string

+ OutputID int

+ OutputIDs []int

+ type Tracer struct

+ func NewTracer[T tensor.Numeric](frozenTensors []*tensor.TensorNumeric[T]) *Tracer[T]

+ func (t *Tracer[T]) FrozenSlots() []int

+ func (t *Tracer[T]) HasOpaqueOps() bool

+ func (t *Tracer[T]) MarkOpaque()

+ func (t *Tracer[T]) NextSlot() int

+ func (t *Tracer[T]) Record(opName string, inputs []*tensor.TensorNumeric[T], ...)

+ func (t *Tracer[T]) RecordGather(params *tensor.TensorNumeric[T], indices *tensor.TensorNumeric[int], ...)

+ func (t *Tracer[T]) RecordMultiOutput(opName string, inputs []*tensor.TensorNumeric[T], ...)

+ func (t *Tracer[T]) SlotFor(tn *tensor.TensorNumeric[T]) int

+ func (t *Tracer[T]) SlotShapes() map[int][]int

+ func (t *Tracer[T]) TracedOps() []TracedOp

+ type TransposeBMatMuler interface

+ MatMulTransposeB func(ctx context.Context, a, b *tensor.TensorNumeric[T], ...) (*tensor.TensorNumeric[T], error)

+ type WeightUploader interface

+ UploadWeights func(tensors []*tensor.TensorNumeric[float32]) error