Documentation
¶
Overview ¶
Package tensor provides a multi-dimensional array (tensor) implementation.
Package tensor provides a multi-dimensional array (tensor) implementation.
Index ¶
- func AssertClose[T Numeric](t *testing.T, expected, actual *TensorNumeric[T], tolerance float64)
- func AssertEquals[T Numeric](t *testing.T, expected, actual *TensorNumeric[T])
- func BroadcastIndex(index int, shape, outputShape []int, broadcast bool) int
- func BroadcastShapes(a, b []int) (shape []int, broadcastA, broadcastB bool, err error)
- func ConvertInt64ToInt(s []int64) []int
- func ConvertIntToInt64(s []int) []int64
- func DequantizeQ4K(raw []byte, dst []float32)
- func DequantizeQ5K(raw []byte, dst []float32)
- func DequantizeQ6K(raw []byte, dst []float32)
- func Equals[T Numeric](a, b *TensorNumeric[T]) bool
- func Float32ToBytes(f []float32) ([]byte, error)
- func Int8ToBytes(i []int8) ([]byte, error)
- func Ones[T Numeric](size int) []T
- func Product(s []int) int
- func Q4GPUDataOffset(totalBlocks int) int
- func Q4GPUScaleOffset() int
- func SameShape(a, b []int) bool
- func ShapesEqual(a, b []int) bool
- func Uint8ToBytes(u []uint8) ([]byte, error)
- type Addable
- type BFloat16Storage
- func (s *BFloat16Storage) DeviceType() device.Type
- func (s *BFloat16Storage) GPUPtr() (unsafe.Pointer, int, int)
- func (s *BFloat16Storage) Len() int
- func (s *BFloat16Storage) RawBytes() []byte
- func (s *BFloat16Storage) Set(data []float32)
- func (s *BFloat16Storage) SetGPUPtr(ptr unsafe.Pointer, byteSize, deviceID int)
- func (s *BFloat16Storage) Slice() []float32
- type CPUStorage
- type FP8E4M3Storage
- func (s *FP8E4M3Storage) DeviceType() device.Type
- func (s *FP8E4M3Storage) GPUPtr() (unsafe.Pointer, int, int)
- func (s *FP8E4M3Storage) Len() int
- func (s *FP8E4M3Storage) RawBytes() []byte
- func (s *FP8E4M3Storage) Scale() float32
- func (s *FP8E4M3Storage) ScaleGPUPtr() unsafe.Pointer
- func (s *FP8E4M3Storage) Set(data []float32)
- func (s *FP8E4M3Storage) SetGPUPtr(ptr unsafe.Pointer, byteSize, deviceID int)
- func (s *FP8E4M3Storage) SetScaleGPUPtr(ptr unsafe.Pointer)
- func (s *FP8E4M3Storage) Slice() []float32
- type FP8E5M2Storage
- type Float
- type Float16Storage
- func (s *Float16Storage) DeviceType() device.Type
- func (s *Float16Storage) GPUPtr() (unsafe.Pointer, int, int)
- func (s *Float16Storage) Len() int
- func (s *Float16Storage) RawBytes() []byte
- func (s *Float16Storage) Set(data []float32)
- func (s *Float16Storage) SetGPUByteSize(byteSize int)
- func (s *Float16Storage) SetGPUPtr(ptr unsafe.Pointer, byteSize, deviceID int)
- func (s *Float16Storage) Slice() []float32
- func (s *Float16Storage) SubSlice(offset, length int) *Float16Storage
- type GPUStorage
- func NewGPUStorage[T Numeric](length int, deviceID ...int) (*GPUStorage[T], error)
- func NewGPUStorageFromPool[T Numeric](devPtr unsafe.Pointer, length int, pool gpuapi.MemPool, deviceID int) (*GPUStorage[T], error)
- func NewGPUStorageFromPtr[T Numeric](devPtr unsafe.Pointer, length int, deviceID ...int) (*GPUStorage[T], error)
- func NewGPUStorageFromSlice[T Numeric](data []T, deviceID ...int) (*GPUStorage[T], error)
- func NewGPUStorageView[T Numeric](parent *GPUStorage[T], offsetElems, length int) *GPUStorage[T]
- func NewGPUStorageViewFromPtr[T Numeric](devPtr unsafe.Pointer, length int, deviceID int) *GPUStorage[T]
- func NewManagedGPUStorage[T Numeric](pool gpuapi.MemPool, length int, deviceID ...int) (*GPUStorage[T], error)
- func (s *GPUStorage[T]) CopyFromDevice(src *GPUStorage[T], dstOffsetElems, srcOffsetElems, numElems int) error
- func (s *GPUStorage[T]) CopyFromDeviceAsync(src *GPUStorage[T], dstOffsetElems, srcOffsetElems, numElems int, ...) error
- func (s *GPUStorage[T]) CopyFromHost(data []T, dstOffsetElems int) error
- func (s *GPUStorage[T]) CopyFromHostAsync(data []T, dstOffsetElems int, stream gpuapi.Stream) error
- func (s *GPUStorage[T]) CopyTo(dst []T) error
- func (s *GPUStorage[T]) DeviceID() int
- func (s *GPUStorage[T]) DeviceType() device.Type
- func (s *GPUStorage[T]) Free() error
- func (s *GPUStorage[T]) Len() int
- func (s *GPUStorage[T]) Managed() bool
- func (s *GPUStorage[T]) Ptr() unsafe.Pointer
- func (s *GPUStorage[T]) Set(data []T)
- func (s *GPUStorage[T]) Slice() []T
- func (s *GPUStorage[T]) SubSlice(offsetElems, length int) *GPUStorage[T]
- func (s *GPUStorage[T]) TrySet(data []T) error
- func (s *GPUStorage[T]) TrySlice() ([]T, error)
- func (s *GPUStorage[T]) View(length int) *GPUStorage[T]
- type Numeric
- type Q4KStorage
- func (q *Q4KStorage) Dequantize(dst []float32)
- func (q *Q4KStorage) DeviceType() device.Type
- func (q *Q4KStorage) GPUPtr() (unsafe.Pointer, int, int)
- func (q *Q4KStorage) Len() int
- func (q *Q4KStorage) NumBlocks() int
- func (q *Q4KStorage) RawBytes() []byte
- func (q *Q4KStorage) Set(_ []float32)
- func (q *Q4KStorage) SetGPUPtr(ptr unsafe.Pointer, byteSize, deviceID int)
- func (q *Q4KStorage) Slice() []float32
- type Q4Storage
- func (q *Q4Storage) BlockData(i int) *byte
- func (q *Q4Storage) BlockPtr(i int) *byte
- func (q *Q4Storage) BlockScaleF32(i int) float32
- func (q *Q4Storage) ByteSize() int
- func (q *Q4Storage) Dequantize(dst []float32)
- func (q *Q4Storage) DeviceType() device.Type
- func (q *Q4Storage) GPUPtr() (unsafe.Pointer, int, int)
- func (q *Q4Storage) Len() int
- func (q *Q4Storage) NumBlocks() int
- func (q *Q4Storage) RawBytes() []byte
- func (q *Q4Storage) RawBytesGPU(blocksPerRow int) []byte
- func (q *Q4Storage) Set(_ []float32)
- func (q *Q4Storage) SetGPUPtr(ptr unsafe.Pointer, byteSize, deviceID int)
- func (q *Q4Storage) Slice() []float32
- type Q5KStorage
- type Q6KStorage
- type Q8Storage
- func (q *Q8Storage) BlockQuants(i int) []int8
- func (q *Q8Storage) BlockScale(i int) float32
- func (q *Q8Storage) ByteSize() int
- func (q *Q8Storage) Dequantize(dst []float32)
- func (q *Q8Storage) DequantizeBlock(blockIdx int, dst *[32]float32)
- func (q *Q8Storage) DequantizeRange(dst []float32, start, count int)
- func (q *Q8Storage) DeviceType() device.Type
- func (q *Q8Storage) GPUPtr() (unsafe.Pointer, int, int)
- func (q *Q8Storage) Len() int
- func (q *Q8Storage) NumBlocks() int
- func (q *Q8Storage) RawBytes() []byte
- func (q *Q8Storage) Set(_ []float32)
- func (q *Q8Storage) SetGPUPtr(ptr unsafe.Pointer, byteSize, deviceID int)
- func (q *Q8Storage) Slice() []float32
- type Storage
- type Tensor
- type TensorBool
- type TensorNumeric
- func New[T Numeric](shape []int, data []T) (*TensorNumeric[T], error)
- func NewFromBytes[T Numeric](shape []int, data []byte) (*TensorNumeric[T], error)
- func NewWithStorage[T Numeric](shape []int, s Storage[T]) (*TensorNumeric[T], error)
- func ToCPU[T Numeric](t *TensorNumeric[T]) *TensorNumeric[T]
- func ToGPU[T Numeric](t *TensorNumeric[T]) (*TensorNumeric[T], error)
- func ToGPUDevice[T Numeric](t *TensorNumeric[T], deviceID int) (*TensorNumeric[T], error)
- func (t *TensorNumeric[T]) At(indices ...int) (T, error)
- func (t *TensorNumeric[T]) Bytes() ([]byte, error)
- func (t *TensorNumeric[T]) Copy() *TensorNumeric[T]
- func (t *TensorNumeric[T]) DType() reflect.Type
- func (t *TensorNumeric[T]) Data() []T
- func (t *TensorNumeric[T]) Dims() int
- func (t *TensorNumeric[T]) Each(fn func(T))
- func (t *TensorNumeric[T]) GetStorage() Storage[T]
- func (t *TensorNumeric[T]) Release()
- func (t *TensorNumeric[T]) Reshape(newShape []int) (*TensorNumeric[T], error)
- func (t *TensorNumeric[T]) Set(value T, indices ...int) error
- func (t *TensorNumeric[T]) SetData(data []T)
- func (t *TensorNumeric[T]) SetShape(shape []int)
- func (t *TensorNumeric[T]) SetStorage(s Storage[T])
- func (t *TensorNumeric[T]) SetStrides(strides []int)
- func (t *TensorNumeric[T]) Shape() []int
- func (t *TensorNumeric[T]) ShapeEquals(other *TensorNumeric[T]) bool
- func (t *TensorNumeric[T]) Size() int
- func (t *TensorNumeric[T]) Slice(ranges ...[2]int) (*TensorNumeric[T], error)
- func (t *TensorNumeric[T]) Strides() []int
- func (t *TensorNumeric[T]) String() string
- type TensorString
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AssertClose ¶
func AssertClose[T Numeric](t *testing.T, expected, actual *TensorNumeric[T], tolerance float64)
AssertClose checks if two tensors are close enough and fails the test if they are not.
func AssertEquals ¶
func AssertEquals[T Numeric](t *testing.T, expected, actual *TensorNumeric[T])
AssertEquals checks if two tensors are equal and fails the test if they are not.
func BroadcastIndex ¶
BroadcastIndex computes the index in the original tensor for a given index in the broadcasted tensor.
func BroadcastShapes ¶
BroadcastShapes computes the resulting shape of a broadcast operation between two shapes.
func ConvertInt64ToInt ¶
ConvertInt64ToInt converts a slice of int64 to a slice of int.
func ConvertIntToInt64 ¶
ConvertIntToInt64 converts a slice of int to a slice of int64.
func DequantizeQ4K ¶
DequantizeQ4K dequantizes one Q4_K super-block (144 bytes) into 256 float32 values. Each 32 bytes of quantized data produces 64 output values: low nibbles map to the first 32 positions and high nibbles map to the next 32 positions. This matches llama.cpp's dequantize_row_q4_K.
func DequantizeQ5K ¶
DequantizeQ5K dequantizes one Q5_K super-block (176 bytes) into 256 float32 values. Same split ordering as Q4_K, but each element has an extra high bit from qh. For each group of 64 elements (32 bytes of ql):
low nibbles + qh bit (2*group) -> positions j..j+31 high nibbles + qh bit (2*group+1) -> positions j+32..j+63
This matches llama.cpp's dequantize_row_q5_K.
func DequantizeQ6K ¶
DequantizeQ6K dequantizes one Q6_K super-block (210 bytes) into 256 float32 values. Each 128-element half uses 64 ql bytes + 32 qh bytes to produce 4 groups of 32:
low nibbles of ql[0:32] + qh bits 0-1 -> positions 0-31 low nibbles of ql[32:64] + qh bits 2-3 -> positions 32-63 high nibbles of ql[0:32] + qh bits 4-5 -> positions 64-95 high nibbles of ql[32:64]+ qh bits 6-7 -> positions 96-127
This matches llama.cpp's dequantize_row_q6_K.
func Equals ¶
func Equals[T Numeric](a, b *TensorNumeric[T]) bool
Equals checks if two tensors are equal.
func Float32ToBytes ¶
Float32ToBytes converts a float32 slice to a byte slice.
func Int8ToBytes ¶
Int8ToBytes converts an int8 slice to a byte slice.
func Q4GPUDataOffset ¶
Q4GPUDataOffset returns the byte offset from the start of RawBytesGPU output where the packed data region begins, given the total number of blocks.
func Q4GPUScaleOffset ¶
func Q4GPUScaleOffset() int
Q4GPUScaleOffset returns the byte offset from the start of RawBytesGPU output where the scale region begins (always 0).
func ShapesEqual ¶
ShapesEqual compares two shapes and returns true if they are equal.
func Uint8ToBytes ¶
Uint8ToBytes converts a uint8 slice to a byte slice.
Types ¶
type Addable ¶
type Addable interface {
~int | ~int8 | ~int16 | ~int32 | ~int64 |
~uint | ~uint32 | ~uint64 |
~float32 | ~float64
}
Addable defines the constraint for numeric types that support the built-in arithmetic operators directly (e.g., +, -, *, /) and zero literals. This intentionally excludes custom minifloat types like float8.Float8, float16.Float16, and float16.BFloat16, which are defined types that do not support Go's built-in operators without explicit conversion helpers.
type BFloat16Storage ¶
type BFloat16Storage struct {
// contains filtered or unexported fields
}
BFloat16Storage holds float32 tensor data in BFloat16 format on CPU. It implements Storage[float32] so that models can use BF16 weights with FP32 activations (mixed-precision inference). The raw BF16 bytes can be uploaded to GPU for use with cublasGemmEx.
func NewBFloat16Storage ¶
func NewBFloat16Storage(src []float32) *BFloat16Storage
NewBFloat16Storage converts float32 data to BFloat16 format.
func NewBFloat16StorageFromRaw ¶
func NewBFloat16StorageFromRaw(data []uint16) *BFloat16Storage
NewBFloat16StorageFromRaw creates a BFloat16Storage from pre-encoded uint16 values.
func (*BFloat16Storage) DeviceType ¶
func (s *BFloat16Storage) DeviceType() device.Type
DeviceType returns device.CPU.
func (*BFloat16Storage) GPUPtr ¶
func (s *BFloat16Storage) GPUPtr() (unsafe.Pointer, int, int)
GPUPtr returns the cached GPU device pointer, byte size, and device ID. Returns nil if no GPU copy exists.
func (*BFloat16Storage) Len ¶
func (s *BFloat16Storage) Len() int
Len returns the number of logical float32 elements.
func (*BFloat16Storage) RawBytes ¶
func (s *BFloat16Storage) RawBytes() []byte
RawBytes returns the raw BF16 data as a byte slice (2 bytes per element).
func (*BFloat16Storage) Set ¶
func (s *BFloat16Storage) Set(data []float32)
Set encodes float32 data into BFloat16 format.
func (*BFloat16Storage) SetGPUPtr ¶
func (s *BFloat16Storage) SetGPUPtr(ptr unsafe.Pointer, byteSize, deviceID int)
SetGPUPtr stores a pre-uploaded GPU device pointer for the raw BF16 bytes.
func (*BFloat16Storage) Slice ¶
func (s *BFloat16Storage) Slice() []float32
Slice decodes BFloat16 data to float32.
type CPUStorage ¶
type CPUStorage[T Numeric] struct { // contains filtered or unexported fields }
CPUStorage is a CPU-backed Storage implementation wrapping a Go slice. Slice() returns the underlying slice with zero copy.
func NewCPUStorage ¶
func NewCPUStorage[T Numeric](data []T) *CPUStorage[T]
NewCPUStorage creates a new CPUStorage wrapping the provided data slice.
func (*CPUStorage[T]) DeviceType ¶
func (s *CPUStorage[T]) DeviceType() device.Type
DeviceType returns device.CPU.
func (*CPUStorage[T]) Set ¶
func (s *CPUStorage[T]) Set(data []T)
Set replaces the underlying data slice.
func (*CPUStorage[T]) Slice ¶
func (s *CPUStorage[T]) Slice() []T
Slice returns the underlying data slice directly (zero copy).
type FP8E4M3Storage ¶
type FP8E4M3Storage struct {
// contains filtered or unexported fields
}
FP8E4M3Storage holds FP8 E4M3 quantized tensor data on CPU. Uses per-tensor absmax scaling: fp8_value = float32_value / scale.
func NewFP8E4M3Storage ¶
func NewFP8E4M3Storage(src []float32) *FP8E4M3Storage
NewFP8E4M3Storage quantizes float32 data into FP8 E4M3 format with absmax scaling.
func (*FP8E4M3Storage) DeviceType ¶
func (s *FP8E4M3Storage) DeviceType() device.Type
DeviceType returns device.CPU.
func (*FP8E4M3Storage) GPUPtr ¶
func (s *FP8E4M3Storage) GPUPtr() (unsafe.Pointer, int, int)
GPUPtr returns the cached GPU device pointer, byte size, and device ID. Returns nil if no GPU copy exists.
func (*FP8E4M3Storage) Len ¶
func (s *FP8E4M3Storage) Len() int
Len returns the number of logical float32 elements.
func (*FP8E4M3Storage) RawBytes ¶
func (s *FP8E4M3Storage) RawBytes() []byte
RawBytes returns the raw FP8 data as a byte slice (1 byte per element).
func (*FP8E4M3Storage) Scale ¶
func (s *FP8E4M3Storage) Scale() float32
Scale returns the per-tensor scale factor.
func (*FP8E4M3Storage) ScaleGPUPtr ¶
func (s *FP8E4M3Storage) ScaleGPUPtr() unsafe.Pointer
ScaleGPUPtr returns the GPU device pointer for the per-tensor scale factor.
func (*FP8E4M3Storage) Set ¶
func (s *FP8E4M3Storage) Set(data []float32)
Set encodes float32 data into FP8 E4M3 format.
func (*FP8E4M3Storage) SetGPUPtr ¶
func (s *FP8E4M3Storage) SetGPUPtr(ptr unsafe.Pointer, byteSize, deviceID int)
SetGPUPtr stores a pre-uploaded GPU device pointer for the raw FP8 bytes.
func (*FP8E4M3Storage) SetScaleGPUPtr ¶
func (s *FP8E4M3Storage) SetScaleGPUPtr(ptr unsafe.Pointer)
SetScaleGPUPtr stores the GPU device pointer for the per-tensor scale factor.
func (*FP8E4M3Storage) Slice ¶
func (s *FP8E4M3Storage) Slice() []float32
Slice decodes FP8 E4M3 data to float32 by multiplying by the scale factor.
type FP8E5M2Storage ¶
type FP8E5M2Storage struct {
// contains filtered or unexported fields
}
FP8E5M2Storage holds FP8 E5M2 quantized tensor data on CPU. Uses per-tensor absmax scaling: fp8_value = float32_value / scale.
func NewFP8E5M2Storage ¶
func NewFP8E5M2Storage(src []float32) *FP8E5M2Storage
NewFP8E5M2Storage quantizes float32 data into FP8 E5M2 format with absmax scaling.
func (*FP8E5M2Storage) DeviceType ¶
func (s *FP8E5M2Storage) DeviceType() device.Type
DeviceType returns device.CPU.
func (*FP8E5M2Storage) Len ¶
func (s *FP8E5M2Storage) Len() int
Len returns the number of logical float32 elements.
func (*FP8E5M2Storage) Scale ¶
func (s *FP8E5M2Storage) Scale() float32
Scale returns the per-tensor scale factor.
func (*FP8E5M2Storage) Set ¶
func (s *FP8E5M2Storage) Set(data []float32)
Set encodes float32 data into FP8 E5M2 format.
func (*FP8E5M2Storage) Slice ¶
func (s *FP8E5M2Storage) Slice() []float32
Slice decodes FP8 E5M2 data to float32 by multiplying by the scale factor.
type Float16Storage ¶
type Float16Storage struct {
// contains filtered or unexported fields
}
Float16Storage holds IEEE 754 half-precision (FP16) tensor data. Each element is stored as 2 bytes in little-endian order. This is a native FP16 storage type — no per-tensor scaling is needed.
func NewFloat16StorageFromF32 ¶
func NewFloat16StorageFromF32(src []float32) *Float16Storage
NewFloat16StorageFromF32 converts float32 data to FP16 and returns a Float16Storage.
func NewFloat16StorageFromRaw ¶ added in v0.2.0
func NewFloat16StorageFromRaw(raw []byte, numElems int) *Float16Storage
NewFloat16StorageFromRaw creates a Float16Storage from pre-encoded FP16 bytes. The raw slice must contain numElems * 2 bytes in little-endian IEEE 754 half-precision format. A copy is made so the caller can reuse raw.
func NewFloat16StorageGPU ¶
func NewFloat16StorageGPU(ptr unsafe.Pointer, numElems, deviceID int) *Float16Storage
NewFloat16StorageGPU creates a Float16Storage backed by a GPU device pointer. The storage has no host data; only the GPU pointer is set.
func (*Float16Storage) DeviceType ¶
func (s *Float16Storage) DeviceType() device.Type
DeviceType returns device.CPU.
func (*Float16Storage) GPUPtr ¶
func (s *Float16Storage) GPUPtr() (unsafe.Pointer, int, int)
GPUPtr returns the cached GPU device pointer, byte size, and device ID. Returns nil if no GPU copy exists.
func (*Float16Storage) Len ¶
func (s *Float16Storage) Len() int
Len returns the number of logical FP16 elements.
func (*Float16Storage) RawBytes ¶
func (s *Float16Storage) RawBytes() []byte
RawBytes returns the raw FP16 data as a byte slice (2 bytes per element).
func (*Float16Storage) Set ¶
func (s *Float16Storage) Set(data []float32)
Set encodes float32 data into FP16 format.
func (*Float16Storage) SetGPUByteSize ¶
func (s *Float16Storage) SetGPUByteSize(byteSize int)
SetGPUByteSize updates the GPU byte size. This is useful when the GPU allocation size differs from the logical byte size (e.g. padded allocations).
func (*Float16Storage) SetGPUPtr ¶
func (s *Float16Storage) SetGPUPtr(ptr unsafe.Pointer, byteSize, deviceID int)
SetGPUPtr stores a pre-uploaded GPU device pointer for the raw FP16 bytes.
func (*Float16Storage) Slice ¶
func (s *Float16Storage) Slice() []float32
Slice decodes FP16 data to float32. Returns a zero-filled slice if no host data is available (GPU-only storage).
func (*Float16Storage) SubSlice ¶
func (s *Float16Storage) SubSlice(offset, length int) *Float16Storage
SubSlice returns a zero-copy view into the storage from element offset for length elements. The caller must ensure the parent outlives the returned view.
type GPUStorage ¶
type GPUStorage[T Numeric] struct { // contains filtered or unexported fields }
GPUStorage is a GPU device-backed Storage implementation. Slice() copies data from the GPU to a new CPU slice (not zero-copy). Set() copies data from a CPU slice to the GPU. Each GPUStorage tracks which device it resides on via deviceID. When managed is true, the storage uses unified memory (cudaMallocManaged) and TrySlice/TrySet access the pointer directly without Memcpy.
Shared ownership: when View() is called, the returned GPUStorage shares the same refcount. Free() decrements the refcount; only the last Free actually releases memory (back to pool or via cudaFree). This avoids both double-free and GC-dependent cleanup for reshape/transpose views.
func NewGPUStorage ¶
func NewGPUStorage[T Numeric](length int, deviceID ...int) (*GPUStorage[T], error)
NewGPUStorage allocates GPU device memory for the given number of elements on the specified device. An optional deviceID selects the GPU (default 0).
func NewGPUStorageFromPool ¶
func NewGPUStorageFromPool[T Numeric](devPtr unsafe.Pointer, length int, pool gpuapi.MemPool, deviceID int) (*GPUStorage[T], error)
NewGPUStorageFromPool wraps a GPU device pointer allocated from a MemPool. When Free() is called, the pointer is returned to the pool instead of being freed via cudaFree. Uses reference counting so views can safely share the allocation without double-free or GC-dependent cleanup.
func NewGPUStorageFromPtr ¶
func NewGPUStorageFromPtr[T Numeric](devPtr unsafe.Pointer, length int, deviceID ...int) (*GPUStorage[T], error)
NewGPUStorageFromPtr wraps an existing GPU device pointer as a GPUStorage. A GC finalizer ensures the device memory is freed if Release() is not called. An optional deviceID records which device the pointer belongs to (default 0).
func NewGPUStorageFromSlice ¶
func NewGPUStorageFromSlice[T Numeric](data []T, deviceID ...int) (*GPUStorage[T], error)
NewGPUStorageFromSlice allocates GPU device memory, copies data from a CPU slice, and returns a GPUStorage on the specified device. An optional deviceID selects the GPU (default 0).
func NewGPUStorageView ¶
func NewGPUStorageView[T Numeric](parent *GPUStorage[T], offsetElems, length int) *GPUStorage[T]
NewGPUStorageView creates a non-owning view into an existing GPUStorage starting at offsetElems elements from the beginning. The returned storage shares the parent's device memory -- no finalizer is set, so the parent must outlive the view.
func NewGPUStorageViewFromPtr ¶
func NewGPUStorageViewFromPtr[T Numeric](devPtr unsafe.Pointer, length int, deviceID int) *GPUStorage[T]
NewGPUStorageViewFromPtr creates a non-owning GPUStorage that wraps a raw device pointer. Free() is a no-op — the caller retains ownership of the memory. This is used for scratchpad buffers where the compute engine owns the allocation and the tensor is a temporary view into it.
func NewManagedGPUStorage ¶
func NewManagedGPUStorage[T Numeric](pool gpuapi.MemPool, length int, deviceID ...int) (*GPUStorage[T], error)
NewManagedGPUStorage allocates unified (managed) GPU memory via pool.AllocManaged. The returned storage is host-accessible: TrySlice and TrySet skip Memcpy. This is beneficial on hardware with coherent unified memory (e.g. DGX Spark NVLink-C2C). On backends that do not support managed memory, AllocManaged returns an error.
func (*GPUStorage[T]) CopyFromDevice ¶
func (s *GPUStorage[T]) CopyFromDevice(src *GPUStorage[T], dstOffsetElems, srcOffsetElems, numElems int) error
CopyFromDevice copies numElems elements from src (at srcOffsetElems) into s (at dstOffsetElems) using a synchronous device-to-device memcpy. Both storages must reside on the same device.
func (*GPUStorage[T]) CopyFromDeviceAsync ¶
func (s *GPUStorage[T]) CopyFromDeviceAsync(src *GPUStorage[T], dstOffsetElems, srcOffsetElems, numElems int, stream gpuapi.Stream) error
CopyFromDeviceAsync copies numElems elements from src (at srcOffsetElems) into s (at dstOffsetElems) using an asynchronous device-to-device memcpy on the given stream. Both storages must reside on the same device.
func (*GPUStorage[T]) CopyFromHost ¶
func (s *GPUStorage[T]) CopyFromHost(data []T, dstOffsetElems int) error
CopyFromHost copies numElems elements from a CPU slice into s starting at dstOffsetElems using a synchronous host-to-device memcpy.
func (*GPUStorage[T]) CopyFromHostAsync ¶
func (s *GPUStorage[T]) CopyFromHostAsync(data []T, dstOffsetElems int, stream gpuapi.Stream) error
CopyFromHostAsync copies elements from a CPU slice into s starting at dstOffsetElems using an asynchronous host-to-device memcpy on the given stream. The caller must ensure the source slice remains valid until the stream is synchronized.
func (*GPUStorage[T]) CopyTo ¶
func (s *GPUStorage[T]) CopyTo(dst []T) error
CopyTo copies GPU device memory into an existing CPU slice without allocating. The destination must have at least Len() elements. Returns an error on failure.
func (*GPUStorage[T]) DeviceID ¶
func (s *GPUStorage[T]) DeviceID() int
DeviceID returns the GPU device ordinal this storage resides on.
func (*GPUStorage[T]) DeviceType ¶
func (s *GPUStorage[T]) DeviceType() device.Type
DeviceType returns the device type for this storage.
func (*GPUStorage[T]) Free ¶
func (s *GPUStorage[T]) Free() error
Free releases the GPU device memory. After calling Free, the storage must not be used. For refcounted storage (pool-backed with views), the refcount is decremented and memory is only returned to the pool when it reaches 0. Legacy views (non-refcounted) are no-ops.
func (*GPUStorage[T]) Managed ¶
func (s *GPUStorage[T]) Managed() bool
Managed returns true if this storage uses unified (managed) memory.
func (*GPUStorage[T]) Ptr ¶
func (s *GPUStorage[T]) Ptr() unsafe.Pointer
Ptr returns the raw GPU device pointer.
func (*GPUStorage[T]) Set ¶
func (s *GPUStorage[T]) Set(data []T)
Set copies data from a CPU slice to the GPU, replacing the current contents. On error, logs a warning instead of panicking.
func (*GPUStorage[T]) Slice ¶
func (s *GPUStorage[T]) Slice() []T
Slice copies device memory to a new CPU slice and returns it. On error, logs a warning and returns a zero-valued slice.
func (*GPUStorage[T]) SubSlice ¶
func (s *GPUStorage[T]) SubSlice(offsetElems, length int) *GPUStorage[T]
SubSlice returns a non-owning GPUStorage view into a sub-range of the receiver's device buffer, starting at offsetElems for length elements. No data is copied (no D2H transfer). The caller must ensure the parent outlives the returned view.
func (*GPUStorage[T]) TrySet ¶
func (s *GPUStorage[T]) TrySet(data []T) error
TrySet copies data from a CPU slice to the GPU, replacing the current contents. If the new slice has a different length, the old device memory is freed and new memory is allocated. For managed storage, data is written directly to the unified pointer without Memcpy. Returns an error on failure.
func (*GPUStorage[T]) TrySlice ¶
func (s *GPUStorage[T]) TrySlice() ([]T, error)
TrySlice copies device memory to a new CPU slice. For managed storage, the data is read directly from the unified pointer without a D2H Memcpy. Returns an error if the copy fails.
func (*GPUStorage[T]) View ¶
func (s *GPUStorage[T]) View(length int) *GPUStorage[T]
View returns a GPUStorage sharing the same device pointer but with a different element count. If the parent has a refcount (pool-backed), the view shares it and Free() on any copy decrements; only the last Free returns memory to the pool. For non-refcounted storage the view uses the legacy no-op Free behavior.
type Numeric ¶
type Numeric interface {
~int | ~int8 | ~int16 | ~int32 | ~int64 |
~uint | uint8 | ~uint32 | ~uint64 |
~float32 | ~float64 |
float8.Float8 |
float16.Float16 |
float16.BFloat16
}
Numeric defines the constraint for numeric types that can be used in Tensors.
type Q4KStorage ¶
type Q4KStorage struct {
// contains filtered or unexported fields
}
Q4KStorage holds Q4_K quantized tensor data on CPU.
func NewQ4KStorageFromRaw ¶
func NewQ4KStorageFromRaw(raw []byte, numElements int) (*Q4KStorage, error)
NewQ4KStorageFromRaw creates Q4KStorage from raw super-block data.
func (*Q4KStorage) Dequantize ¶
func (q *Q4KStorage) Dequantize(dst []float32)
Dequantize unpacks all Q4_K super-blocks into dst.
func (*Q4KStorage) DeviceType ¶
func (q *Q4KStorage) DeviceType() device.Type
func (*Q4KStorage) GPUPtr ¶
func (q *Q4KStorage) GPUPtr() (unsafe.Pointer, int, int)
GPUPtr returns the cached GPU device pointer, byte size, and device ID. Returns nil if no GPU copy exists.
func (*Q4KStorage) Len ¶
func (q *Q4KStorage) Len() int
func (*Q4KStorage) NumBlocks ¶
func (q *Q4KStorage) NumBlocks() int
NumBlocks returns the number of Q4_K super-blocks.
func (*Q4KStorage) RawBytes ¶
func (q *Q4KStorage) RawBytes() []byte
RawBytes returns the raw Q4_K super-block data for GPU upload. The layout is contiguous super-blocks, each 144 bytes.
func (*Q4KStorage) Set ¶
func (q *Q4KStorage) Set(_ []float32)
func (*Q4KStorage) SetGPUPtr ¶
func (q *Q4KStorage) SetGPUPtr(ptr unsafe.Pointer, byteSize, deviceID int)
SetGPUPtr stores a pre-uploaded GPU device pointer for the raw bytes.
func (*Q4KStorage) Slice ¶
func (q *Q4KStorage) Slice() []float32
type Q4Storage ¶
type Q4Storage struct {
// contains filtered or unexported fields
}
Q4Storage holds Q4_0 quantized tensor data on CPU.
func MergeQ4Storage ¶
MergeQ4Storage concatenates multiple Q4Storage objects into one. Used to merge Q/K/V or Gate/Up weight matrices row-wise for single-GEMV optimization during inference decode.
func NewQ4StorageFromRaw ¶
NewQ4StorageFromRaw creates Q4Storage from raw block data in the standard Q4_0 format: 18 bytes per block (2 bytes float16 scale LE + 16 bytes packed nibbles). numElements is the number of logical float32 elements the data represents.
func QuantizeQ4 ¶
QuantizeQ4 quantizes a float32 slice into Q4_0 format. The input is padded to a multiple of 32 if necessary.
func (*Q4Storage) BlockPtr ¶
BlockPtr returns an unsafe pointer to block i's q4Block struct (18 bytes). The layout is: 2 bytes float16 scale (LE) + 16 bytes packed nibble data. Blocks are contiguous in memory with 18-byte stride.
func (*Q4Storage) BlockScaleF32 ¶
BlockScaleF32 returns the dequantization scale for block i as float32.
func (*Q4Storage) ByteSize ¶
ByteSize returns the raw byte size of the quantized data. Each block is 18 bytes (2 byte scale + 16 bytes packed data).
func (*Q4Storage) Dequantize ¶
Dequantize unpacks Q4_0 blocks into dst. len(dst) must be >= q.Len(). Low nibbles map to the first half (positions 0-15) and high nibbles map to the second half (positions 16-31), matching llama.cpp's dequantize_row_q4_0.
func (*Q4Storage) DeviceType ¶
DeviceType returns device.CPU.
func (*Q4Storage) GPUPtr ¶
GPUPtr returns the cached GPU device pointer, byte size, and device ID. Returns nil if no GPU copy exists.
func (*Q4Storage) RawBytes ¶
RawBytes serializes Q4_0 blocks as contiguous bytes for GPU upload. Each block is 18 bytes: 2 bytes little-endian float16 scale + 16 bytes packed data.
func (*Q4Storage) RawBytesGPU ¶
RawBytesGPU serializes Q4_0 blocks in a GPU-optimized separated layout. The layout is global (not per-row), so it works regardless of how the weight matrix is logically viewed (before or after virtual transpose):
[all_scales: N * 2 bytes] [padding to 16-byte align] [all_data: N * 16 bytes]
The kernel indexes by block_idx = row * blocks_per_row + bi, which is the same linear block index regardless of the row definition.
blocksPerRow is unused but kept for API compatibility.
func (*Q4Storage) SetGPUPtr ¶
SetGPUPtr stores a pre-uploaded GPU device pointer for the raw bytes. byteSize must match len(RawBytes()). The caller retains ownership of the pointer.
type Q5KStorage ¶
type Q5KStorage struct {
// contains filtered or unexported fields
}
Q5KStorage holds Q5_K quantized tensor data on CPU.
func NewQ5KStorageFromRaw ¶
func NewQ5KStorageFromRaw(raw []byte, numElements int) (*Q5KStorage, error)
NewQ5KStorageFromRaw creates Q5KStorage from raw super-block data.
func (*Q5KStorage) Dequantize ¶
func (q *Q5KStorage) Dequantize(dst []float32)
func (*Q5KStorage) DeviceType ¶
func (q *Q5KStorage) DeviceType() device.Type
func (*Q5KStorage) Len ¶
func (q *Q5KStorage) Len() int
func (*Q5KStorage) Set ¶
func (q *Q5KStorage) Set(_ []float32)
func (*Q5KStorage) Slice ¶
func (q *Q5KStorage) Slice() []float32
type Q6KStorage ¶
type Q6KStorage struct {
// contains filtered or unexported fields
}
Q6KStorage holds Q6_K quantized tensor data on CPU.
func NewQ6KStorageFromRaw ¶
func NewQ6KStorageFromRaw(raw []byte, numElements int) (*Q6KStorage, error)
NewQ6KStorageFromRaw creates Q6KStorage from raw super-block data.
func (*Q6KStorage) Dequantize ¶
func (q *Q6KStorage) Dequantize(dst []float32)
func (*Q6KStorage) DeviceType ¶
func (q *Q6KStorage) DeviceType() device.Type
func (*Q6KStorage) Len ¶
func (q *Q6KStorage) Len() int
func (*Q6KStorage) Set ¶
func (q *Q6KStorage) Set(_ []float32)
func (*Q6KStorage) Slice ¶
func (q *Q6KStorage) Slice() []float32
type Q8Storage ¶
type Q8Storage struct {
// contains filtered or unexported fields
}
Q8Storage holds Q8_0 quantized tensor data on CPU.
func NewQ8StorageFromBlocks ¶
NewQ8StorageFromBlocks creates Q8Storage from pre-decoded block data. scales has one entry per block. quants has 32 int8 values per block (flattened). numElements is the number of logical float32 elements.
func QuantizeQ8 ¶
QuantizeQ8 quantizes a float32 slice into Q8_0 format.
func (*Q8Storage) BlockQuants ¶
BlockQuants returns the int8 quantized values for block i.
func (*Q8Storage) BlockScale ¶
BlockScale returns the float32 scale for block i.
func (*Q8Storage) Dequantize ¶
Dequantize unpacks Q8_0 blocks into dst.
func (*Q8Storage) DequantizeBlock ¶
DequantizeBlock unpacks a single Q8_0 block into a 32-element buffer.
func (*Q8Storage) DequantizeRange ¶
DequantizeRange unpacks Q8_0 blocks covering the range [start, start+count) into dst, which must have length >= count.
func (*Q8Storage) DeviceType ¶
DeviceType returns device.CPU.
func (*Q8Storage) GPUPtr ¶
GPUPtr returns the cached GPU device pointer, byte size, and device ID. Returns nil if no GPU copy exists.
func (*Q8Storage) RawBytes ¶
RawBytes serializes Q8_0 blocks as contiguous bytes for GPU upload. Each block is 36 bytes: 4 bytes little-endian float32 scale + 32 bytes int8 data.
type Storage ¶
type Storage[T Numeric] interface { // Len returns the number of elements. Len() int // Slice returns a CPU-accessible []T. Slice() []T // Set replaces the storage contents from a CPU slice. Set(data []T) // DeviceType returns the device type this storage resides on. DeviceType() device.Type }
Storage abstracts over CPU and GPU tensor data storage. For CPU storage, Slice() returns the underlying slice directly (zero copy). For GPU storage, Slice() copies device memory to a new host slice.
type Tensor ¶
type Tensor interface {
Shape() []int
DType() reflect.Type
// contains filtered or unexported methods
}
Tensor is an interface that all concrete tensor types must implement. This allows the graph to be type-agnostic at a high level.
type TensorBool ¶
type TensorBool struct {
// contains filtered or unexported fields
}
TensorBool represents an n-dimensional array of booleans.
func NewBool ¶
func NewBool(shape []int, data []bool) (*TensorBool, error)
NewBool creates a new TensorBool with the given shape and initializes it with the provided data.
func (*TensorBool) Bytes ¶
func (t *TensorBool) Bytes() ([]byte, error)
Bytes returns the underlying data of the tensor as a byte slice.
func (*TensorBool) DType ¶
func (t *TensorBool) DType() reflect.Type
DType returns the reflect.Type of the tensor's elements.
func (*TensorBool) Data ¶
func (t *TensorBool) Data() []bool
Data returns a slice representing the underlying data of the tensor.
func (*TensorBool) Shape ¶
func (t *TensorBool) Shape() []int
Shape returns a copy of the tensor's shape.
type TensorNumeric ¶
type TensorNumeric[T Numeric] struct { // contains filtered or unexported fields }
TensorNumeric represents an n-dimensional array of a generic numeric type T.
Note: The name includes the package term "Tensor" which may appear as stutter (tensor.TensorNumeric). This is intentional for clarity and API stability.
func New ¶
func New[T Numeric](shape []int, data []T) (*TensorNumeric[T], error)
New creates a new TensorNumeric with the given shape and initializes it with the provided data.
func NewFromBytes ¶
func NewFromBytes[T Numeric](shape []int, data []byte) (*TensorNumeric[T], error)
NewFromBytes creates a new tensor from bytes data with the given shape.
func NewWithStorage ¶
func NewWithStorage[T Numeric](shape []int, s Storage[T]) (*TensorNumeric[T], error)
NewWithStorage creates a TensorNumeric backed by the given Storage. This allows creating tensors with GPUStorage or any other Storage implementation.
func ToCPU ¶
func ToCPU[T Numeric](t *TensorNumeric[T]) *TensorNumeric[T]
ToCPU creates a new tensor with CPUStorage containing the same data as the source tensor. Shape and strides are preserved. The source tensor is not modified.
func ToGPU ¶
func ToGPU[T Numeric](t *TensorNumeric[T]) (*TensorNumeric[T], error)
ToGPU creates a new tensor with GPUStorage on device 0 containing the same data as the source tensor. Shape and strides are preserved. The source tensor is not modified.
func ToGPUDevice ¶
func ToGPUDevice[T Numeric](t *TensorNumeric[T], deviceID int) (*TensorNumeric[T], error)
ToGPUDevice creates a new tensor with GPUStorage on the specified device containing the same data as the source tensor. If the source tensor is already on a GPU, a peer-to-peer D2D copy is used when the devices differ; if on the same device, a D2D copy is performed. If the source is CPU-backed, an H2D copy targets the specified device.
func (*TensorNumeric[T]) At ¶
func (t *TensorNumeric[T]) At(indices ...int) (T, error)
At retrieves the value at the specified indices. It returns an error if the number of indices does not match the tensor's dimensions or if any index is out of bounds.
func (*TensorNumeric[T]) Bytes ¶
func (t *TensorNumeric[T]) Bytes() ([]byte, error)
Bytes returns the underlying data of the tensor as a byte slice.
func (*TensorNumeric[T]) Copy ¶
func (t *TensorNumeric[T]) Copy() *TensorNumeric[T]
Copy creates a deep copy of the tensor.
func (*TensorNumeric[T]) DType ¶
func (t *TensorNumeric[T]) DType() reflect.Type
DType returns the reflect.Type of the tensor's elements.
func (*TensorNumeric[T]) Data ¶
func (t *TensorNumeric[T]) Data() []T
Data returns a slice representing the underlying data of the tensor. For views, this returns only the data visible through the view.
func (*TensorNumeric[T]) Dims ¶
func (t *TensorNumeric[T]) Dims() int
Dims returns the number of dimensions of the tensor.
func (*TensorNumeric[T]) Each ¶
func (t *TensorNumeric[T]) Each(fn func(T))
Each applies a function to each element of the tensor.
func (*TensorNumeric[T]) GetStorage ¶
func (t *TensorNumeric[T]) GetStorage() Storage[T]
GetStorage returns the underlying storage of the tensor.
func (*TensorNumeric[T]) Release ¶
func (t *TensorNumeric[T]) Release()
Release frees any external resources held by this tensor's storage. For CPU tensors this is a no-op. For GPU tensors it frees device memory. After calling Release the tensor must not be used.
func (*TensorNumeric[T]) Reshape ¶
func (t *TensorNumeric[T]) Reshape(newShape []int) (*TensorNumeric[T], error)
Reshape returns a new TensorNumeric with a different shape that shares the same underlying data. The new shape must have the same total number of elements as the original tensor. This operation is a "view" and does not copy the data.
func (*TensorNumeric[T]) Set ¶
func (t *TensorNumeric[T]) Set(value T, indices ...int) error
Set updates the value at the specified indices. It returns an error if the number of indices does not match the tensor's dimensions, if any index is out of bounds, or if the tensor is a read-only view.
func (*TensorNumeric[T]) SetData ¶
func (t *TensorNumeric[T]) SetData(data []T)
SetData sets the underlying data of the tensor.
func (*TensorNumeric[T]) SetShape ¶
func (t *TensorNumeric[T]) SetShape(shape []int)
SetShape sets the tensor's shape.
func (*TensorNumeric[T]) SetStorage ¶
func (t *TensorNumeric[T]) SetStorage(s Storage[T])
SetStorage replaces the underlying storage of the tensor.
func (*TensorNumeric[T]) SetStrides ¶
func (t *TensorNumeric[T]) SetStrides(strides []int)
SetStrides sets the tensor's strides.
func (*TensorNumeric[T]) Shape ¶
func (t *TensorNumeric[T]) Shape() []int
Shape returns a copy of the tensor's shape.
func (*TensorNumeric[T]) ShapeEquals ¶
func (t *TensorNumeric[T]) ShapeEquals(other *TensorNumeric[T]) bool
ShapeEquals returns true if the shapes of two tensors are identical.
func (*TensorNumeric[T]) Size ¶
func (t *TensorNumeric[T]) Size() int
Size returns the total number of elements in the tensor.
func (*TensorNumeric[T]) Slice ¶
func (t *TensorNumeric[T]) Slice(ranges ...[2]int) (*TensorNumeric[T], error)
Slice creates a new TensorNumeric view for the specified range. A slice is defined by a start and end index for each dimension. The returned tensor shares the same underlying data.
func (*TensorNumeric[T]) Strides ¶
func (t *TensorNumeric[T]) Strides() []int
Strides returns a copy of the tensor's strides.
func (*TensorNumeric[T]) String ¶
func (t *TensorNumeric[T]) String() string
String returns a string representation of the tensor.
type TensorString ¶
type TensorString struct {
// contains filtered or unexported fields
}
TensorString represents an n-dimensional array of strings.
func NewString ¶
func NewString(shape []int, data []string) (*TensorString, error)
NewString creates a new TensorString with the given shape and initializes it with the provided data.
func (*TensorString) DType ¶
func (t *TensorString) DType() reflect.Type
DType returns the reflect.Type of the tensor's elements.
func (*TensorString) Data ¶
func (t *TensorString) Data() []string
Data returns a slice representing the underlying data of the tensor.
func (*TensorString) Shape ¶
func (t *TensorString) Shape() []int
Shape returns a copy of the tensor's shape.