metrics

package
v1.0.20 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 29, 2022 License: Apache-2.0 Imports: 17 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	// DutyCycleNodeGpu reports the percent of time when the GPU was actively processing per Node.
	DutyCycleNodeGpu = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "duty_cycle_gpu_node",
			Help: "Percent of time when the GPU was actively processing",
		},
		[]string{"make", "accelerator_id", "model"})

	// MemoryTotalNodeGpu reports the total memory available on the GPU per Node.
	MemoryTotalNodeGpu = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "memory_total_gpu_node",
			Help: "Total memory available on the GPU in bytes",
		},
		[]string{"make", "accelerator_id", "model"})

	// MemoryUsedNodeGpu reports GPU memory allocated per Node.
	MemoryUsedNodeGpu = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "memory_used_gpu_node",
			Help: "Allocated GPU memory in bytes",
		},
		[]string{"make", "accelerator_id", "model"})

	// DutyCycle reports the percent of time when the GPU was actively processing per container.
	DutyCycle = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "duty_cycle",
			Help: "Percent of time when the GPU was actively processing",
		},
		[]string{"namespace", "pod", "container", "make", "accelerator_id", "model"})

	// MemoryTotal reports the total memory available on the GPU per container.
	MemoryTotal = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "memory_total",
			Help: "Total memory available on the GPU in bytes",
		},
		[]string{"namespace", "pod", "container", "make", "accelerator_id", "model"})

	// MemoryUsed reports GPU memory allocated per container.
	MemoryUsed = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "memory_used",
			Help: "Allocated GPU memory in bytes",
		},
		[]string{"namespace", "pod", "container", "make", "accelerator_id", "model"})

	// AcceleratorRequests reports the number of GPU devices requested by the container.
	AcceleratorRequests = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "request",
			Help: "Number of accelerator devices requested by the container",
		},
		[]string{"namespace", "pod", "container", "resource_name"})
)

Functions

func AverageGPUUtilization

func AverageGPUUtilization(uuid string, since time.Duration) (uint, error)

AverageGPUUtilization reports the average GPU utilization over the last 10 seconds.

func DeviceFromName

func DeviceFromName(deviceName string) (*nvml.Device, error)

DeviceFromName returns the device object for a given device name.

func DiscoverGPUDevices

func DiscoverGPUDevices() error

DiscoverGPUDevices discovers GPUs attached to the node, and updates `gpuDevices` map.

func GetAllGpuDevices

func GetAllGpuDevices() map[string]*nvml.Device

func GetDevicesForAllContainers

func GetDevicesForAllContainers() (map[ContainerID][]string, error)

GetDevicesForAllContainers returns a map with container as the key and the list of devices allocated to that container as the value. It will skip time-shared GPU devices when time-sharing solution is enabled.

Types

type ContainerID

type ContainerID struct {
	// contains filtered or unexported fields
}

ContainerID uniquely identifies a container.

type MetricServer

type MetricServer struct {
	// contains filtered or unexported fields
}

MetricServer exposes GPU metrics for all containers and nodes in prometheus format on the specified port.

func NewMetricServer

func NewMetricServer(collectionInterval, port int, metricsEndpointPath string) *MetricServer

func (*MetricServer) Start

func (m *MetricServer) Start() error

Start performs necessary initializations and starts the metric server.

func (*MetricServer) Stop

func (m *MetricServer) Stop()

Stop performs cleanup operations and stops the metric server.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL