metrics

package

v1.0.20 Latest Latest Go to latest Published: Jul 29, 2022 License: Apache-2.0 Imports: 17 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/GoogleCloudPlatform/container-engine-accelerators

Links

Open Source Insights

Documentation ¶

Index ¶

Variables
func AverageGPUUtilization(uuid string, since time.Duration) (uint, error)
func DeviceFromName(deviceName string) (*nvml.Device, error)
func DiscoverGPUDevices() error
func GetAllGpuDevices() map[string]*nvml.Device
func GetDevicesForAllContainers() (map[ContainerID][]string, error)
type ContainerID
type MetricServer
- func NewMetricServer(collectionInterval, port int, metricsEndpointPath string) *MetricServer
- func (m *MetricServer) Start() error
- func (m *MetricServer) Stop()

Constants ¶

This section is empty.

Variables ¶

View Source

var (
	// DutyCycleNodeGpu reports the percent of time when the GPU was actively processing per Node.
	DutyCycleNodeGpu = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "duty_cycle_gpu_node",
			Help: "Percent of time when the GPU was actively processing",
		},
		[]string{"make", "accelerator_id", "model"})

	// MemoryTotalNodeGpu reports the total memory available on the GPU per Node.
	MemoryTotalNodeGpu = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "memory_total_gpu_node",
			Help: "Total memory available on the GPU in bytes",
		},
		[]string{"make", "accelerator_id", "model"})

	// MemoryUsedNodeGpu reports GPU memory allocated per Node.
	MemoryUsedNodeGpu = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "memory_used_gpu_node",
			Help: "Allocated GPU memory in bytes",
		},
		[]string{"make", "accelerator_id", "model"})

	// DutyCycle reports the percent of time when the GPU was actively processing per container.
	DutyCycle = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "duty_cycle",
			Help: "Percent of time when the GPU was actively processing",
		},
		[]string{"namespace", "pod", "container", "make", "accelerator_id", "model"})

	// MemoryTotal reports the total memory available on the GPU per container.
	MemoryTotal = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "memory_total",
			Help: "Total memory available on the GPU in bytes",
		},
		[]string{"namespace", "pod", "container", "make", "accelerator_id", "model"})

	// MemoryUsed reports GPU memory allocated per container.
	MemoryUsed = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "memory_used",
			Help: "Allocated GPU memory in bytes",
		},
		[]string{"namespace", "pod", "container", "make", "accelerator_id", "model"})

	// AcceleratorRequests reports the number of GPU devices requested by the container.
	AcceleratorRequests = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "request",
			Help: "Number of accelerator devices requested by the container",
		},
		[]string{"namespace", "pod", "container", "resource_name"})
)

Functions ¶

func AverageGPUUtilization ¶

func AverageGPUUtilization(uuid string, since time.Duration) (uint, error)

AverageGPUUtilization reports the average GPU utilization over the last 10 seconds.

func DeviceFromName ¶

func DeviceFromName(deviceName string) (*nvml.Device, error)

DeviceFromName returns the device object for a given device name.

func DiscoverGPUDevices ¶

func DiscoverGPUDevices() error

DiscoverGPUDevices discovers GPUs attached to the node, and updates `gpuDevices` map.

func GetAllGpuDevices ¶

func GetAllGpuDevices() map[string]*nvml.Device

func GetDevicesForAllContainers ¶

func GetDevicesForAllContainers() (map[ContainerID][]string, error)

GetDevicesForAllContainers returns a map with container as the key and the list of devices allocated to that container as the value. It will skip time-shared GPU devices when time-sharing solution is enabled.

Types ¶

type ContainerID ¶

type ContainerID struct {
	// contains filtered or unexported fields
}

ContainerID uniquely identifies a container.

type MetricServer ¶

type MetricServer struct {
	// contains filtered or unexported fields
}

MetricServer exposes GPU metrics for all containers and nodes in prometheus format on the specified port.

func NewMetricServer ¶

func NewMetricServer(collectionInterval, port int, metricsEndpointPath string) *MetricServer

func (*MetricServer) Start ¶

func (m *MetricServer) Start() error

Start performs necessary initializations and starts the metric server.

func (*MetricServer) Stop ¶

func (m *MetricServer) Stop()

Stop performs cleanup operations and stops the metric server.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL