dcgm

package
v0.0.0-...-7c92211 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 1, 2025 License: Apache-2.0 Imports: 22 Imported by: 6

Documentation

Overview

Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)

Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)

Index

Constants

View Source
const (
	Embedded mode = iota
	Standalone
	StartHostengine
)

const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine

View Source
const (
	// DCGM_FT_BINARY is the type for binary data
	DCGM_FT_BINARY = uint('b')
	// DCGM_FT_DOUBLE is the type for floating-point numbers
	DCGM_FT_DOUBLE = uint('d')
	// DCGM_FT_INT64 is the type for 64-bit integers
	DCGM_FT_INT64 = uint('i')
	// DCGM_FT_STRING is the type for strings
	DCGM_FT_STRING = uint('s')
	// DCGM_FT_TIMESTAMP is the type for timestamps
	DCGM_FT_TIMESTAMP = uint('t')
	// DCGM_FT_INT32_BLANK is the blank value for 32-bit integers
	DCGM_FT_INT32_BLANK = int64(2147483632)
	// DCGM_FT_INT32_NOT_FOUND is the value for not found in 32-bit integers
	DCGM_FT_INT32_NOT_FOUND = DCGM_FT_INT32_BLANK + 1
	// DCGM_FT_INT32_NOT_SUPPORTED is the value for not supported in 32-bit integers
	DCGM_FT_INT32_NOT_SUPPORTED = DCGM_FT_INT32_BLANK + 2
	// DCGM_FT_INT32_NOT_PERMISSIONED is the value for not permissioned in 32-bit integers
	DCGM_FT_INT32_NOT_PERMISSIONED = DCGM_FT_INT32_BLANK + 3
	// DCGM_FT_INT64_BLANK is the blank value for 64-bit integers
	DCGM_FT_INT64_BLANK = int64(9223372036854775792)
	// DCGM_FT_INT64_NOT_FOUND is the value for not found in 64-bit integers
	DCGM_FT_INT64_NOT_FOUND = DCGM_FT_INT64_BLANK + 1
	// DCGM_FT_INT64_NOT_SUPPORTED is the value for not supported in 64-bit integers
	DCGM_FT_INT64_NOT_SUPPORTED = DCGM_FT_INT64_BLANK + 2
	// DCGM_FT_INT64_NOT_PERMISSIONED is the value for not permissioned in 64-bit integers
	DCGM_FT_INT64_NOT_PERMISSIONED = DCGM_FT_INT64_BLANK + 3
	// DCGM_FT_FP64_BLANK is the blank value for floating-point numbers
	DCGM_FT_FP64_BLANK = 140737488355328.0
	// DCGM_FT_FP64_NOT_FOUND is the value for not found in floating-point numbers
	DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0)
	// DCGM_FT_FP64_NOT_SUPPORTED is the value for not supported in floating-point numbers
	DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0)
	// DCGM_FT_FP64_NOT_PERMISSIONED is the value for not permissioned in floating-point numbers
	DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0)
	// DCGM_FT_STR_BLANK is the blank value for strings
	DCGM_FT_STR_BLANK = "<<<NULL>>>"
	// DCGM_FT_STR_NOT_FOUND is the value for not found in strings
	DCGM_FT_STR_NOT_FOUND = "<<<NOT_FOUND>>>"
	// DCGM_FT_STR_NOT_SUPPORTED is the value for not supported in strings
	DCGM_FT_STR_NOT_SUPPORTED = "<<<NOT_SUPPORTED>>>"
	// DCGM_FT_STR_NOT_PERMISSIONED is the value for not permissioned in strings
	DCGM_FT_STR_NOT_PERMISSIONED = "<<<NOT_PERMISSIONED>>>"

	// DCGM_ST_OK is the value for ECC OK
	DCGM_ST_OK = 0
	// DCGM_ST_BADPARAM is the value for ECC BAD PARAM
	DCGM_ST_BADPARAM = -1
	// DCGM_ST_GENERIC_ERROR is the value for ECC GENERIC ERROR
	DCGM_ST_GENERIC_ERROR = -3
	// DCGM_ST_MEMORY is the value for ECC MEMORY
	DCGM_ST_MEMORY = -4
	// DCGM_ST_NOT_CONFIGURED is the value for ECC NOT CONFIGURED
	DCGM_ST_NOT_CONFIGURED = -5
	// DCGM_ST_NOT_SUPPORTED is the value for ECC NOT SUPPORTED
	DCGM_ST_NOT_SUPPORTED = -6
	// DCGM_ST_INIT_ERROR is the value for ECC INIT ERROR
	DCGM_ST_INIT_ERROR = -7
	// DCGM_ST_NVML_ERROR is the value for ECC NVML ERROR
	DCGM_ST_NVML_ERROR = -8
	// DCGM_ST_PENDING is the value for ECC PENDING
	DCGM_ST_PENDING = -9
	// DCGM_ST_TIMEOUT is the value for ECC TIMEOUT
	DCGM_ST_TIMEOUT = -11
	// DCGM_ST_VER_MISMATCH is the value for ECC VER MISMATCH
	DCGM_ST_VER_MISMATCH = -12
	// DCGM_ST_UNKNOWN_FIELD is the value for ECC UNKNOWN FIELD
	DCGM_ST_UNKNOWN_FIELD = -13
	// DCGM_ST_NO_DATA is the value for ECC NO DATA
	DCGM_ST_NO_DATA = -14
	// DCGM_ST_STALE_DATA is the value for ECC STALE DATA
	DCGM_ST_STALE_DATA = -15
	// DCGM_ST_NOT_WATCHED is the value for ECC NOT WATCHED
	DCGM_ST_NOT_WATCHED = -16
	// DCGM_ST_NO_PERMISSION is the value for ECC NO PERMISSION
	DCGM_ST_NO_PERMISSION = -17
	// DCGM_ST_GPU_IS_LOST is the value for ECC GPU IS LOST
	DCGM_ST_GPU_IS_LOST = -18
	// DCGM_ST_RESET_REQUIRED is the value for ECC RESET REQUIRED
	DCGM_ST_RESET_REQUIRED = -19
	// DCGM_ST_FUNCTION_NOT_FOUND is the value for ECC FUNCTION NOT FOUND
	DCGM_ST_FUNCTION_NOT_FOUND = -20
	// DCGM_ST_CONNECTION_NOT_VALID is the value for ECC CONNECTION NOT VALID
	DCGM_ST_CONNECTION_NOT_VALID = -21
	// DCGM_ST_GPU_NOT_SUPPORTED is the value for ECC GPU NOT SUPPORTED
	DCGM_ST_GPU_NOT_SUPPORTED = -22
	// DCGM_ST_GROUP_INCOMPATIBLE is the value for ECC GROUP INCOMPATIBLE
	DCGM_ST_GROUP_INCOMPATIBLE = -23
	// DCGM_ST_MAX_LIMIT is the value for ECC MAX LIMIT
	DCGM_ST_MAX_LIMIT = -24
	// DCGM_ST_LIBRARY_NOT_FOUND is the value for ECC LIBRARY NOT FOUND
	DCGM_ST_LIBRARY_NOT_FOUND = -25
	// DCGM_ST_DUPLICATE_KEY is the value for ECC DUPLICATE KEY
	DCGM_ST_DUPLICATE_KEY = -26
	// DCGM_ST_GPU_IN_SYNC_BOOST_GROUP is the value for ECC GPU IN SYNC BOOST GROUP
	DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27
	// DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP is the value for ECC GPU NOT IN SYNC BOOST GROUP
	DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28
	// DCGM_ST_REQUIRES_ROOT is the value for ECC REQUIRES ROOT
	DCGM_ST_REQUIRES_ROOT = -29
	// DCGM_ST_NVVS_ERROR is the value for ECC NVVS ERROR
	DCGM_ST_NVVS_ERROR = -30
	// DCGM_ST_INSUFFICIENT_SIZE is the value for ECC INSUFFICIENT SIZE
	DCGM_ST_INSUFFICIENT_SIZE = -31
	// DCGM_ST_FIELD_UNSUPPORTED_BY_API is the value for ECC FIELD UNSUPPORTED BY API
	DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32
	// DCGM_ST_MODULE_NOT_LOADED is the value for ECC MODULE NOT LOADED
	DCGM_ST_MODULE_NOT_LOADED = -33
	// DCGM_ST_IN_USE is the value for ECC IN USE
	DCGM_ST_IN_USE = -34
	// DCGM_ST_GROUP_IS_EMPTY is the value for ECC GROUP IS EMPTY
	DCGM_ST_GROUP_IS_EMPTY = -35
	// DCGM_ST_PROFILING_NOT_SUPPORTED is the value for ECC PROFILING NOT SUPPORTED
	DCGM_ST_PROFILING_NOT_SUPPORTED = -36
	// DCGM_ST_PROFILING_LIBRARY_ERROR is the value for ECC PROFILING LIBRARY ERROR
	DCGM_ST_PROFILING_LIBRARY_ERROR = -37
	// DCGM_ST_PROFILING_MULTI_PASS is the value for ECC PROFILING MULTI PASS
	DCGM_ST_PROFILING_MULTI_PASS = -38
	// DCGM_ST_DIAG_ALREADY_RUNNING is the value for ECC DIAG ALREADY RUNNING
	DCGM_ST_DIAG_ALREADY_RUNNING = -39
	// DCGM_ST_DIAG_BAD_JSON is the value for ECC DIAG BAD JSON
	DCGM_ST_DIAG_BAD_JSON = -40
	// DCGM_ST_DIAG_BAD_LAUNCH is the value for ECC DIAG BAD LAUNCH
	DCGM_ST_DIAG_BAD_LAUNCH = -41
	// DCGM_ST_DIAG_UNUSED is the value for ECC DIAG UNUSED
	DCGM_ST_DIAG_UNUSED = -42
	// DCGM_ST_DIAG_THRESHOLD_EXCEEDED is the value for ECC DIAG THRESHOLD EXCEEDED
	DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43
	// DCGM_ST_INSUFFICIENT_DRIVER_VERSION is the value for ECC INSUFFICIENT DRIVER VERSION
	DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44
	// DCGM_ST_INSTANCE_NOT_FOUND is the value for ECC INSTANCE NOT FOUND
	DCGM_ST_INSTANCE_NOT_FOUND = -45
	// DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND is the value for ECC COMPUTE INSTANCE NOT FOUND
	DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46
	// DCGM_ST_CHILD_NOT_KILLED is the value for ECC CHILD NOT KILLED
	DCGM_ST_CHILD_NOT_KILLED = -47
	// DCGM_ST_3RD_PARTY_LIBRARY_ERROR is the value for ECC 3RD PARTY LIBRARY ERROR
	DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48
	// DCGM_ST_INSUFFICIENT_RESOURCES is the value for ECC INSUFFICIENT RESOURCES
	DCGM_ST_INSUFFICIENT_RESOURCES = -49
	// DCGM_ST_PLUGIN_EXCEPTION is the value for ECC PLUGIN EXCEPTION
	DCGM_ST_PLUGIN_EXCEPTION = -50
	// DCGM_ST_NVVS_ISOLATE_ERROR is the value for ECC NVVS ISOLATE ERROR
	DCGM_ST_NVVS_ISOLATE_ERROR = -51
	// DCGM_ST_NVVS_BINARY_NOT_FOUND is the value for ECC NVVS BINARY NOT FOUND
	DCGM_ST_NVVS_BINARY_NOT_FOUND = -52
	// DCGM_ST_NVVS_KILLED is the value for ECC NVVS KILLED
	DCGM_ST_NVVS_KILLED = -53
	// DCGM_ST_PAUSED is the value for ECC PAUSED
	DCGM_ST_PAUSED = -54
	// DCGM_ST_ALREADY_INITIALIZED is the value for ECC ALREADY INITIALIZED
	DCGM_ST_ALREADY_INITIALIZED = -55
	// DCGM_ST_NVML_NOT_LOADED is the value for ECC NVML NOT LOADED
	DCGM_ST_NVML_NOT_LOADED = -56
	// DCGM_ST_NVML_DRIVER_TIMEOUT is the value for ECC NVML DRIVER TIMEOUT
	DCGM_ST_NVML_DRIVER_TIMEOUT = -57
	// DCGM_ST_NVVS_NO_AVAILABLE_TEST is the value for ECC NVVS NO AVAILABLE TEST
	DCGM_ST_NVVS_NO_AVAILABLE_TEST = -58
)

FieldType constants

View Source
const (
	// MAX_NUM_CPU_CORES represents the maximum number of CPU cores supported
	MAX_NUM_CPU_CORES = uint(C.DCGM_MAX_NUM_CPU_CORES)

	// MAX_NUM_CPUS represents the maximum number of CPUs supported
	MAX_NUM_CPUS = uint(C.DCGM_MAX_NUM_CPUS)

	// CHAR_BIT represents the number of bits in a byte
	CHAR_BIT = uint(C.CHAR_BIT)

	// MAX_CPU_CORE_BITMASK_COUNT represents the maximum count of CPU core bitmasks
	MAX_CPU_CORE_BITMASK_COUNT = uint(1024 / 8 / 8)
)
View Source
const (
	// PerfStateMax represents the highest performance state (P0)
	PerfStateMax = 0

	// PerfStateMin represents the lowest performance state (P15)
	PerfStateMin = 15

	// PerfStateUnknown represents an unknown performance state
	PerfStateUnknown = 32
)
View Source
const (
	// MAX_NUM_DEVICES represents the maximum number of GPU devices supported
	MAX_NUM_DEVICES = uint(C.DCGM_MAX_NUM_DEVICES)

	// MAX_HIERARCHY_INFO represents the maximum size of the MIG hierarchy information
	MAX_HIERARCHY_INFO = uint(C.DCGM_MAX_HIERARCHY_INFO)
)
View Source
const (
	// DbePolicy represents a Double-bit ECC error policy condition
	DbePolicy = policyCondition("Double-bit ECC error")

	// PCIePolicy represents a PCI error policy condition
	PCIePolicy = policyCondition("PCI error")

	// MaxRtPgPolicy represents a Maximum Retired Pages Limit policy condition
	MaxRtPgPolicy = policyCondition("Max Retired Pages Limit")

	// ThermalPolicy represents a Thermal Limit policy condition
	ThermalPolicy = policyCondition("Thermal Limit")

	// PowerPolicy represents a Power Limit policy condition
	PowerPolicy = policyCondition("Power Limit")

	// NvlinkPolicy represents an NVLink error policy condition
	NvlinkPolicy = policyCondition("Nvlink Error")

	// XidPolicy represents an XID error policy condition
	XidPolicy = policyCondition("XID Error")
)

Policy condition types

View Source
const (
	// DCGM_NVSDM_MOCK_YAML environment variable for enabling NVSDM mock configuration
	DCGM_NVSDM_MOCK_YAML = "DCGM_NVSDM_MOCK_YAML"
	// DCGM_DBG_FILE is environment variables which enables DCGM to write debug logs to a specific file
	DCGM_DBG_FILE = "__DCGM_DBG_FILE"
	// DCGM_DBG_LVL is environment variables which enables DCGM logging level
	DCGM_DBG_LVL = "__DCGM_DBG_LVL"
)
View Source
const (
	DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001)
)

DCGM_FV_FLAG_LIVE_DATA is a flag for the DCGM fields.

View Source
const (
	DCGM_GROUP_MAX_ENTITIES int = C.DCGM_GROUP_MAX_ENTITIES_V2
)

DCGM_GROUP_MAX_ENTITIES represents the maximum number of entities allowed in a group

View Source
const DIAG_RESULT_STRING_SIZE = 1024

DIAG_RESULT_STRING_SIZE represents the maximum size of diagnostic result strings

Variables

View Source
var ErrInvalidMode = errors.New("invalid mode")

ErrInvalidMode represents an error indicating that an invalid mode was used

Functions

func AddEntityToGroup

func AddEntityToGroup(groupID GroupHandle, entityGroupID Field_Entity_Group, entityID uint) (err error)

AddEntityToGroup adds an entity to an existing group

func AddLinkEntityToGroup

func AddLinkEntityToGroup(groupID GroupHandle, index, parentID uint) (err error)

AddLinkEntityToGroup adds a link entity to the group

func AddToGroup

func AddToGroup(groupID GroupHandle, gpuID uint) (err error)

AddToGroup adds a GPU to an existing group

func CreateFakeEntities

func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error)

CreateFakeEntities creates test entities with the specified MIG hierarchy information. This function is intended for testing purposes only. Returns a slice of Entity IDs for the created entities and any error encountered.

func DestroyGroup

func DestroyGroup(groupID GroupHandle) (err error)

DestroyGroup destroys an existing GPU group

func FieldGroupDestroy

func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)

FieldGroupDestroy destroys a previously created field group. Returns an error if the group cannot be destroyed.

func FieldsInit

func FieldsInit() int

FieldsInit initializes the DCGM fields module. Returns an integer status code.

func FieldsTerm

func FieldsTerm() int

FieldsTerm terminates the DCGM fields module. Returns an integer status code.

func FindFirstNonAsciiIndex

func FindFirstNonAsciiIndex(value [4096]byte) int

FindFirstNonAsciiIndex returns the index of the first non-ASCII character in the byte array. Returns 4096 if no non-ASCII character is found.

func Fv2_Blob

func Fv2_Blob(fv FieldValue_v2) [4096]byte

Fv2_Blob returns the raw field value of a FieldValue_v2 as a byte array.

func Fv2_String

func Fv2_String(fv FieldValue_v2) string

Fv2_String returns the string value of a FieldValue_v2.

func GetAllDeviceCount

func GetAllDeviceCount() (uint, error)

GetAllDeviceCount returns the count of all GPUs in the system

func GetEntityGroupEntities

func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error)

GetEntityGroupEntities returns all entities of the specified group type

func GetSupportedDevices

func GetSupportedDevices() ([]uint, error)

GetSupportedDevices returns a list of DCGM-supported GPU IDs

func HealthSet

func HealthSet(groupID GroupHandle, systems HealthSystem) (err error)

HealthSet enables the DCGM health check system for the given systems. It configures which health watch systems should be monitored for the specified group.

func Init

func Init(m mode, args ...string) (cleanup func(), err error)

Init starts DCGM in the specified mode Mode can be: - Embedded: Start hostengine within this process - Standalone: Connect to an already running nv-hostengine - StartHostengine: Start and connect to nv-hostengine, terminate before exiting Returns a cleanup function and any error encountered

func InjectFieldValue

func InjectFieldValue(gpu uint, fieldID Short, fieldType uint, status int, ts int64, value any) error

InjectFieldValue injects a test value for a specific field into DCGM's field manager. This function is intended for testing purposes only.

Parameters:

  • gpu: The GPU ID to inject the field value for
  • fieldID: The DCGM field identifier
  • fieldType: The type of the field (e.g., DCGM_FT_INT64, DCGM_FT_DOUBLE)
  • status: The status code for the field
  • ts: The timestamp for the field value
  • value: The value to inject (must match fieldType)

Returns an error if the injection fails

func IsCurrentField

func IsCurrentField(fieldName string) bool

IsCurrentField returns true if the given field name is a current field

func IsInt32Blank

func IsInt32Blank(value int) bool

IsInt32Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffff0). These values indicate that no valid data is available for the field.

func IsInt64Blank

func IsInt64Blank(value int64) bool

IsInt64Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffffffffffff0). These values indicate that no valid data is available for the field.

func IsLegacyField

func IsLegacyField(fieldName string) bool

IsLegacyField returns true if the given field name is a legacy field

func ListenForPolicyViolations

func ListenForPolicyViolations(ctx context.Context, typ ...policyCondition) (<-chan PolicyViolation, error)

ListenForPolicyViolations sets up monitoring for the specified policy conditions on all GPUs Returns a channel that receives policy violations and any error encountered

func ListenForPolicyViolationsForGroup

func ListenForPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...policyCondition) (<-chan PolicyViolation, error)

ListenForPolicyViolationsForGroup sets up policy monitoring for the specified GPU group Returns a channel that receives policy violations and any error encountered

func Shutdown

func Shutdown() (err error)

Shutdown stops DCGM and destroys all connections Returns an error if DCGM is not initialized

func UpdateAllFields

func UpdateAllFields() error

UpdateAllFields forces an update of all field values. Returns an error if the update fails.

func ViolationRegistration

func ViolationRegistration(data unsafe.Pointer) int

ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()

func WatchFieldsWithGroup

func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error

WatchFieldsWithGroup starts monitoring fields using default parameters. fieldsGroup is the handle of the field group to watch. group is the group handle to associate with the watch. Returns an error if the watch operation fails.

func WatchFieldsWithGroupEx

func WatchFieldsWithGroupEx(
	fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32,
) error

WatchFieldsWithGroupEx starts monitoring fields with custom parameters. fieldsGroup is the handle of the field group to watch. group is the group handle to associate with the watch. updateFreq is the update frequency in microseconds. maxKeepAge is the maximum age of samples to keep in seconds. maxKeepSamples is the maximum number of samples to keep. Returns an error if the watch operation fails.

Types

type CPUHierarchyCPU_v1

type CPUHierarchyCPU_v1 struct {
	// CPUID is the unique identifier for this CPU
	CPUID uint
	// OwnedCores is a bitmask array representing the cores owned by this CPU
	OwnedCores []uint64
}

CPUHierarchyCPU_v1 represents information about a single CPU and its owned cores

type CPUHierarchy_v1

type CPUHierarchy_v1 struct {
	// Version is the version number of the hierarchy structure
	Version uint
	// NumCPUs is the number of CPUs in the system
	NumCPUs uint
	// CPUs contains information about each CPU in the system
	CPUs [MAX_NUM_CPUS]CPUHierarchyCPU_v1
}

CPUHierarchy_v1 represents version 1 of the CPU hierarchy information

func GetCPUHierarchy

func GetCPUHierarchy() (hierarchy CPUHierarchy_v1, err error)

GetCPUHierarchy retrieves the CPU hierarchy information from DCGM

type ClockInfo

type ClockInfo struct {
	Cores  int64 // MHz
	Memory int64 // MHz
}

ClockInfo contains GPU clock frequencies

type DbePolicyCondition

type DbePolicyCondition struct {
	// Location specifies where the ECC error occurred
	Location string
	// NumErrors indicates the number of errors detected
	NumErrors uint
}

DbePolicyCondition contains details about a Double-bit ECC error

type Device

type Device struct {
	GPU           uint
	DCGMSupported string
	UUID          string
	Power         uint // W
	PCI           PCIInfo
	Identifiers   DeviceIdentifiers
	Topology      []P2PLink
	CPUAffinity   string
}

Device represents a GPU device and its properties

func GetDeviceInfo

func GetDeviceInfo(gpuID uint) (Device, error)

GetDeviceInfo returns detailed information about the specified GPU

type DeviceHealth

type DeviceHealth struct {
	// GPU is the ID of the GPU device
	GPU uint
	// Status indicates the overall health status of the GPU
	Status string
	// Watches contains the status of individual health watch systems
	Watches []SystemWatch
}

DeviceHealth represents the health status of a GPU device

func HealthCheckByGpuId

func HealthCheckByGpuId(gpuID uint) (DeviceHealth, error)

HealthCheckByGpuId performs a health check on the specified GPU

type DeviceIdentifiers

type DeviceIdentifiers struct {
	Brand               string
	Model               string
	Serial              string
	Vbios               string
	InforomImageVersion string
	DriverVersion       string
}

DeviceIdentifiers contains various identification information for a GPU device

type DeviceStatus

type DeviceStatus struct {
	Power       float64 // W
	Temperature int64   // °C
	Utilization UtilizationInfo
	Memory      MemoryInfo
	Clocks      ClockInfo
	PCI         PCIStatusInfo
	Performance PerfState
	FanSpeed    int64 // %
}

DeviceStatus contains comprehensive GPU device status information

func GetDeviceStatus

func GetDeviceStatus(gpuID uint) (DeviceStatus, error)

GetDeviceStatus returns current status information about the specified GPU

type DiagErrorDetail

type DiagErrorDetail struct {
	// Message contains a human-readable description of the error
	Message string
	// Code identifies the specific type of error
	Code HealthCheckErrorCode
}

DiagErrorDetail contains detailed information about a health check error

type DiagResult

type DiagResult struct {
	// Status indicates the test result: "pass", "fail", "warn", "skip", or "notrun"
	Status string
	// TestName is the name of the diagnostic test that was run
	TestName string
	// TestOutput contains any additional output or messages from the test
	TestOutput string
	// ErrorCode is the numeric error code if the test failed
	ErrorCode uint
	// ErrorMessage contains a detailed error message if the test failed
	ErrorMessage string
}

DiagResult represents the result of a single diagnostic test

type DiagResults

type DiagResults struct {
	// Software contains the results of software-related diagnostic tests
	Software []DiagResult
}

DiagResults contains the results of all diagnostic tests

func RunDiag

func RunDiag(diagType DiagType, groupID GroupHandle) (DiagResults, error)

RunDiag runs diagnostic tests on a group of GPUs with the specified diagnostic level. Parameters:

  • diagType: The type/level of diagnostic test to run (Quick, Medium, Long, or Extended)
  • groupId: The group of GPUs to run diagnostics on

Returns:

  • DiagResults containing the results of all diagnostic tests
  • error if the diagnostics failed to run

type DiagType

type DiagType int

DiagType represents the type of diagnostic test to run

const (
	// DiagQuick represents a quick diagnostic test that performs basic health checks
	DiagQuick DiagType = 1

	// DiagMedium represents a medium-length diagnostic test that performs more comprehensive checks
	DiagMedium DiagType = 2

	// DiagLong represents a long diagnostic test that performs extensive health checks
	DiagLong DiagType = 3

	// DiagExtended represents an extended diagnostic test that performs the most thorough system checks
	DiagExtended DiagType = 4
)

type ECCErrorsInfo

type ECCErrorsInfo struct {
	SingleBit int64
	DoubleBit int64
}

ECCErrorsInfo contains ECC memory error counts

type Error

type Error struct {
	Code C.dcgmReturn_t // dcgmReturn_t value of error
	// contains filtered or unexported fields
}

Error represents an error returned by the DCGM library

func (*Error) Error

func (e *Error) Error() string

type FieldHandle

type FieldHandle struct {
	// contains filtered or unexported fields
}

FieldHandle represents a handle to a DCGM field group

func FieldGroupCreate

func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error)

FieldGroupCreate creates a new field group with the specified fields. fieldsGroupName is the name for the new group. fields is a slice of field IDs to include in the group. Returns the field group handle and any error encountered.

func (*FieldHandle) GetHandle

func (f *FieldHandle) GetHandle() uintptr

GetHandle returns the internal DCGM field group handle as a uintptr

func (*FieldHandle) SetHandle

func (f *FieldHandle) SetHandle(val uintptr)

SetHandle sets the internal DCGM field group handle to the provided value

type FieldMeta

type FieldMeta struct {
	FieldID     Short              // Unique identifier for the field
	FieldType   byte               // Type of the field (e.g., integer, float, string)
	Size        byte               // Size of the field in bytes
	Tag         string             // Human-readable tag/name for the field
	Scope       int                // Scope of the field
	NvmlFieldID int                // Corresponding NVML field identifier
	EntityLevel Field_Entity_Group // Entity level/group this field belongs to
}

FieldMeta represents metadata about a DCGM field, including its identifier, type, size, and other attributes. This struct is used to describe the characteristics and properties of fields that can be monitored or queried through DCGM.

func FieldGetByID

func FieldGetByID(fieldId Short) FieldMeta

FieldGetByID retrieves field metadata for the specified field ID.

func ToFieldMeta

func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta

ToFieldMeta converts a C DCGM field metadata structure to a Go FieldMeta struct.

type FieldValue_v1

type FieldValue_v1 struct {
	Version   uint
	FieldID   Short
	FieldType uint
	Status    int
	TS        int64
	Value     [4096]byte
}

FieldValue_v1 represents a field value in version 1

func EntityGetLatestValues

func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error)

EntityGetLatestValues retrieves the latest values for specified fields of any entity. entityGroup specifies the type of entity to query. entityId is the ID of the entity. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.

func GetLatestValuesForFields

func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error)

GetLatestValuesForFields retrieves the most recent values for the specified fields. gpu is the ID of the GPU to query. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.

func LinkGetLatestValues

func LinkGetLatestValues(index, parentId uint, fields []Short) ([]FieldValue_v1, error)

LinkGetLatestValues retrieves the latest values for specified fields of a link entity. index is the link index. parentId is the ID of the parent entity. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.

func (FieldValue_v1) Blob

func (fv FieldValue_v1) Blob() [4096]byte

Blob returns the raw field value as a byte array.

func (FieldValue_v1) Float64

func (fv FieldValue_v1) Float64() float64

Float64 returns the field value as a float64.

func (FieldValue_v1) Int64

func (fv FieldValue_v1) Int64() int64

Int64 returns the field value as an int64.

func (FieldValue_v1) String

func (fv FieldValue_v1) String() string

String returns the field value as a string.

type FieldValue_v2

type FieldValue_v2 struct {
	Version       uint
	EntityGroupId Field_Entity_Group
	EntityID      uint
	FieldID       Short
	FieldType     uint
	Status        int
	TS            int64
	Value         [4096]byte
	StringValue   *string
}

FieldValue_v2 represents a field value in version 2

func EntitiesGetLatestValues

func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error)

EntitiesGetLatestValues retrieves the latest values for specified fields across multiple entities. entities is a slice of entity pairs to query. fields is a slice of field IDs to retrieve. flags specify additional options for the query. Returns a slice of field values and any error encountered.

func GetValuesSince

func GetValuesSince(gpuGroup GroupHandle, fieldGroup FieldHandle, sinceTime time.Time) ([]FieldValue_v2, time.Time, error)

GetValuesSince reads and returns field values for a specified group of entities, such as GPUs, that have been updated since a given timestamp. It allows for targeted data retrieval based on time criteria.

GPUGroup is a GroupHandle that identifies the group of entities to operate on. It can be obtained from CreateGroup for a specific group of GPUs or use GroupAllGPUs() to target all GPUs.

fieldGroup is a FieldHandle representing the group of fields for which data is requested.

sinceTime is a time.Time value representing the timestamp from which to request updated values. A zero value (time.Time{}) requests all available data.

Returns []FieldValue_v2 slice containing the requested field values, a time.Time indicating the time of the latest data retrieval, and an error if there is any issue during the operation.

func (FieldValue_v2) Blob

func (fv FieldValue_v2) Blob() [4096]byte

Blob returns the raw field value as a byte array.

func (FieldValue_v2) Float64

func (fv FieldValue_v2) Float64() float64

Float64 returns the field value as a float64.

func (FieldValue_v2) Int64

func (fv FieldValue_v2) Int64() int64

Int64 returns the field value as an int64.

func (FieldValue_v2) String

func (fv FieldValue_v2) String() string

String returns the field value as a string.

type Field_Entity_Group

type Field_Entity_Group uint

Field_Entity_Group represents the type of DCGM entity

const (
	// FE_NONE represents no entity type
	FE_NONE Field_Entity_Group = iota
	// FE_GPU represents a GPU device entity
	FE_GPU
	// FE_VGPU represents a virtual GPU entity
	FE_VGPU
	// FE_SWITCH represents an NVSwitch entity
	FE_SWITCH
	// FE_GPU_I represents a GPU instance entity
	FE_GPU_I
	// FE_GPU_CI represents a GPU compute instance entity
	FE_GPU_CI
	// FE_LINK represents an NVLink entity
	FE_LINK
	// FE_CPU represents a CPU entity
	FE_CPU
	// FE_CPU_CORE represents a CPU core entity
	FE_CPU_CORE
	// FE_COUNT represents the total number of entity types
	FE_COUNT
)

func (Field_Entity_Group) String

func (e Field_Entity_Group) String() string

String returns a string representation of the Field_Entity_Group

type GroupEntityPair

type GroupEntityPair struct {
	// EntityGroupId specifies the type of the entity
	EntityGroupId Field_Entity_Group
	// EntityId is the unique identifier for this entity
	EntityId uint
}

GroupEntityPair represents a DCGM entity and its group identifier

type GroupHandle

type GroupHandle struct {
	// contains filtered or unexported fields
}

GroupHandle represents a handle to a DCGM GPU group

func CreateGroup

func CreateGroup(groupName string) (goGroupId GroupHandle, err error)

CreateGroup creates a new empty GPU group with the specified name

func CreateGroupWithContext

func CreateGroupWithContext(ctx context.Context, groupName string) (GroupHandle, error)

CreateGroupWithContext creates a new group with a context

func GroupAllGPUs

func GroupAllGPUs() GroupHandle

GroupAllGPUs returns a GroupHandle representing all GPUs in the system

func NewDefaultGroup

func NewDefaultGroup(groupName string) (GroupHandle, error)

NewDefaultGroup creates a new group with default GPUs and the specified name

func WatchFields

func WatchFields(gpuID uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)

WatchFields starts monitoring the specified fields for a GPU. gpuId is the ID of the GPU to monitor. fieldsGroup is the handle of the field group to watch. groupName is a name for the watch group. Returns a group handle and any error encountered.

func WatchPidFields

func WatchPidFields() (GroupHandle, error)

WatchPidFields configures DCGM to start recording stats for GPU processes Must be called before GetProcessInfo

func WatchPidFieldsEx

func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error)

WatchPidFieldsEx is the same as WatchPidFields, but allows for modifying the update frequency, max samples, max sample age, and the GPUs on which to enable watches.

func (*GroupHandle) GetHandle

func (g *GroupHandle) GetHandle() uintptr

GetHandle returns the internal group handle value

func (*GroupHandle) SetHandle

func (g *GroupHandle) SetHandle(val uintptr)

SetHandle sets the internal group handle value

type GroupInfo

type GroupInfo struct {
	Version    uint32
	GroupName  string
	EntityList []GroupEntityPair
}

GroupInfo contains information about a DCGM group

func GetGroupInfo

func GetGroupInfo(groupID GroupHandle) (*GroupInfo, error)

GetGroupInfo retrieves information about a DCGM group

type HealthCheckErrorCode

type HealthCheckErrorCode uint

HealthCheckErrorCode error codes for passive and active health checks.

const (
	// DCGM_FR_OK No error
	DCGM_FR_OK HealthCheckErrorCode = 0
	// DCGM_FR_UNKNOWN Unknown error code
	DCGM_FR_UNKNOWN HealthCheckErrorCode = 1
	// DCGM_FR_UNRECOGNIZED Unrecognized error code
	DCGM_FR_UNRECOGNIZED HealthCheckErrorCode = 2
	// DCGM_FR_PCI_REPLAY_RATE Unacceptable rate of PCI errors
	DCGM_FR_PCI_REPLAY_RATE HealthCheckErrorCode = 3
	// DCGM_FR_VOLATILE_DBE_DETECTED Unacceptable rate of volatile double bit errors
	DCGM_FR_VOLATILE_DBE_DETECTED HealthCheckErrorCode = 4
	// DCGM_FR_VOLATILE_SBE_DETECTED Unacceptable rate of volatile single bit errors
	DCGM_FR_VOLATILE_SBE_DETECTED HealthCheckErrorCode = 5
	// DCGM_FR_VOLATILE_SBE_DETECTED_TS Unacceptable rate of volatile single bit errors with a timestamp
	DCGM_FR_VOLATILE_SBE_DETECTED_TS HealthCheckErrorCode = 6
	// DCGM_FR_PENDING_PAGE_RETIREMENTS Pending page retirements detected
	DCGM_FR_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 6
	// DCGM_FR_RETIRED_PAGES_LIMIT Unacceptable total page retirements detected
	DCGM_FR_RETIRED_PAGES_LIMIT HealthCheckErrorCode = 7
	// DCGM_FR_RETIRED_PAGES_DBE_LIMIT Unacceptable total page retirements due to uncorrectable errors
	DCGM_FR_RETIRED_PAGES_DBE_LIMIT HealthCheckErrorCode = 8
	// DCGM_FR_CORRUPT_INFOROM Corrupt inforom found
	DCGM_FR_CORRUPT_INFOROM HealthCheckErrorCode = 9
	// DCGM_FR_CLOCK_THROTTLE_THERMAL Clocks being throttled due to overheating
	DCGM_FR_CLOCK_THROTTLE_THERMAL HealthCheckErrorCode = 10
	// DCGM_FR_POWER_UNREADABLE Cannot get a reading for power from NVML
	DCGM_FR_POWER_UNREADABLE HealthCheckErrorCode = 11
	// DCGM_FR_CLOCK_THROTTLE_POWER Clock being throttled due to power restrictions
	DCGM_FR_CLOCK_THROTTLE_POWER HealthCheckErrorCode = 12
	// DCGM_FR_NVLINK_ERROR_THRESHOLD Unacceptable rate of NVLink errors
	DCGM_FR_NVLINK_ERROR_THRESHOLD HealthCheckErrorCode = 13
	// DCGM_FR_NVLINK_DOWN NVLink is down
	DCGM_FR_NVLINK_DOWN HealthCheckErrorCode = 14
	// DCGM_FR_NVSWITCH_FATAL_ERROR Fatal errors on the NVSwitch
	DCGM_FR_NVSWITCH_FATAL_ERROR HealthCheckErrorCode = 15
	// DCGM_FR_NVSWITCH_NON_FATAL_ERROR Non-fatal errors on the NVSwitch
	DCGM_FR_NVSWITCH_NON_FATAL_ERROR HealthCheckErrorCode = 16
	// DCGM_FR_NVSWITCH_DOWN NVSwitch is down
	DCGM_FR_NVSWITCH_DOWN HealthCheckErrorCode = 17
	// DCGM_FR_NO_ACCESS_TO_FILE Cannot access a file
	DCGM_FR_NO_ACCESS_TO_FILE HealthCheckErrorCode = 18
	// DCGM_FR_NVML_API Error occurred on an NVML API - NOT USED: DEPRECATED
	DCGM_FR_NVML_API HealthCheckErrorCode = 19
	// DCGM_FR_DEVICE_COUNT_MISMATCH Device count mismatch
	DCGM_FR_DEVICE_COUNT_MISMATCH HealthCheckErrorCode = 20
	// DCGM_FR_BAD_PARAMETER Bad parameter passed to API
	DCGM_FR_BAD_PARAMETER HealthCheckErrorCode = 21
	// DCGM_FR_CANNOT_OPEN_LIB Cannot open a library that must be accessed
	DCGM_FR_CANNOT_OPEN_LIB HealthCheckErrorCode = 22
	// DCGM_FR_DENYLISTED_DRIVER A driver on the denylist (nouveau) is active
	DCGM_FR_DENYLISTED_DRIVER HealthCheckErrorCode = 23
	// DCGM_FR_NVML_LIB_BAD NVML library is missing expected functions - NOT USED: DEPRECATED
	DCGM_FR_NVML_LIB_BAD HealthCheckErrorCode = 24
	// DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25
	DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25
	// DCGM_FR_HOSTENGINE_CONN Bad connection to nv-hostengine - NOT USED: DEPRECATED
	DCGM_FR_HOSTENGINE_CONN HealthCheckErrorCode = 26
	// DCGM_FR_FIELD_QUERY Field query failed
	DCGM_FR_FIELD_QUERY HealthCheckErrorCode = 27
	// DCGM_FR_BAD_CUDA_ENV The environment has variables that hurt CUDA
	DCGM_FR_BAD_CUDA_ENV HealthCheckErrorCode = 28
	// DCGM_FR_PERSISTENCE_MODE Persistence mode is disabled
	DCGM_FR_PERSISTENCE_MODE HealthCheckErrorCode = 29
	// DCGM_FR_BAD_NVLINK_ENV The environment has variables that hurt NVLink
	DCGM_FR_BAD_NVLINK_ENV HealthCheckErrorCode = 29
	// DCGM_FR_LOW_BANDWIDTH The bandwidth is unacceptably low
	DCGM_FR_LOW_BANDWIDTH HealthCheckErrorCode = 30
	// DCGM_FR_HIGH_LATENCY Latency is too high
	DCGM_FR_HIGH_LATENCY HealthCheckErrorCode = 31
	// DCGM_FR_CANNOT_GET_FIELD_TAG Cannot find a tag for a field
	DCGM_FR_CANNOT_GET_FIELD_TAG HealthCheckErrorCode = 32
	// DCGM_FR_FIELD_VIOLATION The value for the specified error field is above 0
	DCGM_FR_FIELD_VIOLATION HealthCheckErrorCode = 33
	// DCGM_FR_FIELD_THRESHOLD The value for the specified field is above the threshold
	DCGM_FR_FIELD_THRESHOLD HealthCheckErrorCode = 34
	// DCGM_FR_FIELD_VIOLATION_DBL The value for the specified error field is above 0
	DCGM_FR_FIELD_VIOLATION_DBL HealthCheckErrorCode = 35
	// DCGM_FR_FIELD_THRESHOLD_DBL The value for the specified field is above the threshold
	DCGM_FR_FIELD_THRESHOLD_DBL HealthCheckErrorCode = 36
	// DCGM_FR_UNSUPPORTED_FIELD_TYPE Field type cannot be supported
	DCGM_FR_UNSUPPORTED_FIELD_TYPE HealthCheckErrorCode = 37
	// DCGM_FR_FIELD_THRESHOLD_TS The value for the specified field is above the threshold
	DCGM_FR_FIELD_THRESHOLD_TS HealthCheckErrorCode = 38
	// DCGM_FR_FIELD_THRESHOLD_TS_DBL The value for the specified field is above the threshold
	DCGM_FR_FIELD_THRESHOLD_TS_DBL HealthCheckErrorCode = 39
	// DCGM_FR_THERMAL_VIOLATIONS Thermal violations detected
	DCGM_FR_THERMAL_VIOLATIONS HealthCheckErrorCode = 40
	// DCGM_FR_THERMAL_VIOLATIONS_TS Thermal violations detected with a timestamp
	DCGM_FR_THERMAL_VIOLATIONS_TS HealthCheckErrorCode = 41
	// DCGM_FR_TEMP_VIOLATION Non-benign clock throttling is occurring
	DCGM_FR_TEMP_VIOLATION HealthCheckErrorCode = 42
	// DCGM_FR_THROTTLING_VIOLATION Non-benign clock throttling is occurring
	DCGM_FR_THROTTLING_VIOLATION HealthCheckErrorCode = 43
	// DCGM_FR_INTERNAL An internal error was detected
	DCGM_FR_INTERNAL HealthCheckErrorCode = 44
	// DCGM_FR_PCIE_GENERATION PCIe generation is too low
	DCGM_FR_PCIE_GENERATION HealthCheckErrorCode = 45
	// DCGM_FR_PCIE_WIDTH PCIe width is too low
	DCGM_FR_PCIE_WIDTH HealthCheckErrorCode = 46
	// DCGM_FR_ABORTED Test was aborted by a user signal
	DCGM_FR_ABORTED HealthCheckErrorCode = 47
	// DCGM_FR_TEST_DISABLED Test was disabled by a user signal
	DCGM_FR_TEST_DISABLED HealthCheckErrorCode = 48
	// DCGM_FR_CANNOT_GET_STAT Cannot get telemetry for a needed value
	DCGM_FR_CANNOT_GET_STAT HealthCheckErrorCode = 49
	// DCGM_FR_STRESS_LEVEL Stress level is too low (bad performance)
	DCGM_FR_STRESS_LEVEL HealthCheckErrorCode = 50
	// DCGM_FR_CUDA_API HealthCheckErrorCode = 51
	DCGM_FR_CUDA_API HealthCheckErrorCode = 51
	// DCGM_FR_FAULTY_MEMORY Faulty memory detected on this GPU
	DCGM_FR_FAULTY_MEMORY HealthCheckErrorCode = 52
	// DCGM_FR_CANNOT_SET_WATCHES Unable to set field watches in DCGM - NOT USED: DEPRECATED
	DCGM_FR_CANNOT_SET_WATCHES HealthCheckErrorCode = 53
	// DCGM_FR_CUDA_UNBOUND CUDA context is no longer bound
	DCGM_FR_CUDA_UNBOUND HealthCheckErrorCode = 54
	// DCGM_FR_ECC_DISABLED ECC memory is disabled right now
	DCGM_FR_ECC_DISABLED HealthCheckErrorCode = 55
	// DCGM_FR_MEMORY_ALLOC Cannot allocate memory on the GPU
	DCGM_FR_MEMORY_ALLOC HealthCheckErrorCode = 56
	// DCGM_FR_CUDA_DBE CUDA detected unrecovable double-bit error
	DCGM_FR_CUDA_DBE HealthCheckErrorCode = 57
	// DCGM_FR_MEMORY_MISMATCH Memory error detected
	DCGM_FR_MEMORY_MISMATCH HealthCheckErrorCode = 58
	// DCGM_FR_CUDA_DEVICE No CUDA device discoverable for existing GPU
	DCGM_FR_CUDA_DEVICE HealthCheckErrorCode = 59
	// DCGM_FR_ECC_UNSUPPORTED ECC memory is unsupported by this SKU
	DCGM_FR_ECC_UNSUPPORTED HealthCheckErrorCode = 60
	// DCGM_FR_ECC_PENDING ECC memory is in a pending state - NOT USED: DEPRECATED
	DCGM_FR_ECC_PENDING HealthCheckErrorCode = 61
	// DCGM_FR_MEMORY_BANDWIDTH Memory bandwidth is too low
	DCGM_FR_MEMORY_BANDWIDTH HealthCheckErrorCode = 62
	// DCGM_FR_TARGET_POWER The target power is too low
	DCGM_FR_TARGET_POWER HealthCheckErrorCode = 63
	// DCGM_FR_API_FAIL The specified API call failed
	DCGM_FR_API_FAIL HealthCheckErrorCode = 64
	// DCGM_FR_API_FAIL_GPU The specified API call failed for the specified GPU
	DCGM_FR_API_FAIL_GPU HealthCheckErrorCode = 65
	// DCGM_FR_CUDA_CONTEXT Cannot create a CUDA context on this GPU
	DCGM_FR_CUDA_CONTEXT HealthCheckErrorCode = 66
	// DCGM_FR_DCGM_API DCGM API failure
	DCGM_FR_DCGM_API HealthCheckErrorCode = 67
	// DCGM_FR_CONCURRENT_GPUS Need multiple GPUs to run this test
	DCGM_FR_CONCURRENT_GPUS HealthCheckErrorCode = 68
	// DCGM_FR_TOO_MANY_ERRORS More errors than fit in the return struct - NOT USED: DEPRECATED
	DCGM_FR_TOO_MANY_ERRORS HealthCheckErrorCode = 69
	// DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD NVLink CRC error threshold violation
	DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD HealthCheckErrorCode = 70
	// DCGM_FR_NVLINK_ERROR_CRITICAL NVLink error for a field that should always be 0
	DCGM_FR_NVLINK_ERROR_CRITICAL HealthCheckErrorCode = 71
	// DCGM_FR_ENFORCED_POWER_LIMIT The enforced power limit is too low to hit the target
	DCGM_FR_ENFORCED_POWER_LIMIT HealthCheckErrorCode = 72
	// DCGM_FR_MEMORY_ALLOC_HOST Cannot allocate memory on the host
	DCGM_FR_MEMORY_ALLOC_HOST HealthCheckErrorCode = 73
	// DCGM_FR_GPU_OP_MODE Bad GPU operating mode for running plugin - NOT USED: DEPRECATED
	DCGM_FR_GPU_OP_MODE HealthCheckErrorCode = 74
	// DCGM_FR_NO_MEMORY_CLOCKS No memory clocks with the needed MHz found - NOT USED: DEPRECATED
	DCGM_FR_NO_MEMORY_CLOCKS HealthCheckErrorCode = 75
	// DCGM_FR_NO_GRAPHICS_CLOCKS No graphics clocks with the needed MHz found - NOT USED: DEPRECATED
	DCGM_FR_NO_GRAPHICS_CLOCKS HealthCheckErrorCode = 76
	// DCGM_FR_HAD_TO_RESTORE_STATE Note that we had to restore a GPU's state
	DCGM_FR_HAD_TO_RESTORE_STATE HealthCheckErrorCode = 77
	// DCGM_FR_L1TAG_UNSUPPORTED L1TAG test is unsupported by this SKU
	DCGM_FR_L1TAG_UNSUPPORTED HealthCheckErrorCode = 78
	// DCGM_FR_L1TAG_MISCOMPARE L1TAG test failed on a miscompare
	DCGM_FR_L1TAG_MISCOMPARE HealthCheckErrorCode = 79
	// DCGM_FR_ROW_REMAP_FAILURE Row remapping failed (Ampere or newer GPUs)
	DCGM_FR_ROW_REMAP_FAILURE HealthCheckErrorCode = 80
	// DCGM_FR_UNCONTAINED_ERROR Uncontained error - XID 95
	DCGM_FR_UNCONTAINED_ERROR HealthCheckErrorCode = 81
	// DCGM_FR_EMPTY_GPU_LIST No GPU information given to plugin
	DCGM_FR_EMPTY_GPU_LIST HealthCheckErrorCode = 82
	// DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS Pending page retirements due to a DBE
	DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 83
	// DCGM_FR_UNCORRECTABLE_ROW_REMAP Uncorrectable row remapping
	DCGM_FR_UNCORRECTABLE_ROW_REMAP HealthCheckErrorCode = 84
	// DCGM_FR_PENDING_ROW_REMAP Row remapping is pending
	DCGM_FR_PENDING_ROW_REMAP HealthCheckErrorCode = 85
	// DCGM_FR_BROKEN_P2P_MEMORY_DEVICE P2P copy test detected an error writing to this GPU
	DCGM_FR_BROKEN_P2P_MEMORY_DEVICE HealthCheckErrorCode = 86
	// DCGM_FR_BROKEN_P2P_WRITER_DEVICE P2P copy test detected an error writing from this GPU
	DCGM_FR_BROKEN_P2P_WRITER_DEVICE HealthCheckErrorCode = 87
	// DCGM_FR_NVSWITCH_NVLINK_DOWN An NvLink is down for the specified NVSwitch
	DCGM_FR_NVSWITCH_NVLINK_DOWN HealthCheckErrorCode = 88
	// DCGM_FR_EUD_BINARY_PERMISSIONS EUD binary permissions are incorrect
	DCGM_FR_EUD_BINARY_PERMISSIONS HealthCheckErrorCode = 89
	// DCGM_FR_EUD_NON_ROOT_USER EUD plugin is not running as root
	DCGM_FR_EUD_NON_ROOT_USER HealthCheckErrorCode = 90
	// DCGM_FR_EUD_SPAWN_FAILURE EUD plugin failed to spawn the EUD binary
	DCGM_FR_EUD_SPAWN_FAILURE HealthCheckErrorCode = 91
	// DCGM_FR_EUD_TIMEOUT EUD plugin timed out
	DCGM_FR_EUD_TIMEOUT HealthCheckErrorCode = 92
	// DCGM_FR_EUD_ZOMBIE EUD process remains running after the plugin considers it finished
	DCGM_FR_EUD_ZOMBIE HealthCheckErrorCode = 93
	// DCGM_FR_EUD_NON_ZERO_EXIT_CODE EUD process exited with a non-zero exit code
	DCGM_FR_EUD_NON_ZERO_EXIT_CODE HealthCheckErrorCode = 94
	// DCGM_FR_EUD_TEST_FAILED EUD test failed
	DCGM_FR_EUD_TEST_FAILED HealthCheckErrorCode = 95
	// DCGM_FR_FILE_CREATE_PERMISSIONS We cannot create a file in this directory.
	DCGM_FR_FILE_CREATE_PERMISSIONS HealthCheckErrorCode = 96
	// DCGM_FR_PAUSE_RESUME_FAILED Pause/Resume failed
	DCGM_FR_PAUSE_RESUME_FAILED HealthCheckErrorCode = 97
	// DCGM_FR_PCIE_H_REPLAY_VIOLATION PCIe H replay violation
	DCGM_FR_PCIE_H_REPLAY_VIOLATION HealthCheckErrorCode = 98
	// DCGM_FR_GPU_EXPECTED_NVLINKS_UP Expected nvlinks up per gpu
	DCGM_FR_GPU_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 99
	// DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP Expected nvlinks up per nvswitch
	DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 100
	// DCGM_FR_XID_ERROR XID error detected
	DCGM_FR_XID_ERROR HealthCheckErrorCode = 101
	// DCGM_FR_SBE_VIOLATION Single bit error detected
	DCGM_FR_SBE_VIOLATION HealthCheckErrorCode = 102
	// DCGM_FR_DBE_VIOLATION Double bit error detected
	DCGM_FR_DBE_VIOLATION HealthCheckErrorCode = 103
	// DCGM_FR_PCIE_REPLAY_VIOLATION PCIe replay errors detected
	DCGM_FR_PCIE_REPLAY_VIOLATION HealthCheckErrorCode = 104
	// DCGM_FR_SBE_THRESHOLD_VIOLATION SBE threshold violated
	DCGM_FR_SBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 105
	// DCGM_FR_DBE_THRESHOLD_VIOLATION DBE threshold violated
	DCGM_FR_DBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 106
	// DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION PCIe replay count violated
	DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION HealthCheckErrorCode = 107
	// DCGM_FR_CUDA_FM_NOT_INITIALIZED The fabricmanager is not initialized
	DCGM_FR_CUDA_FM_NOT_INITIALIZED HealthCheckErrorCode = 108
	// DCGM_FR_SXID_ERROR NvSwitch fatal error detected
	DCGM_FR_SXID_ERROR HealthCheckErrorCode = 109
	// DCGM_FR_GFLOPS_THRESHOLD_VIOLATION GPU GFLOPs threshold violated
	DCGM_FR_GFLOPS_THRESHOLD_VIOLATION HealthCheckErrorCode = 110
	// DCGM_FR_NAN_VALUE NaN value detected on this GPU
	DCGM_FR_NAN_VALUE HealthCheckErrorCode = 111
	// DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR Fabric Manager did not finish training
	DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR HealthCheckErrorCode = 112
	// DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE P2P copy test detected an error writing to this GPU over PCIE
	DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE HealthCheckErrorCode = 113
	// DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE P2P copy test detected an error writing from this GPU over PCIE
	DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE HealthCheckErrorCode = 114
	// DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE P2P copy test detected an error writing to this GPU over NVLink
	DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE HealthCheckErrorCode = 115
	// DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE P2P copy test detected an error writing from this GPU over NVLink
	DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE HealthCheckErrorCode = 116
	// DCGM_FR_ERROR_SENTINEL MUST BE THE LAST ERROR CODE
	DCGM_FR_ERROR_SENTINEL HealthCheckErrorCode = 117
)

type HealthResponse

type HealthResponse struct {
	// OverallHealth indicates the aggregate health status across all watches
	OverallHealth HealthResult
	// Incidents contains details about any health issues detected
	Incidents []Incident
}

HealthResponse contains the results of a health check operation

func HealthCheck

func HealthCheck(groupID GroupHandle) (HealthResponse, error)

HealthCheck checks the configured watches for any errors/failures/warnings that have occurred since the last time this check was invoked. On the first call, stateful information about all of the enabled watches within a group is created but no error results are provided. On subsequent calls, any error information will be returned.

type HealthResult

type HealthResult uint

HealthResult is the result of a health check.

const (
	// DCGM_HEALTH_RESULT_PASS All results within this system are reporting normal
	DCGM_HEALTH_RESULT_PASS HealthResult = 0
	// DCGM_HEALTH_RESULT_WARN A warning has been issued, refer to the response for more information
	DCGM_HEALTH_RESULT_WARN HealthResult = 10
	// DCGM_HEALTH_RESULT_FAIL A failure has been issued, refer to the response for more information
	DCGM_HEALTH_RESULT_FAIL HealthResult = 20
)

type HealthSystem

type HealthSystem uint

HealthSystem is the system to watch for health checks.

const (
	// DCGM_HEALTH_WATCH_PCIE PCIe health check
	DCGM_HEALTH_WATCH_PCIE HealthSystem = 0x1
	// DCGM_HEALTH_WATCH_NVLINK NVLink health check
	DCGM_HEALTH_WATCH_NVLINK HealthSystem = 0x2
	// DCGM_HEALTH_WATCH_PMU PMU health check
	DCGM_HEALTH_WATCH_PMU HealthSystem = 0x4
	// DCGM_HEALTH_WATCH_MCU MCU health check
	DCGM_HEALTH_WATCH_MCU HealthSystem = 0x8
	// DCGM_HEALTH_WATCH_MEM Memory health check
	DCGM_HEALTH_WATCH_MEM HealthSystem = 0x10
	// DCGM_HEALTH_WATCH_SM SM health check
	DCGM_HEALTH_WATCH_SM HealthSystem = 0x20
	// DCGM_HEALTH_WATCH_INFOROM Inforom health check
	DCGM_HEALTH_WATCH_INFOROM HealthSystem = 0x40
	// DCGM_HEALTH_WATCH_THERMAL Thermal health check
	DCGM_HEALTH_WATCH_THERMAL HealthSystem = 0x80
	// DCGM_HEALTH_WATCH_POWER Power health check
	DCGM_HEALTH_WATCH_POWER HealthSystem = 0x100
	// DCGM_HEALTH_WATCH_DRIVER Driver health check
	DCGM_HEALTH_WATCH_DRIVER HealthSystem = 0x200
	// DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL NVSwitch non-fatal health check
	DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL HealthSystem = 0x400
	// DCGM_HEALTH_WATCH_NVSWITCH_FATAL NVSwitch fatal health check
	DCGM_HEALTH_WATCH_NVSWITCH_FATAL HealthSystem = 0x800
	// DCGM_HEALTH_WATCH_ALL All health checks
	DCGM_HEALTH_WATCH_ALL HealthSystem = 0xFFFFFFFF
)

func HealthGet

func HealthGet(groupID GroupHandle) (HealthSystem, error)

HealthGet retrieves the current state of the DCGM health check system. It returns which health watch systems are currently enabled for the specified group.

type Incident

type Incident struct {
	// System identifies which health watch system detected the incident
	System HealthSystem
	// Health indicates the severity of the incident
	Health HealthResult
	// Error contains detailed information about the incident
	Error DiagErrorDetail
	// EntityInfo identifies the GPU or component where the incident occurred
	EntityInfo GroupEntityPair
}

Incident represents a health check incident that occurred

type Link_State uint

Link_State represents the state of an NVLINK connection

const (
	// LS_NOT_SUPPORTED indicates the link is unsupported (Default for GPUs)
	LS_NOT_SUPPORTED Link_State = iota
	// LS_DISABLED indicates the link is supported but disabled (Default for NvSwitches)
	LS_DISABLED
	// LS_DOWN indicates the link is down (inactive)
	LS_DOWN
	// LS_UP indicates the link is up (active)
	LS_UP
)

type MemoryInfo

type MemoryInfo struct {
	GlobalUsed int64
	ECCErrors  ECCErrorsInfo
}

MemoryInfo contains GPU memory usage and error information

type MetricGroup

type MetricGroup struct {
	Major    uint
	Minor    uint
	FieldIds []uint
}

MetricGroup represents a group of metrics for a specific GPU

func GetSupportedMetricGroups

func GetSupportedMetricGroups(gpuID uint) ([]MetricGroup, error)

GetSupportedMetricGroups returns all supported metric groups for the specified GPU

type MigEntityInfo

type MigEntityInfo struct {
	// GpuUuid is the UUID of the parent GPU
	GpuUuid string
	// NvmlGpuIndex is the NVML index of the parent GPU
	NvmlGpuIndex uint
	// NvmlInstanceId is the NVML GPU instance ID
	NvmlInstanceId uint
	// NvmlComputeInstanceId is the NVML compute instance ID
	NvmlComputeInstanceId uint
	// NvmlMigProfileId is the NVML MIG profile ID
	NvmlMigProfileId uint
	// NvmlProfileSlices is the number of slices in the MIG profile
	NvmlProfileSlices uint
}

MigEntityInfo contains information about a MIG entity

type MigHierarchyInfo

type MigHierarchyInfo struct {
	// Entity represents the current GPU entity in the hierarchy
	Entity GroupEntityPair
	// Parent represents the parent GPU entity in the hierarchy
	Parent GroupEntityPair
	// SliceProfile defines the MIG profile configuration for this entity
	SliceProfile MigProfile
}

MigHierarchyInfo represents the Multi-Instance GPU (MIG) hierarchy information for a GPU entity and its relationship to other entities

type MigHierarchyInfo_v2

type MigHierarchyInfo_v2 struct {
	// Entity contains the entity information
	Entity GroupEntityPair
	// Parent contains the parent entity information
	Parent GroupEntityPair
	// Info contains detailed MIG entity information
	Info MigEntityInfo
}

MigHierarchyInfo_v2 represents version 2 of MIG hierarchy information

type MigHierarchy_v2

type MigHierarchy_v2 struct {
	// Version is the version number of the hierarchy structure
	Version uint
	// Count is the number of valid entries in EntityList
	Count uint
	// EntityList contains the MIG hierarchy information for each entity
	EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2
}

MigHierarchy_v2 represents version 2 of the complete MIG hierarchy

func GetGPUInstanceHierarchy

func GetGPUInstanceHierarchy() (hierarchy MigHierarchy_v2, err error)

GetGPUInstanceHierarchy retrieves the complete MIG hierarchy information

type MigProfile

type MigProfile int

MigProfile represents the Multi-Instance GPU (MIG) profile type

const (
	// MigProfileNone indicates no MIG profile is set (for GPUs)
	MigProfileNone MigProfile = 0 /*!< No profile (for GPUs) */
	// MigProfileGPUInstanceSlice1 represents GPU instance slice 1
	MigProfileGPUInstanceSlice1 MigProfile = 1 /*!< GPU instance slice 1 */
	// MigProfileGPUInstanceSlice2 represents GPU instance slice 2
	MigProfileGPUInstanceSlice2 MigProfile = 2 /*!< GPU instance slice 2 */
	// MigProfileGPUInstanceSlice3 represents GPU instance slice 3
	MigProfileGPUInstanceSlice3 MigProfile = 3 /*!< GPU instance slice 3 */
	// MigProfileGPUInstanceSlice4 represents GPU instance slice 4
	MigProfileGPUInstanceSlice4 MigProfile = 4 /*!< GPU instance slice 4 */
	// MigProfileGPUInstanceSlice7 represents GPU instance slice 7
	MigProfileGPUInstanceSlice7 MigProfile = 5 /*!< GPU instance slice 7 */
	// MigProfileGPUInstanceSlice8 represents GPU instance slice 8
	MigProfileGPUInstanceSlice8 MigProfile = 6 /*!< GPU instance slice 8 */
	// MigProfileGPUInstanceSlice6 represents GPU instance slice 6
	MigProfileGPUInstanceSlice6 MigProfile = 7 /*!< GPU instance slice 6 */
	// MigProfileGPUInstanceSlice1Rev1 represents GPU instance slice 1 revision 1
	MigProfileGPUInstanceSlice1Rev1 MigProfile = 8 /*!< GPU instance slice 1 revision 1 */
	// MigProfileGPUInstanceSlice2Rev1 represents GPU instance slice 2 revision 1
	MigProfileGPUInstanceSlice2Rev1 MigProfile = 9 /*!< GPU instance slice 2 revision 1 */
	// MigProfileGPUInstanceSlice1Rev2 represents GPU instance slice 1 revision 2
	MigProfileGPUInstanceSlice1Rev2 MigProfile = 10 /*!< GPU instance slice 1 revision 2 */
	// MigProfileComputeInstanceSlice1 represents compute instance slice 1
	MigProfileComputeInstanceSlice1 MigProfile = 30 /*!< compute instance slice 1 */
	// MigProfileComputeInstanceSlice2 represents compute instance slice 2
	MigProfileComputeInstanceSlice2 MigProfile = 31 /*!< compute instance slice 2 */
	// MigProfileComputeInstanceSlice3 represents compute instance slice 3
	MigProfileComputeInstanceSlice3 MigProfile = 32 /*!< compute instance slice 3 */
	// MigProfileComputeInstanceSlice4 represents compute instance slice 4
	MigProfileComputeInstanceSlice4 MigProfile = 33 /*!< compute instance slice 4*/
	// MigProfileComputeInstanceSlice7 represents compute instance slice 7
	MigProfileComputeInstanceSlice7 MigProfile = 34 /*!< compute instance slice 7 */
	// MigProfileComputeInstanceSlice8 represents compute instance slice 8
	MigProfileComputeInstanceSlice8 MigProfile = 35 /*!< compute instance slice 8 */
	// MigProfileComputeInstanceSlice6 represents compute instance slice 6
	MigProfileComputeInstanceSlice6 MigProfile = 36 /*!< compute instance slice 6 */
	// MigProfileComputeInstanceSlice1Rev1 represents compute instance slice 1 revision 1
	MigProfileComputeInstanceSlice1Rev1 MigProfile = 37 /*!< compute instance slice 1 revision 1 */
)

type NvLinkStatus

type NvLinkStatus struct {
	// ParentId is the ID of the parent entity (GPU or NVSwitch)
	ParentId uint
	// ParentType is the type of the parent entity
	ParentType Field_Entity_Group
	// State is the current state of the NVLINK
	State Link_State
	// Index is the link index number
	Index uint
}

NvLinkStatus contains information about an NVLINK connection status

func GetNvLinkLinkStatus

func GetNvLinkLinkStatus() ([]NvLinkStatus, error)

GetNvLinkLinkStatus returns the status of all NVLink connections

type NvlinkPolicyCondition

type NvlinkPolicyCondition struct {
	// FieldId identifies the specific NVLink field that had an error
	FieldId uint16
	// Counter indicates the number of errors detected
	Counter uint
}

NvlinkPolicyCondition contains details about an NVLink error

type P2PLink struct {
	// GPU is the ID of the GPU
	GPU uint
	// BusID is the PCIe bus ID of the GPU
	BusID string
	// Link is the type of P2P connection
	Link P2PLinkType
}

P2PLink contains information about a peer-to-peer connection

func GetDeviceTopology

func GetDeviceTopology(gpuID uint) ([]P2PLink, error)

GetDeviceTopology returns the topology (connectivity) information for the specified GPU

type P2PLinkType

type P2PLinkType uint

P2PLinkType represents the type of peer-to-peer connection between GPUs

const (
	// P2PLinkUnknown represents an unknown link type
	P2PLinkUnknown P2PLinkType = iota
	// P2PLinkCrossCPU represents a connection across different CPUs
	P2PLinkCrossCPU
	// P2PLinkSameCPU represents a connection within the same CPU
	P2PLinkSameCPU
	// P2PLinkHostBridge represents a connection through the host bridge
	P2PLinkHostBridge
	// P2PLinkMultiSwitch represents a connection through multiple PCIe switches
	P2PLinkMultiSwitch
	// P2PLinkSingleSwitch represents a connection through a single PCIe switch
	P2PLinkSingleSwitch
	// P2PLinkSameBoard represents a connection on the same board
	P2PLinkSameBoard
	// SingleNVLINKLink represents a single NVLINK connection
	SingleNVLINKLink
	// TwoNVLINKLinks represents two NVLINK connections
	TwoNVLINKLinks
	// ThreeNVLINKLinks represents three NVLINK connections
	ThreeNVLINKLinks
	// FourNVLINKLinks represents four NVLINK connections
	FourNVLINKLinks
)

func (P2PLinkType) PCIPaths

func (l P2PLinkType) PCIPaths() string

PCIPaths returns a string representation of the P2P link type

type PCIInfo

type PCIInfo struct {
	BusID     string
	BAR1      uint  // MB
	FBTotal   uint  // MB
	Bandwidth int64 // MB/s
}

PCIInfo contains PCI bus related information for a GPU device

type PCIStatusInfo

type PCIStatusInfo struct {
	BAR1Used   int64 // MB
	Throughput PCIThroughputInfo
	FBUsed     int64
}

PCIStatusInfo contains PCI bus status information

type PCIThroughputInfo

type PCIThroughputInfo struct {
	Rx      int64 // MB
	Tx      int64 // MB
	Replays int64
}

PCIThroughputInfo contains PCI bus transfer metrics

type PciPolicyCondition

type PciPolicyCondition struct {
	// ReplayCounter indicates the number of PCI replays
	ReplayCounter uint
}

PciPolicyCondition contains details about a PCI error

type PerfState

type PerfState uint

PerfState represents the performance state (P-state) of a GPU

func (PerfState) String

func (p PerfState) String() string

String returns a string representation of the performance state

type PolicyViolation

type PolicyViolation struct {
	// Condition specifies the type of policy that was violated
	Condition policyCondition
	// Timestamp indicates when the violation occurred
	Timestamp time.Time
	// Data contains violation-specific details
	Data any
}

PolicyViolation represents a detected violation of a policy condition

type PowerPolicyCondition

type PowerPolicyCondition struct {
	// PowerViolation indicates the severity of the power violation
	PowerViolation uint
}

PowerPolicyCondition contains details about a power violation

type ProcessInfo

type ProcessInfo struct {
	// GPU is the ID of the GPU being used
	GPU uint
	// PID is the process ID
	PID uint
	// Name is the name of the process
	Name string
	// ProcessUtilization contains process-specific utilization metrics
	ProcessUtilization ProcessUtilInfo
	// PCI contains PCI bus statistics
	PCI PCIStatusInfo
	// Memory contains memory usage statistics
	Memory MemoryInfo
	// GpuUtilization contains GPU utilization metrics
	GpuUtilization UtilizationInfo
	// Clocks contains GPU clock frequencies
	Clocks ClockInfo
	// Violations contains throttling statistics
	Violations ViolationTime
	// XIDErrors contains XID error information
	XIDErrors XIDErrorInfo
}

ProcessInfo contains comprehensive information about a GPU process

func GetProcessInfo

func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error)

GetProcessInfo returns detailed per-GPU statistics for the specified process

type ProcessUtilInfo

type ProcessUtilInfo struct {
	// StartTime is when the process started using the GPU
	StartTime Time
	// EndTime is when the process stopped using the GPU (0 if still running)
	EndTime Time
	// EnergyConsumed is the energy consumed by the process in Joules
	EnergyConsumed *uint64
	// SmUtil is the GPU SM (Streaming Multiprocessor) utilization percentage
	SmUtil *float64
	// MemUtil is the GPU memory utilization percentage
	MemUtil *float64
}

ProcessUtilInfo contains utilization metrics for a GPU process

type RetiredPagesPolicyCondition

type RetiredPagesPolicyCondition struct {
	// SbePages indicates the number of pages retired due to single-bit errors
	SbePages uint
	// DbePages indicates the number of pages retired due to double-bit errors
	DbePages uint
}

RetiredPagesPolicyCondition contains details about retired memory pages

type Short

type Short C.ushort

Short is an alias for the C.ushort type. It is primarily used for DCGM field identifiers and field collections in the DCGM API bindings. This type provides a direct mapping to the C unsigned short type used in the underlying DCGM C API.

const (
	// DCGM_FI_UNKNOWN represents a NULL field
	DCGM_FI_UNKNOWN Short = 0
	// DCGM_FI_DRIVER_VERSION represents the driver version string
	DCGM_FI_DRIVER_VERSION Short = 1
	// DCGM_FI_NVML_VERSION represents the underlying NVML version string
	DCGM_FI_NVML_VERSION Short = 2
	// DCGM_FI_PROCESS_NAME represents the process name
	DCGM_FI_PROCESS_NAME Short = 3
	// DCGM_FI_DEV_COUNT represents the number of devices on the node
	DCGM_FI_DEV_COUNT Short = 4
	// DCGM_FI_CUDA_DRIVER_VERSION represents the CUDA driver version. Retrieves a number with the major value in the thousands place and the minor value in the hundreds place. (e.g. CUDA 11.1 = 11100)
	DCGM_FI_CUDA_DRIVER_VERSION Short = 5
	// DCGM_FI_DEV_NAME represents the name of the GPU device
	DCGM_FI_DEV_NAME Short = 50
	// DCGM_FI_DEV_BRAND represents the device brand
	DCGM_FI_DEV_BRAND Short = 51
	// DCGM_FI_DEV_NVML_INDEX represents the NVML index of this GPU
	DCGM_FI_DEV_NVML_INDEX Short = 52
	// DCGM_FI_DEV_SERIAL represents the device serial number
	DCGM_FI_DEV_SERIAL Short = 53
	// DCGM_FI_DEV_UUID represents the UUID corresponding to the device
	DCGM_FI_DEV_UUID Short = 54
	// DCGM_FI_DEV_MINOR_NUMBER represents the device node minor number (/dev/nvidia#)
	DCGM_FI_DEV_MINOR_NUMBER Short = 55
	// DCGM_FI_DEV_OEM_INFOROM_VER represents the OEM inforom version
	DCGM_FI_DEV_OEM_INFOROM_VER Short = 56
	// DCGM_FI_DEV_PCI_BUSID represents the PCI attributes for the device
	DCGM_FI_DEV_PCI_BUSID Short = 57
	// DCGM_FI_DEV_PCI_COMBINED_ID represents the combined 16-bit device id and 16-bit vendor id
	DCGM_FI_DEV_PCI_COMBINED_ID Short = 58
	// DCGM_FI_DEV_PCI_SUBSYS_ID represents the 32-bit Sub System Device ID
	DCGM_FI_DEV_PCI_SUBSYS_ID Short = 59
	// DCGM_FI_GPU_TOPOLOGY_PCI represents the topology of all GPUs on the system via PCI (static)
	DCGM_FI_GPU_TOPOLOGY_PCI Short = 60
	// DCGM_FI_GPU_TOPOLOGY_NVLINK represents the topology of all GPUs on the system via NVLINK (static)
	DCGM_FI_GPU_TOPOLOGY_NVLINK Short = 61
	// DCGM_FI_GPU_TOPOLOGY_AFFINITY represents the affinity of all GPUs on the system (static)
	DCGM_FI_GPU_TOPOLOGY_AFFINITY Short = 62
	// DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY represents the CUDA compute capability for the device. The major version is the upper 32 bits and the minor version is the lower 32 bits
	DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY Short = 63
	// DCGM_FI_DEV_COMPUTE_MODE represents the compute mode for the device
	DCGM_FI_DEV_COMPUTE_MODE Short = 65
	// DCGM_FI_DEV_PERSISTENCE_MODE represents the persistence mode for the device. Boolean: 0 is disabled, 1 is enabled
	DCGM_FI_DEV_PERSISTENCE_MODE Short = 66
	// DCGM_FI_DEV_MIG_MODE represents the MIG mode for the device. Boolean: 0 is disabled, 1 is enabled
	DCGM_FI_DEV_MIG_MODE Short = 67
	// DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR represents the string that CUDA_VISIBLE_DEVICES should be set to for this entity (including MIG)
	DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR Short = 68
	// DCGM_FI_DEV_MIG_MAX_SLICES represents the maximum number of MIG slices supported by this GPU
	DCGM_FI_DEV_MIG_MAX_SLICES Short = 69
	// DCGM_FI_DEV_CPU_AFFINITY_0 represents the device CPU affinity for CPUs 0-63
	DCGM_FI_DEV_CPU_AFFINITY_0 Short = 70
	// DCGM_FI_DEV_CPU_AFFINITY_1 represents the device CPU affinity for CPUs 64-127
	DCGM_FI_DEV_CPU_AFFINITY_1 Short = 71
	// DCGM_FI_DEV_CPU_AFFINITY_2 represents the device CPU affinity for CPUs 128-191
	DCGM_FI_DEV_CPU_AFFINITY_2 Short = 72
	// DCGM_FI_DEV_CPU_AFFINITY_3 represents the device CPU affinity for CPUs 192-255
	DCGM_FI_DEV_CPU_AFFINITY_3 Short = 73
	// DCGM_FI_DEV_CC_MODE represents the ConfidentialCompute/AmpereProtectedMemory status. 0 = disabled, 1 = enabled
	DCGM_FI_DEV_CC_MODE Short = 74
	// DCGM_FI_DEV_MIG_ATTRIBUTES represents the attributes for the given MIG device handles
	DCGM_FI_DEV_MIG_ATTRIBUTES Short = 75
	// DCGM_FI_DEV_MIG_GI_INFO represents the GPU instance profile information
	DCGM_FI_DEV_MIG_GI_INFO Short = 76
	// DCGM_FI_DEV_MIG_CI_INFO represents the compute instance profile information
	DCGM_FI_DEV_MIG_CI_INFO Short = 77
	// DCGM_FI_DEV_ECC_INFOROM_VER represents the ECC inforom version
	DCGM_FI_DEV_ECC_INFOROM_VER Short = 80
	// DCGM_FI_DEV_POWER_INFOROM_VER represents the power management object inforom version
	DCGM_FI_DEV_POWER_INFOROM_VER Short = 81
	// DCGM_FI_DEV_INFOROM_IMAGE_VER represents the inforom image version
	DCGM_FI_DEV_INFOROM_IMAGE_VER Short = 82
	// DCGM_FI_DEV_INFOROM_CONFIG_CHECK represents the inforom configuration checksum
	DCGM_FI_DEV_INFOROM_CONFIG_CHECK Short = 83
	// DCGM_FI_DEV_INFOROM_CONFIG_VALID represents whether the inforom configuration is valid. Reads the infoROM from the flash and verifies the checksums
	DCGM_FI_DEV_INFOROM_CONFIG_VALID Short = 84
	// DCGM_FI_DEV_VBIOS_VERSION represents the VBIOS version of the device
	DCGM_FI_DEV_VBIOS_VERSION Short = 85
	// DCGM_FI_DEV_MEM_AFFINITY_0 represents the device memory node affinity for nodes 0-63
	DCGM_FI_DEV_MEM_AFFINITY_0 Short = 86
	// DCGM_FI_DEV_MEM_AFFINITY_1 represents the device memory node affinity for nodes 64-127
	DCGM_FI_DEV_MEM_AFFINITY_1 Short = 87
	// DCGM_FI_DEV_MEM_AFFINITY_2 represents the device memory node affinity for nodes 128-191
	DCGM_FI_DEV_MEM_AFFINITY_2 Short = 88
	// DCGM_FI_DEV_MEM_AFFINITY_3 represents the device memory node affinity for nodes 192-255
	DCGM_FI_DEV_MEM_AFFINITY_3 Short = 89
	// DCGM_FI_DEV_BAR1_TOTAL represents the total BAR1 memory of the GPU in MB
	DCGM_FI_DEV_BAR1_TOTAL Short = 90
	// DCGM_FI_SYNC_BOOST represents the sync boost settings on the node (Deprecated)
	DCGM_FI_SYNC_BOOST Short = 91
	// DCGM_FI_DEV_BAR1_USED represents the used BAR1 memory of the GPU in MB
	DCGM_FI_DEV_BAR1_USED Short = 92
	// DCGM_FI_DEV_BAR1_FREE represents the free BAR1 memory of the GPU in MB
	DCGM_FI_DEV_BAR1_FREE Short = 93
	// DCGM_FI_DEV_GPM_SUPPORT represents the GPM support for the device
	DCGM_FI_DEV_GPM_SUPPORT Short = 94
	// DCGM_FI_DEV_SM_CLOCK represents the SM clock for the device
	DCGM_FI_DEV_SM_CLOCK Short = 100
	// DCGM_FI_DEV_MEM_CLOCK represents the memory clock for the device
	DCGM_FI_DEV_MEM_CLOCK Short = 101
	// DCGM_FI_DEV_VIDEO_CLOCK represents the video encoder/decoder clock for the device
	DCGM_FI_DEV_VIDEO_CLOCK Short = 102
	// DCGM_FI_DEV_APP_SM_CLOCK represents the SM application clocks
	DCGM_FI_DEV_APP_SM_CLOCK Short = 110
	// DCGM_FI_DEV_APP_MEM_CLOCK represents the memory application clocks
	DCGM_FI_DEV_APP_MEM_CLOCK Short = 111
	// DCGM_FI_DEV_CLOCKS_EVENT_REASONS represents the current clock event reasons (bitmask of DCGM_CLOCKS_EVENT_REASON_*)
	DCGM_FI_DEV_CLOCKS_EVENT_REASONS Short = 112
	// DCGM_FI_DEV_CLOCK_THROTTLE_REASONS represents the current clock throttle reasons (Deprecated: Use DCGM_FI_DEV_CLOCKS_EVENT_REASONS instead)
	DCGM_FI_DEV_CLOCK_THROTTLE_REASONS Short = DCGM_FI_DEV_CLOCKS_EVENT_REASONS
	// DCGM_FI_DEV_MAX_SM_CLOCK represents the maximum supported SM clock for the device
	DCGM_FI_DEV_MAX_SM_CLOCK Short = 113
	// DCGM_FI_DEV_MAX_MEM_CLOCK represents the maximum supported memory clock for the device
	DCGM_FI_DEV_MAX_MEM_CLOCK Short = 114
	// DCGM_FI_DEV_MAX_VIDEO_CLOCK represents the maximum supported video encoder/decoder clock for the device
	DCGM_FI_DEV_MAX_VIDEO_CLOCK Short = 115
	// DCGM_FI_DEV_AUTOBOOST represents the auto-boost setting for the device (1 = enabled, 0 = disabled)
	DCGM_FI_DEV_AUTOBOOST Short = 120
	// DCGM_FI_DEV_SUPPORTED_CLOCKS represents the supported clocks for the device
	DCGM_FI_DEV_SUPPORTED_CLOCKS Short = 130
	// DCGM_FI_DEV_MEMORY_TEMP represents the memory temperature for the device
	DCGM_FI_DEV_MEMORY_TEMP Short = 140
	// DCGM_FI_DEV_GPU_TEMP represents the current temperature readings for the device, in degrees C
	DCGM_FI_DEV_GPU_TEMP Short = 150
	// DCGM_FI_DEV_MEM_MAX_OP_TEMP represents the maximum operating temperature for the memory of this GPU
	DCGM_FI_DEV_MEM_MAX_OP_TEMP Short = 151
	// DCGM_FI_DEV_GPU_MAX_OP_TEMP represents the maximum operating temperature for this GPU
	DCGM_FI_DEV_GPU_MAX_OP_TEMP Short = 152
	// DCGM_FI_DEV_GPU_TEMP_LIMIT represents the thermal margin temperature (distance to nearest slowdown threshold) for this GPU
	DCGM_FI_DEV_GPU_TEMP_LIMIT Short = 153
	// DCGM_FI_DEV_POWER_USAGE represents the power usage for the device in Watts
	DCGM_FI_DEV_POWER_USAGE Short = 155
	// DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION represents the total energy consumption for the GPU in mJ since the driver was last reloaded
	DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Short = 156
	// DCGM_FI_DEV_POWER_USAGE_INSTANT represents the current instantaneous power usage of the device in Watts
	DCGM_FI_DEV_POWER_USAGE_INSTANT Short = 157
	// DCGM_FI_DEV_SLOWDOWN_TEMP represents the slowdown temperature for the device
	DCGM_FI_DEV_SLOWDOWN_TEMP Short = 158
	// DCGM_FI_DEV_SHUTDOWN_TEMP represents the shutdown temperature for the device
	DCGM_FI_DEV_SHUTDOWN_TEMP Short = 159
	// DCGM_FI_DEV_POWER_MGMT_LIMIT represents the current power limit for the device
	DCGM_FI_DEV_POWER_MGMT_LIMIT Short = 160
	// DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN represents the minimum power management limit for the device
	DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN Short = 161
	// DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX represents the maximum power management limit for the device
	DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX Short = 162
	// DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF represents the default power management limit for the device
	DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF Short = 163
	// DCGM_FI_DEV_ENFORCED_POWER_LIMIT represents the effective power limit that the driver enforces after taking into account all limiters
	DCGM_FI_DEV_ENFORCED_POWER_LIMIT Short = 164
	// DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK represents the requested workload power profile mask (Blackwell and newer)
	DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK Short = 165
	// DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK represents the enforced workload power profile mask (Blackwell and newer)
	DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK Short = 166
	// DCGM_FI_DEV_VALID_POWER_PROFILE_MASK represents the valid workload power profile mask (Blackwell and newer)
	DCGM_FI_DEV_VALID_POWER_PROFILE_MASK Short = 167
	// DCGM_FI_DEV_FABRIC_MANAGER_STATUS is the value for fabric manager status
	DCGM_FI_DEV_FABRIC_MANAGER_STATUS Short = 170
	// DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE is the value for fabric manager error code
	// NOTE: this is not populated unless the fabric manager completed startup
	DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE Short = 171
	// DCGM_FI_DEV_FABRIC_CLUSTER_UUID is the value for fabric cluster UUID
	DCGM_FI_DEV_FABRIC_CLUSTER_UUID Short = 172
	// DCGM_FI_DEV_FABRIC_CLIQUE_ID is the value for fabric clique ID
	DCGM_FI_DEV_FABRIC_CLIQUE_ID Short = 173
	// DCGM_FI_DEV_PSTATE is the value for P-state
	DCGM_FI_DEV_PSTATE Short = 190
	// DCGM_FI_DEV_FAN_SPEED is the value for fan speed
	DCGM_FI_DEV_FAN_SPEED Short = 191
	// DCGM_FI_DEV_PCIE_TX_THROUGHPUT represents the PCIe transmit throughput in KB/s
	DCGM_FI_DEV_PCIE_TX_THROUGHPUT Short = 200
	// DCGM_FI_DEV_PCIE_RX_THROUGHPUT represents the PCIe receive throughput in KB/s
	DCGM_FI_DEV_PCIE_RX_THROUGHPUT Short = 201
	// DCGM_FI_DEV_PCIE_REPLAY_COUNTER represents the PCIe replay counter value
	DCGM_FI_DEV_PCIE_REPLAY_COUNTER Short = 202
	// DCGM_FI_DEV_GPU_UTIL represents the GPU utilization in percent
	DCGM_FI_DEV_GPU_UTIL Short = 203
	// DCGM_FI_DEV_MEM_COPY_UTIL represents the memory copy utilization in percent
	DCGM_FI_DEV_MEM_COPY_UTIL Short = 204
	// DCGM_FI_DEV_ACCOUNTING_DATA represents the process accounting information
	DCGM_FI_DEV_ACCOUNTING_DATA Short = 205
	// DCGM_FI_DEV_ENC_UTIL represents the encoder utilization in percent
	DCGM_FI_DEV_ENC_UTIL Short = 206
	// DCGM_FI_DEV_DEC_UTIL represents the decoder utilization in percent
	DCGM_FI_DEV_DEC_UTIL Short = 207
	// DCGM_FI_DEV_XID_ERRORS is the value for XID errors
	DCGM_FI_DEV_XID_ERRORS Short = 230
	// DCGM_FI_DEV_PCIE_MAX_LINK_GEN is the value for PCIe max link generation
	DCGM_FI_DEV_PCIE_MAX_LINK_GEN Short = 235
	// DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH is the value for PCIe max link width
	DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH Short = 236
	// DCGM_FI_DEV_PCIE_LINK_GEN is the value for PCIe link generation
	DCGM_FI_DEV_PCIE_LINK_GEN Short = 237
	// DCGM_FI_DEV_PCIE_LINK_WIDTH is the value for PCIe link width
	DCGM_FI_DEV_PCIE_LINK_WIDTH Short = 238
	// DCGM_FI_DEV_POWER_VIOLATION is the value for power violation time in microseconds
	DCGM_FI_DEV_POWER_VIOLATION Short = 240
	// DCGM_FI_DEV_THERMAL_VIOLATION is the value for thermal violation time in microseconds
	DCGM_FI_DEV_THERMAL_VIOLATION Short = 241
	// DCGM_FI_DEV_SYNC_BOOST_VIOLATION is the value for sync boost violation time in microseconds
	DCGM_FI_DEV_SYNC_BOOST_VIOLATION Short = 242
	// DCGM_FI_DEV_BOARD_LIMIT_VIOLATION is the value for board limit violation time in microseconds
	DCGM_FI_DEV_BOARD_LIMIT_VIOLATION Short = 243
	// DCGM_FI_DEV_LOW_UTIL_VIOLATION is the value for low utilization violation time in microseconds
	DCGM_FI_DEV_LOW_UTIL_VIOLATION Short = 244
	// DCGM_FI_DEV_RELIABILITY_VIOLATION is the value for reliability violation time in microseconds
	DCGM_FI_DEV_RELIABILITY_VIOLATION Short = 245
	// DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION is the value for total application clocks violation time in microseconds
	DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION Short = 246
	// DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION is the value for total base clocks violation time in microseconds
	DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION Short = 247
	// DCGM_FI_DEV_FB_TOTAL is the value for framebuffer total
	DCGM_FI_DEV_FB_TOTAL Short = 250
	// DCGM_FI_DEV_FB_FREE is the value for framebuffer free
	DCGM_FI_DEV_FB_FREE Short = 251
	// DCGM_FI_DEV_FB_USED is the value for framebuffer used
	DCGM_FI_DEV_FB_USED Short = 252
	// DCGM_FI_DEV_FB_RESERVED is the value for framebuffer reserved
	DCGM_FI_DEV_FB_RESERVED Short = 253
	// DCGM_FI_DEV_FB_USED_PERCENT is the value for framebuffer used percent
	DCGM_FI_DEV_FB_USED_PERCENT Short = 254
	// DCGM_FI_DEV_C2C_LINK_COUNT is the value for C2C link count
	DCGM_FI_DEV_C2C_LINK_COUNT Short = 285
	// DCGM_FI_DEV_C2C_LINK_STATUS is the value for C2C link status
	DCGM_FI_DEV_C2C_LINK_STATUS Short = 286
	// DCGM_FI_DEV_C2C_MAX_BANDWIDTH is the value for C2C max bandwidth
	DCGM_FI_DEV_C2C_MAX_BANDWIDTH Short = 287
	// DCGM_FI_DEV_ECC_CURRENT is the value for ECC current
	DCGM_FI_DEV_ECC_CURRENT Short = 300
	// DCGM_FI_DEV_ECC_PENDING is the value for ECC pending
	DCGM_FI_DEV_ECC_PENDING Short = 301
	// DCGM_FI_DEV_ECC_SBE_VOL_TOTAL represents the total number of single-bit ECC errors detected since the last counter reset
	DCGM_FI_DEV_ECC_SBE_VOL_TOTAL Short = 310
	// DCGM_FI_DEV_ECC_DBE_VOL_TOTAL represents the total number of double-bit ECC errors detected since the last counter reset
	DCGM_FI_DEV_ECC_DBE_VOL_TOTAL Short = 311
	// DCGM_FI_DEV_ECC_SBE_AGG_TOTAL represents the total number of single-bit ECC errors detected since the last counter reset (aggregate)
	DCGM_FI_DEV_ECC_SBE_AGG_TOTAL Short = 312
	// DCGM_FI_DEV_ECC_DBE_AGG_TOTAL represents the total number of double-bit ECC errors detected since the last counter reset (aggregate)
	DCGM_FI_DEV_ECC_DBE_AGG_TOTAL Short = 313
	// DCGM_FI_DEV_ECC_SBE_VOL_L1 represents the number of single-bit ECC errors detected in L1 cache since the last counter reset
	DCGM_FI_DEV_ECC_SBE_VOL_L1 Short = 314
	// DCGM_FI_DEV_ECC_DBE_VOL_L1 represents the number of double-bit ECC errors detected in L1 cache since the last counter reset
	DCGM_FI_DEV_ECC_DBE_VOL_L1 Short = 315
	// DCGM_FI_DEV_ECC_SBE_VOL_L2 represents the number of single-bit ECC errors detected in L2 cache since the last counter reset
	DCGM_FI_DEV_ECC_SBE_VOL_L2 Short = 316
	// DCGM_FI_DEV_ECC_DBE_VOL_L2 represents the number of double-bit ECC errors detected in L2 cache since the last counter reset
	DCGM_FI_DEV_ECC_DBE_VOL_L2 Short = 317
	// DCGM_FI_DEV_ECC_SBE_VOL_DEV represents the number of single-bit ECC errors detected in device memory since the last counter reset
	DCGM_FI_DEV_ECC_SBE_VOL_DEV Short = 318
	// DCGM_FI_DEV_ECC_DBE_VOL_DEV represents the number of double-bit ECC errors detected in device memory since the last counter reset
	DCGM_FI_DEV_ECC_DBE_VOL_DEV Short = 319
	// DCGM_FI_DEV_ECC_SBE_VOL_REG represents the number of single-bit ECC errors detected in register file since the last counter reset
	DCGM_FI_DEV_ECC_SBE_VOL_REG Short = 320
	// DCGM_FI_DEV_ECC_DBE_VOL_REG represents the number of double-bit ECC errors detected in register file since the last counter reset
	DCGM_FI_DEV_ECC_DBE_VOL_REG Short = 321
	// DCGM_FI_DEV_ECC_SBE_VOL_TEX represents the number of single-bit ECC errors detected in texture memory since the last counter reset
	DCGM_FI_DEV_ECC_SBE_VOL_TEX Short = 322
	// DCGM_FI_DEV_ECC_DBE_VOL_TEX represents the number of double-bit ECC errors detected in texture memory since the last counter reset
	DCGM_FI_DEV_ECC_DBE_VOL_TEX Short = 323
	// DCGM_FI_DEV_ECC_SBE_AGG_L1 represents the aggregate number of single-bit ECC errors detected in L1 cache
	DCGM_FI_DEV_ECC_SBE_AGG_L1 Short = 324
	// DCGM_FI_DEV_ECC_DBE_AGG_L1 represents the aggregate number of double-bit ECC errors detected in L1 cache
	DCGM_FI_DEV_ECC_DBE_AGG_L1 Short = 325
	// DCGM_FI_DEV_ECC_SBE_AGG_L2 represents the aggregate number of single-bit ECC errors detected in L2 cache
	DCGM_FI_DEV_ECC_SBE_AGG_L2 Short = 326
	// DCGM_FI_DEV_ECC_DBE_AGG_L2 represents the aggregate number of double-bit ECC errors detected in L2 cache
	DCGM_FI_DEV_ECC_DBE_AGG_L2 Short = 327
	// DCGM_FI_DEV_ECC_SBE_AGG_DEV represents the aggregate number of single-bit ECC errors detected in device memory
	DCGM_FI_DEV_ECC_SBE_AGG_DEV Short = 328
	// DCGM_FI_DEV_ECC_DBE_AGG_DEV represents the aggregate number of double-bit ECC errors detected in device memory
	DCGM_FI_DEV_ECC_DBE_AGG_DEV Short = 329
	// DCGM_FI_DEV_ECC_SBE_AGG_REG represents the aggregate number of single-bit ECC errors detected in register file
	DCGM_FI_DEV_ECC_SBE_AGG_REG Short = 330
	// DCGM_FI_DEV_ECC_DBE_AGG_REG represents the aggregate number of double-bit ECC errors detected in register file
	DCGM_FI_DEV_ECC_DBE_AGG_REG Short = 331
	// DCGM_FI_DEV_ECC_SBE_AGG_TEX represents the aggregate number of single-bit ECC errors detected in texture memory
	DCGM_FI_DEV_ECC_SBE_AGG_TEX Short = 332
	// DCGM_FI_DEV_ECC_DBE_AGG_TEX represents the aggregate number of double-bit ECC errors detected in texture memory
	DCGM_FI_DEV_ECC_DBE_AGG_TEX Short = 333
	// DCGM_FI_DEV_ECC_SBE_VOL_SHM represents the number of single-bit ECC errors detected in shared memory since the last counter reset
	DCGM_FI_DEV_ECC_SBE_VOL_SHM Short = 334
	// DCGM_FI_DEV_ECC_DBE_VOL_SHM represents the number of double-bit ECC errors detected in shared memory since the last counter reset
	DCGM_FI_DEV_ECC_DBE_VOL_SHM Short = 335
	// DCGM_FI_DEV_ECC_SBE_VOL_CBU represents the number of single-bit ECC errors detected in CBU since the last counter reset
	DCGM_FI_DEV_ECC_SBE_VOL_CBU Short = 336
	// DCGM_FI_DEV_ECC_DBE_VOL_CBU represents the number of double-bit ECC errors detected in CBU since the last counter reset
	DCGM_FI_DEV_ECC_DBE_VOL_CBU Short = 337
	// DCGM_FI_DEV_ECC_SBE_AGG_SHM represents the aggregate number of single-bit ECC errors detected in shared memory
	DCGM_FI_DEV_ECC_SBE_AGG_SHM Short = 338
	// DCGM_FI_DEV_ECC_DBE_AGG_SHM represents the aggregate number of double-bit ECC errors detected in shared memory
	DCGM_FI_DEV_ECC_DBE_AGG_SHM Short = 339
	// DCGM_FI_DEV_ECC_SBE_AGG_CBU represents the aggregate number of single-bit ECC errors detected in CBU
	DCGM_FI_DEV_ECC_SBE_AGG_CBU Short = 340
	// DCGM_FI_DEV_ECC_DBE_AGG_CBU represents the aggregate number of double-bit ECC errors detected in CBU
	DCGM_FI_DEV_ECC_DBE_AGG_CBU Short = 341
	// DCGM_FI_DEV_ECC_SBE_VOL_SRM represents the number of single-bit ECC errors detected in SRM since the last counter reset
	DCGM_FI_DEV_ECC_SBE_VOL_SRM Short = 342
	// DCGM_FI_DEV_ECC_DBE_VOL_SRM represents the number of double-bit ECC errors detected in SRM since the last counter reset
	DCGM_FI_DEV_ECC_DBE_VOL_SRM Short = 343
	// DCGM_FI_DEV_ECC_SBE_AGG_SRM represents the aggregate number of single-bit ECC errors detected in SRM
	DCGM_FI_DEV_ECC_SBE_AGG_SRM Short = 344
	// DCGM_FI_DEV_ECC_DBE_AGG_SRM represents the aggregate number of double-bit ECC errors detected in SRM
	DCGM_FI_DEV_ECC_DBE_AGG_SRM Short = 345
	// DCGM_FI_DEV_DIAG_MEMORY_RESULT is the value for ECC memory result
	DCGM_FI_DEV_DIAG_MEMORY_RESULT Short = 350
	// DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT is the value for ECC diagnostic result
	DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT Short = 351
	// DCGM_FI_DEV_DIAG_PCIE_RESULT is the value for ECC PCIe result
	DCGM_FI_DEV_DIAG_PCIE_RESULT Short = 352
	// DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT is the value for ECC targeted stress result
	DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT Short = 353
	// DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT is the value for ECC targeted power result
	DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT Short = 354
	// DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT is the value for ECC memory bandwidth result
	DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT Short = 355
	// DCGM_FI_DEV_DIAG_MEMTEST_RESULT is the value for ECC memtest result
	DCGM_FI_DEV_DIAG_MEMTEST_RESULT Short = 356
	// DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT is the value for ECC pulse test result
	DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT Short = 357
	// DCGM_FI_DEV_DIAG_EUD_RESULT is the value for ECC EUD result
	DCGM_FI_DEV_DIAG_EUD_RESULT Short = 358
	// DCGM_FI_DEV_DIAG_CPU_EUD_RESULT is the value for ECC CPU EUD result
	DCGM_FI_DEV_DIAG_CPU_EUD_RESULT Short = 359
	// DCGM_FI_DEV_DIAG_SOFTWARE_RESULT is the value for ECC software result
	DCGM_FI_DEV_DIAG_SOFTWARE_RESULT Short = 360
	// DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT is the value for ECC NVBandwidth result
	DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT Short = 361
	// DCGM_FI_DEV_DIAG_STATUS is the value for ECC status
	DCGM_FI_DEV_DIAG_STATUS Short = 362
	// DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX is the value for ECC banks remap rows avail max
	DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX Short = 385
	// DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH is the value for ECC banks remap rows avail high
	DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH Short = 386
	// DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL is the value for ECC banks remap rows avail partial
	DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL Short = 387
	// DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW is the value for ECC banks remap rows avail low
	DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW Short = 388
	// DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE is the value for ECC banks remap rows avail none
	DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE Short = 389
	// DCGM_FI_DEV_RETIRED_SBE is the value for ECC retired SBE
	DCGM_FI_DEV_RETIRED_SBE Short = 390
	// DCGM_FI_DEV_RETIRED_DBE is the value for ECC retired DBE
	DCGM_FI_DEV_RETIRED_DBE Short = 391
	// DCGM_FI_DEV_RETIRED_PENDING is the value for ECC retired pending
	DCGM_FI_DEV_RETIRED_PENDING Short = 392
	// DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS is the value for ECC uncorrectable remapped rows
	DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS Short = 393
	// DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS is the value for ECC correctable remapped rows
	DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS Short = 394
	// DCGM_FI_DEV_ROW_REMAP_FAILURE is the value for ECC row remap failure
	DCGM_FI_DEV_ROW_REMAP_FAILURE Short = 395
	// DCGM_FI_DEV_ROW_REMAP_PENDING is the value for ECC row remap pending
	DCGM_FI_DEV_ROW_REMAP_PENDING Short = 396
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 is the value for ECC NVLink CRC FLIT error count L0
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 Short = 400
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 is the value for ECC NVLink CRC FLIT error count L1
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 Short = 401
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 is the value for ECC NVLink CRC FLIT error count L2
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 Short = 402
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 is the value for ECC NVLink CRC FLIT error count L3
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 Short = 403
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 is the value for ECC NVLink CRC FLIT error count L4
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 Short = 404
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 is the value for ECC NVLink CRC FLIT error count L5
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 Short = 405
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL is the value for ECC NVLink CRC FLIT error count total
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL Short = 409
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 is the value for ECC NVLink CRC DATA error count L0
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 Short = 410
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 is the value for ECC NVLink CRC DATA error count L1
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 Short = 411
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 is the value for ECC NVLink CRC DATA error count L2
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 Short = 412
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 is the value for ECC NVLink CRC DATA error count L3
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 Short = 413
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 is the value for ECC NVLink CRC DATA error count L4
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 Short = 414
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 is the value for ECC NVLink CRC DATA error count L5
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 Short = 415
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL is the value for ECC NVLink CRC DATA error count total
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL Short = 419
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 is the value for ECC NVLink replay error count L0
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 Short = 420
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 is the value for ECC NVLink replay error count L1
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 Short = 421
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 is the value for ECC NVLink replay error count L2
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 Short = 422
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 is the value for ECC NVLink replay error count L3
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 Short = 423
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 is the value for ECC NVLink replay error count L4
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 Short = 424
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 is the value for ECC NVLink replay error count L5
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 Short = 425
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL is the value for ECC NVLink replay error count total
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL Short = 429
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 is the value for ECC NVLink recovery error count L0
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 Short = 430
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 is the value for ECC NVLink recovery error count L1
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 Short = 431
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 is the value for ECC NVLink recovery error count L2
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 Short = 432
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 is the value for ECC NVLink recovery error count L3
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 Short = 433
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 is the value for ECC NVLink recovery error count L4
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 Short = 434
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 is the value for ECC NVLink recovery error count L5
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 Short = 435
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL is the value for ECC NVLink recovery error count total
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL Short = 439
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 is the value for ECC NVLink bandwidth L0
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 Short = 440
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 is the value for ECC NVLink bandwidth L1
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 Short = 441
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 is the value for ECC NVLink bandwidth L2
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 Short = 442
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 is the value for ECC NVLink bandwidth L3
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 Short = 443
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 is the value for ECC NVLink bandwidth L4
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 Short = 444
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 is the value for ECC NVLink bandwidth L5
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 Short = 445
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL is the value for ECC NVLink bandwidth total
	DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Short = 449
	// DCGM_FI_DEV_GPU_NVLINK_ERRORS is the value for GPU NVLink error information
	DCGM_FI_DEV_GPU_NVLINK_ERRORS Short = 450
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 is the value for ECC NVLink CRC FLIT error count L6
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 Short = 451
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 is the value for ECC NVLink CRC FLIT error count L7
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 Short = 452
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 is the value for ECC NVLink CRC FLIT error count L8
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 Short = 453
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 is the value for ECC NVLink CRC FLIT error count L9
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 Short = 454
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 is the value for ECC NVLink CRC FLIT error count L10
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 Short = 455
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 is the value for ECC NVLink CRC FLIT error count L11
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 Short = 456
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 is the value for ECC NVLink CRC DATA error count L6
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 Short = 457
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 is the value for ECC NVLink CRC DATA error count L7
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 Short = 458
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 is the value for ECC NVLink CRC DATA error count L8
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 Short = 459
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 is the value for ECC NVLink CRC DATA error count L9
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 Short = 460
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 is the value for ECC NVLink CRC DATA error count L10
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 Short = 461
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 is the value for ECC NVLink CRC DATA error count L11
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 Short = 462
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 is the value for ECC NVLink replay error count L6
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 Short = 463
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 is the value for ECC NVLink replay error count L7
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 Short = 464
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 is the value for ECC NVLink replay error count L8
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 Short = 465
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 is the value for ECC NVLink replay error count L9
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 Short = 466
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 is the value for ECC NVLink replay error count L10
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 Short = 467
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 is the value for ECC NVLink replay error count L11
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 Short = 468
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 is the value for ECC NVLink recovery error count L6
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 Short = 469
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 is the value for ECC NVLink recovery error count L7
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 Short = 470
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 is the value for ECC NVLink recovery error count L8
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 Short = 471
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 is the value for ECC NVLink recovery error count L9
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 Short = 472
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 is the value for ECC NVLink recovery error count L10
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 Short = 473
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 is the value for ECC NVLink recovery error count L11
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 Short = 474
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 is the value for ECC NVLink bandwidth L6
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 Short = 475
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 is the value for ECC NVLink bandwidth L7
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 Short = 476
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 is the value for ECC NVLink bandwidth L8
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 Short = 477
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 is the value for ECC NVLink bandwidth L9
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 Short = 478
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 is the value for ECC NVLink bandwidth L10
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 Short = 479
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 is the value for ECC NVLink bandwidth L11
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 Short = 480
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 is the value for ECC NVLink CRC FLIT error count L12
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 Short = 406
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 is the value for ECC NVLink CRC FLIT error count L13
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 Short = 407
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 is the value for ECC NVLink CRC FLIT error count L14
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 Short = 408
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 is the value for ECC NVLink CRC FLIT error count L15
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 Short = 481
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 is the value for ECC NVLink CRC FLIT error count L16
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 Short = 482
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 is the value for ECC NVLink CRC FLIT error count L17
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 Short = 483
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 is the value for ECC NVLink CRC DATA error count L12
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 Short = 416
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 is the value for ECC NVLink CRC DATA error count L13
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 Short = 417
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 is the value for ECC NVLink CRC DATA error count L14
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 Short = 418
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 is the value for ECC NVLink CRC DATA error count L15
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 Short = 484
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 is the value for ECC NVLink CRC DATA error count L16
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 Short = 485
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 is the value for ECC NVLink CRC DATA error count L17
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 Short = 486
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 is the value for ECC NVLink replay error count L12
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 Short = 426
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 is the value for ECC NVLink replay error count L13
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 Short = 427
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 is the value for ECC NVLink replay error count L14
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 Short = 428
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 is the value for ECC NVLink replay error count L15
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 Short = 487
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 is the value for ECC NVLink replay error count L16
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 Short = 488
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 is the value for ECC NVLink replay error count L17
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 Short = 489
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 is the value for ECC NVLink recovery error count L12
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 Short = 436
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 is the value for ECC NVLink recovery error count L13
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 Short = 437
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 is the value for ECC NVLink recovery error count L14
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 Short = 438
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 is the value for ECC NVLink recovery error count L15
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 Short = 491
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 is the value for ECC NVLink recovery error count L16
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 Short = 492
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 is the value for ECC NVLink recovery error count L17
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 Short = 493
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 is the value for ECC NVLink bandwidth L12
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 Short = 446
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 is the value for ECC NVLink bandwidth L13
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 Short = 447
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 is the value for ECC NVLink bandwidth L14
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 Short = 448
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 is the value for ECC NVLink bandwidth L15
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 Short = 494
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 is the value for ECC NVLink bandwidth L16
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 Short = 495
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 is the value for ECC NVLink bandwidth L17
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 Short = 496
	// DCGM_FI_DEV_NVLINK_ERROR_DL_CRC is the value for ECC NVLink error DL CRC
	DCGM_FI_DEV_NVLINK_ERROR_DL_CRC Short = 497
	// DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY is the value for ECC NVLink error DL recovery
	DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY Short = 498
	// DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY is the value for ECC NVLink error DL replay
	DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY Short = 499
	// DCGM_FI_DEV_VIRTUAL_MODE is the value for ECC virtual mode
	DCGM_FI_DEV_VIRTUAL_MODE Short = 500
	// DCGM_FI_DEV_SUPPORTED_TYPE_INFO is the value for ECC supported type info
	DCGM_FI_DEV_SUPPORTED_TYPE_INFO Short = 501
	// DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS is the value for ECC creatable VGPU type IDs
	DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS Short = 502
	// DCGM_FI_DEV_VGPU_INSTANCE_IDS is the value for ECC VGPU instance IDs
	DCGM_FI_DEV_VGPU_INSTANCE_IDS Short = 503
	// DCGM_FI_DEV_VGPU_UTILIZATIONS is the value for ECC VGPU utilizations
	DCGM_FI_DEV_VGPU_UTILIZATIONS Short = 504
	// DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION is the value for ECC VGPU per process utilization
	DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION Short = 505
	// DCGM_FI_DEV_ENC_STATS is the value for ECC enc stats
	DCGM_FI_DEV_ENC_STATS Short = 506
	// DCGM_FI_DEV_FBC_STATS is the value for ECC FBC stats
	DCGM_FI_DEV_FBC_STATS Short = 507
	// DCGM_FI_DEV_FBC_SESSIONS_INFO is the value for ECC FBC sessions info
	DCGM_FI_DEV_FBC_SESSIONS_INFO Short = 508
	// DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS is the value for ECC supported VGPU type IDs
	DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS Short = 509
	// DCGM_FI_DEV_VGPU_TYPE_INFO is the value for ECC VGPU type info
	DCGM_FI_DEV_VGPU_TYPE_INFO Short = 510
	// DCGM_FI_DEV_VGPU_TYPE_NAME is the value for ECC VGPU type name
	DCGM_FI_DEV_VGPU_TYPE_NAME Short = 511
	// DCGM_FI_DEV_VGPU_TYPE_CLASS is the value for ECC VGPU type class
	DCGM_FI_DEV_VGPU_TYPE_CLASS Short = 512
	// DCGM_FI_DEV_VGPU_TYPE_LICENSE is the value for ECC VGPU type license
	DCGM_FI_DEV_VGPU_TYPE_LICENSE Short = 513
	// DCGM_FI_DEV_VGPU_VM_ID represents the VGPU VM ID
	DCGM_FI_DEV_VGPU_VM_ID Short = 520
	// DCGM_FI_DEV_VGPU_VM_NAME represents the VGPU VM name
	DCGM_FI_DEV_VGPU_VM_NAME Short = 521
	// DCGM_FI_DEV_VGPU_TYPE represents the VGPU type
	DCGM_FI_DEV_VGPU_TYPE Short = 522
	// DCGM_FI_DEV_VGPU_UUID represents the VGPU UUID
	DCGM_FI_DEV_VGPU_UUID Short = 523
	// DCGM_FI_DEV_VGPU_DRIVER_VERSION represents the VGPU driver version
	DCGM_FI_DEV_VGPU_DRIVER_VERSION Short = 524
	// DCGM_FI_DEV_VGPU_MEMORY_USAGE represents the VGPU memory usage
	DCGM_FI_DEV_VGPU_MEMORY_USAGE Short = 525
	// DCGM_FI_DEV_VGPU_LICENSE_STATUS represents the VGPU license status
	DCGM_FI_DEV_VGPU_LICENSE_STATUS Short = 526
	// DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT represents the VGPU frame rate limit
	DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT Short = 527
	// DCGM_FI_DEV_VGPU_ENC_STATS represents the VGPU encoder statistics
	DCGM_FI_DEV_VGPU_ENC_STATS Short = 528
	// DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO represents the VGPU encoder sessions information
	DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO Short = 529
	// DCGM_FI_DEV_VGPU_FBC_STATS represents the VGPU frame buffer capture statistics
	DCGM_FI_DEV_VGPU_FBC_STATS Short = 530
	// DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO represents the VGPU frame buffer capture sessions information
	DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO Short = 531
	// DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE represents the VGPU instance license state
	DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE Short = 532
	// DCGM_FI_DEV_VGPU_PCI_ID represents the VGPU PCI ID
	DCGM_FI_DEV_VGPU_PCI_ID Short = 533
	// DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID represents the VGPU VM GPU instance ID
	DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID Short = 534
	// DCGM_FI_FIRST_VGPU_FIELD_ID is the value for ECC first VGPU field ID
	DCGM_FI_FIRST_VGPU_FIELD_ID Short = 520
	// DCGM_FI_LAST_VGPU_FIELD_ID is the value for ECC last VGPU field ID
	DCGM_FI_LAST_VGPU_FIELD_ID Short = 570
	// DCGM_FI_DEV_PLATFORM_INFINIBAND_GUID is the value for ECC platform InfiniBand GUID
	DCGM_FI_DEV_PLATFORM_INFINIBAND_GUID Short = 571
	// DCGM_FI_DEV_PLATFORM_CHASSIS_SERIAL_NUMBER is the value for ECC platform chassis serial number
	DCGM_FI_DEV_PLATFORM_CHASSIS_SERIAL_NUMBER Short = 572
	// DCGM_FI_DEV_PLATFORM_CHASSIS_SLOT_NUMBER is the value for ECC platform chassis slot number
	DCGM_FI_DEV_PLATFORM_CHASSIS_SLOT_NUMBER Short = 573
	// DCGM_FI_DEV_PLATFORM_TRAY_INDEX is the value for ECC platform tray index
	DCGM_FI_DEV_PLATFORM_TRAY_INDEX Short = 574
	// DCGM_FI_DEV_PLATFORM_HOST_ID is the value for ECC platform host ID
	DCGM_FI_DEV_PLATFORM_HOST_ID Short = 575
	// DCGM_FI_DEV_PLATFORM_PEER_TYPE is the value for ECC platform peer type
	DCGM_FI_DEV_PLATFORM_PEER_TYPE Short = 576
	// DCGM_FI_DEV_PLATFORM_MODULE_ID is the value for ECC platform module ID
	DCGM_FI_DEV_PLATFORM_MODULE_ID Short = 577
	// DCGM_FI_FIRST_NVSWITCH_FIELD_ID is the value for ECC first NVSwitch field ID
	DCGM_FI_FIRST_NVSWITCH_FIELD_ID Short = 700
	// DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT represents the NVSwitch voltage in millivolts
	DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT Short = 701
	// DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ represents the NVSwitch IDDQ current
	DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ Short = 702
	// DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV represents the NVSwitch IDDQ current revision
	DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV Short = 703
	// DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD represents the NVSwitch IDDQ current for DVDD
	DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD Short = 704
	// DCGM_FI_DEV_NVSWITCH_POWER_VDD represents the NVSwitch VDD power consumption in watts
	DCGM_FI_DEV_NVSWITCH_POWER_VDD Short = 705
	// DCGM_FI_DEV_NVSWITCH_POWER_DVDD represents the NVSwitch DVDD power consumption in watts
	DCGM_FI_DEV_NVSWITCH_POWER_DVDD Short = 706
	// DCGM_FI_DEV_NVSWITCH_POWER_HVDD represents the NVSwitch HVDD power consumption in watts
	DCGM_FI_DEV_NVSWITCH_POWER_HVDD Short = 707
	// DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX represents the NVSwitch Tx Throughput Counter for ports 0-17 in KB/s
	DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX Short = 780
	// DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX represents the NVSwitch Rx Throughput Counter for ports 0-17 in KB/s
	DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX Short = 781
	// DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS represents the number of fatal errors for ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS Short = 782
	// DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS represents the number of non-fatal errors for ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS Short = 783
	// DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS represents the number of replay errors for ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS Short = 784
	// DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS represents the number of recovery errors for ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS Short = 785
	// DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS represents the number of FLIT errors for ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS Short = 786
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS represents the number of CRC errors for ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS Short = 787
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS represents the number of ECC errors for ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS Short = 788
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 is the value for Nvlink lane latency low lane0 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 Short = 789
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 is the value forNvlink lane latency low lane1 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 Short = 790
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 is the value for Nvlink lane latency low lane2 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 Short = 791
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 is the value for Nvlink lane latency low lane3 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 Short = 792
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 is the value for Nvlink lane latency medium lane0 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 Short = 793
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 is the value for Nvlink lane latency medium lane1 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 Short = 794
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 is the value for Nvlink lane latency medium lane2 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 Short = 795
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 is the value for Nvlink lane latency medium lane3 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 Short = 796
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 is the value for Nvlink lane latency high lane0 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 Short = 797
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 is the value for Nvlink lane latency high lane1 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 Short = 798
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 is the value for Nvlink lane latency high lane2 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 Short = 799
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 is the value for Nvlink lane latency high lane3 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 Short = 800
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 is the value for Nvlink lane latency panic lane0 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 Short = 801
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 is the value for Nvlink lane latency panic lane1 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 Short = 802
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 is the value for Nvlink lane latency panic lane2 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 Short = 803
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 is the value for Nvlink lane latency panic lane3 counter
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 Short = 804
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 represents the latency counter for virtual channel 0 on the NVSwitch link
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 Short = 805
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 represents the latency counter for virtual channel 1 on the NVSwitch link
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 Short = 806
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 represents the latency counter for virtual channel 2 on the NVSwitch link
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 Short = 807
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 represents the latency counter for virtual channel 3 on the NVSwitch link
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 Short = 808
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 represents the number of CRC errors on lane 0 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 Short = 809
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 represents the number of CRC errors on lane 1 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 Short = 810
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 represents the number of CRC errors on lane 2 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 Short = 811
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 represents the number of CRC errors on lane 3 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 Short = 812
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 represents the number of ECC errors on lane 0 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 Short = 813
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 represents the number of ECC errors on lane 1 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 Short = 814
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 represents the number of ECC errors on lane 2 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 Short = 815
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 represents the number of ECC errors on lane 3 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 Short = 816
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 represents the number of CRC errors on lane 4 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 Short = 817
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 represents the number of CRC errors on lane 5 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 Short = 818
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 represents the number of CRC errors on lane 6 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 Short = 819
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 represents the number of CRC errors on lane 7 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 Short = 820
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 represents the number of ECC errors on lane 4 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 Short = 821
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 represents the number of ECC errors on lane 5 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 Short = 822
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 represents the number of ECC errors on lane 6 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 Short = 823
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 represents the number of ECC errors on lane 7 on ports 0-17
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 Short = 824
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L0 represents the transmit bandwidth for NVLink lane 0 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L0 Short = 825
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L1 represents the transmit bandwidth for NVLink lane 1 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L1 Short = 826
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L2 represents the transmit bandwidth for NVLink lane 2 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L2 Short = 827
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L3 represents the transmit bandwidth for NVLink lane 3 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L3 Short = 828
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L4 represents the transmit bandwidth for NVLink lane 4 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L4 Short = 829
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L5 represents the transmit bandwidth for NVLink lane 5 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L5 Short = 830
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L6 represents the transmit bandwidth for NVLink lane 6 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L6 Short = 831
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L7 represents the transmit bandwidth for NVLink lane 7 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L7 Short = 832
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L8 represents the transmit bandwidth for NVLink lane 8 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L8 Short = 833
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L9 represents the transmit bandwidth for NVLink lane 9 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L9 Short = 834
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L10 represents the transmit bandwidth for NVLink lane 10 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L10 Short = 835
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L11 represents the transmit bandwidth for NVLink lane 11 in KB/s
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L11 Short = 836
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L12 represents the NV Link TX Bandwidth Counter for Lane 12
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L12 Short = 837
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L13 represents the NV Link TX Bandwidth Counter for Lane 13
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L13 Short = 838
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L14 represents the NV Link TX Bandwidth Counter for Lane 14
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L14 Short = 839
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L15 represents the NV Link TX Bandwidth Counter for Lane 15
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L15 Short = 840
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L16 represents the NV Link TX Bandwidth Counter for Lane 16
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L16 Short = 841
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L17 represents the NV Link TX Bandwidth Counter for Lane 17
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L17 Short = 842
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_TOTAL represents the NV Link Bandwidth Counter total for all TX Lanes
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_TOTAL Short = 843
	// DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS represents the NVSwitch fatal error information.
	// Note: value field indicates the specific SXid reported
	DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS Short = 856
	// DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS represents the NVSwitch non fatal error information.
	DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS Short = 857
	// DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT represents the NVSwitch current temperature.
	DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT Short = 858
	// DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN represents the NVSwitch limit slowdown temperature
	DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN Short = 859
	// DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN represents the NVSwitch limit shutdown temperature
	DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN Short = 860
	// DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX represents the NVSwitch throughput Tx
	DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX Short = 861
	// DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX represents the NVSwitch throughput Rx
	DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX Short = 862
	// DCGM_FI_DEV_NVSWITCH_PHYS_ID represents the NVSwitch physical ID
	DCGM_FI_DEV_NVSWITCH_PHYS_ID Short = 863
	// DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED represents the NVSwitch reset required
	DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED Short = 864
	// DCGM_FI_DEV_NVSWITCH_LINK_ID represents the NVSwitch link ID
	DCGM_FI_DEV_NVSWITCH_LINK_ID Short = 865
	// DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN represents the NVSwitch PCIe domain
	DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN Short = 866
	// DCGM_FI_DEV_NVSWITCH_PCIE_BUS represents the NVSwitch PCIe bus
	DCGM_FI_DEV_NVSWITCH_PCIE_BUS Short = 867
	// DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE represents the NVSwitch PCIe device
	DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE Short = 868
	// DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION represents the NVSwitch PCIe function
	DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION Short = 869
	// DCGM_FI_DEV_NVSWITCH_LINK_STATUS represents the NVSwitch link status UNKNOWN:-1 OFF:0 SAFE:1 ACTIVE:2 ERROR:3
	DCGM_FI_DEV_NVSWITCH_LINK_STATUS Short = 870
	// DCGM_FI_DEV_NVSWITCH_LINK_TYPE represents the NVSwitch link type GPU/Switch
	DCGM_FI_DEV_NVSWITCH_LINK_TYPE Short = 871
	// DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN represents the NVSwitch remote PCIe domain
	DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN Short = 872
	// DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS represents the NVSwitch remote PCIe bus
	DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS Short = 873
	// DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE represents the NVSwitch remote PCIe device
	DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE Short = 874
	// DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION represents the NVSwitch remote PCIe function
	DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION Short = 875
	// DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID represents the NVSwitch link device link ID
	DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID Short = 876
	// DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID represents the NVSwitch link device link SID
	DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID Short = 877
	// DCGM_FI_DEV_NVSWITCH_DEVICE_UUID represents the NVSwitch device UUID
	DCGM_FI_DEV_NVSWITCH_DEVICE_UUID Short = 878
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L0 represents the receive bandwidth for NVLink lane 0 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L0 Short = 879
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L1 represents the receive bandwidth for NVLink lane 1 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L1 Short = 880
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L2 represents the receive bandwidth for NVLink lane 2 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L2 Short = 881
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L3 represents the receive bandwidth for NVLink lane 3 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L3 Short = 882
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L4 represents the receive bandwidth for NVLink lane 4 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L4 Short = 883
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L5 represents the receive bandwidth for NVLink lane 5 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L5 Short = 884
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L6 represents the receive bandwidth for NVLink lane 6 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L6 Short = 885
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L7 represents the receive bandwidth for NVLink lane 7 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L7 Short = 886
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L8 represents the receive bandwidth for NVLink lane 8 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L8 Short = 887
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L9 represents the receive bandwidth for NVLink lane 9 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L9 Short = 888
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L10 represents the receive bandwidth for NVLink lane 10 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L10 Short = 889
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L11 represents the receive bandwidth for NVLink lane 11 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L11 Short = 890
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L12 represents the receive bandwidth for NVLink lane 12 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L12 Short = 891
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L13 represents the receive bandwidth for NVLink lane 13 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L13 Short = 892
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L14 represents the receive bandwidth for NVLink lane 14 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L14 Short = 893
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L15 represents the receive bandwidth for NVLink lane 15 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L15 Short = 894
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L16 represents the receive bandwidth for NVLink lane 16 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L16 Short = 895
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L17 represents the receive bandwidth for NVLink lane 17 in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L17 Short = 896
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_TOTAL represents the total receive bandwidth for all NVLink lanes in KB/s
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_TOTAL Short = 897

	// DCGM_FI_PROF_GR_ENGINE_ACTIVE represents the percentage of time the graphics engine was active
	DCGM_FI_PROF_GR_ENGINE_ACTIVE Short = 1001
	// DCGM_FI_PROF_SM_ACTIVE represents the percentage of time the streaming multiprocessors (SM) were active
	DCGM_FI_PROF_SM_ACTIVE Short = 1002
	// DCGM_FI_PROF_SM_OCCUPANCY represents the percentage of streaming multiprocessors (SM) warps residency
	DCGM_FI_PROF_SM_OCCUPANCY Short = 1003
	// DCGM_FI_PROF_PIPE_TENSOR_ACTIVE represents the percentage of time the tensor (HMMA) pipe was active
	DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Short = 1004
	// DCGM_FI_PROF_DRAM_ACTIVE represents the percentage of time the device memory interface was active
	DCGM_FI_PROF_DRAM_ACTIVE Short = 1005
	// DCGM_FI_PROF_PIPE_FP64_ACTIVE represents the percentage of time the FP64 pipe was active
	DCGM_FI_PROF_PIPE_FP64_ACTIVE Short = 1006
	// DCGM_FI_PROF_PIPE_FP32_ACTIVE represents the percentage of time the FP32 pipe was active
	DCGM_FI_PROF_PIPE_FP32_ACTIVE Short = 1007
	// DCGM_FI_PROF_PIPE_FP16_ACTIVE represents the percentage of time the FP16 pipe was active
	DCGM_FI_PROF_PIPE_FP16_ACTIVE Short = 1008
	// DCGM_FI_PROF_PCIE_TX_BYTES represents the number of bytes transmitted through PCIe TX (in bytes)
	DCGM_FI_PROF_PCIE_TX_BYTES Short = 1009
	// DCGM_FI_PROF_PCIE_RX_BYTES represents the number of bytes received through PCIe RX (in bytes)
	DCGM_FI_PROF_PCIE_RX_BYTES Short = 1010
	// DCGM_FI_PROF_NVLINK_TX_BYTES represents the number of bytes transmitted through NVLink TX (in bytes)
	DCGM_FI_PROF_NVLINK_TX_BYTES Short = 1011
	// DCGM_FI_PROF_NVLINK_RX_BYTES represents the number of bytes received through NVLink RX (in bytes)
	DCGM_FI_PROF_NVLINK_RX_BYTES Short = 1012
	// DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE represents the percentage of time the IMMA tensor pipe was active
	DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE Short = 1013
	// DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE represents the percentage of time the HMMA tensor pipe was active
	DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE Short = 1014
	// DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE represents the percentage of time the DFMA tensor pipe was active
	DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE Short = 1015
	// DCGM_FI_PROF_PIPE_INT_ACTIVE represents the ratio of cycles the integer pipe is active
	DCGM_FI_PROF_PIPE_INT_ACTIVE Short = 1016
	// DCGM_FI_PROF_NVDEC0_ACTIVE represents the ratio of cycles the NVDEC engine 0 is active
	DCGM_FI_PROF_NVDEC0_ACTIVE Short = 1017
	// DCGM_FI_PROF_NVDEC1_ACTIVE represents the ratio of cycles the NVDEC engine 1 is active
	DCGM_FI_PROF_NVDEC1_ACTIVE Short = 1018
	// DCGM_FI_PROF_NVDEC2_ACTIVE represents the ratio of cycles the NVDEC engine 2 is active
	DCGM_FI_PROF_NVDEC2_ACTIVE Short = 1019
	// DCGM_FI_PROF_NVDEC3_ACTIVE represents the ratio of cycles the NVDEC engine 3 is active
	DCGM_FI_PROF_NVDEC3_ACTIVE Short = 1020
	// DCGM_FI_PROF_NVDEC4_ACTIVE represents the ratio of cycles the NVDEC engine 4 is active
	DCGM_FI_PROF_NVDEC4_ACTIVE Short = 1021
	// DCGM_FI_PROF_NVDEC5_ACTIVE represents the ratio of cycles the NVDEC engine 5 is active
	DCGM_FI_PROF_NVDEC5_ACTIVE Short = 1022
	// DCGM_FI_PROF_NVDEC6_ACTIVE represents the ratio of cycles the NVDEC engine 6 is active
	DCGM_FI_PROF_NVDEC6_ACTIVE Short = 1023
	// DCGM_FI_PROF_NVDEC7_ACTIVE represents the ratio of cycles the NVDEC engine 7 is active
	DCGM_FI_PROF_NVDEC7_ACTIVE Short = 1024

	// DCGM_FI_PROF_NVJPG0_ACTIVE represents the ratio of cycles the NVJPG engine 0 is active
	DCGM_FI_PROF_NVJPG0_ACTIVE Short = 1025
	// DCGM_FI_PROF_NVJPG1_ACTIVE represents the ratio of cycles the NVJPG engine 1 is active
	DCGM_FI_PROF_NVJPG1_ACTIVE Short = 1026
	// DCGM_FI_PROF_NVJPG2_ACTIVE represents the ratio of cycles the NVJPG engine 2 is active
	DCGM_FI_PROF_NVJPG2_ACTIVE Short = 1027
	// DCGM_FI_PROF_NVJPG3_ACTIVE represents the ratio of cycles the NVJPG engine 3 is active
	DCGM_FI_PROF_NVJPG3_ACTIVE Short = 1028
	// DCGM_FI_PROF_NVJPG4_ACTIVE represents the ratio of cycles the NVJPG engine 4 is active
	DCGM_FI_PROF_NVJPG4_ACTIVE Short = 1029
	// DCGM_FI_PROF_NVJPG5_ACTIVE represents the ratio of cycles the NVJPG engine 5 is active
	DCGM_FI_PROF_NVJPG5_ACTIVE Short = 1030
	// DCGM_FI_PROF_NVJPG6_ACTIVE represents the ratio of cycles the NVJPG engine 6 is active
	DCGM_FI_PROF_NVJPG6_ACTIVE Short = 1031
	// DCGM_FI_PROF_NVJPG7_ACTIVE represents the ratio of cycles the NVJPG engine 7 is active
	DCGM_FI_PROF_NVJPG7_ACTIVE Short = 1032

	// DCGM_FI_PROF_NVOFA0_ACTIVE represents the ratio of cycles the NVOFA engine 0 is active
	DCGM_FI_PROF_NVOFA0_ACTIVE Short = 1033
	// DCGM_FI_PROF_NVOFA1_ACTIVE represents the ratio of cycles the NVOFA engine 1 is active
	DCGM_FI_PROF_NVOFA1_ACTIVE Short = 1034

	// DCGM_FI_PROF_NVLINK_L0_TX_BYTES represents the number of bytes transmitted through NVLink lane 0 in KB/s
	DCGM_FI_PROF_NVLINK_L0_TX_BYTES Short = 1040
	// DCGM_FI_PROF_NVLINK_L0_RX_BYTES represents the number of bytes received through NVLink lane 0 in KB/s
	DCGM_FI_PROF_NVLINK_L0_RX_BYTES Short = 1041
	// DCGM_FI_PROF_NVLINK_L1_TX_BYTES represents the number of bytes transmitted through NVLink lane 1 in KB/s
	DCGM_FI_PROF_NVLINK_L1_TX_BYTES Short = 1042
	// DCGM_FI_PROF_NVLINK_L1_RX_BYTES represents the number of bytes received through NVLink lane 1 in KB/s
	DCGM_FI_PROF_NVLINK_L1_RX_BYTES Short = 1043
	// DCGM_FI_PROF_NVLINK_L2_TX_BYTES represents the number of bytes transmitted through NVLink lane 2 in KB/s
	DCGM_FI_PROF_NVLINK_L2_TX_BYTES Short = 1044
	// DCGM_FI_PROF_NVLINK_L2_RX_BYTES represents the number of bytes received through NVLink lane 2 in KB/s
	DCGM_FI_PROF_NVLINK_L2_RX_BYTES Short = 1045
	// DCGM_FI_PROF_NVLINK_L3_TX_BYTES represents the number of bytes transmitted through NVLink lane 3 in KB/s
	DCGM_FI_PROF_NVLINK_L3_TX_BYTES Short = 1046
	// DCGM_FI_PROF_NVLINK_L3_RX_BYTES represents the number of bytes received through NVLink lane 3 in KB/s
	DCGM_FI_PROF_NVLINK_L3_RX_BYTES Short = 1047
	// DCGM_FI_PROF_NVLINK_L4_TX_BYTES represents the number of bytes transmitted through NVLink lane 4 in KB/s
	DCGM_FI_PROF_NVLINK_L4_TX_BYTES Short = 1048
	// DCGM_FI_PROF_NVLINK_L4_RX_BYTES represents the number of bytes received through NVLink lane 4 in KB/s
	DCGM_FI_PROF_NVLINK_L4_RX_BYTES Short = 1049
	// DCGM_FI_PROF_NVLINK_L5_TX_BYTES represents the number of bytes transmitted through NVLink lane 5 in KB/s
	DCGM_FI_PROF_NVLINK_L5_TX_BYTES Short = 1050
	// DCGM_FI_PROF_NVLINK_L5_RX_BYTES represents the number of bytes received through NVLink lane 5 in KB/s
	DCGM_FI_PROF_NVLINK_L5_RX_BYTES Short = 1051
	// DCGM_FI_PROF_NVLINK_L6_TX_BYTES represents the number of bytes transmitted through NVLink lane 6 in KB/s
	DCGM_FI_PROF_NVLINK_L6_TX_BYTES Short = 1052
	// DCGM_FI_PROF_NVLINK_L6_RX_BYTES represents the number of bytes received through NVLink lane 6 in KB/s
	DCGM_FI_PROF_NVLINK_L6_RX_BYTES Short = 1053
	// DCGM_FI_PROF_NVLINK_L7_TX_BYTES represents the number of bytes transmitted through NVLink lane 7 in KB/s
	DCGM_FI_PROF_NVLINK_L7_TX_BYTES Short = 1054
	// DCGM_FI_PROF_NVLINK_L7_RX_BYTES represents the number of bytes received through NVLink lane 7 in KB/s
	DCGM_FI_PROF_NVLINK_L7_RX_BYTES Short = 1055
	// DCGM_FI_PROF_NVLINK_L8_TX_BYTES represents the number of bytes transmitted through NVLink lane 8 in KB/s
	DCGM_FI_PROF_NVLINK_L8_TX_BYTES Short = 1056
	// DCGM_FI_PROF_NVLINK_L8_RX_BYTES represents the number of bytes received through NVLink lane 8 in KB/s
	DCGM_FI_PROF_NVLINK_L8_RX_BYTES Short = 1057
	// DCGM_FI_PROF_NVLINK_L9_TX_BYTES represents the number of bytes transmitted through NVLink lane 9 in KB/s
	DCGM_FI_PROF_NVLINK_L9_TX_BYTES Short = 1058
	// DCGM_FI_PROF_NVLINK_L9_RX_BYTES represents the number of bytes received through NVLink lane 9 in KB/s
	DCGM_FI_PROF_NVLINK_L9_RX_BYTES Short = 1059
	// DCGM_FI_PROF_NVLINK_L10_TX_BYTES represents the number of bytes transmitted through NVLink lane 10 in KB/s
	DCGM_FI_PROF_NVLINK_L10_TX_BYTES Short = 1060
	// DCGM_FI_PROF_NVLINK_L10_RX_BYTES represents the number of bytes received through NVLink lane 10 in KB/s
	DCGM_FI_PROF_NVLINK_L10_RX_BYTES Short = 1061
	// DCGM_FI_PROF_NVLINK_L11_TX_BYTES represents the number of bytes transmitted through NVLink lane 11 in KB/s
	DCGM_FI_PROF_NVLINK_L11_TX_BYTES Short = 1062
	// DCGM_FI_PROF_NVLINK_L11_RX_BYTES represents the number of bytes received through NVLink lane 11 in KB/s
	DCGM_FI_PROF_NVLINK_L11_RX_BYTES Short = 1063
	// DCGM_FI_PROF_NVLINK_L12_TX_BYTES represents the number of bytes transmitted through NVLink lane 12 in KB/s
	DCGM_FI_PROF_NVLINK_L12_TX_BYTES Short = 1064
	// DCGM_FI_PROF_NVLINK_L12_RX_BYTES represents the number of bytes received through NVLink lane 12 in KB/s
	DCGM_FI_PROF_NVLINK_L12_RX_BYTES Short = 1065
	// DCGM_FI_PROF_NVLINK_L13_TX_BYTES represents the number of bytes transmitted through NVLink lane 13 in KB/s
	DCGM_FI_PROF_NVLINK_L13_TX_BYTES Short = 1066
	// DCGM_FI_PROF_NVLINK_L13_RX_BYTES represents the number of bytes received through NVLink lane 13 in KB/s
	DCGM_FI_PROF_NVLINK_L13_RX_BYTES Short = 1067
	// DCGM_FI_PROF_NVLINK_L14_TX_BYTES represents the number of bytes transmitted through NVLink lane 14 in KB/s
	DCGM_FI_PROF_NVLINK_L14_TX_BYTES Short = 1068
	// DCGM_FI_PROF_NVLINK_L14_RX_BYTES represents the number of bytes received through NVLink lane 14 in KB/s
	DCGM_FI_PROF_NVLINK_L14_RX_BYTES Short = 1069
	// DCGM_FI_PROF_NVLINK_L15_TX_BYTES represents the number of bytes transmitted through NVLink lane 15 in KB/s
	DCGM_FI_PROF_NVLINK_L15_TX_BYTES Short = 1070
	// DCGM_FI_PROF_NVLINK_L15_RX_BYTES represents the number of bytes received through NVLink lane 15 in KB/s
	DCGM_FI_PROF_NVLINK_L15_RX_BYTES Short = 1071
	// DCGM_FI_PROF_NVLINK_L16_TX_BYTES represents the number of bytes transmitted through NVLink lane 16 in KB/s
	DCGM_FI_PROF_NVLINK_L16_TX_BYTES Short = 1072
	// DCGM_FI_PROF_C2C_TX_ALL_BYTES represents C2C (Chip-to-Chip) interface metric
	DCGM_FI_PROF_C2C_TX_ALL_BYTES Short = 1076
	// DCGM_FI_PROF_C2C_TX_DATA_BYTES represents C2C (Chip-to-Chip) interface metric
	DCGM_FI_PROF_C2C_TX_DATA_BYTES Short = 1077
	// DCGM_FI_PROF_C2C_RX_ALL_BYTES represents C2C (Chip-to-Chip) interface metric
	DCGM_FI_PROF_C2C_RX_ALL_BYTES Short = 1078
	// DCGM_FI_PROF_C2C_RX_DATA_BYTES represents C2C (Chip-to-Chip) interface metric
	DCGM_FI_PROF_C2C_RX_DATA_BYTES Short = 1079

	// DCGM_FI_DEV_CPU_UTIL_TOTAL represents the total CPU utilization, total
	DCGM_FI_DEV_CPU_UTIL_TOTAL Short = 1100
	// DCGM_FI_DEV_CPU_UTIL_USER represents the CPU utilization, user
	DCGM_FI_DEV_CPU_UTIL_USER Short = 1101
	// DCGM_FI_DEV_CPU_UTIL_NICE represents the CPU utilization, nice
	DCGM_FI_DEV_CPU_UTIL_NICE Short = 1102
	// DCGM_FI_DEV_CPU_UTIL_SYS represents the CPU utilization, system time
	DCGM_FI_DEV_CPU_UTIL_SYS Short = 1103
	// DCGM_FI_DEV_CPU_UTIL_IRQ represents the CPU utilization, interrupt servicing
	DCGM_FI_DEV_CPU_UTIL_IRQ Short = 1104
	// DCGM_FI_DEV_CPU_TEMP_CURRENT represents the current CPU temperature in degrees Celsius
	DCGM_FI_DEV_CPU_TEMP_CURRENT Short = 1110
	// DCGM_FI_DEV_CPU_TEMP_WARNING represents the CPU temperature warning threshold in degrees Celsius
	DCGM_FI_DEV_CPU_TEMP_WARNING Short = 1111
	// DCGM_FI_DEV_CPU_TEMP_SHUTDOWN represents the CPU temperature shutdown threshold in degrees Celsius
	DCGM_FI_DEV_CPU_TEMP_SHUTDOWN Short = 1112
	// DCGM_FI_DEV_CPU_CLOCK_CURRENT represents the current CPU clock frequency in MHz
	DCGM_FI_DEV_CPU_CLOCK_CURRENT Short = 1120
	// DCGM_FI_DEV_CPU_POWER_CURRENT represents the current CPU power usage
	DCGM_FI_DEV_CPU_POWER_CURRENT Short = 1130
	// DCGM_FI_DEV_CPU_POWER_LIMIT represents the GPU power limit
	DCGM_FI_DEV_CPU_POWER_LIMIT Short = 1131
	// DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT represents the SoC power utilization
	DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT Short = 1132
	// DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT represents the Module power utilization
	DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT Short = 1133
	// DCGM_FI_DEV_CPU_VENDOR is the value for ECC DEV CPU Vendor
	DCGM_FI_DEV_CPU_VENDOR Short = 1140
	// DCGM_FI_DEV_CPU_MODEL is the value for ECC DEV CPU Model
	DCGM_FI_DEV_CPU_MODEL Short = 1141
	// DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS is the value for ECC DEV NVLink Count TX Packets
	DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS Short = 1200
	// DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES is the value for ECC DEV NVLink Count TX Bytes
	DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES Short = 1201
	// DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS is the value for ECC DEV NVLink Count RX Packets
	DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS Short = 1202
	// DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES is the value for ECC DEV NVLink Count RX Bytes
	DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES Short = 1203
	// DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS is the value for ECC DEV NVLink Count RX Malformed Packet Errors
	DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS Short = 1204
	// DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS is the value for ECC DEV NVLink Count RX Buffer Overrun Errors
	DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS Short = 1205
	// DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS is the value for ECC DEV NVLink Count RX Errors
	DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS Short = 1206
	// DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS is the value for ECC DEV NVLink Count RX Remote Errors
	DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS Short = 1207
	// DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS is the value for ECC DEV NVLink Count RX General Errors
	DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS Short = 1208
	// DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS is the value for ECC DEV NVLink Count Local Link Integrity Errors
	DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS Short = 1209
	// DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS is the value for ECC DEV NVLink Count TX Discards
	DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS Short = 1210
	// DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS is the value for ECC DEV NVLink Count Link Recovery Successful Events
	DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS Short = 1211
	// DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS is the value for ECC DEV NVLink Count Link Recovery Failed Events
	DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS Short = 1212
	// DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS is the value for ECC DEV NVLink Count Link Recovery Events
	DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS Short = 1213
	// DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS is the value for ECC DEV NVLink Count RX Symbol Errors
	DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS Short = 1214
	// DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER is the value for ECC DEV NVLink Count Symbol BER
	DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER Short = 1215
	// DCGM_FI_DEV_CONNECTX_HEALTH represents a health state of ConnectX
	DCGM_FI_DEV_CONNECTX_HEALTH Short = 1300
	// DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_WIDTH is the value of an active PCIe link width
	DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_WIDTH Short = 1301
	// DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_SPEED is the value of an active PCIe link speed
	DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_SPEED Short = 1302
	// DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_WIDTH is the value of an expected PCIe link width
	DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_WIDTH Short = 1303
	// DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_SPEED is the value of an expected PCIe link speed
	DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_SPEED Short = 1304
	// DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_STATUS is the value of a correctable error status
	DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_STATUS Short = 1305
	// DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_MASK is the value of a correctable error mask
	DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_MASK Short = 1306
	// DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_STATUS is the value of an uncorrectable error status
	DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_STATUS Short = 1307
	// DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_MASK is the value of an uncorrectable error mask
	DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_MASK Short = 1308
	// DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_SEVERITY is the value of an uncorrectable error severity
	DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_SEVERITY Short = 1309
	// DCGM_FI_DEV_CONNECTX_DEVICE_TEMPERATURE is the value of a device temperature
	DCGM_FI_DEV_CONNECTX_DEVICE_TEMPERATURE Short = 1310
	// DCGM_FI_DEV_LAST_CONNECTX_FIELD_ID represents the last field ID for ConnectX fields
	DCGM_FI_DEV_LAST_CONNECTX_FIELD_ID Short = 1399
	// DCGM_FI_MAX_FIELDS represents 1 greater than maximum fields above. This is the 1 greater than the maximum field id that could be allocated
	DCGM_FI_MAX_FIELDS Short = 1311
)

func GetFieldID

func GetFieldID(fieldName string) (Short, bool)

GetFieldID returns the DCGM field ID for a given field name and whether it was found It first checks the current field IDs, then falls back to legacy field IDs if not found

func GetFieldIDOrPanic

func GetFieldIDOrPanic(fieldName string) Short

GetFieldIDOrPanic returns the DCGM field ID for a given field name It panics if the field name is not found in either current or legacy maps

type Status

type Status struct {
	// Memory represents the current memory usage of the DCGM hostengine in kilobytes
	Memory int64
	// CPU represents the current CPU utilization of the DCGM hostengine as a percentage (0-100)
	CPU float64
}

Status represents the current resource utilization of the DCGM hostengine process

func Introspect

func Introspect() (Status, error)

Introspect returns memory and CPU usage statistics for the DCGM hostengine

type SystemWatch

type SystemWatch struct {
	// Type identifies the type of health watch system
	Type string
	// Status indicates the current health status
	Status string
	// Error contains any error message if status is not healthy
	Error string
}

SystemWatch represents a health watch system and its status

type ThermalPolicyCondition

type ThermalPolicyCondition struct {
	// ThermalViolation indicates the severity of the thermal violation
	ThermalViolation uint
}

ThermalPolicyCondition contains details about a thermal violation

type Time

type Time uint64

Time represents a Unix timestamp in seconds

func (Time) String

func (t Time) String() string

String returns a human-readable string representation of the timestamp. Returns "Running" if the timestamp is 0, otherwise returns the formatted time.

type UtilizationInfo

type UtilizationInfo struct {
	GPU     int64 // %
	Memory  int64 // %
	Encoder int64 // %
	Decoder int64 // %
}

UtilizationInfo contains GPU utilization metrics

type ViolationTime

type ViolationTime struct {
	// Power is time spent throttling due to power constraints
	Power *uint64
	// Thermal is time spent throttling due to thermal constraints
	Thermal *uint64
	// Reliability is time spent throttling due to reliability constraints
	Reliability *uint64
	// BoardLimit is time spent throttling due to board limit constraints
	BoardLimit *uint64
	// LowUtilization is time spent throttling due to low utilization
	LowUtilization *uint64
	// SyncBoost is time spent throttling due to sync boost
	SyncBoost *uint64
}

ViolationTime measures amount of time (in ms) GPU was at reduced clocks

type XIDErrorInfo

type XIDErrorInfo struct {
	// NumErrors is the number of XID errors that occurred
	NumErrors int
	// Timestamp contains the timestamps of when XID errors occurred
	Timestamp []uint64
}

XIDErrorInfo contains information about XID errors

type XidPolicyCondition

type XidPolicyCondition struct {
	// ErrNum is the XID error number
	ErrNum uint
}

XidPolicyCondition contains details about an XID error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL