Documentation
¶
Overview ¶
Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
Index ¶
- Constants
- Variables
- func AddEntityToGroup(groupID GroupHandle, entityGroupID Field_Entity_Group, entityID uint) (err error)
- func AddLinkEntityToGroup(groupID GroupHandle, index, parentID uint) (err error)
- func AddToGroup(groupID GroupHandle, gpuID uint) (err error)
- func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error)
- func DestroyGroup(groupID GroupHandle) (err error)
- func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)
- func FieldsInit() int
- func FieldsTerm() int
- func FindFirstNonAsciiIndex(value [4096]byte) int
- func Fv2_Blob(fv FieldValue_v2) [4096]byte
- func Fv2_String(fv FieldValue_v2) string
- func GetAllDeviceCount() (uint, error)
- func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error)
- func GetSupportedDevices() ([]uint, error)
- func HealthSet(groupID GroupHandle, systems HealthSystem) (err error)
- func Init(m mode, args ...string) (cleanup func(), err error)
- func InjectFieldValue(gpu uint, fieldID Short, fieldType uint, status int, ts int64, value any) error
- func IsCurrentField(fieldName string) bool
- func IsInt32Blank(value int) bool
- func IsInt64Blank(value int64) bool
- func IsLegacyField(fieldName string) bool
- func ListenForPolicyViolations(ctx context.Context, typ ...policyCondition) (<-chan PolicyViolation, error)
- func ListenForPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...policyCondition) (<-chan PolicyViolation, error)
- func Shutdown() (err error)
- func UpdateAllFields() error
- func ViolationRegistration(data unsafe.Pointer) int
- func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error
- func WatchFieldsWithGroupEx(fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, ...) error
- type CPUHierarchyCPU_v1
- type CPUHierarchy_v1
- type ClockInfo
- type DbePolicyCondition
- type Device
- type DeviceHealth
- type DeviceIdentifiers
- type DeviceStatus
- type DiagErrorDetail
- type DiagResult
- type DiagResults
- type DiagType
- type ECCErrorsInfo
- type Error
- type FieldHandle
- type FieldMeta
- type FieldValue_v1
- type FieldValue_v2
- type Field_Entity_Group
- type GroupEntityPair
- type GroupHandle
- func CreateGroup(groupName string) (goGroupId GroupHandle, err error)
- func CreateGroupWithContext(ctx context.Context, groupName string) (GroupHandle, error)
- func GroupAllGPUs() GroupHandle
- func NewDefaultGroup(groupName string) (GroupHandle, error)
- func WatchFields(gpuID uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)
- func WatchPidFields() (GroupHandle, error)
- func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error)
- type GroupInfo
- type HealthCheckErrorCode
- type HealthResponse
- type HealthResult
- type HealthSystem
- type Incident
- type Link_State
- type MemoryInfo
- type MetricGroup
- type MigEntityInfo
- type MigHierarchyInfo
- type MigHierarchyInfo_v2
- type MigHierarchy_v2
- type MigProfile
- type NvLinkStatus
- type NvlinkPolicyCondition
- type P2PLink
- type P2PLinkType
- type PCIInfo
- type PCIStatusInfo
- type PCIThroughputInfo
- type PciPolicyCondition
- type PerfState
- type PolicyViolation
- type PowerPolicyCondition
- type ProcessInfo
- type ProcessUtilInfo
- type RetiredPagesPolicyCondition
- type Short
- type Status
- type SystemWatch
- type ThermalPolicyCondition
- type Time
- type UtilizationInfo
- type ViolationTime
- type XIDErrorInfo
- type XidPolicyCondition
Constants ¶
const ( Embedded mode = iota Standalone StartHostengine )
const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine
const ( // DCGM_FT_BINARY is the type for binary data DCGM_FT_BINARY = uint('b') // DCGM_FT_DOUBLE is the type for floating-point numbers DCGM_FT_DOUBLE = uint('d') // DCGM_FT_INT64 is the type for 64-bit integers DCGM_FT_INT64 = uint('i') // DCGM_FT_STRING is the type for strings DCGM_FT_STRING = uint('s') // DCGM_FT_TIMESTAMP is the type for timestamps DCGM_FT_TIMESTAMP = uint('t') // DCGM_FT_INT32_BLANK is the blank value for 32-bit integers DCGM_FT_INT32_BLANK = int64(2147483632) // DCGM_FT_INT32_NOT_FOUND is the value for not found in 32-bit integers DCGM_FT_INT32_NOT_FOUND = DCGM_FT_INT32_BLANK + 1 // DCGM_FT_INT32_NOT_SUPPORTED is the value for not supported in 32-bit integers DCGM_FT_INT32_NOT_SUPPORTED = DCGM_FT_INT32_BLANK + 2 // DCGM_FT_INT32_NOT_PERMISSIONED is the value for not permissioned in 32-bit integers DCGM_FT_INT32_NOT_PERMISSIONED = DCGM_FT_INT32_BLANK + 3 // DCGM_FT_INT64_BLANK is the blank value for 64-bit integers DCGM_FT_INT64_BLANK = int64(9223372036854775792) // DCGM_FT_INT64_NOT_FOUND is the value for not found in 64-bit integers DCGM_FT_INT64_NOT_FOUND = DCGM_FT_INT64_BLANK + 1 // DCGM_FT_INT64_NOT_SUPPORTED is the value for not supported in 64-bit integers DCGM_FT_INT64_NOT_SUPPORTED = DCGM_FT_INT64_BLANK + 2 // DCGM_FT_INT64_NOT_PERMISSIONED is the value for not permissioned in 64-bit integers DCGM_FT_INT64_NOT_PERMISSIONED = DCGM_FT_INT64_BLANK + 3 // DCGM_FT_FP64_BLANK is the blank value for floating-point numbers DCGM_FT_FP64_BLANK = 140737488355328.0 // DCGM_FT_FP64_NOT_FOUND is the value for not found in floating-point numbers DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0) // DCGM_FT_FP64_NOT_SUPPORTED is the value for not supported in floating-point numbers DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0) // DCGM_FT_FP64_NOT_PERMISSIONED is the value for not permissioned in floating-point numbers DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0) // DCGM_FT_STR_BLANK is the blank value for strings DCGM_FT_STR_BLANK = "<<<NULL>>>" // DCGM_FT_STR_NOT_FOUND is the value for not found in strings DCGM_FT_STR_NOT_FOUND = "<<<NOT_FOUND>>>" // DCGM_FT_STR_NOT_SUPPORTED is the value for not supported in strings DCGM_FT_STR_NOT_SUPPORTED = "<<<NOT_SUPPORTED>>>" // DCGM_FT_STR_NOT_PERMISSIONED is the value for not permissioned in strings DCGM_FT_STR_NOT_PERMISSIONED = "<<<NOT_PERMISSIONED>>>" // DCGM_ST_OK is the value for ECC OK DCGM_ST_OK = 0 // DCGM_ST_BADPARAM is the value for ECC BAD PARAM DCGM_ST_BADPARAM = -1 // DCGM_ST_GENERIC_ERROR is the value for ECC GENERIC ERROR DCGM_ST_GENERIC_ERROR = -3 // DCGM_ST_MEMORY is the value for ECC MEMORY DCGM_ST_MEMORY = -4 // DCGM_ST_NOT_CONFIGURED is the value for ECC NOT CONFIGURED DCGM_ST_NOT_CONFIGURED = -5 // DCGM_ST_NOT_SUPPORTED is the value for ECC NOT SUPPORTED DCGM_ST_NOT_SUPPORTED = -6 // DCGM_ST_INIT_ERROR is the value for ECC INIT ERROR DCGM_ST_INIT_ERROR = -7 // DCGM_ST_NVML_ERROR is the value for ECC NVML ERROR DCGM_ST_NVML_ERROR = -8 // DCGM_ST_PENDING is the value for ECC PENDING DCGM_ST_PENDING = -9 // DCGM_ST_TIMEOUT is the value for ECC TIMEOUT DCGM_ST_TIMEOUT = -11 // DCGM_ST_VER_MISMATCH is the value for ECC VER MISMATCH DCGM_ST_VER_MISMATCH = -12 // DCGM_ST_UNKNOWN_FIELD is the value for ECC UNKNOWN FIELD DCGM_ST_UNKNOWN_FIELD = -13 // DCGM_ST_NO_DATA is the value for ECC NO DATA DCGM_ST_NO_DATA = -14 // DCGM_ST_STALE_DATA is the value for ECC STALE DATA DCGM_ST_STALE_DATA = -15 // DCGM_ST_NOT_WATCHED is the value for ECC NOT WATCHED DCGM_ST_NOT_WATCHED = -16 // DCGM_ST_NO_PERMISSION is the value for ECC NO PERMISSION DCGM_ST_NO_PERMISSION = -17 // DCGM_ST_GPU_IS_LOST is the value for ECC GPU IS LOST DCGM_ST_GPU_IS_LOST = -18 // DCGM_ST_RESET_REQUIRED is the value for ECC RESET REQUIRED DCGM_ST_RESET_REQUIRED = -19 // DCGM_ST_FUNCTION_NOT_FOUND is the value for ECC FUNCTION NOT FOUND DCGM_ST_FUNCTION_NOT_FOUND = -20 // DCGM_ST_CONNECTION_NOT_VALID is the value for ECC CONNECTION NOT VALID DCGM_ST_CONNECTION_NOT_VALID = -21 // DCGM_ST_GPU_NOT_SUPPORTED is the value for ECC GPU NOT SUPPORTED DCGM_ST_GPU_NOT_SUPPORTED = -22 // DCGM_ST_GROUP_INCOMPATIBLE is the value for ECC GROUP INCOMPATIBLE DCGM_ST_GROUP_INCOMPATIBLE = -23 // DCGM_ST_MAX_LIMIT is the value for ECC MAX LIMIT DCGM_ST_MAX_LIMIT = -24 // DCGM_ST_LIBRARY_NOT_FOUND is the value for ECC LIBRARY NOT FOUND DCGM_ST_LIBRARY_NOT_FOUND = -25 // DCGM_ST_DUPLICATE_KEY is the value for ECC DUPLICATE KEY DCGM_ST_DUPLICATE_KEY = -26 // DCGM_ST_GPU_IN_SYNC_BOOST_GROUP is the value for ECC GPU IN SYNC BOOST GROUP DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27 // DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP is the value for ECC GPU NOT IN SYNC BOOST GROUP DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28 // DCGM_ST_REQUIRES_ROOT is the value for ECC REQUIRES ROOT DCGM_ST_REQUIRES_ROOT = -29 // DCGM_ST_NVVS_ERROR is the value for ECC NVVS ERROR DCGM_ST_NVVS_ERROR = -30 // DCGM_ST_INSUFFICIENT_SIZE is the value for ECC INSUFFICIENT SIZE DCGM_ST_INSUFFICIENT_SIZE = -31 // DCGM_ST_FIELD_UNSUPPORTED_BY_API is the value for ECC FIELD UNSUPPORTED BY API DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32 // DCGM_ST_MODULE_NOT_LOADED is the value for ECC MODULE NOT LOADED DCGM_ST_MODULE_NOT_LOADED = -33 // DCGM_ST_IN_USE is the value for ECC IN USE DCGM_ST_IN_USE = -34 // DCGM_ST_GROUP_IS_EMPTY is the value for ECC GROUP IS EMPTY DCGM_ST_GROUP_IS_EMPTY = -35 // DCGM_ST_PROFILING_NOT_SUPPORTED is the value for ECC PROFILING NOT SUPPORTED DCGM_ST_PROFILING_NOT_SUPPORTED = -36 // DCGM_ST_PROFILING_LIBRARY_ERROR is the value for ECC PROFILING LIBRARY ERROR DCGM_ST_PROFILING_LIBRARY_ERROR = -37 // DCGM_ST_PROFILING_MULTI_PASS is the value for ECC PROFILING MULTI PASS DCGM_ST_PROFILING_MULTI_PASS = -38 // DCGM_ST_DIAG_ALREADY_RUNNING is the value for ECC DIAG ALREADY RUNNING DCGM_ST_DIAG_ALREADY_RUNNING = -39 // DCGM_ST_DIAG_BAD_JSON is the value for ECC DIAG BAD JSON DCGM_ST_DIAG_BAD_JSON = -40 // DCGM_ST_DIAG_BAD_LAUNCH is the value for ECC DIAG BAD LAUNCH DCGM_ST_DIAG_BAD_LAUNCH = -41 // DCGM_ST_DIAG_UNUSED is the value for ECC DIAG UNUSED DCGM_ST_DIAG_UNUSED = -42 // DCGM_ST_DIAG_THRESHOLD_EXCEEDED is the value for ECC DIAG THRESHOLD EXCEEDED DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43 // DCGM_ST_INSUFFICIENT_DRIVER_VERSION is the value for ECC INSUFFICIENT DRIVER VERSION DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44 // DCGM_ST_INSTANCE_NOT_FOUND is the value for ECC INSTANCE NOT FOUND DCGM_ST_INSTANCE_NOT_FOUND = -45 // DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND is the value for ECC COMPUTE INSTANCE NOT FOUND DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46 // DCGM_ST_CHILD_NOT_KILLED is the value for ECC CHILD NOT KILLED DCGM_ST_CHILD_NOT_KILLED = -47 // DCGM_ST_3RD_PARTY_LIBRARY_ERROR is the value for ECC 3RD PARTY LIBRARY ERROR DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48 // DCGM_ST_INSUFFICIENT_RESOURCES is the value for ECC INSUFFICIENT RESOURCES DCGM_ST_INSUFFICIENT_RESOURCES = -49 // DCGM_ST_PLUGIN_EXCEPTION is the value for ECC PLUGIN EXCEPTION DCGM_ST_PLUGIN_EXCEPTION = -50 // DCGM_ST_NVVS_ISOLATE_ERROR is the value for ECC NVVS ISOLATE ERROR DCGM_ST_NVVS_ISOLATE_ERROR = -51 // DCGM_ST_NVVS_BINARY_NOT_FOUND is the value for ECC NVVS BINARY NOT FOUND DCGM_ST_NVVS_BINARY_NOT_FOUND = -52 // DCGM_ST_NVVS_KILLED is the value for ECC NVVS KILLED DCGM_ST_NVVS_KILLED = -53 // DCGM_ST_PAUSED is the value for ECC PAUSED DCGM_ST_PAUSED = -54 // DCGM_ST_ALREADY_INITIALIZED is the value for ECC ALREADY INITIALIZED DCGM_ST_ALREADY_INITIALIZED = -55 // DCGM_ST_NVML_NOT_LOADED is the value for ECC NVML NOT LOADED DCGM_ST_NVML_NOT_LOADED = -56 // DCGM_ST_NVML_DRIVER_TIMEOUT is the value for ECC NVML DRIVER TIMEOUT DCGM_ST_NVML_DRIVER_TIMEOUT = -57 // DCGM_ST_NVVS_NO_AVAILABLE_TEST is the value for ECC NVVS NO AVAILABLE TEST DCGM_ST_NVVS_NO_AVAILABLE_TEST = -58 )
FieldType constants
const ( // MAX_NUM_CPU_CORES represents the maximum number of CPU cores supported MAX_NUM_CPU_CORES = uint(C.DCGM_MAX_NUM_CPU_CORES) // MAX_NUM_CPUS represents the maximum number of CPUs supported MAX_NUM_CPUS = uint(C.DCGM_MAX_NUM_CPUS) // CHAR_BIT represents the number of bits in a byte CHAR_BIT = uint(C.CHAR_BIT) // MAX_CPU_CORE_BITMASK_COUNT represents the maximum count of CPU core bitmasks MAX_CPU_CORE_BITMASK_COUNT = uint(1024 / 8 / 8) )
const ( // PerfStateMax represents the highest performance state (P0) PerfStateMax = 0 // PerfStateMin represents the lowest performance state (P15) PerfStateMin = 15 // PerfStateUnknown represents an unknown performance state PerfStateUnknown = 32 )
const ( // MAX_NUM_DEVICES represents the maximum number of GPU devices supported MAX_NUM_DEVICES = uint(C.DCGM_MAX_NUM_DEVICES) // MAX_HIERARCHY_INFO represents the maximum size of the MIG hierarchy information MAX_HIERARCHY_INFO = uint(C.DCGM_MAX_HIERARCHY_INFO) )
const ( // DbePolicy represents a Double-bit ECC error policy condition DbePolicy = policyCondition("Double-bit ECC error") // PCIePolicy represents a PCI error policy condition PCIePolicy = policyCondition("PCI error") // MaxRtPgPolicy represents a Maximum Retired Pages Limit policy condition MaxRtPgPolicy = policyCondition("Max Retired Pages Limit") // ThermalPolicy represents a Thermal Limit policy condition ThermalPolicy = policyCondition("Thermal Limit") // PowerPolicy represents a Power Limit policy condition PowerPolicy = policyCondition("Power Limit") // NvlinkPolicy represents an NVLink error policy condition NvlinkPolicy = policyCondition("Nvlink Error") // XidPolicy represents an XID error policy condition XidPolicy = policyCondition("XID Error") )
Policy condition types
const ( // DCGM_NVSDM_MOCK_YAML environment variable for enabling NVSDM mock configuration DCGM_NVSDM_MOCK_YAML = "DCGM_NVSDM_MOCK_YAML" // DCGM_DBG_FILE is environment variables which enables DCGM to write debug logs to a specific file DCGM_DBG_FILE = "__DCGM_DBG_FILE" // DCGM_DBG_LVL is environment variables which enables DCGM logging level DCGM_DBG_LVL = "__DCGM_DBG_LVL" )
const (
DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001)
)
DCGM_FV_FLAG_LIVE_DATA is a flag for the DCGM fields.
const (
DCGM_GROUP_MAX_ENTITIES int = C.DCGM_GROUP_MAX_ENTITIES_V2
)
DCGM_GROUP_MAX_ENTITIES represents the maximum number of entities allowed in a group
const DIAG_RESULT_STRING_SIZE = 1024
DIAG_RESULT_STRING_SIZE represents the maximum size of diagnostic result strings
Variables ¶
var ErrInvalidMode = errors.New("invalid mode")
ErrInvalidMode represents an error indicating that an invalid mode was used
Functions ¶
func AddEntityToGroup ¶
func AddEntityToGroup(groupID GroupHandle, entityGroupID Field_Entity_Group, entityID uint) (err error)
AddEntityToGroup adds an entity to an existing group
func AddLinkEntityToGroup ¶
func AddLinkEntityToGroup(groupID GroupHandle, index, parentID uint) (err error)
AddLinkEntityToGroup adds a link entity to the group
func AddToGroup ¶
func AddToGroup(groupID GroupHandle, gpuID uint) (err error)
AddToGroup adds a GPU to an existing group
func CreateFakeEntities ¶
func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error)
CreateFakeEntities creates test entities with the specified MIG hierarchy information. This function is intended for testing purposes only. Returns a slice of Entity IDs for the created entities and any error encountered.
func DestroyGroup ¶
func DestroyGroup(groupID GroupHandle) (err error)
DestroyGroup destroys an existing GPU group
func FieldGroupDestroy ¶
func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)
FieldGroupDestroy destroys a previously created field group. Returns an error if the group cannot be destroyed.
func FieldsInit ¶
func FieldsInit() int
FieldsInit initializes the DCGM fields module. Returns an integer status code.
func FieldsTerm ¶
func FieldsTerm() int
FieldsTerm terminates the DCGM fields module. Returns an integer status code.
func FindFirstNonAsciiIndex ¶
FindFirstNonAsciiIndex returns the index of the first non-ASCII character in the byte array. Returns 4096 if no non-ASCII character is found.
func Fv2_Blob ¶
func Fv2_Blob(fv FieldValue_v2) [4096]byte
Fv2_Blob returns the raw field value of a FieldValue_v2 as a byte array.
func Fv2_String ¶
func Fv2_String(fv FieldValue_v2) string
Fv2_String returns the string value of a FieldValue_v2.
func GetAllDeviceCount ¶
GetAllDeviceCount returns the count of all GPUs in the system
func GetEntityGroupEntities ¶
func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error)
GetEntityGroupEntities returns all entities of the specified group type
func GetSupportedDevices ¶
GetSupportedDevices returns a list of DCGM-supported GPU IDs
func HealthSet ¶
func HealthSet(groupID GroupHandle, systems HealthSystem) (err error)
HealthSet enables the DCGM health check system for the given systems. It configures which health watch systems should be monitored for the specified group.
func Init ¶
Init starts DCGM in the specified mode Mode can be: - Embedded: Start hostengine within this process - Standalone: Connect to an already running nv-hostengine - StartHostengine: Start and connect to nv-hostengine, terminate before exiting Returns a cleanup function and any error encountered
func InjectFieldValue ¶
func InjectFieldValue(gpu uint, fieldID Short, fieldType uint, status int, ts int64, value any) error
InjectFieldValue injects a test value for a specific field into DCGM's field manager. This function is intended for testing purposes only.
Parameters:
- gpu: The GPU ID to inject the field value for
- fieldID: The DCGM field identifier
- fieldType: The type of the field (e.g., DCGM_FT_INT64, DCGM_FT_DOUBLE)
- status: The status code for the field
- ts: The timestamp for the field value
- value: The value to inject (must match fieldType)
Returns an error if the injection fails
func IsCurrentField ¶
IsCurrentField returns true if the given field name is a current field
func IsInt32Blank ¶
IsInt32Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffff0). These values indicate that no valid data is available for the field.
func IsInt64Blank ¶
IsInt64Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffffffffffff0). These values indicate that no valid data is available for the field.
func IsLegacyField ¶
IsLegacyField returns true if the given field name is a legacy field
func ListenForPolicyViolations ¶
func ListenForPolicyViolations(ctx context.Context, typ ...policyCondition) (<-chan PolicyViolation, error)
ListenForPolicyViolations sets up monitoring for the specified policy conditions on all GPUs Returns a channel that receives policy violations and any error encountered
func ListenForPolicyViolationsForGroup ¶
func ListenForPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...policyCondition) (<-chan PolicyViolation, error)
ListenForPolicyViolationsForGroup sets up policy monitoring for the specified GPU group Returns a channel that receives policy violations and any error encountered
func Shutdown ¶
func Shutdown() (err error)
Shutdown stops DCGM and destroys all connections Returns an error if DCGM is not initialized
func UpdateAllFields ¶
func UpdateAllFields() error
UpdateAllFields forces an update of all field values. Returns an error if the update fails.
func ViolationRegistration ¶
ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()
func WatchFieldsWithGroup ¶
func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error
WatchFieldsWithGroup starts monitoring fields using default parameters. fieldsGroup is the handle of the field group to watch. group is the group handle to associate with the watch. Returns an error if the watch operation fails.
func WatchFieldsWithGroupEx ¶
func WatchFieldsWithGroupEx( fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32, ) error
WatchFieldsWithGroupEx starts monitoring fields with custom parameters. fieldsGroup is the handle of the field group to watch. group is the group handle to associate with the watch. updateFreq is the update frequency in microseconds. maxKeepAge is the maximum age of samples to keep in seconds. maxKeepSamples is the maximum number of samples to keep. Returns an error if the watch operation fails.
Types ¶
type CPUHierarchyCPU_v1 ¶
type CPUHierarchyCPU_v1 struct { // CPUID is the unique identifier for this CPU CPUID uint // OwnedCores is a bitmask array representing the cores owned by this CPU OwnedCores []uint64 }
CPUHierarchyCPU_v1 represents information about a single CPU and its owned cores
type CPUHierarchy_v1 ¶
type CPUHierarchy_v1 struct { // Version is the version number of the hierarchy structure Version uint // NumCPUs is the number of CPUs in the system NumCPUs uint // CPUs contains information about each CPU in the system CPUs [MAX_NUM_CPUS]CPUHierarchyCPU_v1 }
CPUHierarchy_v1 represents version 1 of the CPU hierarchy information
func GetCPUHierarchy ¶
func GetCPUHierarchy() (hierarchy CPUHierarchy_v1, err error)
GetCPUHierarchy retrieves the CPU hierarchy information from DCGM
type DbePolicyCondition ¶
type DbePolicyCondition struct { // Location specifies where the ECC error occurred Location string // NumErrors indicates the number of errors detected NumErrors uint }
DbePolicyCondition contains details about a Double-bit ECC error
type Device ¶
type Device struct { GPU uint DCGMSupported string UUID string Power uint // W PCI PCIInfo Identifiers DeviceIdentifiers Topology []P2PLink CPUAffinity string }
Device represents a GPU device and its properties
func GetDeviceInfo ¶
GetDeviceInfo returns detailed information about the specified GPU
type DeviceHealth ¶
type DeviceHealth struct { // GPU is the ID of the GPU device GPU uint // Status indicates the overall health status of the GPU Status string // Watches contains the status of individual health watch systems Watches []SystemWatch }
DeviceHealth represents the health status of a GPU device
func HealthCheckByGpuId ¶
func HealthCheckByGpuId(gpuID uint) (DeviceHealth, error)
HealthCheckByGpuId performs a health check on the specified GPU
type DeviceIdentifiers ¶
type DeviceIdentifiers struct { Brand string Model string Serial string Vbios string InforomImageVersion string DriverVersion string }
DeviceIdentifiers contains various identification information for a GPU device
type DeviceStatus ¶
type DeviceStatus struct { Power float64 // W Temperature int64 // °C Utilization UtilizationInfo Memory MemoryInfo Clocks ClockInfo PCI PCIStatusInfo Performance PerfState FanSpeed int64 // % }
DeviceStatus contains comprehensive GPU device status information
func GetDeviceStatus ¶
func GetDeviceStatus(gpuID uint) (DeviceStatus, error)
GetDeviceStatus returns current status information about the specified GPU
type DiagErrorDetail ¶
type DiagErrorDetail struct { // Message contains a human-readable description of the error Message string // Code identifies the specific type of error Code HealthCheckErrorCode }
DiagErrorDetail contains detailed information about a health check error
type DiagResult ¶
type DiagResult struct { // Status indicates the test result: "pass", "fail", "warn", "skip", or "notrun" Status string // TestName is the name of the diagnostic test that was run TestName string // TestOutput contains any additional output or messages from the test TestOutput string // ErrorCode is the numeric error code if the test failed ErrorCode uint // ErrorMessage contains a detailed error message if the test failed ErrorMessage string }
DiagResult represents the result of a single diagnostic test
type DiagResults ¶
type DiagResults struct { // Software contains the results of software-related diagnostic tests Software []DiagResult }
DiagResults contains the results of all diagnostic tests
func RunDiag ¶
func RunDiag(diagType DiagType, groupID GroupHandle) (DiagResults, error)
RunDiag runs diagnostic tests on a group of GPUs with the specified diagnostic level. Parameters:
- diagType: The type/level of diagnostic test to run (Quick, Medium, Long, or Extended)
- groupId: The group of GPUs to run diagnostics on
Returns:
- DiagResults containing the results of all diagnostic tests
- error if the diagnostics failed to run
type DiagType ¶
type DiagType int
DiagType represents the type of diagnostic test to run
const ( // DiagQuick represents a quick diagnostic test that performs basic health checks DiagQuick DiagType = 1 // DiagMedium represents a medium-length diagnostic test that performs more comprehensive checks DiagMedium DiagType = 2 // DiagLong represents a long diagnostic test that performs extensive health checks DiagLong DiagType = 3 // DiagExtended represents an extended diagnostic test that performs the most thorough system checks DiagExtended DiagType = 4 )
type ECCErrorsInfo ¶
ECCErrorsInfo contains ECC memory error counts
type Error ¶
type Error struct { Code C.dcgmReturn_t // dcgmReturn_t value of error // contains filtered or unexported fields }
Error represents an error returned by the DCGM library
type FieldHandle ¶
type FieldHandle struct {
// contains filtered or unexported fields
}
FieldHandle represents a handle to a DCGM field group
func FieldGroupCreate ¶
func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error)
FieldGroupCreate creates a new field group with the specified fields. fieldsGroupName is the name for the new group. fields is a slice of field IDs to include in the group. Returns the field group handle and any error encountered.
func (*FieldHandle) GetHandle ¶
func (f *FieldHandle) GetHandle() uintptr
GetHandle returns the internal DCGM field group handle as a uintptr
func (*FieldHandle) SetHandle ¶
func (f *FieldHandle) SetHandle(val uintptr)
SetHandle sets the internal DCGM field group handle to the provided value
type FieldMeta ¶
type FieldMeta struct { FieldID Short // Unique identifier for the field FieldType byte // Type of the field (e.g., integer, float, string) Size byte // Size of the field in bytes Tag string // Human-readable tag/name for the field Scope int // Scope of the field NvmlFieldID int // Corresponding NVML field identifier EntityLevel Field_Entity_Group // Entity level/group this field belongs to }
FieldMeta represents metadata about a DCGM field, including its identifier, type, size, and other attributes. This struct is used to describe the characteristics and properties of fields that can be monitored or queried through DCGM.
func FieldGetByID ¶
FieldGetByID retrieves field metadata for the specified field ID.
func ToFieldMeta ¶
func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta
ToFieldMeta converts a C DCGM field metadata structure to a Go FieldMeta struct.
type FieldValue_v1 ¶
type FieldValue_v1 struct { Version uint FieldID Short FieldType uint Status int TS int64 Value [4096]byte }
FieldValue_v1 represents a field value in version 1
func EntityGetLatestValues ¶
func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error)
EntityGetLatestValues retrieves the latest values for specified fields of any entity. entityGroup specifies the type of entity to query. entityId is the ID of the entity. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.
func GetLatestValuesForFields ¶
func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error)
GetLatestValuesForFields retrieves the most recent values for the specified fields. gpu is the ID of the GPU to query. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.
func LinkGetLatestValues ¶
func LinkGetLatestValues(index, parentId uint, fields []Short) ([]FieldValue_v1, error)
LinkGetLatestValues retrieves the latest values for specified fields of a link entity. index is the link index. parentId is the ID of the parent entity. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.
func (FieldValue_v1) Blob ¶
func (fv FieldValue_v1) Blob() [4096]byte
Blob returns the raw field value as a byte array.
func (FieldValue_v1) Float64 ¶
func (fv FieldValue_v1) Float64() float64
Float64 returns the field value as a float64.
func (FieldValue_v1) Int64 ¶
func (fv FieldValue_v1) Int64() int64
Int64 returns the field value as an int64.
func (FieldValue_v1) String ¶
func (fv FieldValue_v1) String() string
String returns the field value as a string.
type FieldValue_v2 ¶
type FieldValue_v2 struct { Version uint EntityGroupId Field_Entity_Group EntityID uint FieldID Short FieldType uint Status int TS int64 Value [4096]byte StringValue *string }
FieldValue_v2 represents a field value in version 2
func EntitiesGetLatestValues ¶
func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error)
EntitiesGetLatestValues retrieves the latest values for specified fields across multiple entities. entities is a slice of entity pairs to query. fields is a slice of field IDs to retrieve. flags specify additional options for the query. Returns a slice of field values and any error encountered.
func GetValuesSince ¶
func GetValuesSince(gpuGroup GroupHandle, fieldGroup FieldHandle, sinceTime time.Time) ([]FieldValue_v2, time.Time, error)
GetValuesSince reads and returns field values for a specified group of entities, such as GPUs, that have been updated since a given timestamp. It allows for targeted data retrieval based on time criteria.
GPUGroup is a GroupHandle that identifies the group of entities to operate on. It can be obtained from CreateGroup for a specific group of GPUs or use GroupAllGPUs() to target all GPUs.
fieldGroup is a FieldHandle representing the group of fields for which data is requested.
sinceTime is a time.Time value representing the timestamp from which to request updated values. A zero value (time.Time{}) requests all available data.
Returns []FieldValue_v2 slice containing the requested field values, a time.Time indicating the time of the latest data retrieval, and an error if there is any issue during the operation.
func (FieldValue_v2) Blob ¶
func (fv FieldValue_v2) Blob() [4096]byte
Blob returns the raw field value as a byte array.
func (FieldValue_v2) Float64 ¶
func (fv FieldValue_v2) Float64() float64
Float64 returns the field value as a float64.
func (FieldValue_v2) Int64 ¶
func (fv FieldValue_v2) Int64() int64
Int64 returns the field value as an int64.
func (FieldValue_v2) String ¶
func (fv FieldValue_v2) String() string
String returns the field value as a string.
type Field_Entity_Group ¶
type Field_Entity_Group uint
Field_Entity_Group represents the type of DCGM entity
const ( // FE_NONE represents no entity type FE_NONE Field_Entity_Group = iota // FE_GPU represents a GPU device entity FE_GPU // FE_VGPU represents a virtual GPU entity FE_VGPU // FE_SWITCH represents an NVSwitch entity FE_SWITCH // FE_GPU_I represents a GPU instance entity FE_GPU_I // FE_GPU_CI represents a GPU compute instance entity FE_GPU_CI // FE_LINK represents an NVLink entity FE_LINK // FE_CPU represents a CPU entity FE_CPU // FE_CPU_CORE represents a CPU core entity FE_CPU_CORE // FE_COUNT represents the total number of entity types FE_COUNT )
func (Field_Entity_Group) String ¶
func (e Field_Entity_Group) String() string
String returns a string representation of the Field_Entity_Group
type GroupEntityPair ¶
type GroupEntityPair struct { // EntityGroupId specifies the type of the entity EntityGroupId Field_Entity_Group // EntityId is the unique identifier for this entity EntityId uint }
GroupEntityPair represents a DCGM entity and its group identifier
type GroupHandle ¶
type GroupHandle struct {
// contains filtered or unexported fields
}
GroupHandle represents a handle to a DCGM GPU group
func CreateGroup ¶
func CreateGroup(groupName string) (goGroupId GroupHandle, err error)
CreateGroup creates a new empty GPU group with the specified name
func CreateGroupWithContext ¶
func CreateGroupWithContext(ctx context.Context, groupName string) (GroupHandle, error)
CreateGroupWithContext creates a new group with a context
func GroupAllGPUs ¶
func GroupAllGPUs() GroupHandle
GroupAllGPUs returns a GroupHandle representing all GPUs in the system
func NewDefaultGroup ¶
func NewDefaultGroup(groupName string) (GroupHandle, error)
NewDefaultGroup creates a new group with default GPUs and the specified name
func WatchFields ¶
func WatchFields(gpuID uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)
WatchFields starts monitoring the specified fields for a GPU. gpuId is the ID of the GPU to monitor. fieldsGroup is the handle of the field group to watch. groupName is a name for the watch group. Returns a group handle and any error encountered.
func WatchPidFields ¶
func WatchPidFields() (GroupHandle, error)
WatchPidFields configures DCGM to start recording stats for GPU processes Must be called before GetProcessInfo
func WatchPidFieldsEx ¶
func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error)
WatchPidFieldsEx is the same as WatchPidFields, but allows for modifying the update frequency, max samples, max sample age, and the GPUs on which to enable watches.
func (*GroupHandle) GetHandle ¶
func (g *GroupHandle) GetHandle() uintptr
GetHandle returns the internal group handle value
func (*GroupHandle) SetHandle ¶
func (g *GroupHandle) SetHandle(val uintptr)
SetHandle sets the internal group handle value
type GroupInfo ¶
type GroupInfo struct { Version uint32 GroupName string EntityList []GroupEntityPair }
GroupInfo contains information about a DCGM group
func GetGroupInfo ¶
func GetGroupInfo(groupID GroupHandle) (*GroupInfo, error)
GetGroupInfo retrieves information about a DCGM group
type HealthCheckErrorCode ¶
type HealthCheckErrorCode uint
HealthCheckErrorCode error codes for passive and active health checks.
const ( // DCGM_FR_OK No error DCGM_FR_OK HealthCheckErrorCode = 0 // DCGM_FR_UNKNOWN Unknown error code DCGM_FR_UNKNOWN HealthCheckErrorCode = 1 // DCGM_FR_UNRECOGNIZED Unrecognized error code DCGM_FR_UNRECOGNIZED HealthCheckErrorCode = 2 // DCGM_FR_PCI_REPLAY_RATE Unacceptable rate of PCI errors DCGM_FR_PCI_REPLAY_RATE HealthCheckErrorCode = 3 // DCGM_FR_VOLATILE_DBE_DETECTED Unacceptable rate of volatile double bit errors DCGM_FR_VOLATILE_DBE_DETECTED HealthCheckErrorCode = 4 // DCGM_FR_VOLATILE_SBE_DETECTED Unacceptable rate of volatile single bit errors DCGM_FR_VOLATILE_SBE_DETECTED HealthCheckErrorCode = 5 // DCGM_FR_VOLATILE_SBE_DETECTED_TS Unacceptable rate of volatile single bit errors with a timestamp DCGM_FR_VOLATILE_SBE_DETECTED_TS HealthCheckErrorCode = 6 // DCGM_FR_PENDING_PAGE_RETIREMENTS Pending page retirements detected DCGM_FR_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 6 // DCGM_FR_RETIRED_PAGES_LIMIT Unacceptable total page retirements detected DCGM_FR_RETIRED_PAGES_LIMIT HealthCheckErrorCode = 7 // DCGM_FR_RETIRED_PAGES_DBE_LIMIT Unacceptable total page retirements due to uncorrectable errors DCGM_FR_RETIRED_PAGES_DBE_LIMIT HealthCheckErrorCode = 8 // DCGM_FR_CORRUPT_INFOROM Corrupt inforom found DCGM_FR_CORRUPT_INFOROM HealthCheckErrorCode = 9 // DCGM_FR_CLOCK_THROTTLE_THERMAL Clocks being throttled due to overheating DCGM_FR_CLOCK_THROTTLE_THERMAL HealthCheckErrorCode = 10 // DCGM_FR_POWER_UNREADABLE Cannot get a reading for power from NVML DCGM_FR_POWER_UNREADABLE HealthCheckErrorCode = 11 // DCGM_FR_CLOCK_THROTTLE_POWER Clock being throttled due to power restrictions DCGM_FR_CLOCK_THROTTLE_POWER HealthCheckErrorCode = 12 // DCGM_FR_NVLINK_ERROR_THRESHOLD Unacceptable rate of NVLink errors DCGM_FR_NVLINK_ERROR_THRESHOLD HealthCheckErrorCode = 13 // DCGM_FR_NVLINK_DOWN NVLink is down DCGM_FR_NVLINK_DOWN HealthCheckErrorCode = 14 // DCGM_FR_NVSWITCH_FATAL_ERROR Fatal errors on the NVSwitch DCGM_FR_NVSWITCH_FATAL_ERROR HealthCheckErrorCode = 15 // DCGM_FR_NVSWITCH_NON_FATAL_ERROR Non-fatal errors on the NVSwitch DCGM_FR_NVSWITCH_NON_FATAL_ERROR HealthCheckErrorCode = 16 // DCGM_FR_NVSWITCH_DOWN NVSwitch is down DCGM_FR_NVSWITCH_DOWN HealthCheckErrorCode = 17 // DCGM_FR_NO_ACCESS_TO_FILE Cannot access a file DCGM_FR_NO_ACCESS_TO_FILE HealthCheckErrorCode = 18 // DCGM_FR_NVML_API Error occurred on an NVML API - NOT USED: DEPRECATED DCGM_FR_NVML_API HealthCheckErrorCode = 19 // DCGM_FR_DEVICE_COUNT_MISMATCH Device count mismatch DCGM_FR_DEVICE_COUNT_MISMATCH HealthCheckErrorCode = 20 // DCGM_FR_BAD_PARAMETER Bad parameter passed to API DCGM_FR_BAD_PARAMETER HealthCheckErrorCode = 21 // DCGM_FR_CANNOT_OPEN_LIB Cannot open a library that must be accessed DCGM_FR_CANNOT_OPEN_LIB HealthCheckErrorCode = 22 // DCGM_FR_DENYLISTED_DRIVER A driver on the denylist (nouveau) is active DCGM_FR_DENYLISTED_DRIVER HealthCheckErrorCode = 23 // DCGM_FR_NVML_LIB_BAD NVML library is missing expected functions - NOT USED: DEPRECATED DCGM_FR_NVML_LIB_BAD HealthCheckErrorCode = 24 // DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25 DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25 // DCGM_FR_HOSTENGINE_CONN Bad connection to nv-hostengine - NOT USED: DEPRECATED DCGM_FR_HOSTENGINE_CONN HealthCheckErrorCode = 26 // DCGM_FR_FIELD_QUERY Field query failed DCGM_FR_FIELD_QUERY HealthCheckErrorCode = 27 // DCGM_FR_BAD_CUDA_ENV The environment has variables that hurt CUDA DCGM_FR_BAD_CUDA_ENV HealthCheckErrorCode = 28 // DCGM_FR_PERSISTENCE_MODE Persistence mode is disabled DCGM_FR_PERSISTENCE_MODE HealthCheckErrorCode = 29 // DCGM_FR_BAD_NVLINK_ENV The environment has variables that hurt NVLink DCGM_FR_BAD_NVLINK_ENV HealthCheckErrorCode = 29 // DCGM_FR_LOW_BANDWIDTH The bandwidth is unacceptably low DCGM_FR_LOW_BANDWIDTH HealthCheckErrorCode = 30 // DCGM_FR_HIGH_LATENCY Latency is too high DCGM_FR_HIGH_LATENCY HealthCheckErrorCode = 31 // DCGM_FR_CANNOT_GET_FIELD_TAG Cannot find a tag for a field DCGM_FR_CANNOT_GET_FIELD_TAG HealthCheckErrorCode = 32 // DCGM_FR_FIELD_VIOLATION The value for the specified error field is above 0 DCGM_FR_FIELD_VIOLATION HealthCheckErrorCode = 33 // DCGM_FR_FIELD_THRESHOLD The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD HealthCheckErrorCode = 34 // DCGM_FR_FIELD_VIOLATION_DBL The value for the specified error field is above 0 DCGM_FR_FIELD_VIOLATION_DBL HealthCheckErrorCode = 35 // DCGM_FR_FIELD_THRESHOLD_DBL The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD_DBL HealthCheckErrorCode = 36 // DCGM_FR_UNSUPPORTED_FIELD_TYPE Field type cannot be supported DCGM_FR_UNSUPPORTED_FIELD_TYPE HealthCheckErrorCode = 37 // DCGM_FR_FIELD_THRESHOLD_TS The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD_TS HealthCheckErrorCode = 38 // DCGM_FR_FIELD_THRESHOLD_TS_DBL The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD_TS_DBL HealthCheckErrorCode = 39 // DCGM_FR_THERMAL_VIOLATIONS Thermal violations detected DCGM_FR_THERMAL_VIOLATIONS HealthCheckErrorCode = 40 // DCGM_FR_THERMAL_VIOLATIONS_TS Thermal violations detected with a timestamp DCGM_FR_THERMAL_VIOLATIONS_TS HealthCheckErrorCode = 41 // DCGM_FR_TEMP_VIOLATION Non-benign clock throttling is occurring DCGM_FR_TEMP_VIOLATION HealthCheckErrorCode = 42 // DCGM_FR_THROTTLING_VIOLATION Non-benign clock throttling is occurring DCGM_FR_THROTTLING_VIOLATION HealthCheckErrorCode = 43 // DCGM_FR_INTERNAL An internal error was detected DCGM_FR_INTERNAL HealthCheckErrorCode = 44 // DCGM_FR_PCIE_GENERATION PCIe generation is too low DCGM_FR_PCIE_GENERATION HealthCheckErrorCode = 45 // DCGM_FR_PCIE_WIDTH PCIe width is too low DCGM_FR_PCIE_WIDTH HealthCheckErrorCode = 46 // DCGM_FR_ABORTED Test was aborted by a user signal DCGM_FR_ABORTED HealthCheckErrorCode = 47 // DCGM_FR_TEST_DISABLED Test was disabled by a user signal DCGM_FR_TEST_DISABLED HealthCheckErrorCode = 48 // DCGM_FR_CANNOT_GET_STAT Cannot get telemetry for a needed value DCGM_FR_CANNOT_GET_STAT HealthCheckErrorCode = 49 // DCGM_FR_STRESS_LEVEL Stress level is too low (bad performance) DCGM_FR_STRESS_LEVEL HealthCheckErrorCode = 50 // DCGM_FR_CUDA_API HealthCheckErrorCode = 51 DCGM_FR_CUDA_API HealthCheckErrorCode = 51 // DCGM_FR_FAULTY_MEMORY Faulty memory detected on this GPU DCGM_FR_FAULTY_MEMORY HealthCheckErrorCode = 52 // DCGM_FR_CANNOT_SET_WATCHES Unable to set field watches in DCGM - NOT USED: DEPRECATED DCGM_FR_CANNOT_SET_WATCHES HealthCheckErrorCode = 53 // DCGM_FR_CUDA_UNBOUND CUDA context is no longer bound DCGM_FR_CUDA_UNBOUND HealthCheckErrorCode = 54 // DCGM_FR_ECC_DISABLED ECC memory is disabled right now DCGM_FR_ECC_DISABLED HealthCheckErrorCode = 55 // DCGM_FR_MEMORY_ALLOC Cannot allocate memory on the GPU DCGM_FR_MEMORY_ALLOC HealthCheckErrorCode = 56 // DCGM_FR_CUDA_DBE CUDA detected unrecovable double-bit error DCGM_FR_CUDA_DBE HealthCheckErrorCode = 57 // DCGM_FR_MEMORY_MISMATCH Memory error detected DCGM_FR_MEMORY_MISMATCH HealthCheckErrorCode = 58 // DCGM_FR_CUDA_DEVICE No CUDA device discoverable for existing GPU DCGM_FR_CUDA_DEVICE HealthCheckErrorCode = 59 // DCGM_FR_ECC_UNSUPPORTED ECC memory is unsupported by this SKU DCGM_FR_ECC_UNSUPPORTED HealthCheckErrorCode = 60 // DCGM_FR_ECC_PENDING ECC memory is in a pending state - NOT USED: DEPRECATED DCGM_FR_ECC_PENDING HealthCheckErrorCode = 61 // DCGM_FR_MEMORY_BANDWIDTH Memory bandwidth is too low DCGM_FR_MEMORY_BANDWIDTH HealthCheckErrorCode = 62 // DCGM_FR_TARGET_POWER The target power is too low DCGM_FR_TARGET_POWER HealthCheckErrorCode = 63 // DCGM_FR_API_FAIL The specified API call failed DCGM_FR_API_FAIL HealthCheckErrorCode = 64 // DCGM_FR_API_FAIL_GPU The specified API call failed for the specified GPU DCGM_FR_API_FAIL_GPU HealthCheckErrorCode = 65 // DCGM_FR_CUDA_CONTEXT Cannot create a CUDA context on this GPU DCGM_FR_CUDA_CONTEXT HealthCheckErrorCode = 66 // DCGM_FR_DCGM_API DCGM API failure DCGM_FR_DCGM_API HealthCheckErrorCode = 67 // DCGM_FR_CONCURRENT_GPUS Need multiple GPUs to run this test DCGM_FR_CONCURRENT_GPUS HealthCheckErrorCode = 68 // DCGM_FR_TOO_MANY_ERRORS More errors than fit in the return struct - NOT USED: DEPRECATED DCGM_FR_TOO_MANY_ERRORS HealthCheckErrorCode = 69 // DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD NVLink CRC error threshold violation DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD HealthCheckErrorCode = 70 // DCGM_FR_NVLINK_ERROR_CRITICAL NVLink error for a field that should always be 0 DCGM_FR_NVLINK_ERROR_CRITICAL HealthCheckErrorCode = 71 // DCGM_FR_ENFORCED_POWER_LIMIT The enforced power limit is too low to hit the target DCGM_FR_ENFORCED_POWER_LIMIT HealthCheckErrorCode = 72 // DCGM_FR_MEMORY_ALLOC_HOST Cannot allocate memory on the host DCGM_FR_MEMORY_ALLOC_HOST HealthCheckErrorCode = 73 // DCGM_FR_GPU_OP_MODE Bad GPU operating mode for running plugin - NOT USED: DEPRECATED DCGM_FR_GPU_OP_MODE HealthCheckErrorCode = 74 // DCGM_FR_NO_MEMORY_CLOCKS No memory clocks with the needed MHz found - NOT USED: DEPRECATED DCGM_FR_NO_MEMORY_CLOCKS HealthCheckErrorCode = 75 // DCGM_FR_NO_GRAPHICS_CLOCKS No graphics clocks with the needed MHz found - NOT USED: DEPRECATED DCGM_FR_NO_GRAPHICS_CLOCKS HealthCheckErrorCode = 76 // DCGM_FR_HAD_TO_RESTORE_STATE Note that we had to restore a GPU's state DCGM_FR_HAD_TO_RESTORE_STATE HealthCheckErrorCode = 77 // DCGM_FR_L1TAG_UNSUPPORTED L1TAG test is unsupported by this SKU DCGM_FR_L1TAG_UNSUPPORTED HealthCheckErrorCode = 78 // DCGM_FR_L1TAG_MISCOMPARE L1TAG test failed on a miscompare DCGM_FR_L1TAG_MISCOMPARE HealthCheckErrorCode = 79 // DCGM_FR_ROW_REMAP_FAILURE Row remapping failed (Ampere or newer GPUs) DCGM_FR_ROW_REMAP_FAILURE HealthCheckErrorCode = 80 // DCGM_FR_UNCONTAINED_ERROR Uncontained error - XID 95 DCGM_FR_UNCONTAINED_ERROR HealthCheckErrorCode = 81 // DCGM_FR_EMPTY_GPU_LIST No GPU information given to plugin DCGM_FR_EMPTY_GPU_LIST HealthCheckErrorCode = 82 // DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS Pending page retirements due to a DBE DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 83 // DCGM_FR_UNCORRECTABLE_ROW_REMAP Uncorrectable row remapping DCGM_FR_UNCORRECTABLE_ROW_REMAP HealthCheckErrorCode = 84 // DCGM_FR_PENDING_ROW_REMAP Row remapping is pending DCGM_FR_PENDING_ROW_REMAP HealthCheckErrorCode = 85 // DCGM_FR_BROKEN_P2P_MEMORY_DEVICE P2P copy test detected an error writing to this GPU DCGM_FR_BROKEN_P2P_MEMORY_DEVICE HealthCheckErrorCode = 86 // DCGM_FR_BROKEN_P2P_WRITER_DEVICE P2P copy test detected an error writing from this GPU DCGM_FR_BROKEN_P2P_WRITER_DEVICE HealthCheckErrorCode = 87 // DCGM_FR_NVSWITCH_NVLINK_DOWN An NvLink is down for the specified NVSwitch DCGM_FR_NVSWITCH_NVLINK_DOWN HealthCheckErrorCode = 88 // DCGM_FR_EUD_BINARY_PERMISSIONS EUD binary permissions are incorrect DCGM_FR_EUD_BINARY_PERMISSIONS HealthCheckErrorCode = 89 // DCGM_FR_EUD_NON_ROOT_USER EUD plugin is not running as root DCGM_FR_EUD_NON_ROOT_USER HealthCheckErrorCode = 90 // DCGM_FR_EUD_SPAWN_FAILURE EUD plugin failed to spawn the EUD binary DCGM_FR_EUD_SPAWN_FAILURE HealthCheckErrorCode = 91 // DCGM_FR_EUD_TIMEOUT EUD plugin timed out DCGM_FR_EUD_TIMEOUT HealthCheckErrorCode = 92 // DCGM_FR_EUD_ZOMBIE EUD process remains running after the plugin considers it finished DCGM_FR_EUD_ZOMBIE HealthCheckErrorCode = 93 // DCGM_FR_EUD_NON_ZERO_EXIT_CODE EUD process exited with a non-zero exit code DCGM_FR_EUD_NON_ZERO_EXIT_CODE HealthCheckErrorCode = 94 // DCGM_FR_EUD_TEST_FAILED EUD test failed DCGM_FR_EUD_TEST_FAILED HealthCheckErrorCode = 95 // DCGM_FR_FILE_CREATE_PERMISSIONS We cannot create a file in this directory. DCGM_FR_FILE_CREATE_PERMISSIONS HealthCheckErrorCode = 96 // DCGM_FR_PAUSE_RESUME_FAILED Pause/Resume failed DCGM_FR_PAUSE_RESUME_FAILED HealthCheckErrorCode = 97 // DCGM_FR_PCIE_H_REPLAY_VIOLATION PCIe H replay violation DCGM_FR_PCIE_H_REPLAY_VIOLATION HealthCheckErrorCode = 98 // DCGM_FR_GPU_EXPECTED_NVLINKS_UP Expected nvlinks up per gpu DCGM_FR_GPU_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 99 // DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP Expected nvlinks up per nvswitch DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 100 // DCGM_FR_XID_ERROR XID error detected DCGM_FR_XID_ERROR HealthCheckErrorCode = 101 // DCGM_FR_SBE_VIOLATION Single bit error detected DCGM_FR_SBE_VIOLATION HealthCheckErrorCode = 102 // DCGM_FR_DBE_VIOLATION Double bit error detected DCGM_FR_DBE_VIOLATION HealthCheckErrorCode = 103 // DCGM_FR_PCIE_REPLAY_VIOLATION PCIe replay errors detected DCGM_FR_PCIE_REPLAY_VIOLATION HealthCheckErrorCode = 104 // DCGM_FR_SBE_THRESHOLD_VIOLATION SBE threshold violated DCGM_FR_SBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 105 // DCGM_FR_DBE_THRESHOLD_VIOLATION DBE threshold violated DCGM_FR_DBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 106 // DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION PCIe replay count violated DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION HealthCheckErrorCode = 107 // DCGM_FR_CUDA_FM_NOT_INITIALIZED The fabricmanager is not initialized DCGM_FR_CUDA_FM_NOT_INITIALIZED HealthCheckErrorCode = 108 // DCGM_FR_SXID_ERROR NvSwitch fatal error detected DCGM_FR_SXID_ERROR HealthCheckErrorCode = 109 // DCGM_FR_GFLOPS_THRESHOLD_VIOLATION GPU GFLOPs threshold violated DCGM_FR_GFLOPS_THRESHOLD_VIOLATION HealthCheckErrorCode = 110 // DCGM_FR_NAN_VALUE NaN value detected on this GPU DCGM_FR_NAN_VALUE HealthCheckErrorCode = 111 // DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR Fabric Manager did not finish training DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR HealthCheckErrorCode = 112 // DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE P2P copy test detected an error writing to this GPU over PCIE DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE HealthCheckErrorCode = 113 // DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE P2P copy test detected an error writing from this GPU over PCIE DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE HealthCheckErrorCode = 114 // DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE P2P copy test detected an error writing to this GPU over NVLink DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE HealthCheckErrorCode = 115 // DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE P2P copy test detected an error writing from this GPU over NVLink DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE HealthCheckErrorCode = 116 // DCGM_FR_ERROR_SENTINEL MUST BE THE LAST ERROR CODE DCGM_FR_ERROR_SENTINEL HealthCheckErrorCode = 117 )
type HealthResponse ¶
type HealthResponse struct { // OverallHealth indicates the aggregate health status across all watches OverallHealth HealthResult // Incidents contains details about any health issues detected Incidents []Incident }
HealthResponse contains the results of a health check operation
func HealthCheck ¶
func HealthCheck(groupID GroupHandle) (HealthResponse, error)
HealthCheck checks the configured watches for any errors/failures/warnings that have occurred since the last time this check was invoked. On the first call, stateful information about all of the enabled watches within a group is created but no error results are provided. On subsequent calls, any error information will be returned.
type HealthResult ¶
type HealthResult uint
HealthResult is the result of a health check.
const ( // DCGM_HEALTH_RESULT_PASS All results within this system are reporting normal DCGM_HEALTH_RESULT_PASS HealthResult = 0 // DCGM_HEALTH_RESULT_WARN A warning has been issued, refer to the response for more information DCGM_HEALTH_RESULT_WARN HealthResult = 10 // DCGM_HEALTH_RESULT_FAIL A failure has been issued, refer to the response for more information DCGM_HEALTH_RESULT_FAIL HealthResult = 20 )
type HealthSystem ¶
type HealthSystem uint
HealthSystem is the system to watch for health checks.
const ( // DCGM_HEALTH_WATCH_PCIE PCIe health check DCGM_HEALTH_WATCH_PCIE HealthSystem = 0x1 // DCGM_HEALTH_WATCH_NVLINK NVLink health check DCGM_HEALTH_WATCH_NVLINK HealthSystem = 0x2 // DCGM_HEALTH_WATCH_PMU PMU health check DCGM_HEALTH_WATCH_PMU HealthSystem = 0x4 // DCGM_HEALTH_WATCH_MCU MCU health check DCGM_HEALTH_WATCH_MCU HealthSystem = 0x8 // DCGM_HEALTH_WATCH_MEM Memory health check DCGM_HEALTH_WATCH_MEM HealthSystem = 0x10 // DCGM_HEALTH_WATCH_SM SM health check DCGM_HEALTH_WATCH_SM HealthSystem = 0x20 // DCGM_HEALTH_WATCH_INFOROM Inforom health check DCGM_HEALTH_WATCH_INFOROM HealthSystem = 0x40 // DCGM_HEALTH_WATCH_THERMAL Thermal health check DCGM_HEALTH_WATCH_THERMAL HealthSystem = 0x80 // DCGM_HEALTH_WATCH_POWER Power health check DCGM_HEALTH_WATCH_POWER HealthSystem = 0x100 // DCGM_HEALTH_WATCH_DRIVER Driver health check DCGM_HEALTH_WATCH_DRIVER HealthSystem = 0x200 // DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL NVSwitch non-fatal health check DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL HealthSystem = 0x400 // DCGM_HEALTH_WATCH_NVSWITCH_FATAL NVSwitch fatal health check DCGM_HEALTH_WATCH_NVSWITCH_FATAL HealthSystem = 0x800 // DCGM_HEALTH_WATCH_ALL All health checks DCGM_HEALTH_WATCH_ALL HealthSystem = 0xFFFFFFFF )
func HealthGet ¶
func HealthGet(groupID GroupHandle) (HealthSystem, error)
HealthGet retrieves the current state of the DCGM health check system. It returns which health watch systems are currently enabled for the specified group.
type Incident ¶
type Incident struct { // System identifies which health watch system detected the incident System HealthSystem // Health indicates the severity of the incident Health HealthResult // Error contains detailed information about the incident Error DiagErrorDetail // EntityInfo identifies the GPU or component where the incident occurred EntityInfo GroupEntityPair }
Incident represents a health check incident that occurred
type Link_State ¶
type Link_State uint
Link_State represents the state of an NVLINK connection
const ( // LS_NOT_SUPPORTED indicates the link is unsupported (Default for GPUs) LS_NOT_SUPPORTED Link_State = iota // LS_DISABLED indicates the link is supported but disabled (Default for NvSwitches) LS_DISABLED // LS_DOWN indicates the link is down (inactive) LS_DOWN // LS_UP indicates the link is up (active) LS_UP )
type MemoryInfo ¶
type MemoryInfo struct { GlobalUsed int64 ECCErrors ECCErrorsInfo }
MemoryInfo contains GPU memory usage and error information
type MetricGroup ¶
MetricGroup represents a group of metrics for a specific GPU
func GetSupportedMetricGroups ¶
func GetSupportedMetricGroups(gpuID uint) ([]MetricGroup, error)
GetSupportedMetricGroups returns all supported metric groups for the specified GPU
type MigEntityInfo ¶
type MigEntityInfo struct { // GpuUuid is the UUID of the parent GPU GpuUuid string // NvmlGpuIndex is the NVML index of the parent GPU NvmlGpuIndex uint // NvmlInstanceId is the NVML GPU instance ID NvmlInstanceId uint // NvmlComputeInstanceId is the NVML compute instance ID NvmlComputeInstanceId uint // NvmlMigProfileId is the NVML MIG profile ID NvmlMigProfileId uint // NvmlProfileSlices is the number of slices in the MIG profile NvmlProfileSlices uint }
MigEntityInfo contains information about a MIG entity
type MigHierarchyInfo ¶
type MigHierarchyInfo struct { // Entity represents the current GPU entity in the hierarchy Entity GroupEntityPair // Parent represents the parent GPU entity in the hierarchy Parent GroupEntityPair // SliceProfile defines the MIG profile configuration for this entity SliceProfile MigProfile }
MigHierarchyInfo represents the Multi-Instance GPU (MIG) hierarchy information for a GPU entity and its relationship to other entities
type MigHierarchyInfo_v2 ¶
type MigHierarchyInfo_v2 struct { // Entity contains the entity information Entity GroupEntityPair // Parent contains the parent entity information Parent GroupEntityPair // Info contains detailed MIG entity information Info MigEntityInfo }
MigHierarchyInfo_v2 represents version 2 of MIG hierarchy information
type MigHierarchy_v2 ¶
type MigHierarchy_v2 struct { // Version is the version number of the hierarchy structure Version uint // Count is the number of valid entries in EntityList Count uint // EntityList contains the MIG hierarchy information for each entity EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2 }
MigHierarchy_v2 represents version 2 of the complete MIG hierarchy
func GetGPUInstanceHierarchy ¶
func GetGPUInstanceHierarchy() (hierarchy MigHierarchy_v2, err error)
GetGPUInstanceHierarchy retrieves the complete MIG hierarchy information
type MigProfile ¶
type MigProfile int
MigProfile represents the Multi-Instance GPU (MIG) profile type
const ( // MigProfileNone indicates no MIG profile is set (for GPUs) MigProfileNone MigProfile = 0 /*!< No profile (for GPUs) */ // MigProfileGPUInstanceSlice1 represents GPU instance slice 1 MigProfileGPUInstanceSlice1 MigProfile = 1 /*!< GPU instance slice 1 */ // MigProfileGPUInstanceSlice2 represents GPU instance slice 2 MigProfileGPUInstanceSlice2 MigProfile = 2 /*!< GPU instance slice 2 */ // MigProfileGPUInstanceSlice3 represents GPU instance slice 3 MigProfileGPUInstanceSlice3 MigProfile = 3 /*!< GPU instance slice 3 */ // MigProfileGPUInstanceSlice4 represents GPU instance slice 4 MigProfileGPUInstanceSlice4 MigProfile = 4 /*!< GPU instance slice 4 */ // MigProfileGPUInstanceSlice7 represents GPU instance slice 7 MigProfileGPUInstanceSlice7 MigProfile = 5 /*!< GPU instance slice 7 */ // MigProfileGPUInstanceSlice8 represents GPU instance slice 8 MigProfileGPUInstanceSlice8 MigProfile = 6 /*!< GPU instance slice 8 */ // MigProfileGPUInstanceSlice6 represents GPU instance slice 6 MigProfileGPUInstanceSlice6 MigProfile = 7 /*!< GPU instance slice 6 */ // MigProfileGPUInstanceSlice1Rev1 represents GPU instance slice 1 revision 1 MigProfileGPUInstanceSlice1Rev1 MigProfile = 8 /*!< GPU instance slice 1 revision 1 */ // MigProfileGPUInstanceSlice2Rev1 represents GPU instance slice 2 revision 1 MigProfileGPUInstanceSlice2Rev1 MigProfile = 9 /*!< GPU instance slice 2 revision 1 */ // MigProfileGPUInstanceSlice1Rev2 represents GPU instance slice 1 revision 2 MigProfileGPUInstanceSlice1Rev2 MigProfile = 10 /*!< GPU instance slice 1 revision 2 */ // MigProfileComputeInstanceSlice1 represents compute instance slice 1 MigProfileComputeInstanceSlice1 MigProfile = 30 /*!< compute instance slice 1 */ // MigProfileComputeInstanceSlice2 represents compute instance slice 2 MigProfileComputeInstanceSlice2 MigProfile = 31 /*!< compute instance slice 2 */ // MigProfileComputeInstanceSlice3 represents compute instance slice 3 MigProfileComputeInstanceSlice3 MigProfile = 32 /*!< compute instance slice 3 */ // MigProfileComputeInstanceSlice4 represents compute instance slice 4 MigProfileComputeInstanceSlice4 MigProfile = 33 /*!< compute instance slice 4*/ // MigProfileComputeInstanceSlice7 represents compute instance slice 7 MigProfileComputeInstanceSlice7 MigProfile = 34 /*!< compute instance slice 7 */ // MigProfileComputeInstanceSlice8 represents compute instance slice 8 MigProfileComputeInstanceSlice8 MigProfile = 35 /*!< compute instance slice 8 */ // MigProfileComputeInstanceSlice6 represents compute instance slice 6 MigProfileComputeInstanceSlice6 MigProfile = 36 /*!< compute instance slice 6 */ // MigProfileComputeInstanceSlice1Rev1 represents compute instance slice 1 revision 1 MigProfileComputeInstanceSlice1Rev1 MigProfile = 37 /*!< compute instance slice 1 revision 1 */ )
type NvLinkStatus ¶
type NvLinkStatus struct { // ParentId is the ID of the parent entity (GPU or NVSwitch) ParentId uint // ParentType is the type of the parent entity ParentType Field_Entity_Group // State is the current state of the NVLINK State Link_State // Index is the link index number Index uint }
NvLinkStatus contains information about an NVLINK connection status
func GetNvLinkLinkStatus ¶
func GetNvLinkLinkStatus() ([]NvLinkStatus, error)
GetNvLinkLinkStatus returns the status of all NVLink connections
type NvlinkPolicyCondition ¶
type NvlinkPolicyCondition struct { // FieldId identifies the specific NVLink field that had an error FieldId uint16 // Counter indicates the number of errors detected Counter uint }
NvlinkPolicyCondition contains details about an NVLink error
type P2PLink ¶
type P2PLink struct { // GPU is the ID of the GPU GPU uint // BusID is the PCIe bus ID of the GPU BusID string // Link is the type of P2P connection Link P2PLinkType }
P2PLink contains information about a peer-to-peer connection
func GetDeviceTopology ¶
GetDeviceTopology returns the topology (connectivity) information for the specified GPU
type P2PLinkType ¶
type P2PLinkType uint
P2PLinkType represents the type of peer-to-peer connection between GPUs
const ( // P2PLinkUnknown represents an unknown link type P2PLinkUnknown P2PLinkType = iota // P2PLinkCrossCPU represents a connection across different CPUs P2PLinkCrossCPU // P2PLinkSameCPU represents a connection within the same CPU P2PLinkSameCPU // P2PLinkHostBridge represents a connection through the host bridge P2PLinkHostBridge // P2PLinkMultiSwitch represents a connection through multiple PCIe switches P2PLinkMultiSwitch // P2PLinkSingleSwitch represents a connection through a single PCIe switch P2PLinkSingleSwitch // P2PLinkSameBoard represents a connection on the same board P2PLinkSameBoard // SingleNVLINKLink represents a single NVLINK connection SingleNVLINKLink // TwoNVLINKLinks represents two NVLINK connections TwoNVLINKLinks // ThreeNVLINKLinks represents three NVLINK connections ThreeNVLINKLinks // FourNVLINKLinks represents four NVLINK connections FourNVLINKLinks )
func (P2PLinkType) PCIPaths ¶
func (l P2PLinkType) PCIPaths() string
PCIPaths returns a string representation of the P2P link type
type PCIStatusInfo ¶
type PCIStatusInfo struct { BAR1Used int64 // MB Throughput PCIThroughputInfo FBUsed int64 }
PCIStatusInfo contains PCI bus status information
type PCIThroughputInfo ¶
PCIThroughputInfo contains PCI bus transfer metrics
type PciPolicyCondition ¶
type PciPolicyCondition struct { // ReplayCounter indicates the number of PCI replays ReplayCounter uint }
PciPolicyCondition contains details about a PCI error
type PolicyViolation ¶
type PolicyViolation struct { // Condition specifies the type of policy that was violated Condition policyCondition // Timestamp indicates when the violation occurred Timestamp time.Time // Data contains violation-specific details Data any }
PolicyViolation represents a detected violation of a policy condition
type PowerPolicyCondition ¶
type PowerPolicyCondition struct { // PowerViolation indicates the severity of the power violation PowerViolation uint }
PowerPolicyCondition contains details about a power violation
type ProcessInfo ¶
type ProcessInfo struct { // GPU is the ID of the GPU being used GPU uint // PID is the process ID PID uint // Name is the name of the process Name string // ProcessUtilization contains process-specific utilization metrics ProcessUtilization ProcessUtilInfo // PCI contains PCI bus statistics PCI PCIStatusInfo // Memory contains memory usage statistics Memory MemoryInfo // GpuUtilization contains GPU utilization metrics GpuUtilization UtilizationInfo // Clocks contains GPU clock frequencies Clocks ClockInfo // Violations contains throttling statistics Violations ViolationTime // XIDErrors contains XID error information XIDErrors XIDErrorInfo }
ProcessInfo contains comprehensive information about a GPU process
func GetProcessInfo ¶
func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error)
GetProcessInfo returns detailed per-GPU statistics for the specified process
type ProcessUtilInfo ¶
type ProcessUtilInfo struct { // StartTime is when the process started using the GPU StartTime Time // EndTime is when the process stopped using the GPU (0 if still running) EndTime Time // EnergyConsumed is the energy consumed by the process in Joules EnergyConsumed *uint64 // SmUtil is the GPU SM (Streaming Multiprocessor) utilization percentage SmUtil *float64 // MemUtil is the GPU memory utilization percentage MemUtil *float64 }
ProcessUtilInfo contains utilization metrics for a GPU process
type RetiredPagesPolicyCondition ¶
type RetiredPagesPolicyCondition struct { // SbePages indicates the number of pages retired due to single-bit errors SbePages uint // DbePages indicates the number of pages retired due to double-bit errors DbePages uint }
RetiredPagesPolicyCondition contains details about retired memory pages
type Short ¶
Short is an alias for the C.ushort type. It is primarily used for DCGM field identifiers and field collections in the DCGM API bindings. This type provides a direct mapping to the C unsigned short type used in the underlying DCGM C API.
const ( // DCGM_FI_UNKNOWN represents a NULL field DCGM_FI_UNKNOWN Short = 0 // DCGM_FI_DRIVER_VERSION represents the driver version string DCGM_FI_DRIVER_VERSION Short = 1 // DCGM_FI_NVML_VERSION represents the underlying NVML version string DCGM_FI_NVML_VERSION Short = 2 // DCGM_FI_PROCESS_NAME represents the process name DCGM_FI_PROCESS_NAME Short = 3 // DCGM_FI_DEV_COUNT represents the number of devices on the node DCGM_FI_DEV_COUNT Short = 4 // DCGM_FI_CUDA_DRIVER_VERSION represents the CUDA driver version. Retrieves a number with the major value in the thousands place and the minor value in the hundreds place. (e.g. CUDA 11.1 = 11100) DCGM_FI_CUDA_DRIVER_VERSION Short = 5 // DCGM_FI_DEV_NAME represents the name of the GPU device DCGM_FI_DEV_NAME Short = 50 // DCGM_FI_DEV_BRAND represents the device brand DCGM_FI_DEV_BRAND Short = 51 // DCGM_FI_DEV_NVML_INDEX represents the NVML index of this GPU DCGM_FI_DEV_NVML_INDEX Short = 52 // DCGM_FI_DEV_SERIAL represents the device serial number DCGM_FI_DEV_SERIAL Short = 53 // DCGM_FI_DEV_UUID represents the UUID corresponding to the device DCGM_FI_DEV_UUID Short = 54 // DCGM_FI_DEV_MINOR_NUMBER represents the device node minor number (/dev/nvidia#) DCGM_FI_DEV_MINOR_NUMBER Short = 55 // DCGM_FI_DEV_OEM_INFOROM_VER represents the OEM inforom version DCGM_FI_DEV_OEM_INFOROM_VER Short = 56 // DCGM_FI_DEV_PCI_BUSID represents the PCI attributes for the device DCGM_FI_DEV_PCI_BUSID Short = 57 // DCGM_FI_DEV_PCI_COMBINED_ID represents the combined 16-bit device id and 16-bit vendor id DCGM_FI_DEV_PCI_COMBINED_ID Short = 58 // DCGM_FI_DEV_PCI_SUBSYS_ID represents the 32-bit Sub System Device ID DCGM_FI_DEV_PCI_SUBSYS_ID Short = 59 // DCGM_FI_GPU_TOPOLOGY_PCI represents the topology of all GPUs on the system via PCI (static) DCGM_FI_GPU_TOPOLOGY_PCI Short = 60 // DCGM_FI_GPU_TOPOLOGY_NVLINK represents the topology of all GPUs on the system via NVLINK (static) DCGM_FI_GPU_TOPOLOGY_NVLINK Short = 61 // DCGM_FI_GPU_TOPOLOGY_AFFINITY represents the affinity of all GPUs on the system (static) DCGM_FI_GPU_TOPOLOGY_AFFINITY Short = 62 // DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY represents the CUDA compute capability for the device. The major version is the upper 32 bits and the minor version is the lower 32 bits DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY Short = 63 // DCGM_FI_DEV_COMPUTE_MODE represents the compute mode for the device DCGM_FI_DEV_COMPUTE_MODE Short = 65 // DCGM_FI_DEV_PERSISTENCE_MODE represents the persistence mode for the device. Boolean: 0 is disabled, 1 is enabled DCGM_FI_DEV_PERSISTENCE_MODE Short = 66 // DCGM_FI_DEV_MIG_MODE represents the MIG mode for the device. Boolean: 0 is disabled, 1 is enabled DCGM_FI_DEV_MIG_MODE Short = 67 // DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR represents the string that CUDA_VISIBLE_DEVICES should be set to for this entity (including MIG) DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR Short = 68 // DCGM_FI_DEV_MIG_MAX_SLICES represents the maximum number of MIG slices supported by this GPU DCGM_FI_DEV_MIG_MAX_SLICES Short = 69 // DCGM_FI_DEV_CPU_AFFINITY_0 represents the device CPU affinity for CPUs 0-63 DCGM_FI_DEV_CPU_AFFINITY_0 Short = 70 // DCGM_FI_DEV_CPU_AFFINITY_1 represents the device CPU affinity for CPUs 64-127 DCGM_FI_DEV_CPU_AFFINITY_1 Short = 71 // DCGM_FI_DEV_CPU_AFFINITY_2 represents the device CPU affinity for CPUs 128-191 DCGM_FI_DEV_CPU_AFFINITY_2 Short = 72 // DCGM_FI_DEV_CPU_AFFINITY_3 represents the device CPU affinity for CPUs 192-255 DCGM_FI_DEV_CPU_AFFINITY_3 Short = 73 // DCGM_FI_DEV_CC_MODE represents the ConfidentialCompute/AmpereProtectedMemory status. 0 = disabled, 1 = enabled DCGM_FI_DEV_CC_MODE Short = 74 // DCGM_FI_DEV_MIG_ATTRIBUTES represents the attributes for the given MIG device handles DCGM_FI_DEV_MIG_ATTRIBUTES Short = 75 // DCGM_FI_DEV_MIG_GI_INFO represents the GPU instance profile information DCGM_FI_DEV_MIG_GI_INFO Short = 76 // DCGM_FI_DEV_MIG_CI_INFO represents the compute instance profile information DCGM_FI_DEV_MIG_CI_INFO Short = 77 // DCGM_FI_DEV_ECC_INFOROM_VER represents the ECC inforom version DCGM_FI_DEV_ECC_INFOROM_VER Short = 80 // DCGM_FI_DEV_POWER_INFOROM_VER represents the power management object inforom version DCGM_FI_DEV_POWER_INFOROM_VER Short = 81 // DCGM_FI_DEV_INFOROM_IMAGE_VER represents the inforom image version DCGM_FI_DEV_INFOROM_IMAGE_VER Short = 82 // DCGM_FI_DEV_INFOROM_CONFIG_CHECK represents the inforom configuration checksum DCGM_FI_DEV_INFOROM_CONFIG_CHECK Short = 83 // DCGM_FI_DEV_INFOROM_CONFIG_VALID represents whether the inforom configuration is valid. Reads the infoROM from the flash and verifies the checksums DCGM_FI_DEV_INFOROM_CONFIG_VALID Short = 84 // DCGM_FI_DEV_VBIOS_VERSION represents the VBIOS version of the device DCGM_FI_DEV_VBIOS_VERSION Short = 85 // DCGM_FI_DEV_MEM_AFFINITY_0 represents the device memory node affinity for nodes 0-63 DCGM_FI_DEV_MEM_AFFINITY_0 Short = 86 // DCGM_FI_DEV_MEM_AFFINITY_1 represents the device memory node affinity for nodes 64-127 DCGM_FI_DEV_MEM_AFFINITY_1 Short = 87 // DCGM_FI_DEV_MEM_AFFINITY_2 represents the device memory node affinity for nodes 128-191 DCGM_FI_DEV_MEM_AFFINITY_2 Short = 88 // DCGM_FI_DEV_MEM_AFFINITY_3 represents the device memory node affinity for nodes 192-255 DCGM_FI_DEV_MEM_AFFINITY_3 Short = 89 // DCGM_FI_DEV_BAR1_TOTAL represents the total BAR1 memory of the GPU in MB DCGM_FI_DEV_BAR1_TOTAL Short = 90 // DCGM_FI_SYNC_BOOST represents the sync boost settings on the node (Deprecated) DCGM_FI_SYNC_BOOST Short = 91 // DCGM_FI_DEV_BAR1_USED represents the used BAR1 memory of the GPU in MB DCGM_FI_DEV_BAR1_USED Short = 92 // DCGM_FI_DEV_BAR1_FREE represents the free BAR1 memory of the GPU in MB DCGM_FI_DEV_BAR1_FREE Short = 93 // DCGM_FI_DEV_GPM_SUPPORT represents the GPM support for the device DCGM_FI_DEV_GPM_SUPPORT Short = 94 // DCGM_FI_DEV_SM_CLOCK represents the SM clock for the device DCGM_FI_DEV_SM_CLOCK Short = 100 // DCGM_FI_DEV_MEM_CLOCK represents the memory clock for the device DCGM_FI_DEV_MEM_CLOCK Short = 101 // DCGM_FI_DEV_VIDEO_CLOCK represents the video encoder/decoder clock for the device DCGM_FI_DEV_VIDEO_CLOCK Short = 102 // DCGM_FI_DEV_APP_SM_CLOCK represents the SM application clocks DCGM_FI_DEV_APP_SM_CLOCK Short = 110 // DCGM_FI_DEV_APP_MEM_CLOCK represents the memory application clocks DCGM_FI_DEV_APP_MEM_CLOCK Short = 111 // DCGM_FI_DEV_CLOCKS_EVENT_REASONS represents the current clock event reasons (bitmask of DCGM_CLOCKS_EVENT_REASON_*) DCGM_FI_DEV_CLOCKS_EVENT_REASONS Short = 112 // DCGM_FI_DEV_CLOCK_THROTTLE_REASONS represents the current clock throttle reasons (Deprecated: Use DCGM_FI_DEV_CLOCKS_EVENT_REASONS instead) DCGM_FI_DEV_CLOCK_THROTTLE_REASONS Short = DCGM_FI_DEV_CLOCKS_EVENT_REASONS // DCGM_FI_DEV_MAX_SM_CLOCK represents the maximum supported SM clock for the device DCGM_FI_DEV_MAX_SM_CLOCK Short = 113 // DCGM_FI_DEV_MAX_MEM_CLOCK represents the maximum supported memory clock for the device DCGM_FI_DEV_MAX_MEM_CLOCK Short = 114 // DCGM_FI_DEV_MAX_VIDEO_CLOCK represents the maximum supported video encoder/decoder clock for the device DCGM_FI_DEV_MAX_VIDEO_CLOCK Short = 115 // DCGM_FI_DEV_AUTOBOOST represents the auto-boost setting for the device (1 = enabled, 0 = disabled) DCGM_FI_DEV_AUTOBOOST Short = 120 // DCGM_FI_DEV_SUPPORTED_CLOCKS represents the supported clocks for the device DCGM_FI_DEV_SUPPORTED_CLOCKS Short = 130 // DCGM_FI_DEV_MEMORY_TEMP represents the memory temperature for the device DCGM_FI_DEV_MEMORY_TEMP Short = 140 // DCGM_FI_DEV_GPU_TEMP represents the current temperature readings for the device, in degrees C DCGM_FI_DEV_GPU_TEMP Short = 150 // DCGM_FI_DEV_MEM_MAX_OP_TEMP represents the maximum operating temperature for the memory of this GPU DCGM_FI_DEV_MEM_MAX_OP_TEMP Short = 151 // DCGM_FI_DEV_GPU_MAX_OP_TEMP represents the maximum operating temperature for this GPU DCGM_FI_DEV_GPU_MAX_OP_TEMP Short = 152 // DCGM_FI_DEV_GPU_TEMP_LIMIT represents the thermal margin temperature (distance to nearest slowdown threshold) for this GPU DCGM_FI_DEV_GPU_TEMP_LIMIT Short = 153 // DCGM_FI_DEV_POWER_USAGE represents the power usage for the device in Watts DCGM_FI_DEV_POWER_USAGE Short = 155 // DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION represents the total energy consumption for the GPU in mJ since the driver was last reloaded DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Short = 156 // DCGM_FI_DEV_POWER_USAGE_INSTANT represents the current instantaneous power usage of the device in Watts DCGM_FI_DEV_POWER_USAGE_INSTANT Short = 157 // DCGM_FI_DEV_SLOWDOWN_TEMP represents the slowdown temperature for the device DCGM_FI_DEV_SLOWDOWN_TEMP Short = 158 // DCGM_FI_DEV_SHUTDOWN_TEMP represents the shutdown temperature for the device DCGM_FI_DEV_SHUTDOWN_TEMP Short = 159 // DCGM_FI_DEV_POWER_MGMT_LIMIT represents the current power limit for the device DCGM_FI_DEV_POWER_MGMT_LIMIT Short = 160 // DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN represents the minimum power management limit for the device DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN Short = 161 // DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX represents the maximum power management limit for the device DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX Short = 162 // DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF represents the default power management limit for the device DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF Short = 163 // DCGM_FI_DEV_ENFORCED_POWER_LIMIT represents the effective power limit that the driver enforces after taking into account all limiters DCGM_FI_DEV_ENFORCED_POWER_LIMIT Short = 164 // DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK represents the requested workload power profile mask (Blackwell and newer) DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK Short = 165 // DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK represents the enforced workload power profile mask (Blackwell and newer) DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK Short = 166 // DCGM_FI_DEV_VALID_POWER_PROFILE_MASK represents the valid workload power profile mask (Blackwell and newer) DCGM_FI_DEV_VALID_POWER_PROFILE_MASK Short = 167 // DCGM_FI_DEV_FABRIC_MANAGER_STATUS is the value for fabric manager status DCGM_FI_DEV_FABRIC_MANAGER_STATUS Short = 170 // DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE is the value for fabric manager error code // NOTE: this is not populated unless the fabric manager completed startup DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE Short = 171 // DCGM_FI_DEV_FABRIC_CLUSTER_UUID is the value for fabric cluster UUID DCGM_FI_DEV_FABRIC_CLUSTER_UUID Short = 172 // DCGM_FI_DEV_FABRIC_CLIQUE_ID is the value for fabric clique ID DCGM_FI_DEV_FABRIC_CLIQUE_ID Short = 173 // DCGM_FI_DEV_PSTATE is the value for P-state DCGM_FI_DEV_PSTATE Short = 190 // DCGM_FI_DEV_FAN_SPEED is the value for fan speed DCGM_FI_DEV_FAN_SPEED Short = 191 // DCGM_FI_DEV_PCIE_TX_THROUGHPUT represents the PCIe transmit throughput in KB/s DCGM_FI_DEV_PCIE_TX_THROUGHPUT Short = 200 // DCGM_FI_DEV_PCIE_RX_THROUGHPUT represents the PCIe receive throughput in KB/s DCGM_FI_DEV_PCIE_RX_THROUGHPUT Short = 201 // DCGM_FI_DEV_PCIE_REPLAY_COUNTER represents the PCIe replay counter value DCGM_FI_DEV_PCIE_REPLAY_COUNTER Short = 202 // DCGM_FI_DEV_GPU_UTIL represents the GPU utilization in percent DCGM_FI_DEV_GPU_UTIL Short = 203 // DCGM_FI_DEV_MEM_COPY_UTIL represents the memory copy utilization in percent DCGM_FI_DEV_MEM_COPY_UTIL Short = 204 // DCGM_FI_DEV_ACCOUNTING_DATA represents the process accounting information DCGM_FI_DEV_ACCOUNTING_DATA Short = 205 // DCGM_FI_DEV_ENC_UTIL represents the encoder utilization in percent DCGM_FI_DEV_ENC_UTIL Short = 206 // DCGM_FI_DEV_DEC_UTIL represents the decoder utilization in percent DCGM_FI_DEV_DEC_UTIL Short = 207 // DCGM_FI_DEV_XID_ERRORS is the value for XID errors DCGM_FI_DEV_XID_ERRORS Short = 230 // DCGM_FI_DEV_PCIE_MAX_LINK_GEN is the value for PCIe max link generation DCGM_FI_DEV_PCIE_MAX_LINK_GEN Short = 235 // DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH is the value for PCIe max link width DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH Short = 236 // DCGM_FI_DEV_PCIE_LINK_GEN is the value for PCIe link generation DCGM_FI_DEV_PCIE_LINK_GEN Short = 237 // DCGM_FI_DEV_PCIE_LINK_WIDTH is the value for PCIe link width DCGM_FI_DEV_PCIE_LINK_WIDTH Short = 238 // DCGM_FI_DEV_POWER_VIOLATION is the value for power violation time in microseconds DCGM_FI_DEV_POWER_VIOLATION Short = 240 // DCGM_FI_DEV_THERMAL_VIOLATION is the value for thermal violation time in microseconds DCGM_FI_DEV_THERMAL_VIOLATION Short = 241 // DCGM_FI_DEV_SYNC_BOOST_VIOLATION is the value for sync boost violation time in microseconds DCGM_FI_DEV_SYNC_BOOST_VIOLATION Short = 242 // DCGM_FI_DEV_BOARD_LIMIT_VIOLATION is the value for board limit violation time in microseconds DCGM_FI_DEV_BOARD_LIMIT_VIOLATION Short = 243 // DCGM_FI_DEV_LOW_UTIL_VIOLATION is the value for low utilization violation time in microseconds DCGM_FI_DEV_LOW_UTIL_VIOLATION Short = 244 // DCGM_FI_DEV_RELIABILITY_VIOLATION is the value for reliability violation time in microseconds DCGM_FI_DEV_RELIABILITY_VIOLATION Short = 245 // DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION is the value for total application clocks violation time in microseconds DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION Short = 246 // DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION is the value for total base clocks violation time in microseconds DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION Short = 247 // DCGM_FI_DEV_FB_TOTAL is the value for framebuffer total DCGM_FI_DEV_FB_TOTAL Short = 250 // DCGM_FI_DEV_FB_FREE is the value for framebuffer free DCGM_FI_DEV_FB_FREE Short = 251 // DCGM_FI_DEV_FB_USED is the value for framebuffer used DCGM_FI_DEV_FB_USED Short = 252 // DCGM_FI_DEV_FB_RESERVED is the value for framebuffer reserved DCGM_FI_DEV_FB_RESERVED Short = 253 // DCGM_FI_DEV_FB_USED_PERCENT is the value for framebuffer used percent DCGM_FI_DEV_FB_USED_PERCENT Short = 254 // DCGM_FI_DEV_C2C_LINK_COUNT is the value for C2C link count DCGM_FI_DEV_C2C_LINK_COUNT Short = 285 // DCGM_FI_DEV_C2C_LINK_STATUS is the value for C2C link status DCGM_FI_DEV_C2C_LINK_STATUS Short = 286 // DCGM_FI_DEV_C2C_MAX_BANDWIDTH is the value for C2C max bandwidth DCGM_FI_DEV_C2C_MAX_BANDWIDTH Short = 287 // DCGM_FI_DEV_ECC_CURRENT is the value for ECC current DCGM_FI_DEV_ECC_CURRENT Short = 300 // DCGM_FI_DEV_ECC_PENDING is the value for ECC pending DCGM_FI_DEV_ECC_PENDING Short = 301 // DCGM_FI_DEV_ECC_SBE_VOL_TOTAL represents the total number of single-bit ECC errors detected since the last counter reset DCGM_FI_DEV_ECC_SBE_VOL_TOTAL Short = 310 // DCGM_FI_DEV_ECC_DBE_VOL_TOTAL represents the total number of double-bit ECC errors detected since the last counter reset DCGM_FI_DEV_ECC_DBE_VOL_TOTAL Short = 311 // DCGM_FI_DEV_ECC_SBE_AGG_TOTAL represents the total number of single-bit ECC errors detected since the last counter reset (aggregate) DCGM_FI_DEV_ECC_SBE_AGG_TOTAL Short = 312 // DCGM_FI_DEV_ECC_DBE_AGG_TOTAL represents the total number of double-bit ECC errors detected since the last counter reset (aggregate) DCGM_FI_DEV_ECC_DBE_AGG_TOTAL Short = 313 // DCGM_FI_DEV_ECC_SBE_VOL_L1 represents the number of single-bit ECC errors detected in L1 cache since the last counter reset DCGM_FI_DEV_ECC_SBE_VOL_L1 Short = 314 // DCGM_FI_DEV_ECC_DBE_VOL_L1 represents the number of double-bit ECC errors detected in L1 cache since the last counter reset DCGM_FI_DEV_ECC_DBE_VOL_L1 Short = 315 // DCGM_FI_DEV_ECC_SBE_VOL_L2 represents the number of single-bit ECC errors detected in L2 cache since the last counter reset DCGM_FI_DEV_ECC_SBE_VOL_L2 Short = 316 // DCGM_FI_DEV_ECC_DBE_VOL_L2 represents the number of double-bit ECC errors detected in L2 cache since the last counter reset DCGM_FI_DEV_ECC_DBE_VOL_L2 Short = 317 // DCGM_FI_DEV_ECC_SBE_VOL_DEV represents the number of single-bit ECC errors detected in device memory since the last counter reset DCGM_FI_DEV_ECC_SBE_VOL_DEV Short = 318 // DCGM_FI_DEV_ECC_DBE_VOL_DEV represents the number of double-bit ECC errors detected in device memory since the last counter reset DCGM_FI_DEV_ECC_DBE_VOL_DEV Short = 319 // DCGM_FI_DEV_ECC_SBE_VOL_REG represents the number of single-bit ECC errors detected in register file since the last counter reset DCGM_FI_DEV_ECC_SBE_VOL_REG Short = 320 // DCGM_FI_DEV_ECC_DBE_VOL_REG represents the number of double-bit ECC errors detected in register file since the last counter reset DCGM_FI_DEV_ECC_DBE_VOL_REG Short = 321 // DCGM_FI_DEV_ECC_SBE_VOL_TEX represents the number of single-bit ECC errors detected in texture memory since the last counter reset DCGM_FI_DEV_ECC_SBE_VOL_TEX Short = 322 // DCGM_FI_DEV_ECC_DBE_VOL_TEX represents the number of double-bit ECC errors detected in texture memory since the last counter reset DCGM_FI_DEV_ECC_DBE_VOL_TEX Short = 323 // DCGM_FI_DEV_ECC_SBE_AGG_L1 represents the aggregate number of single-bit ECC errors detected in L1 cache DCGM_FI_DEV_ECC_SBE_AGG_L1 Short = 324 // DCGM_FI_DEV_ECC_DBE_AGG_L1 represents the aggregate number of double-bit ECC errors detected in L1 cache DCGM_FI_DEV_ECC_DBE_AGG_L1 Short = 325 // DCGM_FI_DEV_ECC_SBE_AGG_L2 represents the aggregate number of single-bit ECC errors detected in L2 cache DCGM_FI_DEV_ECC_SBE_AGG_L2 Short = 326 // DCGM_FI_DEV_ECC_DBE_AGG_L2 represents the aggregate number of double-bit ECC errors detected in L2 cache DCGM_FI_DEV_ECC_DBE_AGG_L2 Short = 327 // DCGM_FI_DEV_ECC_SBE_AGG_DEV represents the aggregate number of single-bit ECC errors detected in device memory DCGM_FI_DEV_ECC_SBE_AGG_DEV Short = 328 // DCGM_FI_DEV_ECC_DBE_AGG_DEV represents the aggregate number of double-bit ECC errors detected in device memory DCGM_FI_DEV_ECC_DBE_AGG_DEV Short = 329 // DCGM_FI_DEV_ECC_SBE_AGG_REG represents the aggregate number of single-bit ECC errors detected in register file DCGM_FI_DEV_ECC_SBE_AGG_REG Short = 330 // DCGM_FI_DEV_ECC_DBE_AGG_REG represents the aggregate number of double-bit ECC errors detected in register file DCGM_FI_DEV_ECC_DBE_AGG_REG Short = 331 // DCGM_FI_DEV_ECC_SBE_AGG_TEX represents the aggregate number of single-bit ECC errors detected in texture memory DCGM_FI_DEV_ECC_SBE_AGG_TEX Short = 332 // DCGM_FI_DEV_ECC_DBE_AGG_TEX represents the aggregate number of double-bit ECC errors detected in texture memory DCGM_FI_DEV_ECC_DBE_AGG_TEX Short = 333 // DCGM_FI_DEV_ECC_SBE_VOL_SHM represents the number of single-bit ECC errors detected in shared memory since the last counter reset DCGM_FI_DEV_ECC_SBE_VOL_SHM Short = 334 // DCGM_FI_DEV_ECC_DBE_VOL_SHM represents the number of double-bit ECC errors detected in shared memory since the last counter reset DCGM_FI_DEV_ECC_DBE_VOL_SHM Short = 335 // DCGM_FI_DEV_ECC_SBE_VOL_CBU represents the number of single-bit ECC errors detected in CBU since the last counter reset DCGM_FI_DEV_ECC_SBE_VOL_CBU Short = 336 // DCGM_FI_DEV_ECC_DBE_VOL_CBU represents the number of double-bit ECC errors detected in CBU since the last counter reset DCGM_FI_DEV_ECC_DBE_VOL_CBU Short = 337 // DCGM_FI_DEV_ECC_SBE_AGG_SHM represents the aggregate number of single-bit ECC errors detected in shared memory DCGM_FI_DEV_ECC_SBE_AGG_SHM Short = 338 // DCGM_FI_DEV_ECC_DBE_AGG_SHM represents the aggregate number of double-bit ECC errors detected in shared memory DCGM_FI_DEV_ECC_DBE_AGG_SHM Short = 339 // DCGM_FI_DEV_ECC_SBE_AGG_CBU represents the aggregate number of single-bit ECC errors detected in CBU DCGM_FI_DEV_ECC_SBE_AGG_CBU Short = 340 // DCGM_FI_DEV_ECC_DBE_AGG_CBU represents the aggregate number of double-bit ECC errors detected in CBU DCGM_FI_DEV_ECC_DBE_AGG_CBU Short = 341 // DCGM_FI_DEV_ECC_SBE_VOL_SRM represents the number of single-bit ECC errors detected in SRM since the last counter reset DCGM_FI_DEV_ECC_SBE_VOL_SRM Short = 342 // DCGM_FI_DEV_ECC_DBE_VOL_SRM represents the number of double-bit ECC errors detected in SRM since the last counter reset DCGM_FI_DEV_ECC_DBE_VOL_SRM Short = 343 // DCGM_FI_DEV_ECC_SBE_AGG_SRM represents the aggregate number of single-bit ECC errors detected in SRM DCGM_FI_DEV_ECC_SBE_AGG_SRM Short = 344 // DCGM_FI_DEV_ECC_DBE_AGG_SRM represents the aggregate number of double-bit ECC errors detected in SRM DCGM_FI_DEV_ECC_DBE_AGG_SRM Short = 345 // DCGM_FI_DEV_DIAG_MEMORY_RESULT is the value for ECC memory result DCGM_FI_DEV_DIAG_MEMORY_RESULT Short = 350 // DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT is the value for ECC diagnostic result DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT Short = 351 // DCGM_FI_DEV_DIAG_PCIE_RESULT is the value for ECC PCIe result DCGM_FI_DEV_DIAG_PCIE_RESULT Short = 352 // DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT is the value for ECC targeted stress result DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT Short = 353 // DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT is the value for ECC targeted power result DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT Short = 354 // DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT is the value for ECC memory bandwidth result DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT Short = 355 // DCGM_FI_DEV_DIAG_MEMTEST_RESULT is the value for ECC memtest result DCGM_FI_DEV_DIAG_MEMTEST_RESULT Short = 356 // DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT is the value for ECC pulse test result DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT Short = 357 // DCGM_FI_DEV_DIAG_EUD_RESULT is the value for ECC EUD result DCGM_FI_DEV_DIAG_EUD_RESULT Short = 358 // DCGM_FI_DEV_DIAG_CPU_EUD_RESULT is the value for ECC CPU EUD result DCGM_FI_DEV_DIAG_CPU_EUD_RESULT Short = 359 // DCGM_FI_DEV_DIAG_SOFTWARE_RESULT is the value for ECC software result DCGM_FI_DEV_DIAG_SOFTWARE_RESULT Short = 360 // DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT is the value for ECC NVBandwidth result DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT Short = 361 // DCGM_FI_DEV_DIAG_STATUS is the value for ECC status DCGM_FI_DEV_DIAG_STATUS Short = 362 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX is the value for ECC banks remap rows avail max DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX Short = 385 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH is the value for ECC banks remap rows avail high DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH Short = 386 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL is the value for ECC banks remap rows avail partial DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL Short = 387 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW is the value for ECC banks remap rows avail low DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW Short = 388 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE is the value for ECC banks remap rows avail none DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE Short = 389 // DCGM_FI_DEV_RETIRED_SBE is the value for ECC retired SBE DCGM_FI_DEV_RETIRED_SBE Short = 390 // DCGM_FI_DEV_RETIRED_DBE is the value for ECC retired DBE DCGM_FI_DEV_RETIRED_DBE Short = 391 // DCGM_FI_DEV_RETIRED_PENDING is the value for ECC retired pending DCGM_FI_DEV_RETIRED_PENDING Short = 392 // DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS is the value for ECC uncorrectable remapped rows DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS Short = 393 // DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS is the value for ECC correctable remapped rows DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS Short = 394 // DCGM_FI_DEV_ROW_REMAP_FAILURE is the value for ECC row remap failure DCGM_FI_DEV_ROW_REMAP_FAILURE Short = 395 // DCGM_FI_DEV_ROW_REMAP_PENDING is the value for ECC row remap pending DCGM_FI_DEV_ROW_REMAP_PENDING Short = 396 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 is the value for ECC NVLink CRC FLIT error count L0 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 Short = 400 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 is the value for ECC NVLink CRC FLIT error count L1 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 Short = 401 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 is the value for ECC NVLink CRC FLIT error count L2 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 Short = 402 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 is the value for ECC NVLink CRC FLIT error count L3 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 Short = 403 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 is the value for ECC NVLink CRC FLIT error count L4 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 Short = 404 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 is the value for ECC NVLink CRC FLIT error count L5 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 Short = 405 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL is the value for ECC NVLink CRC FLIT error count total DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL Short = 409 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 is the value for ECC NVLink CRC DATA error count L0 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 Short = 410 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 is the value for ECC NVLink CRC DATA error count L1 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 Short = 411 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 is the value for ECC NVLink CRC DATA error count L2 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 Short = 412 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 is the value for ECC NVLink CRC DATA error count L3 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 Short = 413 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 is the value for ECC NVLink CRC DATA error count L4 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 Short = 414 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 is the value for ECC NVLink CRC DATA error count L5 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 Short = 415 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL is the value for ECC NVLink CRC DATA error count total DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL Short = 419 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 is the value for ECC NVLink replay error count L0 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 Short = 420 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 is the value for ECC NVLink replay error count L1 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 Short = 421 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 is the value for ECC NVLink replay error count L2 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 Short = 422 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 is the value for ECC NVLink replay error count L3 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 Short = 423 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 is the value for ECC NVLink replay error count L4 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 Short = 424 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 is the value for ECC NVLink replay error count L5 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 Short = 425 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL is the value for ECC NVLink replay error count total DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL Short = 429 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 is the value for ECC NVLink recovery error count L0 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 Short = 430 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 is the value for ECC NVLink recovery error count L1 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 Short = 431 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 is the value for ECC NVLink recovery error count L2 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 Short = 432 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 is the value for ECC NVLink recovery error count L3 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 Short = 433 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 is the value for ECC NVLink recovery error count L4 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 Short = 434 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 is the value for ECC NVLink recovery error count L5 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 Short = 435 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL is the value for ECC NVLink recovery error count total DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL Short = 439 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 is the value for ECC NVLink bandwidth L0 DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 Short = 440 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 is the value for ECC NVLink bandwidth L1 DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 Short = 441 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 is the value for ECC NVLink bandwidth L2 DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 Short = 442 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 is the value for ECC NVLink bandwidth L3 DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 Short = 443 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 is the value for ECC NVLink bandwidth L4 DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 Short = 444 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 is the value for ECC NVLink bandwidth L5 DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 Short = 445 // DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL is the value for ECC NVLink bandwidth total DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Short = 449 // DCGM_FI_DEV_GPU_NVLINK_ERRORS is the value for GPU NVLink error information DCGM_FI_DEV_GPU_NVLINK_ERRORS Short = 450 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 is the value for ECC NVLink CRC FLIT error count L6 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 Short = 451 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 is the value for ECC NVLink CRC FLIT error count L7 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 Short = 452 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 is the value for ECC NVLink CRC FLIT error count L8 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 Short = 453 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 is the value for ECC NVLink CRC FLIT error count L9 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 Short = 454 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 is the value for ECC NVLink CRC FLIT error count L10 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 Short = 455 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 is the value for ECC NVLink CRC FLIT error count L11 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 Short = 456 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 is the value for ECC NVLink CRC DATA error count L6 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 Short = 457 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 is the value for ECC NVLink CRC DATA error count L7 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 Short = 458 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 is the value for ECC NVLink CRC DATA error count L8 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 Short = 459 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 is the value for ECC NVLink CRC DATA error count L9 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 Short = 460 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 is the value for ECC NVLink CRC DATA error count L10 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 Short = 461 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 is the value for ECC NVLink CRC DATA error count L11 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 Short = 462 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 is the value for ECC NVLink replay error count L6 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 Short = 463 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 is the value for ECC NVLink replay error count L7 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 Short = 464 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 is the value for ECC NVLink replay error count L8 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 Short = 465 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 is the value for ECC NVLink replay error count L9 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 Short = 466 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 is the value for ECC NVLink replay error count L10 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 Short = 467 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 is the value for ECC NVLink replay error count L11 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 Short = 468 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 is the value for ECC NVLink recovery error count L6 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 Short = 469 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 is the value for ECC NVLink recovery error count L7 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 Short = 470 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 is the value for ECC NVLink recovery error count L8 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 Short = 471 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 is the value for ECC NVLink recovery error count L9 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 Short = 472 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 is the value for ECC NVLink recovery error count L10 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 Short = 473 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 is the value for ECC NVLink recovery error count L11 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 Short = 474 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 is the value for ECC NVLink bandwidth L6 DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 Short = 475 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 is the value for ECC NVLink bandwidth L7 DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 Short = 476 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 is the value for ECC NVLink bandwidth L8 DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 Short = 477 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 is the value for ECC NVLink bandwidth L9 DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 Short = 478 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 is the value for ECC NVLink bandwidth L10 DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 Short = 479 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 is the value for ECC NVLink bandwidth L11 DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 Short = 480 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 is the value for ECC NVLink CRC FLIT error count L12 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 Short = 406 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 is the value for ECC NVLink CRC FLIT error count L13 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 Short = 407 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 is the value for ECC NVLink CRC FLIT error count L14 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 Short = 408 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 is the value for ECC NVLink CRC FLIT error count L15 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 Short = 481 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 is the value for ECC NVLink CRC FLIT error count L16 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 Short = 482 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 is the value for ECC NVLink CRC FLIT error count L17 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 Short = 483 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 is the value for ECC NVLink CRC DATA error count L12 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 Short = 416 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 is the value for ECC NVLink CRC DATA error count L13 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 Short = 417 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 is the value for ECC NVLink CRC DATA error count L14 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 Short = 418 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 is the value for ECC NVLink CRC DATA error count L15 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 Short = 484 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 is the value for ECC NVLink CRC DATA error count L16 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 Short = 485 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 is the value for ECC NVLink CRC DATA error count L17 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 Short = 486 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 is the value for ECC NVLink replay error count L12 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 Short = 426 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 is the value for ECC NVLink replay error count L13 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 Short = 427 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 is the value for ECC NVLink replay error count L14 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 Short = 428 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 is the value for ECC NVLink replay error count L15 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 Short = 487 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 is the value for ECC NVLink replay error count L16 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 Short = 488 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 is the value for ECC NVLink replay error count L17 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 Short = 489 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 is the value for ECC NVLink recovery error count L12 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 Short = 436 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 is the value for ECC NVLink recovery error count L13 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 Short = 437 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 is the value for ECC NVLink recovery error count L14 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 Short = 438 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 is the value for ECC NVLink recovery error count L15 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 Short = 491 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 is the value for ECC NVLink recovery error count L16 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 Short = 492 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 is the value for ECC NVLink recovery error count L17 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 Short = 493 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 is the value for ECC NVLink bandwidth L12 DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 Short = 446 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 is the value for ECC NVLink bandwidth L13 DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 Short = 447 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 is the value for ECC NVLink bandwidth L14 DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 Short = 448 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 is the value for ECC NVLink bandwidth L15 DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 Short = 494 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 is the value for ECC NVLink bandwidth L16 DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 Short = 495 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 is the value for ECC NVLink bandwidth L17 DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 Short = 496 // DCGM_FI_DEV_NVLINK_ERROR_DL_CRC is the value for ECC NVLink error DL CRC DCGM_FI_DEV_NVLINK_ERROR_DL_CRC Short = 497 // DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY is the value for ECC NVLink error DL recovery DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY Short = 498 // DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY is the value for ECC NVLink error DL replay DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY Short = 499 // DCGM_FI_DEV_VIRTUAL_MODE is the value for ECC virtual mode DCGM_FI_DEV_VIRTUAL_MODE Short = 500 // DCGM_FI_DEV_SUPPORTED_TYPE_INFO is the value for ECC supported type info DCGM_FI_DEV_SUPPORTED_TYPE_INFO Short = 501 // DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS is the value for ECC creatable VGPU type IDs DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS Short = 502 // DCGM_FI_DEV_VGPU_INSTANCE_IDS is the value for ECC VGPU instance IDs DCGM_FI_DEV_VGPU_INSTANCE_IDS Short = 503 // DCGM_FI_DEV_VGPU_UTILIZATIONS is the value for ECC VGPU utilizations DCGM_FI_DEV_VGPU_UTILIZATIONS Short = 504 // DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION is the value for ECC VGPU per process utilization DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION Short = 505 // DCGM_FI_DEV_ENC_STATS is the value for ECC enc stats DCGM_FI_DEV_ENC_STATS Short = 506 // DCGM_FI_DEV_FBC_STATS is the value for ECC FBC stats DCGM_FI_DEV_FBC_STATS Short = 507 // DCGM_FI_DEV_FBC_SESSIONS_INFO is the value for ECC FBC sessions info DCGM_FI_DEV_FBC_SESSIONS_INFO Short = 508 // DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS is the value for ECC supported VGPU type IDs DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS Short = 509 // DCGM_FI_DEV_VGPU_TYPE_INFO is the value for ECC VGPU type info DCGM_FI_DEV_VGPU_TYPE_INFO Short = 510 // DCGM_FI_DEV_VGPU_TYPE_NAME is the value for ECC VGPU type name DCGM_FI_DEV_VGPU_TYPE_NAME Short = 511 // DCGM_FI_DEV_VGPU_TYPE_CLASS is the value for ECC VGPU type class DCGM_FI_DEV_VGPU_TYPE_CLASS Short = 512 // DCGM_FI_DEV_VGPU_TYPE_LICENSE is the value for ECC VGPU type license DCGM_FI_DEV_VGPU_TYPE_LICENSE Short = 513 // DCGM_FI_DEV_VGPU_VM_ID represents the VGPU VM ID DCGM_FI_DEV_VGPU_VM_ID Short = 520 // DCGM_FI_DEV_VGPU_VM_NAME represents the VGPU VM name DCGM_FI_DEV_VGPU_VM_NAME Short = 521 // DCGM_FI_DEV_VGPU_TYPE represents the VGPU type DCGM_FI_DEV_VGPU_TYPE Short = 522 // DCGM_FI_DEV_VGPU_UUID represents the VGPU UUID DCGM_FI_DEV_VGPU_UUID Short = 523 // DCGM_FI_DEV_VGPU_DRIVER_VERSION represents the VGPU driver version DCGM_FI_DEV_VGPU_DRIVER_VERSION Short = 524 // DCGM_FI_DEV_VGPU_MEMORY_USAGE represents the VGPU memory usage DCGM_FI_DEV_VGPU_MEMORY_USAGE Short = 525 // DCGM_FI_DEV_VGPU_LICENSE_STATUS represents the VGPU license status DCGM_FI_DEV_VGPU_LICENSE_STATUS Short = 526 // DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT represents the VGPU frame rate limit DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT Short = 527 // DCGM_FI_DEV_VGPU_ENC_STATS represents the VGPU encoder statistics DCGM_FI_DEV_VGPU_ENC_STATS Short = 528 // DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO represents the VGPU encoder sessions information DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO Short = 529 // DCGM_FI_DEV_VGPU_FBC_STATS represents the VGPU frame buffer capture statistics DCGM_FI_DEV_VGPU_FBC_STATS Short = 530 // DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO represents the VGPU frame buffer capture sessions information DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO Short = 531 // DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE represents the VGPU instance license state DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE Short = 532 // DCGM_FI_DEV_VGPU_PCI_ID represents the VGPU PCI ID DCGM_FI_DEV_VGPU_PCI_ID Short = 533 // DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID represents the VGPU VM GPU instance ID DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID Short = 534 // DCGM_FI_FIRST_VGPU_FIELD_ID is the value for ECC first VGPU field ID DCGM_FI_FIRST_VGPU_FIELD_ID Short = 520 // DCGM_FI_LAST_VGPU_FIELD_ID is the value for ECC last VGPU field ID DCGM_FI_LAST_VGPU_FIELD_ID Short = 570 // DCGM_FI_DEV_PLATFORM_INFINIBAND_GUID is the value for ECC platform InfiniBand GUID DCGM_FI_DEV_PLATFORM_INFINIBAND_GUID Short = 571 // DCGM_FI_DEV_PLATFORM_CHASSIS_SERIAL_NUMBER is the value for ECC platform chassis serial number DCGM_FI_DEV_PLATFORM_CHASSIS_SERIAL_NUMBER Short = 572 // DCGM_FI_DEV_PLATFORM_CHASSIS_SLOT_NUMBER is the value for ECC platform chassis slot number DCGM_FI_DEV_PLATFORM_CHASSIS_SLOT_NUMBER Short = 573 // DCGM_FI_DEV_PLATFORM_TRAY_INDEX is the value for ECC platform tray index DCGM_FI_DEV_PLATFORM_TRAY_INDEX Short = 574 // DCGM_FI_DEV_PLATFORM_HOST_ID is the value for ECC platform host ID DCGM_FI_DEV_PLATFORM_HOST_ID Short = 575 // DCGM_FI_DEV_PLATFORM_PEER_TYPE is the value for ECC platform peer type DCGM_FI_DEV_PLATFORM_PEER_TYPE Short = 576 // DCGM_FI_DEV_PLATFORM_MODULE_ID is the value for ECC platform module ID DCGM_FI_DEV_PLATFORM_MODULE_ID Short = 577 // DCGM_FI_FIRST_NVSWITCH_FIELD_ID is the value for ECC first NVSwitch field ID DCGM_FI_FIRST_NVSWITCH_FIELD_ID Short = 700 // DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT represents the NVSwitch voltage in millivolts DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT Short = 701 // DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ represents the NVSwitch IDDQ current DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ Short = 702 // DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV represents the NVSwitch IDDQ current revision DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV Short = 703 // DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD represents the NVSwitch IDDQ current for DVDD DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD Short = 704 // DCGM_FI_DEV_NVSWITCH_POWER_VDD represents the NVSwitch VDD power consumption in watts DCGM_FI_DEV_NVSWITCH_POWER_VDD Short = 705 // DCGM_FI_DEV_NVSWITCH_POWER_DVDD represents the NVSwitch DVDD power consumption in watts DCGM_FI_DEV_NVSWITCH_POWER_DVDD Short = 706 // DCGM_FI_DEV_NVSWITCH_POWER_HVDD represents the NVSwitch HVDD power consumption in watts DCGM_FI_DEV_NVSWITCH_POWER_HVDD Short = 707 // DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX represents the NVSwitch Tx Throughput Counter for ports 0-17 in KB/s DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX Short = 780 // DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX represents the NVSwitch Rx Throughput Counter for ports 0-17 in KB/s DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX Short = 781 // DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS represents the number of fatal errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS Short = 782 // DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS represents the number of non-fatal errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS Short = 783 // DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS represents the number of replay errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS Short = 784 // DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS represents the number of recovery errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS Short = 785 // DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS represents the number of FLIT errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS Short = 786 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS represents the number of CRC errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS Short = 787 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS represents the number of ECC errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS Short = 788 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 is the value for Nvlink lane latency low lane0 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 Short = 789 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 is the value forNvlink lane latency low lane1 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 Short = 790 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 is the value for Nvlink lane latency low lane2 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 Short = 791 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 is the value for Nvlink lane latency low lane3 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 Short = 792 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 is the value for Nvlink lane latency medium lane0 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 Short = 793 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 is the value for Nvlink lane latency medium lane1 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 Short = 794 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 is the value for Nvlink lane latency medium lane2 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 Short = 795 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 is the value for Nvlink lane latency medium lane3 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 Short = 796 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 is the value for Nvlink lane latency high lane0 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 Short = 797 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 is the value for Nvlink lane latency high lane1 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 Short = 798 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 is the value for Nvlink lane latency high lane2 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 Short = 799 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 is the value for Nvlink lane latency high lane3 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 Short = 800 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 is the value for Nvlink lane latency panic lane0 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 Short = 801 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 is the value for Nvlink lane latency panic lane1 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 Short = 802 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 is the value for Nvlink lane latency panic lane2 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 Short = 803 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 is the value for Nvlink lane latency panic lane3 counter DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 Short = 804 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 represents the latency counter for virtual channel 0 on the NVSwitch link DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 Short = 805 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 represents the latency counter for virtual channel 1 on the NVSwitch link DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 Short = 806 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 represents the latency counter for virtual channel 2 on the NVSwitch link DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 Short = 807 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 represents the latency counter for virtual channel 3 on the NVSwitch link DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 Short = 808 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 represents the number of CRC errors on lane 0 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 Short = 809 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 represents the number of CRC errors on lane 1 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 Short = 810 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 represents the number of CRC errors on lane 2 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 Short = 811 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 represents the number of CRC errors on lane 3 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 Short = 812 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 represents the number of ECC errors on lane 0 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 Short = 813 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 represents the number of ECC errors on lane 1 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 Short = 814 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 represents the number of ECC errors on lane 2 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 Short = 815 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 represents the number of ECC errors on lane 3 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 Short = 816 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 represents the number of CRC errors on lane 4 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 Short = 817 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 represents the number of CRC errors on lane 5 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 Short = 818 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 represents the number of CRC errors on lane 6 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 Short = 819 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 represents the number of CRC errors on lane 7 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 Short = 820 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 represents the number of ECC errors on lane 4 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 Short = 821 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 represents the number of ECC errors on lane 5 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 Short = 822 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 represents the number of ECC errors on lane 6 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 Short = 823 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 represents the number of ECC errors on lane 7 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 Short = 824 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L0 represents the transmit bandwidth for NVLink lane 0 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L0 Short = 825 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L1 represents the transmit bandwidth for NVLink lane 1 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L1 Short = 826 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L2 represents the transmit bandwidth for NVLink lane 2 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L2 Short = 827 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L3 represents the transmit bandwidth for NVLink lane 3 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L3 Short = 828 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L4 represents the transmit bandwidth for NVLink lane 4 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L4 Short = 829 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L5 represents the transmit bandwidth for NVLink lane 5 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L5 Short = 830 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L6 represents the transmit bandwidth for NVLink lane 6 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L6 Short = 831 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L7 represents the transmit bandwidth for NVLink lane 7 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L7 Short = 832 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L8 represents the transmit bandwidth for NVLink lane 8 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L8 Short = 833 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L9 represents the transmit bandwidth for NVLink lane 9 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L9 Short = 834 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L10 represents the transmit bandwidth for NVLink lane 10 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L10 Short = 835 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L11 represents the transmit bandwidth for NVLink lane 11 in KB/s DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L11 Short = 836 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L12 represents the NV Link TX Bandwidth Counter for Lane 12 DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L12 Short = 837 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L13 represents the NV Link TX Bandwidth Counter for Lane 13 DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L13 Short = 838 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L14 represents the NV Link TX Bandwidth Counter for Lane 14 DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L14 Short = 839 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L15 represents the NV Link TX Bandwidth Counter for Lane 15 DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L15 Short = 840 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L16 represents the NV Link TX Bandwidth Counter for Lane 16 DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L16 Short = 841 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L17 represents the NV Link TX Bandwidth Counter for Lane 17 DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L17 Short = 842 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_TOTAL represents the NV Link Bandwidth Counter total for all TX Lanes DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_TOTAL Short = 843 // DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS represents the NVSwitch fatal error information. // Note: value field indicates the specific SXid reported DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS Short = 856 // DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS represents the NVSwitch non fatal error information. DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS Short = 857 // DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT represents the NVSwitch current temperature. DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT Short = 858 // DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN represents the NVSwitch limit slowdown temperature DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN Short = 859 // DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN represents the NVSwitch limit shutdown temperature DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN Short = 860 // DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX represents the NVSwitch throughput Tx DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX Short = 861 // DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX represents the NVSwitch throughput Rx DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX Short = 862 // DCGM_FI_DEV_NVSWITCH_PHYS_ID represents the NVSwitch physical ID DCGM_FI_DEV_NVSWITCH_PHYS_ID Short = 863 // DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED represents the NVSwitch reset required DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED Short = 864 // DCGM_FI_DEV_NVSWITCH_LINK_ID represents the NVSwitch link ID DCGM_FI_DEV_NVSWITCH_LINK_ID Short = 865 // DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN represents the NVSwitch PCIe domain DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN Short = 866 // DCGM_FI_DEV_NVSWITCH_PCIE_BUS represents the NVSwitch PCIe bus DCGM_FI_DEV_NVSWITCH_PCIE_BUS Short = 867 // DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE represents the NVSwitch PCIe device DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE Short = 868 // DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION represents the NVSwitch PCIe function DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION Short = 869 // DCGM_FI_DEV_NVSWITCH_LINK_STATUS represents the NVSwitch link status UNKNOWN:-1 OFF:0 SAFE:1 ACTIVE:2 ERROR:3 DCGM_FI_DEV_NVSWITCH_LINK_STATUS Short = 870 // DCGM_FI_DEV_NVSWITCH_LINK_TYPE represents the NVSwitch link type GPU/Switch DCGM_FI_DEV_NVSWITCH_LINK_TYPE Short = 871 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN represents the NVSwitch remote PCIe domain DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN Short = 872 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS represents the NVSwitch remote PCIe bus DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS Short = 873 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE represents the NVSwitch remote PCIe device DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE Short = 874 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION represents the NVSwitch remote PCIe function DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION Short = 875 // DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID represents the NVSwitch link device link ID DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID Short = 876 // DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID represents the NVSwitch link device link SID DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID Short = 877 // DCGM_FI_DEV_NVSWITCH_DEVICE_UUID represents the NVSwitch device UUID DCGM_FI_DEV_NVSWITCH_DEVICE_UUID Short = 878 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L0 represents the receive bandwidth for NVLink lane 0 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L0 Short = 879 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L1 represents the receive bandwidth for NVLink lane 1 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L1 Short = 880 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L2 represents the receive bandwidth for NVLink lane 2 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L2 Short = 881 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L3 represents the receive bandwidth for NVLink lane 3 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L3 Short = 882 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L4 represents the receive bandwidth for NVLink lane 4 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L4 Short = 883 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L5 represents the receive bandwidth for NVLink lane 5 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L5 Short = 884 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L6 represents the receive bandwidth for NVLink lane 6 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L6 Short = 885 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L7 represents the receive bandwidth for NVLink lane 7 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L7 Short = 886 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L8 represents the receive bandwidth for NVLink lane 8 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L8 Short = 887 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L9 represents the receive bandwidth for NVLink lane 9 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L9 Short = 888 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L10 represents the receive bandwidth for NVLink lane 10 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L10 Short = 889 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L11 represents the receive bandwidth for NVLink lane 11 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L11 Short = 890 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L12 represents the receive bandwidth for NVLink lane 12 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L12 Short = 891 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L13 represents the receive bandwidth for NVLink lane 13 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L13 Short = 892 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L14 represents the receive bandwidth for NVLink lane 14 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L14 Short = 893 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L15 represents the receive bandwidth for NVLink lane 15 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L15 Short = 894 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L16 represents the receive bandwidth for NVLink lane 16 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L16 Short = 895 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L17 represents the receive bandwidth for NVLink lane 17 in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L17 Short = 896 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_TOTAL represents the total receive bandwidth for all NVLink lanes in KB/s DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_TOTAL Short = 897 // DCGM_FI_PROF_GR_ENGINE_ACTIVE represents the percentage of time the graphics engine was active DCGM_FI_PROF_GR_ENGINE_ACTIVE Short = 1001 // DCGM_FI_PROF_SM_ACTIVE represents the percentage of time the streaming multiprocessors (SM) were active DCGM_FI_PROF_SM_ACTIVE Short = 1002 // DCGM_FI_PROF_SM_OCCUPANCY represents the percentage of streaming multiprocessors (SM) warps residency DCGM_FI_PROF_SM_OCCUPANCY Short = 1003 // DCGM_FI_PROF_PIPE_TENSOR_ACTIVE represents the percentage of time the tensor (HMMA) pipe was active DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Short = 1004 // DCGM_FI_PROF_DRAM_ACTIVE represents the percentage of time the device memory interface was active DCGM_FI_PROF_DRAM_ACTIVE Short = 1005 // DCGM_FI_PROF_PIPE_FP64_ACTIVE represents the percentage of time the FP64 pipe was active DCGM_FI_PROF_PIPE_FP64_ACTIVE Short = 1006 // DCGM_FI_PROF_PIPE_FP32_ACTIVE represents the percentage of time the FP32 pipe was active DCGM_FI_PROF_PIPE_FP32_ACTIVE Short = 1007 // DCGM_FI_PROF_PIPE_FP16_ACTIVE represents the percentage of time the FP16 pipe was active DCGM_FI_PROF_PIPE_FP16_ACTIVE Short = 1008 // DCGM_FI_PROF_PCIE_TX_BYTES represents the number of bytes transmitted through PCIe TX (in bytes) DCGM_FI_PROF_PCIE_TX_BYTES Short = 1009 // DCGM_FI_PROF_PCIE_RX_BYTES represents the number of bytes received through PCIe RX (in bytes) DCGM_FI_PROF_PCIE_RX_BYTES Short = 1010 // DCGM_FI_PROF_NVLINK_TX_BYTES represents the number of bytes transmitted through NVLink TX (in bytes) DCGM_FI_PROF_NVLINK_TX_BYTES Short = 1011 // DCGM_FI_PROF_NVLINK_RX_BYTES represents the number of bytes received through NVLink RX (in bytes) DCGM_FI_PROF_NVLINK_RX_BYTES Short = 1012 // DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE represents the percentage of time the IMMA tensor pipe was active DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE Short = 1013 // DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE represents the percentage of time the HMMA tensor pipe was active DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE Short = 1014 // DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE represents the percentage of time the DFMA tensor pipe was active DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE Short = 1015 // DCGM_FI_PROF_PIPE_INT_ACTIVE represents the ratio of cycles the integer pipe is active DCGM_FI_PROF_PIPE_INT_ACTIVE Short = 1016 // DCGM_FI_PROF_NVDEC0_ACTIVE represents the ratio of cycles the NVDEC engine 0 is active DCGM_FI_PROF_NVDEC0_ACTIVE Short = 1017 // DCGM_FI_PROF_NVDEC1_ACTIVE represents the ratio of cycles the NVDEC engine 1 is active DCGM_FI_PROF_NVDEC1_ACTIVE Short = 1018 // DCGM_FI_PROF_NVDEC2_ACTIVE represents the ratio of cycles the NVDEC engine 2 is active DCGM_FI_PROF_NVDEC2_ACTIVE Short = 1019 // DCGM_FI_PROF_NVDEC3_ACTIVE represents the ratio of cycles the NVDEC engine 3 is active DCGM_FI_PROF_NVDEC3_ACTIVE Short = 1020 // DCGM_FI_PROF_NVDEC4_ACTIVE represents the ratio of cycles the NVDEC engine 4 is active DCGM_FI_PROF_NVDEC4_ACTIVE Short = 1021 // DCGM_FI_PROF_NVDEC5_ACTIVE represents the ratio of cycles the NVDEC engine 5 is active DCGM_FI_PROF_NVDEC5_ACTIVE Short = 1022 // DCGM_FI_PROF_NVDEC6_ACTIVE represents the ratio of cycles the NVDEC engine 6 is active DCGM_FI_PROF_NVDEC6_ACTIVE Short = 1023 // DCGM_FI_PROF_NVDEC7_ACTIVE represents the ratio of cycles the NVDEC engine 7 is active DCGM_FI_PROF_NVDEC7_ACTIVE Short = 1024 // DCGM_FI_PROF_NVJPG0_ACTIVE represents the ratio of cycles the NVJPG engine 0 is active DCGM_FI_PROF_NVJPG0_ACTIVE Short = 1025 // DCGM_FI_PROF_NVJPG1_ACTIVE represents the ratio of cycles the NVJPG engine 1 is active DCGM_FI_PROF_NVJPG1_ACTIVE Short = 1026 // DCGM_FI_PROF_NVJPG2_ACTIVE represents the ratio of cycles the NVJPG engine 2 is active DCGM_FI_PROF_NVJPG2_ACTIVE Short = 1027 // DCGM_FI_PROF_NVJPG3_ACTIVE represents the ratio of cycles the NVJPG engine 3 is active DCGM_FI_PROF_NVJPG3_ACTIVE Short = 1028 // DCGM_FI_PROF_NVJPG4_ACTIVE represents the ratio of cycles the NVJPG engine 4 is active DCGM_FI_PROF_NVJPG4_ACTIVE Short = 1029 // DCGM_FI_PROF_NVJPG5_ACTIVE represents the ratio of cycles the NVJPG engine 5 is active DCGM_FI_PROF_NVJPG5_ACTIVE Short = 1030 // DCGM_FI_PROF_NVJPG6_ACTIVE represents the ratio of cycles the NVJPG engine 6 is active DCGM_FI_PROF_NVJPG6_ACTIVE Short = 1031 // DCGM_FI_PROF_NVJPG7_ACTIVE represents the ratio of cycles the NVJPG engine 7 is active DCGM_FI_PROF_NVJPG7_ACTIVE Short = 1032 // DCGM_FI_PROF_NVOFA0_ACTIVE represents the ratio of cycles the NVOFA engine 0 is active DCGM_FI_PROF_NVOFA0_ACTIVE Short = 1033 // DCGM_FI_PROF_NVOFA1_ACTIVE represents the ratio of cycles the NVOFA engine 1 is active DCGM_FI_PROF_NVOFA1_ACTIVE Short = 1034 // DCGM_FI_PROF_NVLINK_L0_TX_BYTES represents the number of bytes transmitted through NVLink lane 0 in KB/s DCGM_FI_PROF_NVLINK_L0_TX_BYTES Short = 1040 // DCGM_FI_PROF_NVLINK_L0_RX_BYTES represents the number of bytes received through NVLink lane 0 in KB/s DCGM_FI_PROF_NVLINK_L0_RX_BYTES Short = 1041 // DCGM_FI_PROF_NVLINK_L1_TX_BYTES represents the number of bytes transmitted through NVLink lane 1 in KB/s DCGM_FI_PROF_NVLINK_L1_TX_BYTES Short = 1042 // DCGM_FI_PROF_NVLINK_L1_RX_BYTES represents the number of bytes received through NVLink lane 1 in KB/s DCGM_FI_PROF_NVLINK_L1_RX_BYTES Short = 1043 // DCGM_FI_PROF_NVLINK_L2_TX_BYTES represents the number of bytes transmitted through NVLink lane 2 in KB/s DCGM_FI_PROF_NVLINK_L2_TX_BYTES Short = 1044 // DCGM_FI_PROF_NVLINK_L2_RX_BYTES represents the number of bytes received through NVLink lane 2 in KB/s DCGM_FI_PROF_NVLINK_L2_RX_BYTES Short = 1045 // DCGM_FI_PROF_NVLINK_L3_TX_BYTES represents the number of bytes transmitted through NVLink lane 3 in KB/s DCGM_FI_PROF_NVLINK_L3_TX_BYTES Short = 1046 // DCGM_FI_PROF_NVLINK_L3_RX_BYTES represents the number of bytes received through NVLink lane 3 in KB/s DCGM_FI_PROF_NVLINK_L3_RX_BYTES Short = 1047 // DCGM_FI_PROF_NVLINK_L4_TX_BYTES represents the number of bytes transmitted through NVLink lane 4 in KB/s DCGM_FI_PROF_NVLINK_L4_TX_BYTES Short = 1048 // DCGM_FI_PROF_NVLINK_L4_RX_BYTES represents the number of bytes received through NVLink lane 4 in KB/s DCGM_FI_PROF_NVLINK_L4_RX_BYTES Short = 1049 // DCGM_FI_PROF_NVLINK_L5_TX_BYTES represents the number of bytes transmitted through NVLink lane 5 in KB/s DCGM_FI_PROF_NVLINK_L5_TX_BYTES Short = 1050 // DCGM_FI_PROF_NVLINK_L5_RX_BYTES represents the number of bytes received through NVLink lane 5 in KB/s DCGM_FI_PROF_NVLINK_L5_RX_BYTES Short = 1051 // DCGM_FI_PROF_NVLINK_L6_TX_BYTES represents the number of bytes transmitted through NVLink lane 6 in KB/s DCGM_FI_PROF_NVLINK_L6_TX_BYTES Short = 1052 // DCGM_FI_PROF_NVLINK_L6_RX_BYTES represents the number of bytes received through NVLink lane 6 in KB/s DCGM_FI_PROF_NVLINK_L6_RX_BYTES Short = 1053 // DCGM_FI_PROF_NVLINK_L7_TX_BYTES represents the number of bytes transmitted through NVLink lane 7 in KB/s DCGM_FI_PROF_NVLINK_L7_TX_BYTES Short = 1054 // DCGM_FI_PROF_NVLINK_L7_RX_BYTES represents the number of bytes received through NVLink lane 7 in KB/s DCGM_FI_PROF_NVLINK_L7_RX_BYTES Short = 1055 // DCGM_FI_PROF_NVLINK_L8_TX_BYTES represents the number of bytes transmitted through NVLink lane 8 in KB/s DCGM_FI_PROF_NVLINK_L8_TX_BYTES Short = 1056 // DCGM_FI_PROF_NVLINK_L8_RX_BYTES represents the number of bytes received through NVLink lane 8 in KB/s DCGM_FI_PROF_NVLINK_L8_RX_BYTES Short = 1057 // DCGM_FI_PROF_NVLINK_L9_TX_BYTES represents the number of bytes transmitted through NVLink lane 9 in KB/s DCGM_FI_PROF_NVLINK_L9_TX_BYTES Short = 1058 // DCGM_FI_PROF_NVLINK_L9_RX_BYTES represents the number of bytes received through NVLink lane 9 in KB/s DCGM_FI_PROF_NVLINK_L9_RX_BYTES Short = 1059 // DCGM_FI_PROF_NVLINK_L10_TX_BYTES represents the number of bytes transmitted through NVLink lane 10 in KB/s DCGM_FI_PROF_NVLINK_L10_TX_BYTES Short = 1060 // DCGM_FI_PROF_NVLINK_L10_RX_BYTES represents the number of bytes received through NVLink lane 10 in KB/s DCGM_FI_PROF_NVLINK_L10_RX_BYTES Short = 1061 // DCGM_FI_PROF_NVLINK_L11_TX_BYTES represents the number of bytes transmitted through NVLink lane 11 in KB/s DCGM_FI_PROF_NVLINK_L11_TX_BYTES Short = 1062 // DCGM_FI_PROF_NVLINK_L11_RX_BYTES represents the number of bytes received through NVLink lane 11 in KB/s DCGM_FI_PROF_NVLINK_L11_RX_BYTES Short = 1063 // DCGM_FI_PROF_NVLINK_L12_TX_BYTES represents the number of bytes transmitted through NVLink lane 12 in KB/s DCGM_FI_PROF_NVLINK_L12_TX_BYTES Short = 1064 // DCGM_FI_PROF_NVLINK_L12_RX_BYTES represents the number of bytes received through NVLink lane 12 in KB/s DCGM_FI_PROF_NVLINK_L12_RX_BYTES Short = 1065 // DCGM_FI_PROF_NVLINK_L13_TX_BYTES represents the number of bytes transmitted through NVLink lane 13 in KB/s DCGM_FI_PROF_NVLINK_L13_TX_BYTES Short = 1066 // DCGM_FI_PROF_NVLINK_L13_RX_BYTES represents the number of bytes received through NVLink lane 13 in KB/s DCGM_FI_PROF_NVLINK_L13_RX_BYTES Short = 1067 // DCGM_FI_PROF_NVLINK_L14_TX_BYTES represents the number of bytes transmitted through NVLink lane 14 in KB/s DCGM_FI_PROF_NVLINK_L14_TX_BYTES Short = 1068 // DCGM_FI_PROF_NVLINK_L14_RX_BYTES represents the number of bytes received through NVLink lane 14 in KB/s DCGM_FI_PROF_NVLINK_L14_RX_BYTES Short = 1069 // DCGM_FI_PROF_NVLINK_L15_TX_BYTES represents the number of bytes transmitted through NVLink lane 15 in KB/s DCGM_FI_PROF_NVLINK_L15_TX_BYTES Short = 1070 // DCGM_FI_PROF_NVLINK_L15_RX_BYTES represents the number of bytes received through NVLink lane 15 in KB/s DCGM_FI_PROF_NVLINK_L15_RX_BYTES Short = 1071 // DCGM_FI_PROF_NVLINK_L16_TX_BYTES represents the number of bytes transmitted through NVLink lane 16 in KB/s DCGM_FI_PROF_NVLINK_L16_TX_BYTES Short = 1072 // DCGM_FI_PROF_C2C_TX_ALL_BYTES represents C2C (Chip-to-Chip) interface metric DCGM_FI_PROF_C2C_TX_ALL_BYTES Short = 1076 // DCGM_FI_PROF_C2C_TX_DATA_BYTES represents C2C (Chip-to-Chip) interface metric DCGM_FI_PROF_C2C_TX_DATA_BYTES Short = 1077 // DCGM_FI_PROF_C2C_RX_ALL_BYTES represents C2C (Chip-to-Chip) interface metric DCGM_FI_PROF_C2C_RX_ALL_BYTES Short = 1078 // DCGM_FI_PROF_C2C_RX_DATA_BYTES represents C2C (Chip-to-Chip) interface metric DCGM_FI_PROF_C2C_RX_DATA_BYTES Short = 1079 // DCGM_FI_DEV_CPU_UTIL_TOTAL represents the total CPU utilization, total DCGM_FI_DEV_CPU_UTIL_TOTAL Short = 1100 // DCGM_FI_DEV_CPU_UTIL_USER represents the CPU utilization, user DCGM_FI_DEV_CPU_UTIL_USER Short = 1101 // DCGM_FI_DEV_CPU_UTIL_NICE represents the CPU utilization, nice DCGM_FI_DEV_CPU_UTIL_NICE Short = 1102 // DCGM_FI_DEV_CPU_UTIL_SYS represents the CPU utilization, system time DCGM_FI_DEV_CPU_UTIL_SYS Short = 1103 // DCGM_FI_DEV_CPU_UTIL_IRQ represents the CPU utilization, interrupt servicing DCGM_FI_DEV_CPU_UTIL_IRQ Short = 1104 // DCGM_FI_DEV_CPU_TEMP_CURRENT represents the current CPU temperature in degrees Celsius DCGM_FI_DEV_CPU_TEMP_CURRENT Short = 1110 // DCGM_FI_DEV_CPU_TEMP_WARNING represents the CPU temperature warning threshold in degrees Celsius DCGM_FI_DEV_CPU_TEMP_WARNING Short = 1111 // DCGM_FI_DEV_CPU_TEMP_SHUTDOWN represents the CPU temperature shutdown threshold in degrees Celsius DCGM_FI_DEV_CPU_TEMP_SHUTDOWN Short = 1112 // DCGM_FI_DEV_CPU_CLOCK_CURRENT represents the current CPU clock frequency in MHz DCGM_FI_DEV_CPU_CLOCK_CURRENT Short = 1120 // DCGM_FI_DEV_CPU_POWER_CURRENT represents the current CPU power usage DCGM_FI_DEV_CPU_POWER_CURRENT Short = 1130 // DCGM_FI_DEV_CPU_POWER_LIMIT represents the GPU power limit DCGM_FI_DEV_CPU_POWER_LIMIT Short = 1131 // DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT represents the SoC power utilization DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT Short = 1132 // DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT represents the Module power utilization DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT Short = 1133 // DCGM_FI_DEV_CPU_VENDOR is the value for ECC DEV CPU Vendor DCGM_FI_DEV_CPU_VENDOR Short = 1140 // DCGM_FI_DEV_CPU_MODEL is the value for ECC DEV CPU Model DCGM_FI_DEV_CPU_MODEL Short = 1141 // DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS is the value for ECC DEV NVLink Count TX Packets DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS Short = 1200 // DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES is the value for ECC DEV NVLink Count TX Bytes DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES Short = 1201 // DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS is the value for ECC DEV NVLink Count RX Packets DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS Short = 1202 // DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES is the value for ECC DEV NVLink Count RX Bytes DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES Short = 1203 // DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS is the value for ECC DEV NVLink Count RX Malformed Packet Errors DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS Short = 1204 // DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS is the value for ECC DEV NVLink Count RX Buffer Overrun Errors DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS Short = 1205 // DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS is the value for ECC DEV NVLink Count RX Errors DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS Short = 1206 // DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS is the value for ECC DEV NVLink Count RX Remote Errors DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS Short = 1207 // DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS is the value for ECC DEV NVLink Count RX General Errors DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS Short = 1208 // DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS is the value for ECC DEV NVLink Count Local Link Integrity Errors DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS Short = 1209 // DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS is the value for ECC DEV NVLink Count TX Discards DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS Short = 1210 // DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS is the value for ECC DEV NVLink Count Link Recovery Successful Events DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS Short = 1211 // DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS is the value for ECC DEV NVLink Count Link Recovery Failed Events DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS Short = 1212 // DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS is the value for ECC DEV NVLink Count Link Recovery Events DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS Short = 1213 // DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS is the value for ECC DEV NVLink Count RX Symbol Errors DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS Short = 1214 // DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER is the value for ECC DEV NVLink Count Symbol BER DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER Short = 1215 // DCGM_FI_DEV_CONNECTX_HEALTH represents a health state of ConnectX DCGM_FI_DEV_CONNECTX_HEALTH Short = 1300 // DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_WIDTH is the value of an active PCIe link width DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_WIDTH Short = 1301 // DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_SPEED is the value of an active PCIe link speed DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_SPEED Short = 1302 // DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_WIDTH is the value of an expected PCIe link width DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_WIDTH Short = 1303 // DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_SPEED is the value of an expected PCIe link speed DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_SPEED Short = 1304 // DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_STATUS is the value of a correctable error status DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_STATUS Short = 1305 // DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_MASK is the value of a correctable error mask DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_MASK Short = 1306 // DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_STATUS is the value of an uncorrectable error status DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_STATUS Short = 1307 // DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_MASK is the value of an uncorrectable error mask DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_MASK Short = 1308 // DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_SEVERITY is the value of an uncorrectable error severity DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_SEVERITY Short = 1309 // DCGM_FI_DEV_CONNECTX_DEVICE_TEMPERATURE is the value of a device temperature DCGM_FI_DEV_CONNECTX_DEVICE_TEMPERATURE Short = 1310 // DCGM_FI_DEV_LAST_CONNECTX_FIELD_ID represents the last field ID for ConnectX fields DCGM_FI_DEV_LAST_CONNECTX_FIELD_ID Short = 1399 // DCGM_FI_MAX_FIELDS represents 1 greater than maximum fields above. This is the 1 greater than the maximum field id that could be allocated DCGM_FI_MAX_FIELDS Short = 1311 )
func GetFieldID ¶
GetFieldID returns the DCGM field ID for a given field name and whether it was found It first checks the current field IDs, then falls back to legacy field IDs if not found
func GetFieldIDOrPanic ¶
GetFieldIDOrPanic returns the DCGM field ID for a given field name It panics if the field name is not found in either current or legacy maps
type Status ¶
type Status struct { // Memory represents the current memory usage of the DCGM hostengine in kilobytes Memory int64 // CPU represents the current CPU utilization of the DCGM hostengine as a percentage (0-100) CPU float64 }
Status represents the current resource utilization of the DCGM hostengine process
func Introspect ¶
Introspect returns memory and CPU usage statistics for the DCGM hostengine
type SystemWatch ¶
type SystemWatch struct { // Type identifies the type of health watch system Type string // Status indicates the current health status Status string // Error contains any error message if status is not healthy Error string }
SystemWatch represents a health watch system and its status
type ThermalPolicyCondition ¶
type ThermalPolicyCondition struct { // ThermalViolation indicates the severity of the thermal violation ThermalViolation uint }
ThermalPolicyCondition contains details about a thermal violation
type UtilizationInfo ¶
type UtilizationInfo struct { GPU int64 // % Memory int64 // % Encoder int64 // % Decoder int64 // % }
UtilizationInfo contains GPU utilization metrics
type ViolationTime ¶
type ViolationTime struct { // Power is time spent throttling due to power constraints Power *uint64 // Thermal is time spent throttling due to thermal constraints Thermal *uint64 // Reliability is time spent throttling due to reliability constraints Reliability *uint64 // BoardLimit is time spent throttling due to board limit constraints BoardLimit *uint64 // LowUtilization is time spent throttling due to low utilization LowUtilization *uint64 // SyncBoost is time spent throttling due to sync boost SyncBoost *uint64 }
ViolationTime measures amount of time (in ms) GPU was at reduced clocks
type XIDErrorInfo ¶
type XIDErrorInfo struct { // NumErrors is the number of XID errors that occurred NumErrors int // Timestamp contains the timestamps of when XID errors occurred Timestamp []uint64 }
XIDErrorInfo contains information about XID errors
type XidPolicyCondition ¶
type XidPolicyCondition struct { // ErrNum is the XID error number ErrNum uint }
XidPolicyCondition contains details about an XID error