Documentation
¶
Overview ¶
Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
Index ¶
- Constants
- Variables
- func AddEntityToGroup(groupID GroupHandle, entityGroupID Field_Entity_Group, entityID uint) (err error)
- func AddLinkEntityToGroup(groupID GroupHandle, index uint, entityGroupID Field_Entity_Group, ...) (err error)
- func AddToGroup(groupID GroupHandle, gpuID uint) (err error)
- func AttachDriver() error
- func ClearPolicyForGroup(group GroupHandle) error
- func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error)
- func DestroyGroup(groupID GroupHandle) (err error)
- func DetachDriver() error
- func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)
- func FieldsInit() int
- func FieldsTerm() int
- func FindFirstNonAsciiIndex(value [4096]byte) int
- func Fv2_Blob(fv FieldValue_v2) [4096]byte
- func Fv2_String(fv FieldValue_v2) string
- func GetAllDeviceCount() (uint, error)
- func GetAllSupportedFieldsMetadata() map[Short]FieldMeta
- func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error)
- func GetSupportedDevices() ([]uint, error)
- func HealthSet(groupID GroupHandle, systems HealthSystem) (err error)
- func Init(m mode, args ...string) (cleanup func(), err error)
- func InjectFieldValue(gpu uint, fieldID Short, fieldType uint, status int, ts int64, value any) error
- func IsCurrentField(fieldName string) bool
- func IsInt32Blank(value int) bool
- func IsInt64Blank(value int64) bool
- func IsLegacyField(fieldName string) bool
- func ListenForPolicyViolations(ctx context.Context, typ ...policyCondition) (<-chan PolicyViolation, error)
- func ListenForPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...policyCondition) (<-chan PolicyViolation, error)
- func SetPolicyForGroup(group GroupHandle, configs ...PolicyConfig) error
- func Shutdown() (err error)
- func UnwatchFields(fieldsGroup FieldHandle, group GroupHandle) error
- func UpdateAllFields() error
- func ViolationRegistration(data unsafe.Pointer) int
- func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error
- func WatchFieldsWithGroupEx(fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, ...) error
- func WatchPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...PolicyCondition) (<-chan PolicyViolation, error)
- type CPUHierarchyCPU_v1
- type CPUHierarchy_v1
- type ClockInfo
- type DbePolicyCondition
- type DcgmBindUnbindEventState
- type Device
- type DeviceHealth
- type DeviceIdentifiers
- type DeviceStatus
- type DiagErrorDetail
- type DiagResult
- type DiagResults
- type DiagType
- type ECCErrorsInfo
- type EntityStatus
- type Error
- type FieldHandle
- type FieldMeta
- type FieldValue_v1
- func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error)
- func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error)
- func LinkGetLatestValues(index uint, parentType Field_Entity_Group, parentId uint, fields []Short) ([]FieldValue_v1, error)
- type FieldValue_v2
- type Field_Entity_Group
- type GroupEntityPair
- type GroupHandle
- func CreateGroup(groupName string) (goGroupId GroupHandle, err error)
- func CreateGroupWithContext(ctx context.Context, groupName string) (GroupHandle, error)
- func GroupAllGPUs() GroupHandle
- func NewDefaultGroup(groupName string) (GroupHandle, error)
- func WatchFields(gpuID uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)
- func WatchPidFields() (GroupHandle, error)
- func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error)
- type GroupInfo
- type HealthCheckErrorCode
- type HealthResponse
- type HealthResult
- type HealthSystem
- type Incident
- type Link_State
- type MemoryInfo
- type MetricGroup
- type MigEntityInfo
- type MigHierarchyInfo
- type MigHierarchyInfo_v2
- type MigHierarchy_v2
- type MigProfile
- type NvLinkP2PStatus
- type NvLinkStatus
- type NvlinkPolicyCondition
- type P2PLink
- type P2PLinkType
- type PCIInfo
- type PCIStatusInfo
- type PCIThroughputInfo
- type PciPolicyCondition
- type PerfState
- type PolicyAction
- type PolicyCondition
- type PolicyConfig
- type PolicyStatus
- type PolicyValidation
- type PolicyViolation
- type PowerPolicyCondition
- type ProcessInfo
- type ProcessUtilInfo
- type RetiredPagesPolicyCondition
- type Short
- type Status
- type SystemWatch
- type ThermalPolicyCondition
- type Time
- type UtilizationInfo
- type VersionInfo
- type ViolationTime
- type XIDErrorInfo
- type XidPolicyCondition
Constants ¶
const ( Embedded mode = iota Standalone StartHostengine )
const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine
const ( // DCGM_FT_BINARY is the type for binary data DCGM_FT_BINARY = uint('b') // DCGM_FT_DOUBLE is the type for floating-point numbers DCGM_FT_DOUBLE = uint('d') // DCGM_FT_INT64 is the type for 64-bit integers DCGM_FT_INT64 = uint('i') // DCGM_FT_STRING is the type for strings DCGM_FT_STRING = uint('s') // DCGM_FT_TIMESTAMP is the type for timestamps DCGM_FT_TIMESTAMP = uint('t') // DCGM_FT_INT32_BLANK is the blank value for 32-bit integers DCGM_FT_INT32_BLANK = int64(2147483632) // DCGM_FT_INT32_NOT_FOUND is the value for not found in 32-bit integers DCGM_FT_INT32_NOT_FOUND = DCGM_FT_INT32_BLANK + 1 // DCGM_FT_INT32_NOT_SUPPORTED is the value for not supported in 32-bit integers DCGM_FT_INT32_NOT_SUPPORTED = DCGM_FT_INT32_BLANK + 2 // DCGM_FT_INT32_NOT_PERMISSIONED is the value for not permissioned in 32-bit integers DCGM_FT_INT32_NOT_PERMISSIONED = DCGM_FT_INT32_BLANK + 3 // DCGM_FT_INT64_BLANK is the blank value for 64-bit integers DCGM_FT_INT64_BLANK = int64(9223372036854775792) // DCGM_FT_INT64_NOT_FOUND is the value for not found in 64-bit integers DCGM_FT_INT64_NOT_FOUND = DCGM_FT_INT64_BLANK + 1 // DCGM_FT_INT64_NOT_SUPPORTED is the value for not supported in 64-bit integers DCGM_FT_INT64_NOT_SUPPORTED = DCGM_FT_INT64_BLANK + 2 // DCGM_FT_INT64_NOT_PERMISSIONED is the value for not permissioned in 64-bit integers DCGM_FT_INT64_NOT_PERMISSIONED = DCGM_FT_INT64_BLANK + 3 // DCGM_FT_FP64_BLANK is the blank value for floating-point numbers DCGM_FT_FP64_BLANK = 140737488355328.0 // DCGM_FT_FP64_NOT_FOUND is the value for not found in floating-point numbers DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0) // DCGM_FT_FP64_NOT_SUPPORTED is the value for not supported in floating-point numbers DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0) // DCGM_FT_FP64_NOT_PERMISSIONED is the value for not permissioned in floating-point numbers DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0) // DCGM_FT_STR_BLANK is the blank value for strings DCGM_FT_STR_BLANK = "<<<NULL>>>" // DCGM_FT_STR_NOT_FOUND is the value for not found in strings DCGM_FT_STR_NOT_FOUND = "<<<NOT_FOUND>>>" // DCGM_FT_STR_NOT_SUPPORTED is the value for not supported in strings DCGM_FT_STR_NOT_SUPPORTED = "<<<NOT_SUPPORTED>>>" // DCGM_FT_STR_NOT_PERMISSIONED is the value for not permissioned in strings DCGM_FT_STR_NOT_PERMISSIONED = "<<<NOT_PERMISSIONED>>>" // DCGM_ST_OK is the value for ECC OK DCGM_ST_OK = 0 // DCGM_ST_BADPARAM is the value for ECC BAD PARAM DCGM_ST_BADPARAM = -1 // DCGM_ST_GENERIC_ERROR is the value for ECC GENERIC ERROR DCGM_ST_GENERIC_ERROR = -3 // DCGM_ST_MEMORY is the value for ECC MEMORY DCGM_ST_MEMORY = -4 // DCGM_ST_NOT_CONFIGURED is the value for ECC NOT CONFIGURED DCGM_ST_NOT_CONFIGURED = -5 // DCGM_ST_NOT_SUPPORTED is the value for ECC NOT SUPPORTED DCGM_ST_NOT_SUPPORTED = -6 // DCGM_ST_INIT_ERROR is the value for ECC INIT ERROR DCGM_ST_INIT_ERROR = -7 // DCGM_ST_NVML_ERROR is the value for ECC NVML ERROR DCGM_ST_NVML_ERROR = -8 // DCGM_ST_PENDING is the value for ECC PENDING DCGM_ST_PENDING = -9 // DCGM_ST_TIMEOUT is the value for ECC TIMEOUT DCGM_ST_TIMEOUT = -11 // DCGM_ST_VER_MISMATCH is the value for ECC VER MISMATCH DCGM_ST_VER_MISMATCH = -12 // DCGM_ST_UNKNOWN_FIELD is the value for ECC UNKNOWN FIELD DCGM_ST_UNKNOWN_FIELD = -13 // DCGM_ST_NO_DATA is the value for ECC NO DATA DCGM_ST_NO_DATA = -14 // DCGM_ST_STALE_DATA is the value for ECC STALE DATA DCGM_ST_STALE_DATA = -15 // DCGM_ST_NOT_WATCHED is the value for ECC NOT WATCHED DCGM_ST_NOT_WATCHED = -16 // DCGM_ST_NO_PERMISSION is the value for ECC NO PERMISSION DCGM_ST_NO_PERMISSION = -17 // DCGM_ST_GPU_IS_LOST is the value for ECC GPU IS LOST DCGM_ST_GPU_IS_LOST = -18 // DCGM_ST_RESET_REQUIRED is the value for ECC RESET REQUIRED DCGM_ST_RESET_REQUIRED = -19 // DCGM_ST_FUNCTION_NOT_FOUND is the value for ECC FUNCTION NOT FOUND DCGM_ST_FUNCTION_NOT_FOUND = -20 // DCGM_ST_CONNECTION_NOT_VALID is the value for ECC CONNECTION NOT VALID DCGM_ST_CONNECTION_NOT_VALID = -21 // DCGM_ST_GPU_NOT_SUPPORTED is the value for ECC GPU NOT SUPPORTED DCGM_ST_GPU_NOT_SUPPORTED = -22 // DCGM_ST_GROUP_INCOMPATIBLE is the value for ECC GROUP INCOMPATIBLE DCGM_ST_GROUP_INCOMPATIBLE = -23 // DCGM_ST_MAX_LIMIT is the value for ECC MAX LIMIT DCGM_ST_MAX_LIMIT = -24 // DCGM_ST_LIBRARY_NOT_FOUND is the value for ECC LIBRARY NOT FOUND DCGM_ST_LIBRARY_NOT_FOUND = -25 // DCGM_ST_DUPLICATE_KEY is the value for ECC DUPLICATE KEY DCGM_ST_DUPLICATE_KEY = -26 // DCGM_ST_GPU_IN_SYNC_BOOST_GROUP is the value for ECC GPU IN SYNC BOOST GROUP DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27 // DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP is the value for ECC GPU NOT IN SYNC BOOST GROUP DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28 // DCGM_ST_REQUIRES_ROOT is the value for ECC REQUIRES ROOT DCGM_ST_REQUIRES_ROOT = -29 // DCGM_ST_NVVS_ERROR is the value for ECC NVVS ERROR DCGM_ST_NVVS_ERROR = -30 // DCGM_ST_INSUFFICIENT_SIZE is the value for ECC INSUFFICIENT SIZE DCGM_ST_INSUFFICIENT_SIZE = -31 // DCGM_ST_FIELD_UNSUPPORTED_BY_API is the value for ECC FIELD UNSUPPORTED BY API DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32 // DCGM_ST_MODULE_NOT_LOADED is the value for ECC MODULE NOT LOADED DCGM_ST_MODULE_NOT_LOADED = -33 // DCGM_ST_IN_USE is the value for ECC IN USE DCGM_ST_IN_USE = -34 // DCGM_ST_GROUP_IS_EMPTY is the value for ECC GROUP IS EMPTY DCGM_ST_GROUP_IS_EMPTY = -35 // DCGM_ST_PROFILING_NOT_SUPPORTED is the value for ECC PROFILING NOT SUPPORTED DCGM_ST_PROFILING_NOT_SUPPORTED = -36 // DCGM_ST_PROFILING_LIBRARY_ERROR is the value for ECC PROFILING LIBRARY ERROR DCGM_ST_PROFILING_LIBRARY_ERROR = -37 // DCGM_ST_PROFILING_MULTI_PASS is the value for ECC PROFILING MULTI PASS DCGM_ST_PROFILING_MULTI_PASS = -38 // DCGM_ST_DIAG_ALREADY_RUNNING is the value for ECC DIAG ALREADY RUNNING DCGM_ST_DIAG_ALREADY_RUNNING = -39 // DCGM_ST_DIAG_BAD_JSON is the value for ECC DIAG BAD JSON DCGM_ST_DIAG_BAD_JSON = -40 // DCGM_ST_DIAG_BAD_LAUNCH is the value for ECC DIAG BAD LAUNCH DCGM_ST_DIAG_BAD_LAUNCH = -41 // DCGM_ST_DIAG_UNUSED is the value for ECC DIAG UNUSED DCGM_ST_DIAG_UNUSED = -42 // DCGM_ST_DIAG_THRESHOLD_EXCEEDED is the value for ECC DIAG THRESHOLD EXCEEDED DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43 // DCGM_ST_INSUFFICIENT_DRIVER_VERSION is the value for ECC INSUFFICIENT DRIVER VERSION DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44 // DCGM_ST_INSTANCE_NOT_FOUND is the value for ECC INSTANCE NOT FOUND DCGM_ST_INSTANCE_NOT_FOUND = -45 // DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND is the value for ECC COMPUTE INSTANCE NOT FOUND DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46 // DCGM_ST_CHILD_NOT_KILLED is the value for ECC CHILD NOT KILLED DCGM_ST_CHILD_NOT_KILLED = -47 // DCGM_ST_3RD_PARTY_LIBRARY_ERROR is the value for ECC 3RD PARTY LIBRARY ERROR DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48 // DCGM_ST_INSUFFICIENT_RESOURCES is the value for ECC INSUFFICIENT RESOURCES DCGM_ST_INSUFFICIENT_RESOURCES = -49 // DCGM_ST_PLUGIN_EXCEPTION is the value for ECC PLUGIN EXCEPTION DCGM_ST_PLUGIN_EXCEPTION = -50 // DCGM_ST_NVVS_ISOLATE_ERROR is the value for ECC NVVS ISOLATE ERROR DCGM_ST_NVVS_ISOLATE_ERROR = -51 // DCGM_ST_NVVS_BINARY_NOT_FOUND is the value for ECC NVVS BINARY NOT FOUND DCGM_ST_NVVS_BINARY_NOT_FOUND = -52 // DCGM_ST_NVVS_KILLED is the value for ECC NVVS KILLED DCGM_ST_NVVS_KILLED = -53 // DCGM_ST_PAUSED is the value for ECC PAUSED DCGM_ST_PAUSED = -54 // DCGM_ST_ALREADY_INITIALIZED is the value for ECC ALREADY INITIALIZED DCGM_ST_ALREADY_INITIALIZED = -55 // DCGM_ST_NVML_NOT_LOADED is the value for ECC NVML NOT LOADED DCGM_ST_NVML_NOT_LOADED = -56 // DCGM_ST_NVML_DRIVER_TIMEOUT is the value for ECC NVML DRIVER TIMEOUT DCGM_ST_NVML_DRIVER_TIMEOUT = -57 // DCGM_ST_NVVS_NO_AVAILABLE_TEST is the value for ECC NVVS NO AVAILABLE TEST DCGM_ST_NVVS_NO_AVAILABLE_TEST = -58 // DCGM_ST_UNINITIALIZED is the value for DCGM not initialized DCGM_ST_UNINITIALIZED = -59 // DCGM_ST_NO_NVVS is the value for NVVS not available DCGM_ST_NO_NVVS = -60 // DCGM_ST_NVVS_NOT_RUNNING is the value for NVVS not running DCGM_ST_NVVS_NOT_RUNNING = -61 // DCGM_ST_CHILD_SPAWN_FAILED is the value for child spawn failed DCGM_ST_CHILD_SPAWN_FAILED = -62 // DCGM_ST_FILE_IO_ERROR is the value for file I/O error DCGM_ST_FILE_IO_ERROR = -63 // DCGM_ST_CHILD_SIGNAL_RECEIVED is the value for child signal received DCGM_ST_CHILD_SIGNAL_RECEIVED = -64 // DCGM_ST_CALLER_ALREADY_STOPPED is the value for caller already stopped DCGM_ST_CALLER_ALREADY_STOPPED = -65 // DCGM_ST_DIAG_STOPPED is the value for diagnostic stopped DCGM_ST_DIAG_STOPPED = -66 )
FieldType constants
const ( // MAX_NUM_CPU_CORES represents the maximum number of CPU cores supported MAX_NUM_CPU_CORES = uint(C.DCGM_MAX_NUM_CPU_CORES) // MAX_NUM_CPUS represents the maximum number of CPUs supported MAX_NUM_CPUS = uint(C.DCGM_MAX_NUM_CPUS) // CHAR_BIT represents the number of bits in a byte CHAR_BIT = uint(C.CHAR_BIT) // MAX_CPU_CORE_BITMASK_COUNT represents the maximum count of CPU core bitmasks MAX_CPU_CORE_BITMASK_COUNT = uint(1024 / 8 / 8) )
const ( // PerfStateMax represents the highest performance state (P0) PerfStateMax = 0 // PerfStateMin represents the lowest performance state (P15) PerfStateMin = 15 // PerfStateUnknown represents an unknown performance state PerfStateUnknown = 32 )
const ( // MAX_NUM_DEVICES represents the maximum number of GPU devices supported MAX_NUM_DEVICES = uint(C.DCGM_MAX_NUM_DEVICES) // MAX_HIERARCHY_INFO represents the maximum size of the MIG hierarchy information MAX_HIERARCHY_INFO = uint(C.DCGM_MAX_HIERARCHY_INFO) )
const ( // DbePolicy represents a Double-bit ECC error policy condition DbePolicy = PolicyCondition("Double-bit ECC error") // PCIePolicy represents a PCI error policy condition PCIePolicy = PolicyCondition("PCI error") // MaxRtPgPolicy represents a Maximum Retired Pages Limit policy condition MaxRtPgPolicy = PolicyCondition("Max Retired Pages Limit") // ThermalPolicy represents a Thermal Limit policy condition ThermalPolicy = PolicyCondition("Thermal Limit") // PowerPolicy represents a Power Limit policy condition PowerPolicy = PolicyCondition("Power Limit") // NvlinkPolicy represents an NVLink error policy condition NvlinkPolicy = PolicyCondition("Nvlink Error") // XidPolicy represents an XID error policy condition XidPolicy = PolicyCondition("XID Error") )
Policy condition types
const ( // DefaultMaxRetiredPages is the default threshold for retired pages (matches dcgmi default) DefaultMaxRetiredPages = 10 // DefaultMaxTemperature is the default threshold for temperature in Celsius (matches dcgmi default) DefaultMaxTemperature = 100 // DefaultMaxPower is the default threshold for power in Watts (matches dcgmi default) DefaultMaxPower = 250 )
Default policy thresholds matching dcgmi defaults
const ( // DCGM_NVSDM_MOCK_YAML environment variable for enabling NVSDM mock configuration DCGM_NVSDM_MOCK_YAML = "DCGM_NVSDM_MOCK_YAML" // DCGM_DBG_FILE is environment variables which enables DCGM to write debug logs to a specific file DCGM_DBG_FILE = "__DCGM_DBG_FILE" // DCGM_DBG_LVL is environment variables which enables DCGM logging level DCGM_DBG_LVL = "__DCGM_DBG_LVL" )
const (
DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001)
)
DCGM_FV_FLAG_LIVE_DATA is a flag for the DCGM fields.
const (
DCGM_GROUP_MAX_ENTITIES int = C.DCGM_GROUP_MAX_ENTITIES_V2
)
DCGM_GROUP_MAX_ENTITIES represents the maximum number of entities allowed in a group
const DIAG_RESULT_STRING_SIZE = 1024
DIAG_RESULT_STRING_SIZE represents the maximum size of diagnostic result strings
Variables ¶
var ErrInvalidMode = errors.New("invalid mode")
ErrInvalidMode represents an error indicating that an invalid mode was used
Functions ¶
func AddEntityToGroup ¶
func AddEntityToGroup(groupID GroupHandle, entityGroupID Field_Entity_Group, entityID uint) (err error)
AddEntityToGroup adds an entity to an existing group
func AddLinkEntityToGroup ¶
func AddLinkEntityToGroup(groupID GroupHandle, index uint, entityGroupID Field_Entity_Group, parentID uint) (err error)
AddLinkEntityToGroup adds a link entity to the group
func AddToGroup ¶
func AddToGroup(groupID GroupHandle, gpuID uint) (err error)
AddToGroup adds a GPU to an existing group
func AttachDriver ¶
func AttachDriver() error
AttachDriver attaches the driver to DCGM. This is used to reattach the driver after a DetachDriver call, typically when updating the driver without restarting DCGM. Requires DCGM 4.5.0 or later.
func ClearPolicyForGroup ¶
func ClearPolicyForGroup(group GroupHandle) error
ClearPolicyForGroup clears all policy conditions for a GPU group
func CreateFakeEntities ¶
func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error)
CreateFakeEntities creates test entities with the specified MIG hierarchy information. This function is intended for testing purposes only. Returns a slice of Entity IDs for the created entities and any error encountered.
func DestroyGroup ¶
func DestroyGroup(groupID GroupHandle) (err error)
DestroyGroup destroys an existing GPU group
func DetachDriver ¶
func DetachDriver() error
DetachDriver detaches the driver from DCGM. This is used when you want to update the driver without restarting DCGM. After detaching, GPUs will not be accessible until AttachDriver is called. Requires DCGM 4.5.0 or later.
func FieldGroupDestroy ¶
func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)
FieldGroupDestroy destroys a previously created field group. Returns an error if the group cannot be destroyed.
func FieldsInit ¶
func FieldsInit() int
FieldsInit initializes the DCGM fields module. Returns an integer status code.
func FieldsTerm ¶
func FieldsTerm() int
FieldsTerm terminates the DCGM fields module. Returns an integer status code.
func FindFirstNonAsciiIndex ¶
FindFirstNonAsciiIndex returns the index of the first non-ASCII character in the byte array. Returns 4096 if no non-ASCII character is found.
func Fv2_Blob ¶
func Fv2_Blob(fv FieldValue_v2) [4096]byte
Fv2_Blob returns the raw field value of a FieldValue_v2 as a byte array.
func Fv2_String ¶
func Fv2_String(fv FieldValue_v2) string
Fv2_String returns the string value of a FieldValue_v2.
func GetAllDeviceCount ¶
GetAllDeviceCount returns the count of all GPUs in the system
func GetAllSupportedFieldsMetadata ¶
GetAllSupportedFieldsMetadata retrieves metadata for all supported DCGM fields. It returns a map of field IDs to their corresponding FieldMeta information. Unsupported fields are excluded from the resulting map.
func GetEntityGroupEntities ¶
func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error)
GetEntityGroupEntities returns all entities of the specified group type
func GetSupportedDevices ¶
GetSupportedDevices returns a list of DCGM-supported GPU IDs
func HealthSet ¶
func HealthSet(groupID GroupHandle, systems HealthSystem) (err error)
HealthSet enables the DCGM health check system for the given systems. It configures which health watch systems should be monitored for the specified group.
func Init ¶
Init starts DCGM in the specified mode Mode can be: - Embedded: Start hostengine within this process - Standalone: Connect to an already running nv-hostengine - StartHostengine: Start and connect to nv-hostengine, terminate before exiting Returns a cleanup function and any error encountered
func InjectFieldValue ¶
func InjectFieldValue(gpu uint, fieldID Short, fieldType uint, status int, ts int64, value any) error
InjectFieldValue injects a test value for a specific field into DCGM's field manager. This function is intended for testing purposes only.
Parameters:
- gpu: The GPU ID to inject the field value for
- fieldID: The DCGM field identifier
- fieldType: The type of the field (e.g., DCGM_FT_INT64, DCGM_FT_DOUBLE)
- status: The status code for the field
- ts: The timestamp for the field value
- value: The value to inject (must match fieldType)
Returns an error if the injection fails
func IsCurrentField ¶
IsCurrentField returns true if the given field name is a current field
func IsInt32Blank ¶
IsInt32Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffff0). These values indicate that no valid data is available for the field.
func IsInt64Blank ¶
IsInt64Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffffffffffff0). These values indicate that no valid data is available for the field.
func IsLegacyField ¶
IsLegacyField returns true if the given field name is a legacy field
func ListenForPolicyViolations ¶
func ListenForPolicyViolations(ctx context.Context, typ ...policyCondition) (<-chan PolicyViolation, error)
ListenForPolicyViolations sets up monitoring for the specified policy conditions on all GPUs. Returns a channel that receives policy violations and any error encountered.
Important: The context MUST be cancelled when monitoring is no longer needed to properly clean up resources and prevent goroutine leaks. When the context is cancelled, the returned channel will be closed and all resources will be automatically cleaned up.
Example:
ctx, cancel := context.WithCancel(context.Background())
defer cancel() // Ensures cleanup happens
violations, err := dcgm.ListenForPolicyViolations(ctx, dcgm.XidPolicy)
if err != nil {
return err
}
for violation := range violations {
// Handle violation...
}
func ListenForPolicyViolationsForGroup ¶
func ListenForPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...policyCondition) (<-chan PolicyViolation, error)
ListenForPolicyViolationsForGroup sets up policy monitoring for the specified GPU group. Returns a channel that receives policy violations and any error encountered.
Important: The context MUST be cancelled when monitoring is no longer needed to properly clean up resources and prevent goroutine leaks. See ListenForPolicyViolations for usage example.
func SetPolicyForGroup ¶
func SetPolicyForGroup(group GroupHandle, configs ...PolicyConfig) error
SetPolicyForGroup configures policies with optional custom thresholds and actions for a GPU group
func Shutdown ¶
func Shutdown() (err error)
Shutdown stops DCGM and destroys all connections Returns an error if DCGM is not initialized
func UnwatchFields ¶
func UnwatchFields(fieldsGroup FieldHandle, group GroupHandle) error
UnwatchFields stops monitoring the specified fields for a GPU group. fieldsGroup is the handle to the field group to stop watching. group is the handle to the GPU group to stop watching.
func UpdateAllFields ¶
func UpdateAllFields() error
UpdateAllFields forces an update of all field values. Returns an error if the update fails.
func ViolationRegistration ¶
ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()
func WatchFieldsWithGroup ¶
func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error
WatchFieldsWithGroup starts monitoring fields using default parameters. fieldsGroup is the handle of the field group to watch. group is the group handle to associate with the watch. Returns an error if the watch operation fails.
func WatchFieldsWithGroupEx ¶
func WatchFieldsWithGroupEx( fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32, ) error
WatchFieldsWithGroupEx starts monitoring fields with custom parameters. fieldsGroup is the handle of the field group to watch. group is the group handle to associate with the watch. updateFreq is the update frequency in microseconds. maxKeepAge is the maximum age of samples to keep in seconds. maxKeepSamples is the maximum number of samples to keep. Returns an error if the watch operation fails.
func WatchPolicyViolationsForGroup ¶
func WatchPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...PolicyCondition) (<-chan PolicyViolation, error)
WatchPolicyViolationsForGroup registers to receive violation notifications for a specific GPU group
Types ¶
type CPUHierarchyCPU_v1 ¶
type CPUHierarchyCPU_v1 struct {
// CPUID is the unique identifier for this CPU
CPUID uint
// OwnedCores is a bitmask array representing the cores owned by this CPU
OwnedCores []uint64
}
CPUHierarchyCPU_v1 represents information about a single CPU and its owned cores
type CPUHierarchy_v1 ¶
type CPUHierarchy_v1 struct {
// Version is the version number of the hierarchy structure
Version uint
// NumCPUs is the number of CPUs in the system
NumCPUs uint
// CPUs contains information about each CPU in the system
CPUs [MAX_NUM_CPUS]CPUHierarchyCPU_v1
}
CPUHierarchy_v1 represents version 1 of the CPU hierarchy information
func GetCPUHierarchy ¶
func GetCPUHierarchy() (hierarchy CPUHierarchy_v1, err error)
GetCPUHierarchy retrieves the CPU hierarchy information from DCGM
type DbePolicyCondition ¶
type DbePolicyCondition struct {
// Location specifies where the ECC error occurred
Location string
// NumErrors indicates the number of errors detected
NumErrors uint
}
DbePolicyCondition contains details about a Double-bit ECC error
type DcgmBindUnbindEventState ¶
type DcgmBindUnbindEventState int
DcgmBindUnbindEventState represents the state of GPU bind/unbind events
const ( // DcgmBUEventStateSystemReinitializing indicates the system is reinitializing (GPU unbind) DcgmBUEventStateSystemReinitializing DcgmBindUnbindEventState = 1 // DcgmBUEventStateSystemReinitializationCompleted indicates system reinitialization is complete (GPU bind) DcgmBUEventStateSystemReinitializationCompleted DcgmBindUnbindEventState = 2 )
type Device ¶
type Device struct {
GPU uint
DCGMSupported string
UUID string
Power uint // W
PCI PCIInfo
Identifiers DeviceIdentifiers
Topology []P2PLink
CPUAffinity string
}
Device represents a GPU device and its properties
func GetDeviceInfo ¶
GetDeviceInfo returns detailed information about the specified GPU
type DeviceHealth ¶
type DeviceHealth struct {
// GPU is the ID of the GPU device
GPU uint
// Status indicates the overall health status of the GPU
Status string
// Watches contains the status of individual health watch systems
Watches []SystemWatch
}
DeviceHealth represents the health status of a GPU device
func HealthCheckByGpuId ¶
func HealthCheckByGpuId(gpuID uint) (DeviceHealth, error)
HealthCheckByGpuId performs a health check on the specified GPU
type DeviceIdentifiers ¶
type DeviceIdentifiers struct {
Brand string
Model string
Serial string
Vbios string
InforomImageVersion string
DriverVersion string
}
DeviceIdentifiers contains various identification information for a GPU device
type DeviceStatus ¶
type DeviceStatus struct {
Power float64 // W
Temperature int64 // °C
Utilization UtilizationInfo
Memory MemoryInfo
Clocks ClockInfo
PCI PCIStatusInfo
Performance PerfState
FanSpeed int64 // %
}
DeviceStatus contains comprehensive GPU device status information
func GetDeviceStatus ¶
func GetDeviceStatus(gpuID uint) (DeviceStatus, error)
GetDeviceStatus returns current status information about the specified GPU
type DiagErrorDetail ¶
type DiagErrorDetail struct {
// Message contains a human-readable description of the error
Message string
// Code identifies the specific type of error
Code HealthCheckErrorCode
}
DiagErrorDetail contains detailed information about a health check error
type DiagResult ¶
type DiagResult struct {
// Status indicates the test result: "pass", "fail", "warn", "skip", or "notrun"
Status string
// TestName is the name of the diagnostic test that was run
TestName string
// TestOutput contains any additional output or messages from the test
TestOutput string
// ErrorCode is the numeric error code if the test failed
ErrorCode uint
// ErrorMessage contains a detailed error message if the test failed
ErrorMessage string
// Serial number of the tested entity
SerialNumber string
// EntityID
EntityID uint
}
DiagResult represents the result of a single diagnostic test
type DiagResults ¶
type DiagResults struct {
// Software contains the results of software-related diagnostic tests
Software []DiagResult
}
DiagResults contains the results of all diagnostic tests
func RunDiag ¶
func RunDiag(diagType DiagType, groupID GroupHandle) (DiagResults, error)
RunDiag runs diagnostic tests on a group of GPUs with the specified diagnostic level. Parameters:
- diagType: The type/level of diagnostic test to run (Quick, Medium, Long, or Extended)
- groupId: The group of GPUs to run diagnostics on
Returns:
- DiagResults containing the results of all diagnostic tests
- error if the diagnostics failed to run
type DiagType ¶
type DiagType int
DiagType represents the type of diagnostic test to run
const ( // DiagQuick represents a quick diagnostic test that performs basic health checks DiagQuick DiagType = 1 // DiagMedium represents a medium-length diagnostic test that performs more comprehensive checks DiagMedium DiagType = 2 // DiagLong represents a long diagnostic test that performs extensive health checks DiagLong DiagType = 3 // DiagExtended represents an extended diagnostic test that performs the most thorough system checks DiagExtended DiagType = 4 )
type ECCErrorsInfo ¶
ECCErrorsInfo contains ECC memory error counts
type EntityStatus ¶
type EntityStatus uint
EntityStatus represents the status of a GPU entity
const ( // EntityStatusUnknown - Entity has not been referenced yet EntityStatusUnknown EntityStatus = 0 // EntityStatusOk - Entity is known and OK EntityStatusOk EntityStatus = 1 // EntityStatusUnsupported - Entity is unsupported by DCGM EntityStatusUnsupported EntityStatus = 2 // EntityStatusInaccessible - Entity is inaccessible, usually due to cgroups EntityStatusInaccessible EntityStatus = 3 // EntityStatusLost - Entity has been lost. Usually set from NVML returning NVML_ERROR_GPU_IS_LOST EntityStatusLost EntityStatus = 4 // EntityStatusFake - Entity is a fake, injection-only entity for testing EntityStatusFake EntityStatus = 5 // EntityStatusDisabled - Don't collect values from this GPU EntityStatusDisabled EntityStatus = 6 // EntityStatusDetached - Entity is detached, not good for any uses EntityStatusDetached EntityStatus = 7 )
func GetGPUStatus ¶
func GetGPUStatus(gpuID uint) EntityStatus
GetGPUStatus returns the entity status of the specified GPU
func (EntityStatus) String ¶
func (e EntityStatus) String() string
String returns a string representation of the entity status
type Error ¶
type Error struct {
Code C.dcgmReturn_t // dcgmReturn_t value of error
// contains filtered or unexported fields
}
Error represents an error returned by the DCGM library
type FieldHandle ¶
type FieldHandle struct {
// contains filtered or unexported fields
}
FieldHandle represents a handle to a DCGM field group
func FieldGroupCreate ¶
func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error)
FieldGroupCreate creates a new field group with the specified fields. fieldsGroupName is the name for the new group. fields is a slice of field IDs to include in the group. Returns the field group handle and any error encountered.
Important: Field groups must be destroyed using FieldGroupDestroy when no longer needed to prevent resource leaks in the DCGM library.
Example:
fieldGroup, err := dcgm.FieldGroupCreate("myFields", []dcgm.Short{dcgm.DCGM_FI_DEV_POWER_USAGE})
if err != nil {
return err
}
defer dcgm.FieldGroupDestroy(fieldGroup)
// Use the field group...
func (*FieldHandle) GetHandle ¶
func (f *FieldHandle) GetHandle() uintptr
GetHandle returns the internal DCGM field group handle as a uintptr
func (*FieldHandle) SetHandle ¶
func (f *FieldHandle) SetHandle(val uintptr)
SetHandle sets the internal DCGM field group handle to the provided value
type FieldMeta ¶
type FieldMeta struct {
FieldID Short // Unique identifier for the field
FieldType byte // Type of the field (e.g., integer, float, string)
Size byte // Size of the field in bytes
Tag string // Human-readable tag/name for the field
Scope int // Scope of the field
NvmlFieldID int // Corresponding NVML field identifier
EntityLevel Field_Entity_Group // Entity level/group this field belongs to
}
FieldMeta represents metadata about a DCGM field, including its identifier, type, size, and other attributes. This struct is used to describe the characteristics and properties of fields that can be monitored or queried through DCGM.
func FieldGetByID ¶
FieldGetByID retrieves field metadata for the specified field ID. Returns a zeroed FieldMeta if DCGM does not recognize the field ID.
func ToFieldMeta ¶
func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta
ToFieldMeta converts a C DCGM field metadata structure to a Go FieldMeta struct. In case of an invalid fieldInfo pointer, it returns a zeroed FieldMeta.
type FieldValue_v1 ¶
type FieldValue_v1 struct {
Version uint
FieldID Short
FieldType uint
Status int
TS int64
Value [4096]byte
}
FieldValue_v1 represents a field value in version 1
func EntityGetLatestValues ¶
func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error)
EntityGetLatestValues retrieves the latest values for specified fields of any entity. entityGroup specifies the type of entity to query. entityId is the ID of the entity. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.
func GetLatestValuesForFields ¶
func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error)
GetLatestValuesForFields retrieves the most recent values for the specified fields. gpu is the ID of the GPU to query. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.
func LinkGetLatestValues ¶
func LinkGetLatestValues(index uint, parentType Field_Entity_Group, parentId uint, fields []Short) ([]FieldValue_v1, error)
LinkGetLatestValues retrieves the latest values for specified fields of a link entity. index is the link index. parentId is the ID of the parent entity. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.
func (FieldValue_v1) Blob ¶
func (fv FieldValue_v1) Blob() [4096]byte
Blob returns the raw field value as a byte array.
func (FieldValue_v1) Float64 ¶
func (fv FieldValue_v1) Float64() float64
Float64 returns the field value as a float64.
func (FieldValue_v1) Int64 ¶
func (fv FieldValue_v1) Int64() int64
Int64 returns the field value as an int64.
func (FieldValue_v1) String ¶
func (fv FieldValue_v1) String() string
String returns the field value as a string.
type FieldValue_v2 ¶
type FieldValue_v2 struct {
Version uint
EntityGroupId Field_Entity_Group
EntityID uint
FieldID Short
FieldType uint
Status int
TS int64
Value [4096]byte
StringValue *string
}
FieldValue_v2 represents a field value in version 2
func EntitiesGetLatestValues ¶
func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error)
EntitiesGetLatestValues retrieves the latest values for specified fields across multiple entities. entities is a slice of entity pairs to query. fields is a slice of field IDs to retrieve. flags specify additional options for the query. Returns a slice of field values and any error encountered.
func GetValuesSince ¶
func GetValuesSince(gpuGroup GroupHandle, fieldGroup FieldHandle, sinceTime time.Time) ([]FieldValue_v2, time.Time, error)
GetValuesSince reads and returns field values for a specified group of entities, such as GPUs, that have been updated since a given timestamp. It allows for targeted data retrieval based on time criteria.
GPUGroup is a GroupHandle that identifies the group of entities to operate on. It can be obtained from CreateGroup for a specific group of GPUs or use GroupAllGPUs() to target all GPUs.
fieldGroup is a FieldHandle representing the group of fields for which data is requested.
sinceTime is a time.Time value representing the timestamp from which to request updated values. A zero value (time.Time{}) requests all available data.
Returns []FieldValue_v2 slice containing the requested field values, a time.Time indicating the time of the latest data retrieval, and an error if there is any issue during the operation.
If the number of field values exceeds maxCallbackValues (131,072), an error is returned to prevent unbounded memory growth. To avoid this, reduce the time range, field group size, or entity count.
func (FieldValue_v2) Blob ¶
func (fv FieldValue_v2) Blob() [4096]byte
Blob returns the raw field value as a byte array.
func (FieldValue_v2) Float64 ¶
func (fv FieldValue_v2) Float64() float64
Float64 returns the field value as a float64.
func (FieldValue_v2) Int64 ¶
func (fv FieldValue_v2) Int64() int64
Int64 returns the field value as an int64.
func (FieldValue_v2) String ¶
func (fv FieldValue_v2) String() string
String returns the field value as a string.
type Field_Entity_Group ¶
type Field_Entity_Group uint
Field_Entity_Group represents the type of DCGM entity
const ( // FE_NONE represents no entity type FE_NONE Field_Entity_Group = iota // FE_GPU represents a GPU device entity FE_GPU // FE_VGPU represents a virtual GPU entity FE_VGPU // FE_SWITCH represents an NVSwitch entity FE_SWITCH // FE_GPU_I represents a GPU instance entity FE_GPU_I // FE_GPU_CI represents a GPU compute instance entity FE_GPU_CI // FE_LINK represents an NVLink entity FE_LINK // FE_CPU represents a CPU entity FE_CPU // FE_CPU_CORE represents a CPU core entity FE_CPU_CORE // FE_COUNT represents the total number of entity types FE_COUNT )
func (Field_Entity_Group) String ¶
func (e Field_Entity_Group) String() string
String returns a string representation of the Field_Entity_Group
type GroupEntityPair ¶
type GroupEntityPair struct {
// EntityGroupId specifies the type of the entity
EntityGroupId Field_Entity_Group
// EntityId is the unique identifier for this entity
EntityId uint
}
GroupEntityPair represents a DCGM entity and its group identifier
type GroupHandle ¶
type GroupHandle struct {
// contains filtered or unexported fields
}
GroupHandle represents a handle to a DCGM GPU group
func CreateGroup ¶
func CreateGroup(groupName string) (goGroupId GroupHandle, err error)
CreateGroup creates a new empty GPU group with the specified name.
Important: Groups must be destroyed using DestroyGroup when no longer needed to prevent resource leaks in the DCGM library.
Example:
group, err := dcgm.CreateGroup("myGroup")
if err != nil {
return err
}
defer dcgm.DestroyGroup(group)
// Use the group...
func CreateGroupWithContext ¶
func CreateGroupWithContext(ctx context.Context, groupName string) (GroupHandle, error)
CreateGroupWithContext creates a new group with a context
func GroupAllGPUs ¶
func GroupAllGPUs() GroupHandle
GroupAllGPUs returns a GroupHandle representing all GPUs in the system
func NewDefaultGroup ¶
func NewDefaultGroup(groupName string) (GroupHandle, error)
NewDefaultGroup creates a new group with default GPUs and the specified name
func WatchFields ¶
func WatchFields(gpuID uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)
WatchFields starts monitoring the specified fields for a GPU. gpuId is the ID of the GPU to monitor. fieldsGroup is the handle of the field group to watch. groupName is a name for the watch group. Returns a group handle and any error encountered.
func WatchPidFields ¶
func WatchPidFields() (GroupHandle, error)
WatchPidFields configures DCGM to start recording stats for GPU processes Must be called before GetProcessInfo.
Important: The returned GroupHandle should be cleaned up by calling DestroyGroup when monitoring is no longer needed to prevent resource leaks.
Example:
group, err := dcgm.WatchPidFields()
if err != nil {
return err
}
defer dcgm.DestroyGroup(group)
// Use GetProcessInfo with the group...
func WatchPidFieldsEx ¶
func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error)
WatchPidFieldsEx is the same as WatchPidFields, but allows for modifying the update frequency, max samples, max sample age, and the GPUs on which to enable watches.
func (*GroupHandle) GetHandle ¶
func (g *GroupHandle) GetHandle() uintptr
GetHandle returns the internal group handle value
func (*GroupHandle) SetHandle ¶
func (g *GroupHandle) SetHandle(val uintptr)
SetHandle sets the internal group handle value
type GroupInfo ¶
type GroupInfo struct {
Version uint32
GroupName string
EntityList []GroupEntityPair
}
GroupInfo contains information about a DCGM group
func GetGroupInfo ¶
func GetGroupInfo(groupID GroupHandle) (*GroupInfo, error)
GetGroupInfo retrieves information about a DCGM group
type HealthCheckErrorCode ¶
type HealthCheckErrorCode uint
HealthCheckErrorCode error codes for passive and active health checks.
const ( // DCGM_FR_OK No error DCGM_FR_OK HealthCheckErrorCode = 0 // DCGM_FR_UNKNOWN Unknown error code DCGM_FR_UNKNOWN HealthCheckErrorCode = 1 // DCGM_FR_UNRECOGNIZED Unrecognized error code DCGM_FR_UNRECOGNIZED HealthCheckErrorCode = 2 // DCGM_FR_PCI_REPLAY_RATE Unacceptable rate of PCI errors DCGM_FR_PCI_REPLAY_RATE HealthCheckErrorCode = 3 // DCGM_FR_VOLATILE_DBE_DETECTED Unacceptable rate of volatile double bit errors DCGM_FR_VOLATILE_DBE_DETECTED HealthCheckErrorCode = 4 // DCGM_FR_VOLATILE_SBE_DETECTED Unacceptable rate of volatile single bit errors DCGM_FR_VOLATILE_SBE_DETECTED HealthCheckErrorCode = 5 // DCGM_FR_VOLATILE_SBE_DETECTED_TS Unacceptable rate of volatile single bit errors with a timestamp DCGM_FR_VOLATILE_SBE_DETECTED_TS HealthCheckErrorCode = 6 // DCGM_FR_PENDING_PAGE_RETIREMENTS Pending page retirements detected DCGM_FR_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 6 // DCGM_FR_RETIRED_PAGES_LIMIT Unacceptable total page retirements detected DCGM_FR_RETIRED_PAGES_LIMIT HealthCheckErrorCode = 7 // DCGM_FR_RETIRED_PAGES_DBE_LIMIT Unacceptable total page retirements due to uncorrectable errors DCGM_FR_RETIRED_PAGES_DBE_LIMIT HealthCheckErrorCode = 8 // DCGM_FR_CORRUPT_INFOROM Corrupt inforom found DCGM_FR_CORRUPT_INFOROM HealthCheckErrorCode = 9 // DCGM_FR_CLOCK_THROTTLE_THERMAL Clocks being throttled due to overheating DCGM_FR_CLOCK_THROTTLE_THERMAL HealthCheckErrorCode = 10 // DCGM_FR_POWER_UNREADABLE Cannot get a reading for power from NVML DCGM_FR_POWER_UNREADABLE HealthCheckErrorCode = 11 // DCGM_FR_CLOCK_THROTTLE_POWER Clock being throttled due to power restrictions DCGM_FR_CLOCK_THROTTLE_POWER HealthCheckErrorCode = 12 // DCGM_FR_NVLINK_ERROR_THRESHOLD Unacceptable rate of NVLink errors DCGM_FR_NVLINK_ERROR_THRESHOLD HealthCheckErrorCode = 13 // DCGM_FR_NVLINK_DOWN NVLink is down DCGM_FR_NVLINK_DOWN HealthCheckErrorCode = 14 // DCGM_FR_NVSWITCH_FATAL_ERROR Fatal errors on the NVSwitch DCGM_FR_NVSWITCH_FATAL_ERROR HealthCheckErrorCode = 15 // DCGM_FR_NVSWITCH_NON_FATAL_ERROR Non-fatal errors on the NVSwitch DCGM_FR_NVSWITCH_NON_FATAL_ERROR HealthCheckErrorCode = 16 // DCGM_FR_NVSWITCH_DOWN NVSwitch is down DCGM_FR_NVSWITCH_DOWN HealthCheckErrorCode = 17 // DCGM_FR_NO_ACCESS_TO_FILE Cannot access a file DCGM_FR_NO_ACCESS_TO_FILE HealthCheckErrorCode = 18 // DCGM_FR_NVML_API Error occurred on an NVML API - NOT USED: DEPRECATED DCGM_FR_NVML_API HealthCheckErrorCode = 19 // DCGM_FR_DEVICE_COUNT_MISMATCH Device count mismatch DCGM_FR_DEVICE_COUNT_MISMATCH HealthCheckErrorCode = 20 // DCGM_FR_BAD_PARAMETER Bad parameter passed to API DCGM_FR_BAD_PARAMETER HealthCheckErrorCode = 21 // DCGM_FR_CANNOT_OPEN_LIB Cannot open a library that must be accessed DCGM_FR_CANNOT_OPEN_LIB HealthCheckErrorCode = 22 // DCGM_FR_DENYLISTED_DRIVER A driver on the denylist (nouveau) is active DCGM_FR_DENYLISTED_DRIVER HealthCheckErrorCode = 23 // DCGM_FR_NVML_LIB_BAD NVML library is missing expected functions - NOT USED: DEPRECATED DCGM_FR_NVML_LIB_BAD HealthCheckErrorCode = 24 // DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25 DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25 // DCGM_FR_HOSTENGINE_CONN Bad connection to nv-hostengine - NOT USED: DEPRECATED DCGM_FR_HOSTENGINE_CONN HealthCheckErrorCode = 26 // DCGM_FR_FIELD_QUERY Field query failed DCGM_FR_FIELD_QUERY HealthCheckErrorCode = 27 // DCGM_FR_BAD_CUDA_ENV The environment has variables that hurt CUDA DCGM_FR_BAD_CUDA_ENV HealthCheckErrorCode = 28 // DCGM_FR_PERSISTENCE_MODE Persistence mode is disabled DCGM_FR_PERSISTENCE_MODE HealthCheckErrorCode = 29 // DCGM_FR_BAD_NVLINK_ENV The environment has variables that hurt NVLink DCGM_FR_BAD_NVLINK_ENV HealthCheckErrorCode = 29 // DCGM_FR_LOW_BANDWIDTH The bandwidth is unacceptably low DCGM_FR_LOW_BANDWIDTH HealthCheckErrorCode = 30 // DCGM_FR_HIGH_LATENCY Latency is too high DCGM_FR_HIGH_LATENCY HealthCheckErrorCode = 31 // DCGM_FR_CANNOT_GET_FIELD_TAG Cannot find a tag for a field DCGM_FR_CANNOT_GET_FIELD_TAG HealthCheckErrorCode = 32 // DCGM_FR_FIELD_VIOLATION The value for the specified error field is above 0 DCGM_FR_FIELD_VIOLATION HealthCheckErrorCode = 33 // DCGM_FR_FIELD_THRESHOLD The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD HealthCheckErrorCode = 34 // DCGM_FR_FIELD_VIOLATION_DBL The value for the specified error field is above 0 DCGM_FR_FIELD_VIOLATION_DBL HealthCheckErrorCode = 35 // DCGM_FR_FIELD_THRESHOLD_DBL The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD_DBL HealthCheckErrorCode = 36 // DCGM_FR_UNSUPPORTED_FIELD_TYPE Field type cannot be supported DCGM_FR_UNSUPPORTED_FIELD_TYPE HealthCheckErrorCode = 37 // DCGM_FR_FIELD_THRESHOLD_TS The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD_TS HealthCheckErrorCode = 38 // DCGM_FR_FIELD_THRESHOLD_TS_DBL The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD_TS_DBL HealthCheckErrorCode = 39 // DCGM_FR_THERMAL_VIOLATIONS Thermal violations detected DCGM_FR_THERMAL_VIOLATIONS HealthCheckErrorCode = 40 // DCGM_FR_THERMAL_VIOLATIONS_TS Thermal violations detected with a timestamp DCGM_FR_THERMAL_VIOLATIONS_TS HealthCheckErrorCode = 41 // DCGM_FR_TEMP_VIOLATION Non-benign clock throttling is occurring DCGM_FR_TEMP_VIOLATION HealthCheckErrorCode = 42 // DCGM_FR_THROTTLING_VIOLATION Non-benign clock throttling is occurring DCGM_FR_THROTTLING_VIOLATION HealthCheckErrorCode = 43 // DCGM_FR_INTERNAL An internal error was detected DCGM_FR_INTERNAL HealthCheckErrorCode = 44 // DCGM_FR_PCIE_GENERATION PCIe generation is too low DCGM_FR_PCIE_GENERATION HealthCheckErrorCode = 45 // DCGM_FR_PCIE_WIDTH PCIe width is too low DCGM_FR_PCIE_WIDTH HealthCheckErrorCode = 46 // DCGM_FR_ABORTED Test was aborted by a user signal DCGM_FR_ABORTED HealthCheckErrorCode = 47 // DCGM_FR_TEST_DISABLED Test was disabled by a user signal DCGM_FR_TEST_DISABLED HealthCheckErrorCode = 48 // DCGM_FR_CANNOT_GET_STAT Cannot get telemetry for a needed value DCGM_FR_CANNOT_GET_STAT HealthCheckErrorCode = 49 // DCGM_FR_STRESS_LEVEL Stress level is too low (bad performance) DCGM_FR_STRESS_LEVEL HealthCheckErrorCode = 50 // DCGM_FR_CUDA_API HealthCheckErrorCode = 51 DCGM_FR_CUDA_API HealthCheckErrorCode = 51 // DCGM_FR_FAULTY_MEMORY Faulty memory detected on this GPU DCGM_FR_FAULTY_MEMORY HealthCheckErrorCode = 52 // DCGM_FR_CANNOT_SET_WATCHES Unable to set field watches in DCGM - NOT USED: DEPRECATED DCGM_FR_CANNOT_SET_WATCHES HealthCheckErrorCode = 53 // DCGM_FR_CUDA_UNBOUND CUDA context is no longer bound DCGM_FR_CUDA_UNBOUND HealthCheckErrorCode = 54 // DCGM_FR_ECC_DISABLED ECC memory is disabled right now DCGM_FR_ECC_DISABLED HealthCheckErrorCode = 55 // DCGM_FR_MEMORY_ALLOC Cannot allocate memory on the GPU DCGM_FR_MEMORY_ALLOC HealthCheckErrorCode = 56 // DCGM_FR_CUDA_DBE CUDA detected unrecovable double-bit error DCGM_FR_CUDA_DBE HealthCheckErrorCode = 57 // DCGM_FR_MEMORY_MISMATCH Memory error detected DCGM_FR_MEMORY_MISMATCH HealthCheckErrorCode = 58 // DCGM_FR_CUDA_DEVICE No CUDA device discoverable for existing GPU DCGM_FR_CUDA_DEVICE HealthCheckErrorCode = 59 // DCGM_FR_ECC_UNSUPPORTED ECC memory is unsupported by this SKU DCGM_FR_ECC_UNSUPPORTED HealthCheckErrorCode = 60 // DCGM_FR_ECC_PENDING ECC memory is in a pending state - NOT USED: DEPRECATED DCGM_FR_ECC_PENDING HealthCheckErrorCode = 61 // DCGM_FR_MEMORY_BANDWIDTH Memory bandwidth is too low DCGM_FR_MEMORY_BANDWIDTH HealthCheckErrorCode = 62 // DCGM_FR_TARGET_POWER The target power is too low DCGM_FR_TARGET_POWER HealthCheckErrorCode = 63 // DCGM_FR_API_FAIL The specified API call failed DCGM_FR_API_FAIL HealthCheckErrorCode = 64 // DCGM_FR_API_FAIL_GPU The specified API call failed for the specified GPU DCGM_FR_API_FAIL_GPU HealthCheckErrorCode = 65 // DCGM_FR_CUDA_CONTEXT Cannot create a CUDA context on this GPU DCGM_FR_CUDA_CONTEXT HealthCheckErrorCode = 66 // DCGM_FR_DCGM_API DCGM API failure DCGM_FR_DCGM_API HealthCheckErrorCode = 67 // DCGM_FR_CONCURRENT_GPUS Need multiple GPUs to run this test DCGM_FR_CONCURRENT_GPUS HealthCheckErrorCode = 68 // DCGM_FR_TOO_MANY_ERRORS More errors than fit in the return struct - NOT USED: DEPRECATED DCGM_FR_TOO_MANY_ERRORS HealthCheckErrorCode = 69 // DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD NVLink CRC error threshold violation DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD HealthCheckErrorCode = 70 // DCGM_FR_NVLINK_ERROR_CRITICAL NVLink error for a field that should always be 0 DCGM_FR_NVLINK_ERROR_CRITICAL HealthCheckErrorCode = 71 // DCGM_FR_ENFORCED_POWER_LIMIT The enforced power limit is too low to hit the target DCGM_FR_ENFORCED_POWER_LIMIT HealthCheckErrorCode = 72 // DCGM_FR_MEMORY_ALLOC_HOST Cannot allocate memory on the host DCGM_FR_MEMORY_ALLOC_HOST HealthCheckErrorCode = 73 // DCGM_FR_GPU_OP_MODE Bad GPU operating mode for running plugin - NOT USED: DEPRECATED DCGM_FR_GPU_OP_MODE HealthCheckErrorCode = 74 // DCGM_FR_NO_MEMORY_CLOCKS No memory clocks with the needed MHz found - NOT USED: DEPRECATED DCGM_FR_NO_MEMORY_CLOCKS HealthCheckErrorCode = 75 // DCGM_FR_NO_GRAPHICS_CLOCKS No graphics clocks with the needed MHz found - NOT USED: DEPRECATED DCGM_FR_NO_GRAPHICS_CLOCKS HealthCheckErrorCode = 76 // DCGM_FR_HAD_TO_RESTORE_STATE Note that we had to restore a GPU's state DCGM_FR_HAD_TO_RESTORE_STATE HealthCheckErrorCode = 77 // DCGM_FR_L1TAG_UNSUPPORTED L1TAG test is unsupported by this SKU DCGM_FR_L1TAG_UNSUPPORTED HealthCheckErrorCode = 78 // DCGM_FR_L1TAG_MISCOMPARE L1TAG test failed on a miscompare DCGM_FR_L1TAG_MISCOMPARE HealthCheckErrorCode = 79 // DCGM_FR_ROW_REMAP_FAILURE Row remapping failed (Ampere or newer GPUs) DCGM_FR_ROW_REMAP_FAILURE HealthCheckErrorCode = 80 // DCGM_FR_UNCONTAINED_ERROR Uncontained error - XID 95 DCGM_FR_UNCONTAINED_ERROR HealthCheckErrorCode = 81 // DCGM_FR_EMPTY_GPU_LIST No GPU information given to plugin DCGM_FR_EMPTY_GPU_LIST HealthCheckErrorCode = 82 // DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS Pending page retirements due to a DBE DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 83 // DCGM_FR_UNCORRECTABLE_ROW_REMAP Uncorrectable row remapping DCGM_FR_UNCORRECTABLE_ROW_REMAP HealthCheckErrorCode = 84 // DCGM_FR_PENDING_ROW_REMAP Row remapping is pending DCGM_FR_PENDING_ROW_REMAP HealthCheckErrorCode = 85 // DCGM_FR_BROKEN_P2P_MEMORY_DEVICE P2P copy test detected an error writing to this GPU DCGM_FR_BROKEN_P2P_MEMORY_DEVICE HealthCheckErrorCode = 86 // DCGM_FR_BROKEN_P2P_WRITER_DEVICE P2P copy test detected an error writing from this GPU DCGM_FR_BROKEN_P2P_WRITER_DEVICE HealthCheckErrorCode = 87 // DCGM_FR_NVSWITCH_NVLINK_DOWN An NvLink is down for the specified NVSwitch DCGM_FR_NVSWITCH_NVLINK_DOWN HealthCheckErrorCode = 88 // DCGM_FR_EUD_BINARY_PERMISSIONS EUD binary permissions are incorrect DCGM_FR_EUD_BINARY_PERMISSIONS HealthCheckErrorCode = 89 // DCGM_FR_EUD_NON_ROOT_USER EUD plugin is not running as root DCGM_FR_EUD_NON_ROOT_USER HealthCheckErrorCode = 90 // DCGM_FR_EUD_SPAWN_FAILURE EUD plugin failed to spawn the EUD binary DCGM_FR_EUD_SPAWN_FAILURE HealthCheckErrorCode = 91 // DCGM_FR_EUD_TIMEOUT EUD plugin timed out DCGM_FR_EUD_TIMEOUT HealthCheckErrorCode = 92 // DCGM_FR_EUD_ZOMBIE EUD process remains running after the plugin considers it finished DCGM_FR_EUD_ZOMBIE HealthCheckErrorCode = 93 // DCGM_FR_EUD_NON_ZERO_EXIT_CODE EUD process exited with a non-zero exit code DCGM_FR_EUD_NON_ZERO_EXIT_CODE HealthCheckErrorCode = 94 // DCGM_FR_EUD_TEST_FAILED EUD test failed DCGM_FR_EUD_TEST_FAILED HealthCheckErrorCode = 95 // DCGM_FR_FILE_CREATE_PERMISSIONS We cannot create a file in this directory. DCGM_FR_FILE_CREATE_PERMISSIONS HealthCheckErrorCode = 96 // DCGM_FR_PAUSE_RESUME_FAILED Pause/Resume failed DCGM_FR_PAUSE_RESUME_FAILED HealthCheckErrorCode = 97 // DCGM_FR_PCIE_H_REPLAY_VIOLATION PCIe H replay violation DCGM_FR_PCIE_H_REPLAY_VIOLATION HealthCheckErrorCode = 98 // DCGM_FR_GPU_EXPECTED_NVLINKS_UP Expected nvlinks up per gpu DCGM_FR_GPU_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 99 // DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP Expected nvlinks up per nvswitch DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 100 // DCGM_FR_XID_ERROR XID error detected DCGM_FR_XID_ERROR HealthCheckErrorCode = 101 // DCGM_FR_SBE_VIOLATION Single bit error detected DCGM_FR_SBE_VIOLATION HealthCheckErrorCode = 102 // DCGM_FR_DBE_VIOLATION Double bit error detected DCGM_FR_DBE_VIOLATION HealthCheckErrorCode = 103 // DCGM_FR_PCIE_REPLAY_VIOLATION PCIe replay errors detected DCGM_FR_PCIE_REPLAY_VIOLATION HealthCheckErrorCode = 104 // DCGM_FR_SBE_THRESHOLD_VIOLATION SBE threshold violated DCGM_FR_SBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 105 // DCGM_FR_DBE_THRESHOLD_VIOLATION DBE threshold violated DCGM_FR_DBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 106 // DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION PCIe replay count violated DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION HealthCheckErrorCode = 107 // DCGM_FR_CUDA_FM_NOT_INITIALIZED The fabricmanager is not initialized DCGM_FR_CUDA_FM_NOT_INITIALIZED HealthCheckErrorCode = 108 // DCGM_FR_SXID_ERROR NvSwitch fatal error detected DCGM_FR_SXID_ERROR HealthCheckErrorCode = 109 // DCGM_FR_GFLOPS_THRESHOLD_VIOLATION GPU GFLOPs threshold violated DCGM_FR_GFLOPS_THRESHOLD_VIOLATION HealthCheckErrorCode = 110 // DCGM_FR_NAN_VALUE NaN value detected on this GPU DCGM_FR_NAN_VALUE HealthCheckErrorCode = 111 // DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR Fabric Manager did not finish training DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR HealthCheckErrorCode = 112 // DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE P2P copy test detected an error writing to this GPU over PCIE DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE HealthCheckErrorCode = 113 // DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE P2P copy test detected an error writing from this GPU over PCIE DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE HealthCheckErrorCode = 114 // DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE P2P copy test detected an error writing to this GPU over NVLink DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE HealthCheckErrorCode = 115 // DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE P2P copy test detected an error writing from this GPU over NVLink DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE HealthCheckErrorCode = 116 // DCGM_FR_TEST_SKIPPED Indicates that the test was skipped DCGM_FR_TEST_SKIPPED HealthCheckErrorCode = 117 // DCGM_FR_SRAM_THRESHOLD SRAM Threshold Count exceeded DCGM_FR_SRAM_THRESHOLD HealthCheckErrorCode = 118 // DCGM_FR_NVLINK_EFFECTIVE_BER_THRESHOLD Effective BER threshold exceeded DCGM_FR_NVLINK_EFFECTIVE_BER_THRESHOLD HealthCheckErrorCode = 119 // DCGM_FR_FALLEN_OFF_BUS GPU has fallen off the bus DCGM_FR_FALLEN_OFF_BUS HealthCheckErrorCode = 120 // DCGM_FR_NVLINK_SYMBOL_BER_THRESHOLD Symbol BER threshold exceeded DCGM_FR_NVLINK_SYMBOL_BER_THRESHOLD HealthCheckErrorCode = 121 // DCGM_FR_IMEX_UNHEALTHY IMEX domain or daemon status is unhealthy DCGM_FR_IMEX_UNHEALTHY HealthCheckErrorCode = 122 // DCGM_FR_FABRIC_PROBE_STATE Fabric probe state error DCGM_FR_FABRIC_PROBE_STATE HealthCheckErrorCode = 123 // DCGM_FR_BINARY_PERMISSIONS Binary permissions are incorrect DCGM_FR_BINARY_PERMISSIONS HealthCheckErrorCode = 124 // DCGM_FR_GPU_RECOVERY_RESET GPU requires reset to recover from a fault DCGM_FR_GPU_RECOVERY_RESET HealthCheckErrorCode = 125 // DCGM_FR_GPU_RECOVERY_REBOOT Node requires reboot due to GPU fault DCGM_FR_GPU_RECOVERY_REBOOT HealthCheckErrorCode = 126 // DCGM_FR_GPU_RECOVERY_DRAIN_P2P Peer-to-peer traffic must be drained DCGM_FR_GPU_RECOVERY_DRAIN_P2P HealthCheckErrorCode = 127 // DCGM_FR_GPU_RECOVERY_DRAIN_RESET GPU operating at reduced capacity, drain and reset required DCGM_FR_GPU_RECOVERY_DRAIN_RESET HealthCheckErrorCode = 128 // DCGM_FR_NCCL_ERROR Detected a NCCL error DCGM_FR_NCCL_ERROR HealthCheckErrorCode = 129 // DCGM_FR_RETEST_REQUESTED Retest requested before providing results DCGM_FR_RETEST_REQUESTED HealthCheckErrorCode = 130 // DCGM_FR_ERROR_SENTINEL MUST BE THE LAST ERROR CODE DCGM_FR_ERROR_SENTINEL HealthCheckErrorCode = 131 )
type HealthResponse ¶
type HealthResponse struct {
// OverallHealth indicates the aggregate health status across all watches
OverallHealth HealthResult
// Incidents contains details about any health issues detected
Incidents []Incident
}
HealthResponse contains the results of a health check operation
func HealthCheck ¶
func HealthCheck(groupID GroupHandle) (HealthResponse, error)
HealthCheck checks the configured watches for any errors/failures/warnings that have occurred since the last time this check was invoked. On the first call, stateful information about all of the enabled watches within a group is created but no error results are provided. On subsequent calls, any error information will be returned.
type HealthResult ¶
type HealthResult uint
HealthResult is the result of a health check.
const ( // DCGM_HEALTH_RESULT_PASS All results within this system are reporting normal DCGM_HEALTH_RESULT_PASS HealthResult = 0 // DCGM_HEALTH_RESULT_WARN A warning has been issued, refer to the response for more information DCGM_HEALTH_RESULT_WARN HealthResult = 10 // DCGM_HEALTH_RESULT_FAIL A failure has been issued, refer to the response for more information DCGM_HEALTH_RESULT_FAIL HealthResult = 20 )
type HealthSystem ¶
type HealthSystem uint
HealthSystem is the system to watch for health checks.
const ( // DCGM_HEALTH_WATCH_PCIE PCIe health check DCGM_HEALTH_WATCH_PCIE HealthSystem = 0x1 // DCGM_HEALTH_WATCH_NVLINK NVLink health check DCGM_HEALTH_WATCH_NVLINK HealthSystem = 0x2 // DCGM_HEALTH_WATCH_PMU PMU health check DCGM_HEALTH_WATCH_PMU HealthSystem = 0x4 // DCGM_HEALTH_WATCH_MCU MCU health check DCGM_HEALTH_WATCH_MCU HealthSystem = 0x8 // DCGM_HEALTH_WATCH_MEM Memory health check DCGM_HEALTH_WATCH_MEM HealthSystem = 0x10 // DCGM_HEALTH_WATCH_SM SM health check DCGM_HEALTH_WATCH_SM HealthSystem = 0x20 // DCGM_HEALTH_WATCH_INFOROM Inforom health check DCGM_HEALTH_WATCH_INFOROM HealthSystem = 0x40 // DCGM_HEALTH_WATCH_THERMAL Thermal health check DCGM_HEALTH_WATCH_THERMAL HealthSystem = 0x80 // DCGM_HEALTH_WATCH_POWER Power health check DCGM_HEALTH_WATCH_POWER HealthSystem = 0x100 // DCGM_HEALTH_WATCH_DRIVER Driver health check DCGM_HEALTH_WATCH_DRIVER HealthSystem = 0x200 // DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL NVSwitch non-fatal health check DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL HealthSystem = 0x400 // DCGM_HEALTH_WATCH_NVSWITCH_FATAL NVSwitch fatal health check DCGM_HEALTH_WATCH_NVSWITCH_FATAL HealthSystem = 0x800 // DCGM_HEALTH_WATCH_CONNECTX ConnectX device health DCGM_HEALTH_WATCH_CONNECTX HealthSystem = 0x1000 // DCGM_HEALTH_WATCH_ALL All health checks DCGM_HEALTH_WATCH_ALL HealthSystem = 0xFFFFFFFF )
func HealthGet ¶
func HealthGet(groupID GroupHandle) (HealthSystem, error)
HealthGet retrieves the current state of the DCGM health check system. It returns which health watch systems are currently enabled for the specified group.
type Incident ¶
type Incident struct {
// System identifies which health watch system detected the incident
System HealthSystem
// Health indicates the severity of the incident
Health HealthResult
// Error contains detailed information about the incident
Error DiagErrorDetail
// EntityInfo identifies the GPU or component where the incident occurred
EntityInfo GroupEntityPair
}
Incident represents a health check incident that occurred
type Link_State ¶
type Link_State uint
Link_State represents the state of an NVLINK connection
const ( // LS_NOT_SUPPORTED indicates the link is unsupported (Default for GPUs) LS_NOT_SUPPORTED Link_State = iota // LS_DISABLED indicates the link is supported but disabled (Default for NvSwitches) LS_DISABLED // LS_DOWN indicates the link is down (inactive) LS_DOWN // LS_UP indicates the link is up (active) LS_UP )
type MemoryInfo ¶
type MemoryInfo struct {
GlobalUsed int64
ECCErrors ECCErrorsInfo
}
MemoryInfo contains GPU memory usage and error information
type MetricGroup ¶
MetricGroup represents a group of metrics for a specific GPU
func GetSupportedMetricGroups ¶
func GetSupportedMetricGroups(gpuID uint) ([]MetricGroup, error)
GetSupportedMetricGroups returns all supported metric groups for the specified GPU
type MigEntityInfo ¶
type MigEntityInfo struct {
// GpuUuid is the UUID of the parent GPU
GpuUuid string
// NvmlGpuIndex is the NVML index of the parent GPU
NvmlGpuIndex uint
// NvmlInstanceId is the NVML GPU instance ID
NvmlInstanceId uint
// NvmlComputeInstanceId is the NVML compute instance ID
NvmlComputeInstanceId uint
// NvmlMigProfileId is the NVML MIG profile ID
NvmlMigProfileId uint
// NvmlProfileSlices is the number of slices in the MIG profile
NvmlProfileSlices uint
}
MigEntityInfo contains information about a MIG entity
type MigHierarchyInfo ¶
type MigHierarchyInfo struct {
// Entity represents the current GPU entity in the hierarchy
Entity GroupEntityPair
// Parent represents the parent GPU entity in the hierarchy
Parent GroupEntityPair
// SliceProfile defines the MIG profile configuration for this entity
SliceProfile MigProfile
}
MigHierarchyInfo represents the Multi-Instance GPU (MIG) hierarchy information for a GPU entity and its relationship to other entities
type MigHierarchyInfo_v2 ¶
type MigHierarchyInfo_v2 struct {
// Entity contains the entity information
Entity GroupEntityPair
// Parent contains the parent entity information
Parent GroupEntityPair
// Info contains detailed MIG entity information
Info MigEntityInfo
}
MigHierarchyInfo_v2 represents version 2 of MIG hierarchy information
type MigHierarchy_v2 ¶
type MigHierarchy_v2 struct {
// Version is the version number of the hierarchy structure
Version uint
// Count is the number of valid entries in EntityList
Count uint
// EntityList contains the MIG hierarchy information for each entity
EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2
}
MigHierarchy_v2 represents version 2 of the complete MIG hierarchy
func GetGPUInstanceHierarchy ¶
func GetGPUInstanceHierarchy() (hierarchy MigHierarchy_v2, err error)
GetGPUInstanceHierarchy retrieves the complete MIG hierarchy information
type MigProfile ¶
type MigProfile int
MigProfile represents the Multi-Instance GPU (MIG) profile type
const ( // MigProfileNone indicates no MIG profile is set (for GPUs) MigProfileNone MigProfile = 0 /*!< No profile (for GPUs) */ // MigProfileGPUInstanceSlice1 represents GPU instance slice 1 MigProfileGPUInstanceSlice1 MigProfile = 1 /*!< GPU instance slice 1 */ // MigProfileGPUInstanceSlice2 represents GPU instance slice 2 MigProfileGPUInstanceSlice2 MigProfile = 2 /*!< GPU instance slice 2 */ // MigProfileGPUInstanceSlice3 represents GPU instance slice 3 MigProfileGPUInstanceSlice3 MigProfile = 3 /*!< GPU instance slice 3 */ // MigProfileGPUInstanceSlice4 represents GPU instance slice 4 MigProfileGPUInstanceSlice4 MigProfile = 4 /*!< GPU instance slice 4 */ // MigProfileGPUInstanceSlice7 represents GPU instance slice 7 MigProfileGPUInstanceSlice7 MigProfile = 5 /*!< GPU instance slice 7 */ // MigProfileGPUInstanceSlice8 represents GPU instance slice 8 MigProfileGPUInstanceSlice8 MigProfile = 6 /*!< GPU instance slice 8 */ // MigProfileGPUInstanceSlice6 represents GPU instance slice 6 MigProfileGPUInstanceSlice6 MigProfile = 7 /*!< GPU instance slice 6 */ // MigProfileGPUInstanceSlice1Rev1 represents GPU instance slice 1 revision 1 MigProfileGPUInstanceSlice1Rev1 MigProfile = 8 /*!< GPU instance slice 1 revision 1 */ // MigProfileGPUInstanceSlice2Rev1 represents GPU instance slice 2 revision 1 MigProfileGPUInstanceSlice2Rev1 MigProfile = 9 /*!< GPU instance slice 2 revision 1 */ // MigProfileGPUInstanceSlice1Rev2 represents GPU instance slice 1 revision 2 MigProfileGPUInstanceSlice1Rev2 MigProfile = 10 /*!< GPU instance slice 1 revision 2 */ // MigProfileComputeInstanceSlice1 represents compute instance slice 1 MigProfileComputeInstanceSlice1 MigProfile = 30 /*!< compute instance slice 1 */ // MigProfileComputeInstanceSlice2 represents compute instance slice 2 MigProfileComputeInstanceSlice2 MigProfile = 31 /*!< compute instance slice 2 */ // MigProfileComputeInstanceSlice3 represents compute instance slice 3 MigProfileComputeInstanceSlice3 MigProfile = 32 /*!< compute instance slice 3 */ // MigProfileComputeInstanceSlice4 represents compute instance slice 4 MigProfileComputeInstanceSlice4 MigProfile = 33 /*!< compute instance slice 4*/ // MigProfileComputeInstanceSlice7 represents compute instance slice 7 MigProfileComputeInstanceSlice7 MigProfile = 34 /*!< compute instance slice 7 */ // MigProfileComputeInstanceSlice8 represents compute instance slice 8 MigProfileComputeInstanceSlice8 MigProfile = 35 /*!< compute instance slice 8 */ // MigProfileComputeInstanceSlice6 represents compute instance slice 6 MigProfileComputeInstanceSlice6 MigProfile = 36 /*!< compute instance slice 6 */ // MigProfileComputeInstanceSlice1Rev1 represents compute instance slice 1 revision 1 MigProfileComputeInstanceSlice1Rev1 MigProfile = 37 /*!< compute instance slice 1 revision 1 */ )
type NvLinkP2PStatus ¶
type NvLinkP2PStatus struct {
Gpus [][]Link_State
// contains filtered or unexported fields
}
NvLinkP2PStatus represents the state of NvLinks between the GPU pairs
func GetNvLinkP2PStatus ¶
func GetNvLinkP2PStatus() (NvLinkP2PStatus, error)
GetNvLinkP2PStatus returns the status of NvLinks between GPU pairs
type NvLinkStatus ¶
type NvLinkStatus struct {
// ParentId is the ID of the parent entity (GPU or NVSwitch)
ParentId uint
// ParentType is the type of the parent entity
ParentType Field_Entity_Group
// State is the current state of the NVLINK
State Link_State
// Index is the link index number
Index uint
}
NvLinkStatus contains information about an NVLINK connection status
func GetNvLinkLinkStatus ¶
func GetNvLinkLinkStatus() ([]NvLinkStatus, error)
GetNvLinkLinkStatus returns the status of all NVLink connections
type NvlinkPolicyCondition ¶
type NvlinkPolicyCondition struct {
// FieldId identifies the specific NVLink field that had an error
FieldId uint16
// Counter indicates the number of errors detected
Counter uint
}
NvlinkPolicyCondition contains details about an NVLink error
type P2PLink ¶
type P2PLink struct {
// GPU is the ID of the GPU
GPU uint
// BusID is the PCIe bus ID of the GPU
BusID string
// Link is the type of P2P connection
Link P2PLinkType
}
P2PLink contains information about a peer-to-peer connection
func GetDeviceTopology ¶
GetDeviceTopology returns the topology (connectivity) information for the specified GPU
type P2PLinkType ¶
type P2PLinkType uint
P2PLinkType represents the type of peer-to-peer connection between GPUs
const ( // P2PLinkUnknown represents an unknown link type P2PLinkUnknown P2PLinkType = iota // P2PLinkCrossCPU represents a connection across different CPUs P2PLinkCrossCPU // P2PLinkSameCPU represents a connection within the same CPU P2PLinkSameCPU // P2PLinkHostBridge represents a connection through the host bridge P2PLinkHostBridge // P2PLinkMultiSwitch represents a connection through multiple PCIe switches P2PLinkMultiSwitch // P2PLinkSingleSwitch represents a connection through a single PCIe switch P2PLinkSingleSwitch // P2PLinkSameBoard represents a connection on the same board P2PLinkSameBoard // SingleNVLINKLink represents a single NVLINK connection SingleNVLINKLink // TwoNVLINKLinks represents two NVLINK connections TwoNVLINKLinks // ThreeNVLINKLinks represents three NVLINK connections ThreeNVLINKLinks // FourNVLINKLinks represents four NVLINK connections FourNVLINKLinks )
func (P2PLinkType) PCIPaths ¶
func (l P2PLinkType) PCIPaths() string
PCIPaths returns a string representation of the P2P link type
type PCIStatusInfo ¶
type PCIStatusInfo struct {
BAR1Used int64 // MB
Throughput PCIThroughputInfo
FBUsed int64
}
PCIStatusInfo contains PCI bus status information
type PCIThroughputInfo ¶
PCIThroughputInfo contains PCI bus transfer metrics
type PciPolicyCondition ¶
type PciPolicyCondition struct {
// ReplayCounter indicates the number of PCI replays
ReplayCounter uint
}
PciPolicyCondition contains details about a PCI error
type PolicyAction ¶
type PolicyAction uint32
PolicyAction specifies the action to take when a policy violation occurs
const ( // PolicyActionNone indicates no action should be taken on violation (default) PolicyActionNone PolicyAction = 0 // PolicyActionGPUReset indicates the GPU should be reset on violation PolicyActionGPUReset PolicyAction = 1 )
type PolicyCondition ¶
type PolicyCondition string
PolicyCondition represents a type of policy violation that can be monitored
type PolicyConfig ¶
type PolicyConfig struct {
// Condition specifies the type of policy to monitor
Condition PolicyCondition
// Action specifies what action to take when this policy violation occurs (optional, defaults to PolicyActionNone)
Action *PolicyAction
// Validation specifies what validation to perform after the action (optional, defaults to PolicyValidationNone)
Validation *PolicyValidation
// MaxRetiredPages specifies the threshold for MaxRtPgPolicy (optional, defaults to DefaultMaxRetiredPages)
MaxRetiredPages *uint32
// MaxTemperature specifies the threshold for ThermalPolicy in Celsius (optional, defaults to DefaultMaxTemperature)
MaxTemperature *uint32
// MaxPower specifies the threshold for PowerPolicy in Watts (optional, defaults to DefaultMaxPower)
MaxPower *uint32
}
PolicyConfig configures a policy condition with optional custom thresholds and actions
type PolicyStatus ¶
type PolicyStatus struct {
// Mode indicates the operation mode (automatic or manual)
Mode uint32
// Action specifies what action is taken on violation
Action PolicyAction
// Validation specifies what validation is performed after action
Validation PolicyValidation
// Conditions is a map of enabled policy conditions with their thresholds
// Key is the PolicyCondition, value is the threshold (if applicable)
Conditions map[PolicyCondition]interface{}
}
PolicyStatus represents the current policy configuration for a group
func GetPolicyForGroup ¶
func GetPolicyForGroup(group GroupHandle) (*PolicyStatus, error)
GetPolicyForGroup retrieves the current policy configuration for a GPU group
type PolicyValidation ¶
type PolicyValidation uint32
PolicyValidation specifies the validation to perform after a policy action
const ( // PolicyValidationNone indicates no validation after action (default) PolicyValidationNone PolicyValidation = 0 // PolicyValidationShort indicates a short system validation should be performed PolicyValidationShort PolicyValidation = 1 // PolicyValidationMedium indicates a medium system validation should be performed PolicyValidationMedium PolicyValidation = 2 // PolicyValidationLong indicates a long system validation should be performed PolicyValidationLong PolicyValidation = 3 )
type PolicyViolation ¶
type PolicyViolation struct {
// Condition specifies the type of policy that was violated
Condition PolicyCondition
// Timestamp indicates when the violation occurred
Timestamp time.Time
// Data contains violation-specific details
Data any
}
PolicyViolation represents a detected violation of a policy condition
type PowerPolicyCondition ¶
type PowerPolicyCondition struct {
// PowerViolation indicates the severity of the power violation
PowerViolation uint
}
PowerPolicyCondition contains details about a power violation
type ProcessInfo ¶
type ProcessInfo struct {
// GPU is the ID of the GPU being used
GPU uint
// PID is the process ID
PID uint
// Name is the name of the process
Name string
// ProcessUtilization contains process-specific utilization metrics
ProcessUtilization ProcessUtilInfo
// PCI contains PCI bus statistics
PCI PCIStatusInfo
// Memory contains memory usage statistics
Memory MemoryInfo
// GpuUtilization contains GPU utilization metrics
GpuUtilization UtilizationInfo
// Clocks contains GPU clock frequencies
Clocks ClockInfo
// Violations contains throttling statistics
Violations ViolationTime
// XIDErrors contains XID error information
XIDErrors XIDErrorInfo
}
ProcessInfo contains comprehensive information about a GPU process
func GetProcessInfo ¶
func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error)
GetProcessInfo returns detailed per-GPU statistics for the specified process
type ProcessUtilInfo ¶
type ProcessUtilInfo struct {
// StartTime is when the process started using the GPU
StartTime Time
// EndTime is when the process stopped using the GPU (0 if still running)
EndTime Time
// EnergyConsumed is the energy consumed by the process in Joules
EnergyConsumed *uint64
// SmUtil is the GPU SM (Streaming Multiprocessor) utilization percentage
SmUtil *float64
// MemUtil is the GPU memory utilization percentage
MemUtil *float64
}
ProcessUtilInfo contains utilization metrics for a GPU process
type RetiredPagesPolicyCondition ¶
type RetiredPagesPolicyCondition struct {
// SbePages indicates the number of pages retired due to single-bit errors
SbePages uint
// DbePages indicates the number of pages retired due to double-bit errors
DbePages uint
}
RetiredPagesPolicyCondition contains details about retired memory pages
type Short ¶
Short is an alias for the C.ushort type. It is primarily used for DCGM field identifiers and field collections in the DCGM API bindings. This type provides a direct mapping to the C unsigned short type used in the underlying DCGM C API.
const ( // DCGM_FI_UNKNOWN represents NULL field DCGM_FI_UNKNOWN Short = 0 // DCGM_FI_DRIVER_VERSION represents Driver Version DCGM_FI_DRIVER_VERSION Short = 1 // DCGM_FI_NVML_VERSION DCGM_FI_NVML_VERSION Short = 2 // DCGM_FI_PROCESS_NAME represents Process Name DCGM_FI_PROCESS_NAME Short = 3 // DCGM_FI_DEV_COUNT represents Number of Devices on the node DCGM_FI_DEV_COUNT Short = 4 // DCGM_FI_CUDA_DRIVER_VERSION represents CUDA 11.1 = 11100 DCGM_FI_CUDA_DRIVER_VERSION Short = 5 // DCGM_FI_BIND_UNBIND_EVENT represents @note Recommended watch frequency: 1 second DCGM_FI_BIND_UNBIND_EVENT Short = 6 // DCGM_FI_DEV_NAME represents Name of the GPU device DCGM_FI_DEV_NAME Short = 50 // DCGM_FI_DEV_BRAND represents Device Brand DCGM_FI_DEV_BRAND Short = 51 // DCGM_FI_DEV_NVML_INDEX represents NVML index of this GPU DCGM_FI_DEV_NVML_INDEX Short = 52 // DCGM_FI_DEV_SERIAL represents Device Serial Number DCGM_FI_DEV_SERIAL Short = 53 // DCGM_FI_DEV_UUID represents UUID corresponding to the device DCGM_FI_DEV_UUID Short = 54 // DCGM_FI_DEV_MINOR_NUMBER represents Device node minor number /dev/nvidia# DCGM_FI_DEV_MINOR_NUMBER Short = 55 // DCGM_FI_DEV_OEM_INFOROM_VER represents OEM inforom version DCGM_FI_DEV_OEM_INFOROM_VER Short = 56 // DCGM_FI_DEV_PCI_BUSID represents PCI attributes for the device DCGM_FI_DEV_PCI_BUSID Short = 57 // DCGM_FI_DEV_PCI_COMBINED_ID represents The combined 16-bit device id and 16-bit vendor id DCGM_FI_DEV_PCI_COMBINED_ID Short = 58 // DCGM_FI_DEV_PCI_SUBSYS_ID represents The 32-bit Sub System Device ID DCGM_FI_DEV_PCI_SUBSYS_ID Short = 59 // DCGM_FI_GPU_TOPOLOGY_PCI represents Topology of all GPUs on the system via PCI (static) DCGM_FI_GPU_TOPOLOGY_PCI Short = 60 // DCGM_FI_GPU_TOPOLOGY_NVLINK represents Topology of all GPUs on the system via NVLINK (static) DCGM_FI_GPU_TOPOLOGY_NVLINK Short = 61 // DCGM_FI_GPU_TOPOLOGY_AFFINITY represents Affinity of all GPUs on the system (static) DCGM_FI_GPU_TOPOLOGY_AFFINITY Short = 62 // DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY represents the minor version is the lower 32 bits. DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY Short = 63 // DCGM_FI_DEV_P2P_NVLINK_STATUS represents A bitmap of the P2P NVLINK status from this GPU to others on this host. DCGM_FI_DEV_P2P_NVLINK_STATUS Short = 64 // DCGM_FI_DEV_COMPUTE_MODE represents Compute mode for the device DCGM_FI_DEV_COMPUTE_MODE Short = 65 // DCGM_FI_DEV_PERSISTENCE_MODE represents Boolean: 0 is disabled, 1 is enabled DCGM_FI_DEV_PERSISTENCE_MODE Short = 66 // DCGM_FI_DEV_MIG_MODE represents Boolean: 0 is disabled, 1 is enabled DCGM_FI_DEV_MIG_MODE Short = 67 // DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR represents be set to for this entity (including MIG) DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR Short = 68 // DCGM_FI_DEV_MIG_MAX_SLICES represents The maximum number of MIG slices supported by this GPU DCGM_FI_DEV_MIG_MAX_SLICES Short = 69 // DCGM_FI_DEV_CPU_AFFINITY_0 represents Device CPU affinity. part 1/8 = cpus 0 - 63 DCGM_FI_DEV_CPU_AFFINITY_0 Short = 70 // DCGM_FI_DEV_CPU_AFFINITY_1 represents Device CPU affinity. part 1/8 = cpus 64 - 127 DCGM_FI_DEV_CPU_AFFINITY_1 Short = 71 // DCGM_FI_DEV_CPU_AFFINITY_2 represents Device CPU affinity. part 2/8 = cpus 128 - 191 DCGM_FI_DEV_CPU_AFFINITY_2 Short = 72 // DCGM_FI_DEV_CPU_AFFINITY_3 represents Device CPU affinity. part 3/8 = cpus 192 - 255 DCGM_FI_DEV_CPU_AFFINITY_3 Short = 73 // DCGM_FI_DEV_CC_MODE represents 1 = enabled DCGM_FI_DEV_CC_MODE Short = 74 // DCGM_FI_DEV_MIG_ATTRIBUTES represents Attributes for the given MIG device handles DCGM_FI_DEV_MIG_ATTRIBUTES Short = 75 // DCGM_FI_DEV_MIG_GI_INFO represents GPU instance profile information DCGM_FI_DEV_MIG_GI_INFO Short = 76 // DCGM_FI_DEV_MIG_CI_INFO represents Compute instance profile information DCGM_FI_DEV_MIG_CI_INFO Short = 77 // DCGM_FI_DEV_ECC_INFOROM_VER represents ECC inforom version DCGM_FI_DEV_ECC_INFOROM_VER Short = 80 // DCGM_FI_DEV_POWER_INFOROM_VER represents Power management object inforom version DCGM_FI_DEV_POWER_INFOROM_VER Short = 81 // DCGM_FI_DEV_INFOROM_IMAGE_VER represents Inforom image version DCGM_FI_DEV_INFOROM_IMAGE_VER Short = 82 // DCGM_FI_DEV_INFOROM_CONFIG_CHECK represents Inforom configuration checksum DCGM_FI_DEV_INFOROM_CONFIG_CHECK Short = 83 // DCGM_FI_DEV_INFOROM_CONFIG_VALID represents Reads the infoROM from the flash and verifies the checksums DCGM_FI_DEV_INFOROM_CONFIG_VALID Short = 84 // DCGM_FI_DEV_VBIOS_VERSION represents VBIOS version of the device DCGM_FI_DEV_VBIOS_VERSION Short = 85 // DCGM_FI_DEV_MEM_AFFINITY_0 represents Device Memory node affinity, 0-63 DCGM_FI_DEV_MEM_AFFINITY_0 Short = 86 // DCGM_FI_DEV_MEM_AFFINITY_1 represents Device Memory node affinity, 64-127 DCGM_FI_DEV_MEM_AFFINITY_1 Short = 87 // DCGM_FI_DEV_MEM_AFFINITY_2 represents Device Memory node affinity, 128-191 DCGM_FI_DEV_MEM_AFFINITY_2 Short = 88 // DCGM_FI_DEV_MEM_AFFINITY_3 represents Device Memory node affinity, 192-255 DCGM_FI_DEV_MEM_AFFINITY_3 Short = 89 // DCGM_FI_DEV_BAR1_TOTAL represents Total BAR1 of the GPU in MB DCGM_FI_DEV_BAR1_TOTAL Short = 90 // DCGM_FI_SYNC_BOOST represents Deprecated - Sync boost settings on the node DCGM_FI_SYNC_BOOST Short = 91 // DCGM_FI_DEV_BAR1_USED represents Used BAR1 of the GPU in MB DCGM_FI_DEV_BAR1_USED Short = 92 // DCGM_FI_DEV_BAR1_FREE represents Free BAR1 of the GPU in MB DCGM_FI_DEV_BAR1_FREE Short = 93 // DCGM_FI_DEV_GPM_SUPPORT represents * GPM support for the device DCGM_FI_DEV_GPM_SUPPORT Short = 94 // DCGM_FI_DEV_SM_CLOCK represents SM clock for the device DCGM_FI_DEV_SM_CLOCK Short = 100 // DCGM_FI_DEV_MEM_CLOCK represents Memory clock for the device DCGM_FI_DEV_MEM_CLOCK Short = 101 // DCGM_FI_DEV_VIDEO_CLOCK represents Video encoder/decoder clock for the device DCGM_FI_DEV_VIDEO_CLOCK Short = 102 // DCGM_FI_DEV_APP_SM_CLOCK represents SM Application clocks DCGM_FI_DEV_APP_SM_CLOCK Short = 110 // DCGM_FI_DEV_APP_MEM_CLOCK represents Memory Application clocks DCGM_FI_DEV_APP_MEM_CLOCK Short = 111 // DCGM_FI_DEV_CLOCKS_EVENT_REASONS represents Current clock event reasons (bitmask of DCGM_CLOCKS_EVENT_REASON_*) DCGM_FI_DEV_CLOCKS_EVENT_REASONS Short = 112 // DCGM_FI_DEV_MAX_SM_CLOCK represents Maximum supported SM clock for the device DCGM_FI_DEV_MAX_SM_CLOCK Short = 113 // DCGM_FI_DEV_MAX_MEM_CLOCK represents Maximum supported Memory clock for the device DCGM_FI_DEV_MAX_MEM_CLOCK Short = 114 // DCGM_FI_DEV_MAX_VIDEO_CLOCK represents Maximum supported Video encoder/decoder clock for the device DCGM_FI_DEV_MAX_VIDEO_CLOCK Short = 115 // DCGM_FI_DEV_AUTOBOOST represents Auto-boost for the device (1 = enabled. 0 = disabled) DCGM_FI_DEV_AUTOBOOST Short = 120 // DCGM_FI_DEV_SUPPORTED_CLOCKS represents Supported clocks for the device DCGM_FI_DEV_SUPPORTED_CLOCKS Short = 130 // DCGM_FI_DEV_MEMORY_TEMP represents Memory temperature for the device DCGM_FI_DEV_MEMORY_TEMP Short = 140 // DCGM_FI_DEV_GPU_TEMP represents Current temperature readings for the device, in degrees C DCGM_FI_DEV_GPU_TEMP Short = 150 // DCGM_FI_DEV_MEM_MAX_OP_TEMP represents Maximum operating temperature for the memory of this GPU. Above this temperature slowdown will occur. DCGM_FI_DEV_MEM_MAX_OP_TEMP Short = 151 // DCGM_FI_DEV_GPU_MAX_OP_TEMP represents Maximum operating temperature for this GPU DCGM_FI_DEV_GPU_MAX_OP_TEMP Short = 152 // DCGM_FI_DEV_GPU_TEMP_LIMIT represents Thermal margin temperature (distance to nearest slowdown threshold) for this GPU DCGM_FI_DEV_GPU_TEMP_LIMIT Short = 153 // DCGM_FI_DEV_POWER_USAGE represents Power usage for the device in Watts DCGM_FI_DEV_POWER_USAGE Short = 155 // DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION represents Total energy consumption for the GPU in mJ since the driver was last reloaded DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Short = 156 // DCGM_FI_DEV_POWER_USAGE_INSTANT represents Current instantaneous power usage of the device in Watts DCGM_FI_DEV_POWER_USAGE_INSTANT Short = 157 // DCGM_FI_DEV_SLOWDOWN_TEMP represents Slowdown temperature for the device DCGM_FI_DEV_SLOWDOWN_TEMP Short = 158 // DCGM_FI_DEV_SHUTDOWN_TEMP represents Shutdown temperature for the device DCGM_FI_DEV_SHUTDOWN_TEMP Short = 159 // DCGM_FI_DEV_POWER_MGMT_LIMIT represents Current Power limit for the device DCGM_FI_DEV_POWER_MGMT_LIMIT Short = 160 // DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN represents Minimum power management limit for the device DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN Short = 161 // DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX represents Maximum power management limit for the device DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX Short = 162 // DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF represents Default power management limit for the device DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF Short = 163 // DCGM_FI_DEV_ENFORCED_POWER_LIMIT represents Effective power limit that the driver enforces after taking into account all limiters DCGM_FI_DEV_ENFORCED_POWER_LIMIT Short = 164 // DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK represents Requested workload power profile mask(Blackwell and newer) DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK Short = 165 // DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK represents Enforced workload power profile mask(Blackwell and newer) DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK Short = 166 // DCGM_FI_DEV_VALID_POWER_PROFILE_MASK represents Requested workload power profile mask(Blackwell and newer) DCGM_FI_DEV_VALID_POWER_PROFILE_MASK Short = 167 // DCGM_FI_DEV_FABRIC_MANAGER_STATUS represents The status of the fabric manager - a value from dcgmFabricManagerStatus_t. DCGM_FI_DEV_FABRIC_MANAGER_STATUS Short = 170 // DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE represents NOTE: this is not populated unless the fabric manager completed startup DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE Short = 171 // DCGM_FI_DEV_FABRIC_CLUSTER_UUID represents The uuid of the cluster to which this GPU belongs DCGM_FI_DEV_FABRIC_CLUSTER_UUID Short = 172 // DCGM_FI_DEV_FABRIC_CLIQUE_ID represents The ID of the fabric clique to which this GPU belongs DCGM_FI_DEV_FABRIC_CLIQUE_ID Short = 173 // DCGM_FI_DEV_FABRIC_HEALTH_MASK represents Use DCGM_GPU_FABRIC_HEALTH_GET macro to get the different health statuses. DCGM_FI_DEV_FABRIC_HEALTH_MASK Short = 174 // DCGM_FI_DEV_FABRIC_HEALTH_SUMMARY represents - NVML_GPU_FABRIC_HEALTH_SUMMARY_LIMITED_CAPACITY (3) DCGM_FI_DEV_FABRIC_HEALTH_SUMMARY Short = 175 // DCGM_FI_DEV_PSTATE represents Performance state (P-State) 0-15. 0=highest DCGM_FI_DEV_PSTATE Short = 190 // DCGM_FI_DEV_FAN_SPEED represents Fan speed for the device in percent 0-100 DCGM_FI_DEV_FAN_SPEED Short = 191 // DCGM_FI_DEV_PCIE_TX_THROUGHPUT represents Deprecated: Use DCGM_FI_PROF_PCIE_TX_BYTES instead. DCGM_FI_DEV_PCIE_TX_THROUGHPUT Short = 200 // DCGM_FI_DEV_PCIE_RX_THROUGHPUT represents Deprecated: Use DCGM_FI_PROF_PCIE_RX_BYTES instead. DCGM_FI_DEV_PCIE_RX_THROUGHPUT Short = 201 // DCGM_FI_DEV_PCIE_REPLAY_COUNTER represents PCIe replay counter DCGM_FI_DEV_PCIE_REPLAY_COUNTER Short = 202 // DCGM_FI_DEV_GPU_UTIL represents GPU Utilization DCGM_FI_DEV_GPU_UTIL Short = 203 // DCGM_FI_DEV_MEM_COPY_UTIL represents Memory Utilization DCGM_FI_DEV_MEM_COPY_UTIL Short = 204 // DCGM_FI_DEV_ACCOUNTING_DATA represents running "nvidia-smi -am 1" as root on the same node the host engine is running on. DCGM_FI_DEV_ACCOUNTING_DATA Short = 205 // DCGM_FI_DEV_ENC_UTIL represents Encoder Utilization DCGM_FI_DEV_ENC_UTIL Short = 206 // DCGM_FI_DEV_DEC_UTIL represents Decoder Utilization DCGM_FI_DEV_DEC_UTIL Short = 207 // DCGM_FI_DEV_XID_ERRORS represents XID errors. The value is the specific XID error DCGM_FI_DEV_XID_ERRORS Short = 230 // DCGM_FI_DEV_PCIE_MAX_LINK_GEN represents PCIe Max Link Generation DCGM_FI_DEV_PCIE_MAX_LINK_GEN Short = 235 // DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH represents PCIe Max Link Width DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH Short = 236 // DCGM_FI_DEV_PCIE_LINK_GEN represents PCIe Current Link Generation DCGM_FI_DEV_PCIE_LINK_GEN Short = 237 // DCGM_FI_DEV_PCIE_LINK_WIDTH represents PCIe Current Link Width DCGM_FI_DEV_PCIE_LINK_WIDTH Short = 238 // DCGM_FI_DEV_POWER_VIOLATION represents Power Violation time in ns DCGM_FI_DEV_POWER_VIOLATION Short = 240 // DCGM_FI_DEV_THERMAL_VIOLATION represents Thermal Violation time in ns DCGM_FI_DEV_THERMAL_VIOLATION Short = 241 // DCGM_FI_DEV_SYNC_BOOST_VIOLATION represents Sync Boost Violation time in ns DCGM_FI_DEV_SYNC_BOOST_VIOLATION Short = 242 // DCGM_FI_DEV_BOARD_LIMIT_VIOLATION represents Board violation limit. DCGM_FI_DEV_BOARD_LIMIT_VIOLATION Short = 243 // DCGM_FI_DEV_LOW_UTIL_VIOLATION represents Low utilisation violation limit. DCGM_FI_DEV_LOW_UTIL_VIOLATION Short = 244 // DCGM_FI_DEV_RELIABILITY_VIOLATION represents Reliability violation limit. DCGM_FI_DEV_RELIABILITY_VIOLATION Short = 245 // DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION represents App clock violation limit. DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION Short = 246 // DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION represents Base clock violation limit. DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION Short = 247 // DCGM_FI_DEV_FB_TOTAL represents Total Frame Buffer of the GPU in MB DCGM_FI_DEV_FB_TOTAL Short = 250 // DCGM_FI_DEV_FB_FREE represents Free Frame Buffer in MB DCGM_FI_DEV_FB_FREE Short = 251 // DCGM_FI_DEV_FB_USED represents Used Frame Buffer in MB DCGM_FI_DEV_FB_USED Short = 252 // DCGM_FI_DEV_FB_RESERVED represents Reserved Frame Buffer in MB DCGM_FI_DEV_FB_RESERVED Short = 253 // DCGM_FI_DEV_FB_USED_PERCENT represents Percentage used of Frame Buffer: 'Used/(Total - Reserved)'. Range 0.0-1.0 DCGM_FI_DEV_FB_USED_PERCENT Short = 254 // DCGM_FI_DEV_C2C_LINK_COUNT represents C2C Link Count DCGM_FI_DEV_C2C_LINK_COUNT Short = 285 // DCGM_FI_DEV_C2C_LINK_STATUS represents The value of 1 the link is ACTIVE. DCGM_FI_DEV_C2C_LINK_STATUS Short = 286 // DCGM_FI_DEV_C2C_MAX_BANDWIDTH represents The value indicates the link speed in MB/s. DCGM_FI_DEV_C2C_MAX_BANDWIDTH Short = 287 // DCGM_FI_DEV_ECC_CURRENT represents Current ECC mode for the device DCGM_FI_DEV_ECC_CURRENT Short = 300 // DCGM_FI_DEV_ECC_PENDING represents Pending ECC mode for the device DCGM_FI_DEV_ECC_PENDING Short = 301 // DCGM_FI_DEV_ECC_SBE_VOL_TOTAL represents Total single bit volatile ECC errors DCGM_FI_DEV_ECC_SBE_VOL_TOTAL Short = 310 // DCGM_FI_DEV_ECC_DBE_VOL_TOTAL represents Total double bit volatile ECC errors DCGM_FI_DEV_ECC_DBE_VOL_TOTAL Short = 311 // DCGM_FI_DEV_ECC_SBE_AGG_TOTAL represents Note: monotonically increasing DCGM_FI_DEV_ECC_SBE_AGG_TOTAL Short = 312 // DCGM_FI_DEV_ECC_DBE_AGG_TOTAL represents Note: monotonically increasing DCGM_FI_DEV_ECC_DBE_AGG_TOTAL Short = 313 // DCGM_FI_DEV_ECC_SBE_VOL_L1 represents L1 cache single bit volatile ECC errors DCGM_FI_DEV_ECC_SBE_VOL_L1 Short = 314 // DCGM_FI_DEV_ECC_DBE_VOL_L1 represents L1 cache double bit volatile ECC errors DCGM_FI_DEV_ECC_DBE_VOL_L1 Short = 315 // DCGM_FI_DEV_ECC_SBE_VOL_L2 represents L2 cache single bit volatile ECC errors DCGM_FI_DEV_ECC_SBE_VOL_L2 Short = 316 // DCGM_FI_DEV_ECC_DBE_VOL_L2 represents L2 cache double bit volatile ECC errors DCGM_FI_DEV_ECC_DBE_VOL_L2 Short = 317 // DCGM_FI_DEV_ECC_SBE_VOL_DEV represents Device memory single bit volatile ECC errors DCGM_FI_DEV_ECC_SBE_VOL_DEV Short = 318 // DCGM_FI_DEV_ECC_DBE_VOL_DEV represents Device memory double bit volatile ECC errors DCGM_FI_DEV_ECC_DBE_VOL_DEV Short = 319 // DCGM_FI_DEV_ECC_SBE_VOL_REG represents Register file single bit volatile ECC errors DCGM_FI_DEV_ECC_SBE_VOL_REG Short = 320 // DCGM_FI_DEV_ECC_DBE_VOL_REG represents Register file double bit volatile ECC errors DCGM_FI_DEV_ECC_DBE_VOL_REG Short = 321 // DCGM_FI_DEV_ECC_SBE_VOL_TEX represents Texture memory single bit volatile ECC errors DCGM_FI_DEV_ECC_SBE_VOL_TEX Short = 322 // DCGM_FI_DEV_ECC_DBE_VOL_TEX represents Texture memory double bit volatile ECC errors DCGM_FI_DEV_ECC_DBE_VOL_TEX Short = 323 // DCGM_FI_DEV_ECC_SBE_AGG_L1 represents Note: monotonically increasing DCGM_FI_DEV_ECC_SBE_AGG_L1 Short = 324 // DCGM_FI_DEV_ECC_DBE_AGG_L1 represents Note: monotonically increasing DCGM_FI_DEV_ECC_DBE_AGG_L1 Short = 325 // DCGM_FI_DEV_ECC_SBE_AGG_L2 represents Note: monotonically increasing DCGM_FI_DEV_ECC_SBE_AGG_L2 Short = 326 // DCGM_FI_DEV_ECC_DBE_AGG_L2 represents Note: monotonically increasing DCGM_FI_DEV_ECC_DBE_AGG_L2 Short = 327 // DCGM_FI_DEV_ECC_SBE_AGG_DEV represents Note: monotonically increasing DCGM_FI_DEV_ECC_SBE_AGG_DEV Short = 328 // DCGM_FI_DEV_ECC_DBE_AGG_DEV represents Note: monotonically increasing DCGM_FI_DEV_ECC_DBE_AGG_DEV Short = 329 // DCGM_FI_DEV_ECC_SBE_AGG_REG represents Note: monotonically increasing DCGM_FI_DEV_ECC_SBE_AGG_REG Short = 330 // DCGM_FI_DEV_ECC_DBE_AGG_REG represents Note: monotonically increasing DCGM_FI_DEV_ECC_DBE_AGG_REG Short = 331 // DCGM_FI_DEV_ECC_SBE_AGG_TEX represents Note: monotonically increasing DCGM_FI_DEV_ECC_SBE_AGG_TEX Short = 332 // DCGM_FI_DEV_ECC_DBE_AGG_TEX represents Note: monotonically increasing DCGM_FI_DEV_ECC_DBE_AGG_TEX Short = 333 // DCGM_FI_DEV_ECC_SBE_VOL_SHM represents Texture SHM single bit volatile ECC errors DCGM_FI_DEV_ECC_SBE_VOL_SHM Short = 334 // DCGM_FI_DEV_ECC_DBE_VOL_SHM represents Texture SHM double bit volatile ECC errors DCGM_FI_DEV_ECC_DBE_VOL_SHM Short = 335 // DCGM_FI_DEV_ECC_SBE_VOL_CBU represents CBU single bit ECC volatile errors DCGM_FI_DEV_ECC_SBE_VOL_CBU Short = 336 // DCGM_FI_DEV_ECC_DBE_VOL_CBU represents CBU double bit ECC volatile errors DCGM_FI_DEV_ECC_DBE_VOL_CBU Short = 337 // DCGM_FI_DEV_ECC_SBE_AGG_SHM represents Texture SHM single bit aggregate ECC errors DCGM_FI_DEV_ECC_SBE_AGG_SHM Short = 338 // DCGM_FI_DEV_ECC_DBE_AGG_SHM represents Texture SHM double bit aggregate ECC errors DCGM_FI_DEV_ECC_DBE_AGG_SHM Short = 339 // DCGM_FI_DEV_ECC_SBE_AGG_CBU represents CBU single bit ECC aggregate errors DCGM_FI_DEV_ECC_SBE_AGG_CBU Short = 340 // DCGM_FI_DEV_ECC_DBE_AGG_CBU represents CBU double bit ECC aggregate errors DCGM_FI_DEV_ECC_DBE_AGG_CBU Short = 341 // DCGM_FI_DEV_ECC_SBE_VOL_SRM represents SRAM single bit ECC volatile errors DCGM_FI_DEV_ECC_SBE_VOL_SRM Short = 342 // DCGM_FI_DEV_ECC_DBE_VOL_SRM represents SRAM double bit ECC volatile errors DCGM_FI_DEV_ECC_DBE_VOL_SRM Short = 343 // DCGM_FI_DEV_ECC_SBE_AGG_SRM represents SRAM single bit ECC aggregate errors DCGM_FI_DEV_ECC_SBE_AGG_SRM Short = 344 // DCGM_FI_DEV_ECC_DBE_AGG_SRM represents SRAM double bit ECC aggregate errors DCGM_FI_DEV_ECC_DBE_AGG_SRM Short = 345 // DCGM_FI_DEV_THRESHOLD_SRM represents SRAM Threashhold Exceeded boolean (1=true) DCGM_FI_DEV_THRESHOLD_SRM Short = 346 // DCGM_FI_DEV_DIAG_MEMORY_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_MEMORY_RESULT Short = 350 // DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT Short = 351 // DCGM_FI_DEV_DIAG_PCIE_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_PCIE_RESULT Short = 352 // DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT Short = 353 // DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT Short = 354 // DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT Short = 355 // DCGM_FI_DEV_DIAG_MEMTEST_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_MEMTEST_RESULT Short = 356 // DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT Short = 357 // DCGM_FI_DEV_DIAG_EUD_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_EUD_RESULT Short = 358 // DCGM_FI_DEV_DIAG_CPU_EUD_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_CPU_EUD_RESULT Short = 359 // DCGM_FI_DEV_DIAG_SOFTWARE_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_SOFTWARE_RESULT Short = 360 // DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT Short = 361 // DCGM_FI_DEV_DIAG_STATUS represents Refers to a binary blob of a `dcgmDiagStatus_t` struct DCGM_FI_DEV_DIAG_STATUS Short = 362 // DCGM_FI_DEV_DIAG_NCCL_TESTS_RESULT represents Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration DCGM_FI_DEV_DIAG_NCCL_TESTS_RESULT Short = 363 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX represents Historical max available spare memory rows per memory bank DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX Short = 385 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH represents Historical high mark of available spare memory rows per memory bank DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH Short = 386 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL represents Historical mark of partial available spare memory rows per memory bank DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL Short = 387 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW represents Historical low mark of available spare memory rows per memory bank DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW Short = 388 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE represents Historical marker of memory banks with no available spare memory rows DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE Short = 389 // DCGM_FI_DEV_RETIRED_SBE represents Note: monotonically increasing DCGM_FI_DEV_RETIRED_SBE Short = 390 // DCGM_FI_DEV_RETIRED_DBE represents Note: monotonically increasing DCGM_FI_DEV_RETIRED_DBE Short = 391 // DCGM_FI_DEV_RETIRED_PENDING represents Number of pages pending retirement DCGM_FI_DEV_RETIRED_PENDING Short = 392 // DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS represents Number of remapped rows for uncorrectable errors DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS Short = 393 // DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS represents Number of remapped rows for correctable errors DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS Short = 394 // DCGM_FI_DEV_ROW_REMAP_FAILURE represents Whether remapping of rows has failed DCGM_FI_DEV_ROW_REMAP_FAILURE Short = 395 // DCGM_FI_DEV_ROW_REMAP_PENDING represents Whether remapping of rows is pending DCGM_FI_DEV_ROW_REMAP_PENDING Short = 396 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 represents NV Link flow control CRC Error Counter for Lane 0 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 Short = 400 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 represents NV Link flow control CRC Error Counter for Lane 1 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 Short = 401 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 represents NV Link flow control CRC Error Counter for Lane 2 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 Short = 402 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 represents NV Link flow control CRC Error Counter for Lane 3 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 Short = 403 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 represents NV Link flow control CRC Error Counter for Lane 4 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 Short = 404 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 represents NV Link flow control CRC Error Counter for Lane 5 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 Short = 405 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 Short = 406 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 Short = 407 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 Short = 408 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL represents NV Link flow control CRC Error Counter total for all Lanes DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL Short = 409 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 represents NV Link data CRC Error Counter for Lane 0 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 Short = 410 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 represents NV Link data CRC Error Counter for Lane 1 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 Short = 411 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 represents NV Link data CRC Error Counter for Lane 2 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 Short = 412 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 represents NV Link data CRC Error Counter for Lane 3 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 Short = 413 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 represents NV Link data CRC Error Counter for Lane 4 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 Short = 414 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 represents NV Link data CRC Error Counter for Lane 5 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 Short = 415 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 Short = 416 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 Short = 417 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 Short = 418 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL represents NV Link data CRC Error Counter total for all Lanes DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL Short = 419 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 represents NV Link Replay Error Counter for Lane 0 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 Short = 420 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 represents NV Link Replay Error Counter for Lane 1 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 Short = 421 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 represents NV Link Replay Error Counter for Lane 2 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 Short = 422 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 represents NV Link Replay Error Counter for Lane 3 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 Short = 423 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 represents NV Link Replay Error Counter for Lane 4 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 Short = 424 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 represents NV Link Replay Error Counter for Lane 5 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 Short = 425 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 Short = 426 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 Short = 427 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 Short = 428 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL represents NV Link Replay Error Counter total for all Lanes DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL Short = 429 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 represents NV Link Recovery Error Counter for Lane 0 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 Short = 430 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 represents NV Link Recovery Error Counter for Lane 1 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 Short = 431 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 represents NV Link Recovery Error Counter for Lane 2 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 Short = 432 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 represents NV Link Recovery Error Counter for Lane 3 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 Short = 433 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 represents NV Link Recovery Error Counter for Lane 4 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 Short = 434 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 represents NV Link Recovery Error Counter for Lane 5 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 Short = 435 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 Short = 436 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 Short = 437 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 Short = 438 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL represents NV Link Recovery Error Counter total for all Lanes DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL Short = 439 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L0 represents NV Link Throughput for Lane 0 DCGM_FI_DEV_NVLINK_THROUGHPUT_L0 Short = 440 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L1 represents NV Link Throughput for Lane 1 DCGM_FI_DEV_NVLINK_THROUGHPUT_L1 Short = 441 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L2 represents NV Link Throughput for Lane 2 DCGM_FI_DEV_NVLINK_THROUGHPUT_L2 Short = 442 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L3 represents NV Link Throughput for Lane 3 DCGM_FI_DEV_NVLINK_THROUGHPUT_L3 Short = 443 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L4 represents NV Link Throughput for Lane 4 DCGM_FI_DEV_NVLINK_THROUGHPUT_L4 Short = 444 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L5 represents NV Link Throughput for Lane 5 DCGM_FI_DEV_NVLINK_THROUGHPUT_L5 Short = 445 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L12 DCGM_FI_DEV_NVLINK_THROUGHPUT_L12 Short = 446 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L13 DCGM_FI_DEV_NVLINK_THROUGHPUT_L13 Short = 447 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L14 DCGM_FI_DEV_NVLINK_THROUGHPUT_L14 Short = 448 // DCGM_FI_DEV_NVLINK_THROUGHPUT_TOTAL represents NV Link Throughput total for all Lanes DCGM_FI_DEV_NVLINK_THROUGHPUT_TOTAL Short = 449 // DCGM_FI_DEV_GPU_NVLINK_ERRORS represents GPU NVLink error information DCGM_FI_DEV_GPU_NVLINK_ERRORS Short = 450 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 represents NV Link flow control CRC Error Counter for Lane 6 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 Short = 451 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 represents NV Link flow control CRC Error Counter for Lane 7 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 Short = 452 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 represents NV Link flow control CRC Error Counter for Lane 8 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 Short = 453 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 represents NV Link flow control CRC Error Counter for Lane 9 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 Short = 454 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 represents NV Link flow control CRC Error Counter for Lane 10 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 Short = 455 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 represents NV Link flow control CRC Error Counter for Lane 11 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 Short = 456 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 represents NV Link data CRC Error Counter for Lane 6 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 Short = 457 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 represents NV Link data CRC Error Counter for Lane 7 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 Short = 458 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 represents NV Link data CRC Error Counter for Lane 8 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 Short = 459 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 represents NV Link data CRC Error Counter for Lane 9 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 Short = 460 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 represents NV Link data CRC Error Counter for Lane 10 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 Short = 461 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 represents NV Link data CRC Error Counter for Lane 11 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 Short = 462 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 represents NV Link Replay Error Counter for Lane 6 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 Short = 463 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 represents NV Link Replay Error Counter for Lane 7 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 Short = 464 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 represents NV Link Replay Error Counter for Lane 8 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 Short = 465 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 represents NV Link Replay Error Counter for Lane 9 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 Short = 466 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 represents NV Link Replay Error Counter for Lane 10 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 Short = 467 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 represents NV Link Replay Error Counter for Lane 11 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 Short = 468 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 represents NV Link Recovery Error Counter for Lane 6 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 Short = 469 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 represents NV Link Recovery Error Counter for Lane 7 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 Short = 470 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 represents NV Link Recovery Error Counter for Lane 8 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 Short = 471 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 represents NV Link Recovery Error Counter for Lane 9 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 Short = 472 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 represents NV Link Recovery Error Counter for Lane 10 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 Short = 473 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 represents NV Link Recovery Error Counter for Lane 11 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 Short = 474 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L6 represents NV Link Throughput for Lane 6 DCGM_FI_DEV_NVLINK_THROUGHPUT_L6 Short = 475 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L7 represents NV Link Throughput for Lane 7 DCGM_FI_DEV_NVLINK_THROUGHPUT_L7 Short = 476 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L8 represents NV Link Throughput for Lane 8 DCGM_FI_DEV_NVLINK_THROUGHPUT_L8 Short = 477 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L9 represents NV Link Throughput for Lane 9 DCGM_FI_DEV_NVLINK_THROUGHPUT_L9 Short = 478 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L10 represents NV Link Throughput for Lane 10 DCGM_FI_DEV_NVLINK_THROUGHPUT_L10 Short = 479 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L11 represents NV Link Throughput for Lane 11 DCGM_FI_DEV_NVLINK_THROUGHPUT_L11 Short = 480 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 Short = 481 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 Short = 482 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 Short = 483 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 Short = 484 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 Short = 485 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 Short = 486 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 Short = 487 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 Short = 488 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 Short = 489 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 Short = 491 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 Short = 492 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 Short = 493 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L15 DCGM_FI_DEV_NVLINK_THROUGHPUT_L15 Short = 494 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L16 DCGM_FI_DEV_NVLINK_THROUGHPUT_L16 Short = 495 // DCGM_FI_DEV_NVLINK_THROUGHPUT_L17 DCGM_FI_DEV_NVLINK_THROUGHPUT_L17 Short = 496 // DCGM_FI_DEV_NVLINK_ERROR_DL_CRC represents NVLink CRC Error Counter DCGM_FI_DEV_NVLINK_ERROR_DL_CRC Short = 497 // DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY represents NVLink Recovery Error Counter DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY Short = 498 // DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY represents NVLink Replay Error Counter DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY Short = 499 // DCGM_FI_DEV_VIRTUAL_MODE represents One of DCGM_GPU_VIRTUALIZATION_MODE_* constants. DCGM_FI_DEV_VIRTUAL_MODE Short = 500 // DCGM_FI_DEV_SUPPORTED_TYPE_INFO represents Includes Count and Static info of vGPU types supported on a device DCGM_FI_DEV_SUPPORTED_TYPE_INFO Short = 501 // DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS represents Includes Count and currently Creatable vGPU types on a device DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS Short = 502 // DCGM_FI_DEV_VGPU_INSTANCE_IDS represents Includes Count and currently Active vGPU Instances on a device DCGM_FI_DEV_VGPU_INSTANCE_IDS Short = 503 // DCGM_FI_DEV_VGPU_UTILIZATIONS represents Utilization values for vGPUs running on the device DCGM_FI_DEV_VGPU_UTILIZATIONS Short = 504 // DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION represents Utilization values for processes running within vGPU VMs using the device DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION Short = 505 // DCGM_FI_DEV_ENC_STATS represents Current encoder statistics for a given device DCGM_FI_DEV_ENC_STATS Short = 506 // DCGM_FI_DEV_FBC_STATS represents Statistics of current active frame buffer capture sessions on a given device DCGM_FI_DEV_FBC_STATS Short = 507 // DCGM_FI_DEV_FBC_SESSIONS_INFO represents Information about active frame buffer capture sessions on a target device DCGM_FI_DEV_FBC_SESSIONS_INFO Short = 508 // DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS represents Includes Count and currently Supported vGPU types on a device DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS Short = 509 // DCGM_FI_DEV_VGPU_TYPE_INFO represents Includes Static info of vGPU types supported on a device DCGM_FI_DEV_VGPU_TYPE_INFO Short = 510 // DCGM_FI_DEV_VGPU_TYPE_NAME represents Includes the name of a vGPU type supported on a device DCGM_FI_DEV_VGPU_TYPE_NAME Short = 511 // DCGM_FI_DEV_VGPU_TYPE_CLASS represents Includes the class of a vGPU type supported on a device DCGM_FI_DEV_VGPU_TYPE_CLASS Short = 512 // DCGM_FI_DEV_VGPU_TYPE_LICENSE represents Includes the license info for a vGPU type supported on a device DCGM_FI_DEV_VGPU_TYPE_LICENSE Short = 513 // DCGM_FI_FIRST_VGPU_FIELD_ID represents Starting field ID of the vGPU instance DCGM_FI_FIRST_VGPU_FIELD_ID Short = 520 // DCGM_FI_DEV_VGPU_VM_ID represents VM ID of the vGPU instance DCGM_FI_DEV_VGPU_VM_ID Short = 520 // DCGM_FI_DEV_VGPU_VM_NAME represents VM name of the vGPU instance DCGM_FI_DEV_VGPU_VM_NAME Short = 521 // DCGM_FI_DEV_VGPU_TYPE represents vGPU type of the vGPU instance DCGM_FI_DEV_VGPU_TYPE Short = 522 // DCGM_FI_DEV_VGPU_UUID represents UUID of the vGPU instance DCGM_FI_DEV_VGPU_UUID Short = 523 // DCGM_FI_DEV_VGPU_DRIVER_VERSION represents Driver version of the vGPU instance DCGM_FI_DEV_VGPU_DRIVER_VERSION Short = 524 // DCGM_FI_DEV_VGPU_MEMORY_USAGE represents Memory usage of the vGPU instance DCGM_FI_DEV_VGPU_MEMORY_USAGE Short = 525 // DCGM_FI_DEV_VGPU_LICENSE_STATUS represents 1 = vgpu is licensed DCGM_FI_DEV_VGPU_LICENSE_STATUS Short = 526 // DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT represents Frame rate limit of the vGPU instance DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT Short = 527 // DCGM_FI_DEV_VGPU_ENC_STATS represents Current encoder statistics of the vGPU instance DCGM_FI_DEV_VGPU_ENC_STATS Short = 528 // DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO represents Information about all active encoder sessions on the vGPU instance DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO Short = 529 // DCGM_FI_DEV_VGPU_FBC_STATS represents Statistics of current active frame buffer capture sessions on the vGPU instance DCGM_FI_DEV_VGPU_FBC_STATS Short = 530 // DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO represents Information about active frame buffer capture sessions on the vGPU instance DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO Short = 531 // DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE represents License state information of the vGPU instance DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE Short = 532 // DCGM_FI_DEV_VGPU_PCI_ID represents PCI Id of the vGPU instance DCGM_FI_DEV_VGPU_PCI_ID Short = 533 // DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID represents GPU Instance ID for the given vGPU Instance DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID Short = 534 // DCGM_FI_LAST_VGPU_FIELD_ID represents Last field ID of the vGPU instance DCGM_FI_LAST_VGPU_FIELD_ID Short = 570 // DCGM_FI_DEV_PLATFORM_INFINIBAND_GUID represents Infiniband GUID string with format 0xXXXXXXXXXXXXXXXX for the specified GPU. DCGM_FI_DEV_PLATFORM_INFINIBAND_GUID Short = 571 // DCGM_FI_DEV_PLATFORM_CHASSIS_SERIAL_NUMBER represents Serial number of the chassis containing this GPU DCGM_FI_DEV_PLATFORM_CHASSIS_SERIAL_NUMBER Short = 572 // DCGM_FI_DEV_PLATFORM_CHASSIS_SLOT_NUMBER represents Slot number in the rack containing the GPU (includes switches) DCGM_FI_DEV_PLATFORM_CHASSIS_SLOT_NUMBER Short = 573 // DCGM_FI_DEV_PLATFORM_TRAY_INDEX represents Tray index within the compute slots in the chassis containing this GPU (does not include switches) DCGM_FI_DEV_PLATFORM_TRAY_INDEX Short = 574 // DCGM_FI_DEV_PLATFORM_HOST_ID represents Index of the node within the slot containing the GPU DCGM_FI_DEV_PLATFORM_HOST_ID Short = 575 // DCGM_FI_DEV_PLATFORM_PEER_TYPE represents Platform indicated NVLink-peer type (e.g. switch present or not) DCGM_FI_DEV_PLATFORM_PEER_TYPE Short = 576 // DCGM_FI_DEV_PLATFORM_MODULE_ID represents ID of the GPU within the node DCGM_FI_DEV_PLATFORM_MODULE_ID Short = 577 // DCGM_FI_DEV_NVLINK_PPRM_OPER_RECOVERY represents PPRM recovery operation status DCGM_FI_DEV_NVLINK_PPRM_OPER_RECOVERY Short = 580 // DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_SINCE_LAST represents Time in seconds since last PRM recovery DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_SINCE_LAST Short = 581 // DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_BETWEEN_LAST_TWO represents Time in milliseconds between last two recoveries DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_BETWEEN_LAST_TWO Short = 582 // DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TOTAL_SUCCESSFUL_EVENTS represents Total successful recovery events counter DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TOTAL_SUCCESSFUL_EVENTS Short = 583 // DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_SUCCESSFUL_RECOVERY_EVENTS represents Physical layer successful recovery events DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_SUCCESSFUL_RECOVERY_EVENTS Short = 584 // DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_LINK_DOWN_COUNTER represents Physical layer link down counter DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_LINK_DOWN_COUNTER Short = 585 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODES represents PLR received codewords counter DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODES Short = 586 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODE_ERR represents PLR received code error counter DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODE_ERR Short = 587 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_UNCORRECTABLE_CODE represents PLR received uncorrectable codes counter DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_UNCORRECTABLE_CODE Short = 588 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_CODES represents PLR transmitted codewords counter DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_CODES Short = 589 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_CODES represents PLR transmitted retry codes counter DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_CODES Short = 590 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_EVENTS represents PLR transmitted retry events counter DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_EVENTS Short = 591 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_SYNC_EVENTS represents PLR sync events counter DCGM_FI_DEV_NVLINK_PPCNT_PLR_SYNC_EVENTS Short = 592 // DCGM_FI_INTERNAL_FIELDS_0_START represents Starting ID for all the internal fields DCGM_FI_INTERNAL_FIELDS_0_START Short = 600 // DCGM_FI_INTERNAL_FIELDS_0_END represents <p>NVSwitch latency bins for port 0</p> DCGM_FI_INTERNAL_FIELDS_0_END Short = 699 // DCGM_FI_FIRST_NVSWITCH_FIELD_ID represents Starting field ID of the NVSwitch instance DCGM_FI_FIRST_NVSWITCH_FIELD_ID Short = 700 // DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT represents NvSwitch voltage DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT Short = 701 // DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ represents NvSwitch Current IDDQ DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ Short = 702 // DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV represents NvSwitch Current IDDQ Rev DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV Short = 703 // DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD represents NvSwitch Current IDDQ Rev DVDD DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD Short = 704 // DCGM_FI_DEV_NVSWITCH_POWER_VDD represents NvSwitch Power VDD in watts DCGM_FI_DEV_NVSWITCH_POWER_VDD Short = 705 // DCGM_FI_DEV_NVSWITCH_POWER_DVDD represents NvSwitch Power DVDD in watts DCGM_FI_DEV_NVSWITCH_POWER_DVDD Short = 706 // DCGM_FI_DEV_NVSWITCH_POWER_HVDD represents NvSwitch Power HVDD in watts DCGM_FI_DEV_NVSWITCH_POWER_HVDD Short = 707 // DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX represents <p>NVSwitch Tx Throughput Counter for ports 0-17</p> DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX Short = 780 // DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX represents NVSwitch Rx Throughput Counter for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX Short = 781 // DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS represents NvSwitch fatal_errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS Short = 782 // DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS represents NvSwitch non_fatal_errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS Short = 783 // DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS represents NvSwitch replay_count_errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS Short = 784 // DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS represents NvSwitch recovery_count_errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS Short = 785 // DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS represents NvSwitch filt_err_count_errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS Short = 786 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS represents NvLink lane_crs_err_count_aggregate_errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS Short = 787 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS represents NvLink lane ecc_err_count_aggregate_errors for ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS Short = 788 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 represents Nvlink lane latency low lane0 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 Short = 789 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 represents Nvlink lane latency low lane1 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 Short = 790 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 represents Nvlink lane latency low lane2 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 Short = 791 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 represents Nvlink lane latency low lane3 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 Short = 792 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 represents Nvlink lane latency medium lane0 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 Short = 793 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 represents Nvlink lane latency medium lane1 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 Short = 794 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 represents Nvlink lane latency medium lane2 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 Short = 795 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 represents Nvlink lane latency medium lane3 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 Short = 796 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 represents Nvlink lane latency high lane0 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 Short = 797 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 represents Nvlink lane latency high lane1 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 Short = 798 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 represents Nvlink lane latency high lane2 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 Short = 799 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 represents Nvlink lane latency high lane3 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 Short = 800 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 represents Nvlink lane latency panic lane0 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 Short = 801 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 represents Nvlink lane latency panic lane1 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 Short = 802 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 represents Nvlink lane latency panic lane2 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 Short = 803 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 represents Nvlink lane latency panic lane2 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 Short = 804 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 represents Nvlink lane latency count lane0 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 Short = 805 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 represents Nvlink lane latency count lane1 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 Short = 806 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 represents Nvlink lane latency count lane2 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 Short = 807 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 represents Nvlink lane latency count lane3 counter. DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 Short = 808 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 represents NvLink lane crc_err_count for lane 0 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 Short = 809 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 represents NvLink lane crc_err_count for lane 1 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 Short = 810 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 represents NvLink lane crc_err_count for lane 2 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 Short = 811 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 represents NvLink lane crc_err_count for lane 3 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 Short = 812 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 represents NvLink lane ecc_err_count for lane 0 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 Short = 813 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 represents NvLink lane ecc_err_count for lane 1 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 Short = 814 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 represents NvLink lane ecc_err_count for lane 2 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 Short = 815 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 represents NvLink lane ecc_err_count for lane 3 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 Short = 816 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 represents NvLink lane crc_err_count for lane 4 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 Short = 817 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 represents NvLink lane crc_err_count for lane 5 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 Short = 818 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 represents NvLink lane crc_err_count for lane 6 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 Short = 819 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 represents NvLink lane crc_err_count for lane 7 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 Short = 820 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 represents NvLink lane ecc_err_count for lane 4 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 Short = 821 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 represents NvLink lane ecc_err_count for lane 5 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 Short = 822 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 represents NvLink lane ecc_err_count for lane 6 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 Short = 823 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 represents NvLink lane ecc_err_count for lane 7 on ports 0-17 DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 Short = 824 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L0 represents NV Link TX Throughput for Lane 0 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L0 Short = 825 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L1 represents NV Link TX Throughput for Lane 1 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L1 Short = 826 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L2 represents NV Link TX Throughput for Lane 2 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L2 Short = 827 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L3 represents NV Link TX Throughput for Lane 3 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L3 Short = 828 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L4 represents NV Link TX Throughput for Lane 4 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L4 Short = 829 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L5 represents NV Link TX Throughput for Lane 5 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L5 Short = 830 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L6 represents NV Link TX Throughput for Lane 6 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L6 Short = 831 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L7 represents NV Link TX Throughput for Lane 7 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L7 Short = 832 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L8 represents NV Link TX Throughput for Lane 8 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L8 Short = 833 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L9 represents NV Link TX Throughput for Lane 9 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L9 Short = 834 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L10 represents NV Link TX Throughput for Lane 10 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L10 Short = 835 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L11 represents NV Link TX Throughput for Lane 11 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L11 Short = 836 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L12 represents NV Link TX Throughput for Lane 12 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L12 Short = 837 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L13 represents NV Link TX Throughput for Lane 13 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L13 Short = 838 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L14 represents NV Link TX Throughput for Lane 14 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L14 Short = 839 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L15 represents NV Link TX Throughput for Lane 15 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L15 Short = 840 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L16 represents NV Link TX Throughput for Lane 16 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L16 Short = 841 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L17 represents NV Link TX Throughput for Lane 17 DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_L17 Short = 842 // DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_TOTAL represents NV Link Throughput total for all TX Lanes DCGM_FI_DEV_NVLINK_TX_THROUGHPUT_TOTAL Short = 843 // DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS represents Note: value field indicates the specific SXid reported DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS Short = 856 // DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS represents Note: value field indicates the specific SXid reported DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS Short = 857 // DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT represents NVSwitch current temperature. DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT Short = 858 // DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN represents NVSwitch limit slowdown temperature. DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN Short = 859 // DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN represents NVSwitch limit shutdown temperature. DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN Short = 860 // DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX represents NVSwitch throughput Tx. DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX Short = 861 // DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX represents NVSwitch throughput Rx. DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX Short = 862 // DCGM_FI_DEV_NVSWITCH_PHYS_ID represents NVSwitch Physical ID. DCGM_FI_DEV_NVSWITCH_PHYS_ID Short = 863 // DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED represents NVSwitch reset required. DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED Short = 864 // DCGM_FI_DEV_NVSWITCH_LINK_ID represents NvSwitch NvLink ID DCGM_FI_DEV_NVSWITCH_LINK_ID Short = 865 // DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN represents NvSwitch PCIE domain DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN Short = 866 // DCGM_FI_DEV_NVSWITCH_PCIE_BUS represents NvSwitch PCIE bus DCGM_FI_DEV_NVSWITCH_PCIE_BUS Short = 867 // DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE represents NvSwitch PCIE device DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE Short = 868 // DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION represents NvSwitch PCIE function DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION Short = 869 // DCGM_FI_DEV_NVSWITCH_LINK_STATUS represents NvLink status. UNKNOWN:-1 OFF:0 SAFE:1 ACTIVE:2 ERROR:3 DCGM_FI_DEV_NVSWITCH_LINK_STATUS Short = 870 // DCGM_FI_DEV_NVSWITCH_LINK_TYPE represents NvLink device type (NSCQ: GPU=1, Switch=2; NVSDM: CA=1, Switch=2, GPU=5) DCGM_FI_DEV_NVSWITCH_LINK_TYPE Short = 871 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN represents NvLink device pcie domain. DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN Short = 872 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS represents NvLink device pcie bus. DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS Short = 873 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE represents NvLink device pcie device. DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE Short = 874 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION represents NvLink device pcie function. DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION Short = 875 // DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID represents NvLink device link ID DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID Short = 876 // DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID represents NvLink device SID. DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID Short = 877 // DCGM_FI_DEV_NVSWITCH_DEVICE_UUID represents NvLink device switch/link uid. DCGM_FI_DEV_NVSWITCH_DEVICE_UUID Short = 878 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L0 represents NV Link RX Throughput for Lane 0 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L0 Short = 879 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L1 represents NV Link RX Throughput for Lane 1 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L1 Short = 880 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L2 represents NV Link RX Throughput for Lane 2 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L2 Short = 881 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L3 represents NV Link RX Throughput for Lane 3 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L3 Short = 882 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L4 represents NV Link RX Throughput for Lane 4 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L4 Short = 883 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L5 represents NV Link RX Throughput for Lane 5 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L5 Short = 884 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L6 represents NV Link RX Throughput for Lane 6 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L6 Short = 885 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L7 represents NV Link RX Throughput for Lane 7 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L7 Short = 886 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L8 represents NV Link RX Throughput for Lane 8 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L8 Short = 887 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L9 represents NV Link RX Throughput for Lane 9 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L9 Short = 888 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L10 represents NV Link RX Throughput for Lane 10 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L10 Short = 889 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L11 represents NV Link RX Throughput for Lane 11 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L11 Short = 890 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L12 represents NV Link RX Throughput for Lane 12 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L12 Short = 891 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L13 represents NV Link RX Throughput for Lane 13 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L13 Short = 892 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L14 represents NV Link RX Throughput for Lane 14 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L14 Short = 893 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L15 represents NV Link RX Throughput for Lane 15 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L15 Short = 894 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L16 represents NV Link RX Throughput for Lane 16 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L16 Short = 895 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L17 represents NV Link RX Throughput for Lane 17 DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_L17 Short = 896 // DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_TOTAL represents NV Link Throughput total for all RX Lanes DCGM_FI_DEV_NVLINK_RX_THROUGHPUT_TOTAL Short = 897 // DCGM_FI_LAST_NVSWITCH_FIELD_ID represents Last field ID of the NVSwitch instance DCGM_FI_LAST_NVSWITCH_FIELD_ID Short = 899 // DCGM_FI_PROF_GR_ENGINE_ACTIVE represents compute pipe is busy. DCGM_FI_PROF_GR_ENGINE_ACTIVE Short = 1001 // DCGM_FI_PROF_SM_ACTIVE represents (computed from the number of cycles and elapsed cycles) DCGM_FI_PROF_SM_ACTIVE Short = 1002 // DCGM_FI_PROF_SM_OCCUPANCY represents maximum number of warps per elapsed cycle) DCGM_FI_PROF_SM_OCCUPANCY Short = 1003 // DCGM_FI_PROF_PIPE_TENSOR_ACTIVE represents (off the peak sustained elapsed cycles) DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Short = 1004 // DCGM_FI_PROF_DRAM_ACTIVE represents active sending or receiving data. DCGM_FI_PROF_DRAM_ACTIVE Short = 1005 // DCGM_FI_PROF_PIPE_FP64_ACTIVE represents Ratio of cycles the fp64 pipe is active. DCGM_FI_PROF_PIPE_FP64_ACTIVE Short = 1006 // DCGM_FI_PROF_PIPE_FP32_ACTIVE represents Ratio of cycles the fp32 pipe is active. DCGM_FI_PROF_PIPE_FP32_ACTIVE Short = 1007 // DCGM_FI_PROF_PIPE_FP16_ACTIVE represents Ratio of cycles the fp16 pipe is active. This does not include HMMA. DCGM_FI_PROF_PIPE_FP16_ACTIVE Short = 1008 // DCGM_FI_PROF_PCIE_TX_BYTES represents would be reflected in this metric. DCGM_FI_PROF_PCIE_TX_BYTES Short = 1009 // DCGM_FI_PROF_PCIE_RX_BYTES represents would be reflected in this metric. DCGM_FI_PROF_PCIE_RX_BYTES Short = 1010 // DCGM_FI_PROF_NVLINK_TX_BYTES represents Per-link fields are available below DCGM_FI_PROF_NVLINK_TX_BYTES Short = 1011 // DCGM_FI_PROF_NVLINK_RX_BYTES represents Per-link fields are available below DCGM_FI_PROF_NVLINK_RX_BYTES Short = 1012 // DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE represents The ratio of cycles the tensor (IMMA) pipe is active (off the peak sustained elapsed cycles) DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE Short = 1013 // DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE represents The ratio of cycles the tensor (HMMA) pipe is active (off the peak sustained elapsed cycles) DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE Short = 1014 // DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE represents The ratio of cycles the tensor (DFMA) pipe is active (off the peak sustained elapsed cycles) DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE Short = 1015 // DCGM_FI_PROF_PIPE_INT_ACTIVE represents Ratio of cycles the integer pipe is active. DCGM_FI_PROF_PIPE_INT_ACTIVE Short = 1016 // DCGM_FI_PROF_NVDEC0_ACTIVE represents Ratio of cycles each of the NVDEC engines are active. DCGM_FI_PROF_NVDEC0_ACTIVE Short = 1017 // DCGM_FI_PROF_NVDEC1_ACTIVE DCGM_FI_PROF_NVDEC1_ACTIVE Short = 1018 // DCGM_FI_PROF_NVDEC2_ACTIVE DCGM_FI_PROF_NVDEC2_ACTIVE Short = 1019 // DCGM_FI_PROF_NVDEC3_ACTIVE DCGM_FI_PROF_NVDEC3_ACTIVE Short = 1020 // DCGM_FI_PROF_NVDEC4_ACTIVE DCGM_FI_PROF_NVDEC4_ACTIVE Short = 1021 // DCGM_FI_PROF_NVDEC5_ACTIVE DCGM_FI_PROF_NVDEC5_ACTIVE Short = 1022 // DCGM_FI_PROF_NVDEC6_ACTIVE DCGM_FI_PROF_NVDEC6_ACTIVE Short = 1023 // DCGM_FI_PROF_NVDEC7_ACTIVE DCGM_FI_PROF_NVDEC7_ACTIVE Short = 1024 // DCGM_FI_PROF_NVJPG0_ACTIVE represents Ratio of cycles each of the NVJPG engines are active. DCGM_FI_PROF_NVJPG0_ACTIVE Short = 1025 // DCGM_FI_PROF_NVJPG1_ACTIVE DCGM_FI_PROF_NVJPG1_ACTIVE Short = 1026 // DCGM_FI_PROF_NVJPG2_ACTIVE DCGM_FI_PROF_NVJPG2_ACTIVE Short = 1027 // DCGM_FI_PROF_NVJPG3_ACTIVE DCGM_FI_PROF_NVJPG3_ACTIVE Short = 1028 // DCGM_FI_PROF_NVJPG4_ACTIVE DCGM_FI_PROF_NVJPG4_ACTIVE Short = 1029 // DCGM_FI_PROF_NVJPG5_ACTIVE DCGM_FI_PROF_NVJPG5_ACTIVE Short = 1030 // DCGM_FI_PROF_NVJPG6_ACTIVE DCGM_FI_PROF_NVJPG6_ACTIVE Short = 1031 // DCGM_FI_PROF_NVJPG7_ACTIVE DCGM_FI_PROF_NVJPG7_ACTIVE Short = 1032 // DCGM_FI_PROF_NVOFA0_ACTIVE represents Ratio of cycles each of the NVOFA engines are active. DCGM_FI_PROF_NVOFA0_ACTIVE Short = 1033 // DCGM_FI_PROF_NVOFA1_ACTIVE DCGM_FI_PROF_NVOFA1_ACTIVE Short = 1034 // DCGM_FI_PROF_NVLINK_L0_TX_BYTES represents total = DCGM_FI_PROF_NVLINK_L0_TX_BYTES + DCGM_FI_PROF_NVLINK_L0_RX_BYTES DCGM_FI_PROF_NVLINK_L0_TX_BYTES Short = 1040 // DCGM_FI_PROF_NVLINK_L0_RX_BYTES DCGM_FI_PROF_NVLINK_L0_RX_BYTES Short = 1041 // DCGM_FI_PROF_NVLINK_L1_TX_BYTES DCGM_FI_PROF_NVLINK_L1_TX_BYTES Short = 1042 // DCGM_FI_PROF_NVLINK_L1_RX_BYTES DCGM_FI_PROF_NVLINK_L1_RX_BYTES Short = 1043 // DCGM_FI_PROF_NVLINK_L2_TX_BYTES DCGM_FI_PROF_NVLINK_L2_TX_BYTES Short = 1044 // DCGM_FI_PROF_NVLINK_L2_RX_BYTES DCGM_FI_PROF_NVLINK_L2_RX_BYTES Short = 1045 // DCGM_FI_PROF_NVLINK_L3_TX_BYTES DCGM_FI_PROF_NVLINK_L3_TX_BYTES Short = 1046 // DCGM_FI_PROF_NVLINK_L3_RX_BYTES DCGM_FI_PROF_NVLINK_L3_RX_BYTES Short = 1047 // DCGM_FI_PROF_NVLINK_L4_TX_BYTES DCGM_FI_PROF_NVLINK_L4_TX_BYTES Short = 1048 // DCGM_FI_PROF_NVLINK_L4_RX_BYTES DCGM_FI_PROF_NVLINK_L4_RX_BYTES Short = 1049 // DCGM_FI_PROF_NVLINK_L5_TX_BYTES DCGM_FI_PROF_NVLINK_L5_TX_BYTES Short = 1050 // DCGM_FI_PROF_NVLINK_L5_RX_BYTES DCGM_FI_PROF_NVLINK_L5_RX_BYTES Short = 1051 // DCGM_FI_PROF_NVLINK_L6_TX_BYTES DCGM_FI_PROF_NVLINK_L6_TX_BYTES Short = 1052 // DCGM_FI_PROF_NVLINK_L6_RX_BYTES DCGM_FI_PROF_NVLINK_L6_RX_BYTES Short = 1053 // DCGM_FI_PROF_NVLINK_L7_TX_BYTES DCGM_FI_PROF_NVLINK_L7_TX_BYTES Short = 1054 // DCGM_FI_PROF_NVLINK_L7_RX_BYTES DCGM_FI_PROF_NVLINK_L7_RX_BYTES Short = 1055 // DCGM_FI_PROF_NVLINK_L8_TX_BYTES DCGM_FI_PROF_NVLINK_L8_TX_BYTES Short = 1056 // DCGM_FI_PROF_NVLINK_L8_RX_BYTES DCGM_FI_PROF_NVLINK_L8_RX_BYTES Short = 1057 // DCGM_FI_PROF_NVLINK_L9_TX_BYTES DCGM_FI_PROF_NVLINK_L9_TX_BYTES Short = 1058 // DCGM_FI_PROF_NVLINK_L9_RX_BYTES DCGM_FI_PROF_NVLINK_L9_RX_BYTES Short = 1059 // DCGM_FI_PROF_NVLINK_L10_TX_BYTES DCGM_FI_PROF_NVLINK_L10_TX_BYTES Short = 1060 // DCGM_FI_PROF_NVLINK_L10_RX_BYTES DCGM_FI_PROF_NVLINK_L10_RX_BYTES Short = 1061 // DCGM_FI_PROF_NVLINK_L11_TX_BYTES DCGM_FI_PROF_NVLINK_L11_TX_BYTES Short = 1062 // DCGM_FI_PROF_NVLINK_L11_RX_BYTES DCGM_FI_PROF_NVLINK_L11_RX_BYTES Short = 1063 // DCGM_FI_PROF_NVLINK_L12_TX_BYTES DCGM_FI_PROF_NVLINK_L12_TX_BYTES Short = 1064 // DCGM_FI_PROF_NVLINK_L12_RX_BYTES DCGM_FI_PROF_NVLINK_L12_RX_BYTES Short = 1065 // DCGM_FI_PROF_NVLINK_L13_TX_BYTES DCGM_FI_PROF_NVLINK_L13_TX_BYTES Short = 1066 // DCGM_FI_PROF_NVLINK_L13_RX_BYTES DCGM_FI_PROF_NVLINK_L13_RX_BYTES Short = 1067 // DCGM_FI_PROF_NVLINK_L14_TX_BYTES DCGM_FI_PROF_NVLINK_L14_TX_BYTES Short = 1068 // DCGM_FI_PROF_NVLINK_L14_RX_BYTES DCGM_FI_PROF_NVLINK_L14_RX_BYTES Short = 1069 // DCGM_FI_PROF_NVLINK_L15_TX_BYTES DCGM_FI_PROF_NVLINK_L15_TX_BYTES Short = 1070 // DCGM_FI_PROF_NVLINK_L15_RX_BYTES DCGM_FI_PROF_NVLINK_L15_RX_BYTES Short = 1071 // DCGM_FI_PROF_NVLINK_L16_TX_BYTES DCGM_FI_PROF_NVLINK_L16_TX_BYTES Short = 1072 // DCGM_FI_PROF_NVLINK_L16_RX_BYTES DCGM_FI_PROF_NVLINK_L16_RX_BYTES Short = 1073 // DCGM_FI_PROF_NVLINK_L17_TX_BYTES DCGM_FI_PROF_NVLINK_L17_TX_BYTES Short = 1074 // DCGM_FI_PROF_NVLINK_L17_RX_BYTES DCGM_FI_PROF_NVLINK_L17_RX_BYTES Short = 1075 // DCGM_FI_PROF_C2C_TX_ALL_BYTES represents The total number of bytes transmitted over the C2C (Chip-to-Chip) interface, including both header and payload data DCGM_FI_PROF_C2C_TX_ALL_BYTES Short = 1076 // DCGM_FI_PROF_C2C_TX_DATA_BYTES represents The number of data-only bytes transmitted over the C2C (Chip-to-Chip) interface DCGM_FI_PROF_C2C_TX_DATA_BYTES Short = 1077 // DCGM_FI_PROF_C2C_RX_ALL_BYTES represents The total number of bytes received over the C2C (Chip-to-Chip) interface, including both header and payload data DCGM_FI_PROF_C2C_RX_ALL_BYTES Short = 1078 // DCGM_FI_PROF_C2C_RX_DATA_BYTES represents The number of data-only bytes received over the C2C (Chip-to-Chip) interface DCGM_FI_PROF_C2C_RX_DATA_BYTES Short = 1079 // DCGM_FI_PROF_HOSTMEM_CACHE_HIT represents Percentage of requests to Host Memory that were served from cache DCGM_FI_PROF_HOSTMEM_CACHE_HIT Short = 1080 // DCGM_FI_PROF_HOSTMEM_CACHE_MISS represents Percentage of requests to Host Memory that were cache misses DCGM_FI_PROF_HOSTMEM_CACHE_MISS Short = 1081 // DCGM_FI_PROF_PEERMEM_CACHE_HIT represents Percentage of requests to Peer Memory that were served from cache DCGM_FI_PROF_PEERMEM_CACHE_HIT Short = 1082 // DCGM_FI_PROF_PEERMEM_CACHE_MISS represents Percentage of requests to Peer Memory that were cache misses DCGM_FI_PROF_PEERMEM_CACHE_MISS Short = 1083 // DCGM_FI_DEV_CPU_UTIL_TOTAL represents CPU Utilization, total DCGM_FI_DEV_CPU_UTIL_TOTAL Short = 1100 // DCGM_FI_DEV_CPU_UTIL_USER represents CPU Utilization, user DCGM_FI_DEV_CPU_UTIL_USER Short = 1101 // DCGM_FI_DEV_CPU_UTIL_NICE represents CPU Utilization, nice DCGM_FI_DEV_CPU_UTIL_NICE Short = 1102 // DCGM_FI_DEV_CPU_UTIL_SYS represents CPU Utilization, system time DCGM_FI_DEV_CPU_UTIL_SYS Short = 1103 // DCGM_FI_DEV_CPU_UTIL_IRQ represents CPU Utilization, interrupt servicing DCGM_FI_DEV_CPU_UTIL_IRQ Short = 1104 // DCGM_FI_DEV_CPU_TEMP_CURRENT represents CPU temperature DCGM_FI_DEV_CPU_TEMP_CURRENT Short = 1110 // DCGM_FI_DEV_CPU_TEMP_WARNING represents CPU Warning Temperature DCGM_FI_DEV_CPU_TEMP_WARNING Short = 1111 // DCGM_FI_DEV_CPU_TEMP_CRITICAL represents CPU Critical Temperature DCGM_FI_DEV_CPU_TEMP_CRITICAL Short = 1112 // DCGM_FI_DEV_CPU_CLOCK_CURRENT represents CPU instantaneous clock speed DCGM_FI_DEV_CPU_CLOCK_CURRENT Short = 1120 // DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT represents CPU power utilization DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT Short = 1130 // DCGM_FI_DEV_CPU_POWER_LIMIT represents CPU power limit DCGM_FI_DEV_CPU_POWER_LIMIT Short = 1131 // DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT represents SoC power utilization DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT Short = 1132 // DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT represents Module power utilization DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT Short = 1133 // DCGM_FI_DEV_CPU_VENDOR represents CPU vendor name DCGM_FI_DEV_CPU_VENDOR Short = 1140 // DCGM_FI_DEV_CPU_MODEL represents CPU model name DCGM_FI_DEV_CPU_MODEL Short = 1141 // DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS Short = 1200 // DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES Short = 1201 // DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS Short = 1202 // DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES Short = 1203 // DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS Short = 1204 // DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS Short = 1205 // DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS Short = 1206 // DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS Short = 1207 // DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS Short = 1208 // DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS Short = 1209 // DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS Short = 1210 // DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS Short = 1211 // DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS Short = 1212 // DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS Short = 1213 // DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS Short = 1214 // DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER Short = 1215 // DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER_FLOAT represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER_FLOAT Short = 1216 // DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER Short = 1217 // DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER_FLOAT represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER_FLOAT Short = 1218 // DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS Short = 1219 // DCGM_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL represents NVLink ECC Data Error Counter total for all Links DCGM_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL Short = 1220 // DCGM_FI_DEV_FIRST_CONNECTX_FIELD_ID represents First field id of ConnectX DCGM_FI_DEV_FIRST_CONNECTX_FIELD_ID Short = 1300 // DCGM_FI_DEV_CONNECTX_HEALTH represents Health state of ConnectX DCGM_FI_DEV_CONNECTX_HEALTH Short = 1300 // DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_WIDTH represents Active PCIe link width DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_WIDTH Short = 1301 // DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_SPEED represents Active PCIe link speed DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_SPEED Short = 1302 // DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_WIDTH represents Expect PCIe link width DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_WIDTH Short = 1303 // DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_SPEED represents Expect PCIe link speed DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_SPEED Short = 1304 // DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_STATUS represents Correctable error status DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_STATUS Short = 1305 // DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_MASK represents Correctable error mask DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_MASK Short = 1306 // DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_STATUS represents Uncorrectable error status DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_STATUS Short = 1307 // DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_MASK represents Uncorrectable error mask DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_MASK Short = 1308 // DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_SEVERITY represents Uncorrectable error severity DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_SEVERITY Short = 1309 // DCGM_FI_DEV_CONNECTX_DEVICE_TEMPERATURE represents Device temperature DCGM_FI_DEV_CONNECTX_DEVICE_TEMPERATURE Short = 1310 // DCGM_FI_DEV_LAST_CONNECTX_FIELD_ID represents The last field id of ConnectX DCGM_FI_DEV_LAST_CONNECTX_FIELD_ID Short = 1399 // DCGM_FI_DEV_C2C_LINK_ERROR_INTR represents C2C Link CRC Error Counter DCGM_FI_DEV_C2C_LINK_ERROR_INTR Short = 1400 // DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY represents C2C Link Replay Error Counter DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY Short = 1401 // DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY_B2B represents C2C Link Back to Back Replay Error Counter DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY_B2B Short = 1402 // DCGM_FI_DEV_C2C_LINK_POWER_STATE represents C2C Link Power state. See NVML_C2C_POWER_STATE_* DCGM_FI_DEV_C2C_LINK_POWER_STATE Short = 1403 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_0 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_0 Short = 1404 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_1 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_1 Short = 1405 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_2 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_2 Short = 1406 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_3 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_3 Short = 1407 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_4 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_4 Short = 1408 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_5 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_5 Short = 1409 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_6 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_6 Short = 1410 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_7 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_7 Short = 1411 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_8 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_8 Short = 1412 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_9 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_9 Short = 1413 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_10 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_10 Short = 1414 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_11 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_11 Short = 1415 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_12 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_12 Short = 1416 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_13 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_13 Short = 1417 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_14 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_14 Short = 1418 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_15 represents Note: NVLink5+ only. Returns aggregate value across all links. Not supported on NVLink4 and earlier. DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_15 Short = 1419 // DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_POWER_CAP_NS represents Throttling to not exceed currently set power limits in ns DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_POWER_CAP_NS Short = 1420 // DCGM_FI_DEV_CLOCKS_EVENT_REASON_SYNC_BOOST_NS represents Boost Group in ns DCGM_FI_DEV_CLOCKS_EVENT_REASON_SYNC_BOOST_NS Short = 1421 // DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_THERM_SLOWDOWN_NS represents (Memory Temp < Memory Max Operating Temp)) in ns DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_THERM_SLOWDOWN_NS Short = 1422 // DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_THERM_SLOWDOWN_NS represents clocks by a factor of 2 or more) in ns DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_THERM_SLOWDOWN_NS Short = 1423 // DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_POWER_BRAKE_SLOWDOWN_NS represents (reducing core clocks by a factor of 2 or more) in ns DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_POWER_BRAKE_SLOWDOWN_NS Short = 1424 // DCGM_FI_DEV_PWR_SMOOTHING_ENABLED represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_ENABLED Short = 1425 // DCGM_FI_DEV_PWR_SMOOTHING_PRIV_LVL represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_PRIV_LVL Short = 1426 // DCGM_FI_DEV_PWR_SMOOTHING_IMM_RAMP_DOWN_ENABLED represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_IMM_RAMP_DOWN_ENABLED Short = 1427 // DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_CEIL represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_CEIL Short = 1428 // DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_FLOOR represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_FLOOR Short = 1429 // DCGM_FI_DEV_PWR_SMOOTHING_MAX_PERCENT_TMP_FLOOR_SETTING represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_MAX_PERCENT_TMP_FLOOR_SETTING Short = 1430 // DCGM_FI_DEV_PWR_SMOOTHING_MIN_PERCENT_TMP_FLOOR_SETTING represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_MIN_PERCENT_TMP_FLOOR_SETTING Short = 1431 // DCGM_FI_DEV_PWR_SMOOTHING_HW_CIRCUITRY_PERCENT_LIFETIME_REMAINING represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_HW_CIRCUITRY_PERCENT_LIFETIME_REMAINING Short = 1432 // DCGM_FI_DEV_PWR_SMOOTHING_MAX_NUM_PRESET_PROFILES represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_MAX_NUM_PRESET_PROFILES Short = 1433 // DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_PERCENT_TMP_FLOOR represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_PERCENT_TMP_FLOOR Short = 1434 // DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_UP_RATE represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_UP_RATE Short = 1435 // DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_RATE represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_RATE Short = 1436 // DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_HYST_VAL represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_HYST_VAL Short = 1437 // DCGM_FI_DEV_PWR_SMOOTHING_ACTIVE_PRESET_PROFILE represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_ACTIVE_PRESET_PROFILE Short = 1438 // DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_PERCENT_TMP_FLOOR represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_PERCENT_TMP_FLOOR Short = 1439 // DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_UP_RATE represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_UP_RATE Short = 1440 // DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_RATE represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_RATE Short = 1441 // DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_HYST_VAL represents either level 1 or level 2 (e.g. via Redfish API) DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_HYST_VAL Short = 1442 // DCGM_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS represents PCIe Correctable Errors Counter DCGM_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS Short = 1501 // DCGM_FI_IMEX_DOMAIN_STATUS represents Retrieved from nvidia-imex-ctl -N -j command DCGM_FI_IMEX_DOMAIN_STATUS Short = 1502 // DCGM_FI_IMEX_DAEMON_STATUS represents WAITING_FOR_RECOVERY=3, INIT_GPU=4, READY=5, SHUTTING_DOWN=6, UNAVAILABLE=7 DCGM_FI_IMEX_DAEMON_STATUS Short = 1503 // DCGM_FI_DEV_MEMORY_UNREPAIRABLE_FLAG represents 1=yes, 0=no DCGM_FI_DEV_MEMORY_UNREPAIRABLE_FLAG Short = 1507 // DCGM_FI_DEV_NVLINK_GET_STATE represents Use DCGM_FE_LINK entity group when accessing this field. DCGM_FI_DEV_NVLINK_GET_STATE Short = 1508 // DCGM_FI_DEV_NVLINK_PPCNT_IBPC_PORT_XMIT_WAIT represents Use DCGM_FE_LINK entity group when accessing this field. DCGM_FI_DEV_NVLINK_PPCNT_IBPC_PORT_XMIT_WAIT Short = 1509 // DCGM_FI_DEV_GET_GPU_RECOVERY_ACTION represents GPU Recovery Action (see nvmlDeviceGpuRecoveryAction_t for return values) DCGM_FI_DEV_GET_GPU_RECOVERY_ACTION Short = 1523 // DCGM_FI_DEV_NVSWITCH_FIRMWARE_VERSION represents NVSwitch firmware version string DCGM_FI_DEV_NVSWITCH_FIRMWARE_VERSION Short = 1524 )
func GetFieldID ¶
GetFieldID returns the DCGM field ID for a given field name and whether it was found It first checks the current field IDs, then falls back to legacy field IDs if not found
func GetFieldIDOrPanic ¶
GetFieldIDOrPanic returns the DCGM field ID for a given field name It panics if the field name is not found in either current or legacy maps
type Status ¶
type Status struct {
// Memory represents the current memory usage of the DCGM hostengine in kilobytes
Memory int64
// CPU represents the current CPU utilization of the DCGM hostengine as a percentage (0-100)
CPU float64
}
Status represents the current resource utilization of the DCGM hostengine process
func Introspect ¶
Introspect returns memory and CPU usage statistics for the DCGM hostengine
type SystemWatch ¶
type SystemWatch struct {
// Type identifies the type of health watch system
Type string
// Status indicates the current health status
Status string
// Error contains any error message if status is not healthy
Error string
}
SystemWatch represents a health watch system and its status
type ThermalPolicyCondition ¶
type ThermalPolicyCondition struct {
// ThermalViolation indicates the severity of the thermal violation
ThermalViolation uint
}
ThermalPolicyCondition contains details about a thermal violation
type UtilizationInfo ¶
type UtilizationInfo struct {
GPU int64 // %
Memory int64 // %
Encoder int64 // %
Decoder int64 // %
}
UtilizationInfo contains GPU utilization metrics
type VersionInfo ¶
type VersionInfo struct {
RawBuildInfoString string
}
VersionInfo holds DCGM build environment information. RawBuildInfoString contains key-value pairs (e.g. version, arch, buildid, commit) separated by semicolons; each pair is "key:value".
func GetHostengineVersionInfo ¶
func GetHostengineVersionInfo() (VersionInfo, error)
GetHostengineVersionInfo returns build environment information for the DCGM host engine. Requires an active connection (Init must have been called).
func GetVersionInfo ¶
func GetVersionInfo() (VersionInfo, error)
GetVersionInfo returns build environment information for the DCGM client library.
type ViolationTime ¶
type ViolationTime struct {
// Power is time spent throttling due to power constraints
Power *uint64
// Thermal is time spent throttling due to thermal constraints
Thermal *uint64
// Reliability is time spent throttling due to reliability constraints
Reliability *uint64
// BoardLimit is time spent throttling due to board limit constraints
BoardLimit *uint64
// LowUtilization is time spent throttling due to low utilization
LowUtilization *uint64
// SyncBoost is time spent throttling due to sync boost
SyncBoost *uint64
}
ViolationTime measures amount of time (in ms) GPU was at reduced clocks
type XIDErrorInfo ¶
type XIDErrorInfo struct {
// NumErrors is the number of XID errors that occurred
NumErrors int
// Timestamp contains the timestamps of when XID errors occurred
Timestamp []uint64
}
XIDErrorInfo contains information about XID errors
type XidPolicyCondition ¶
type XidPolicyCondition struct {
// ErrNum is the XID error number
ErrNum uint
}
XidPolicyCondition contains details about an XID error
Source Files
¶
- admin.go
- api.go
- const.go
- const_fields.go
- cpu.go
- device_info.go
- device_status.go
- diag.go
- diag_test_helpers.go
- error.go
- field_values.go
- field_values_bench_helpers.go
- fields.go
- gpu_group.go
- health.go
- hostengine_status.go
- internal.go
- mig.go
- policy.go
- process_info.go
- profile.go
- structs.go
- test_utils.go
- topology.go
- utils.go
- version_info.go