dcgm

package
v0.0.0-...-ee85b5d Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 2, 2021 License: Apache-2.0 Imports: 15 Imported by: 0

Documentation

Index

Constants

View Source
const (
	Embedded mode = iota
	Standalone
	StartHostengine
)

const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine

View Source
const (
	DCGM_FT_BINARY                 = uint('b')
	DCGM_FT_DOUBLE                 = uint('d')
	DCGM_FT_INT64                  = uint('i')
	DCGM_FT_STRING                 = uint('s')
	DCGM_FT_TIMESTAMP              = uint('t')
	DCGM_FT_INT32_BLANK            = int64(2147483632)
	DCGM_FT_INT32_NOT_FOUND        = int64(DCGM_FT_INT32_BLANK + 1)
	DCGM_FT_INT32_NOT_SUPPORTED    = int64(DCGM_FT_INT32_BLANK + 2)
	DCGM_FT_INT32_NOT_PERMISSIONED = int64(DCGM_FT_INT32_BLANK + 3)
	DCGM_FT_INT64_BLANK            = int64(9223372036854775792)
	DCGM_FT_INT64_NOT_FOUND        = int64(DCGM_FT_INT64_BLANK + 1)
	DCGM_FT_INT64_NOT_SUPPORTED    = int64(DCGM_FT_INT64_BLANK + 2)
	DCGM_FT_INT64_NOT_PERMISSIONED = int64(DCGM_FT_INT64_BLANK + 3)
	DCGM_FT_FP64_BLANK             = 140737488355328.0
	DCGM_FT_FP64_NOT_FOUND         = float64(DCGM_FT_FP64_BLANK + 1.0)
	DCGM_FT_FP64_NOT_SUPPORTED     = float64(DCGM_FT_FP64_BLANK + 2.0)
	DCGM_FT_FP64_NOT_PERMISSIONED  = float64(DCGM_FT_FP64_BLANK + 3.0)
	DCGM_FT_STR_BLANK              = "<<<NULL>>>"
	DCGM_FT_STR_NOT_FOUND          = "<<<NOT_FOUND>>>"
	DCGM_FT_STR_NOT_SUPPORTED      = "<<<NOT_SUPPORTED>>>"
	DCGM_FT_STR_NOT_PERMISSIONED   = "<<<NOT_PERM>>>"

	DCGM_FI_UNKNOWN                               = 0
	DCGM_FI_DRIVER_VERSION                        = 1
	DCGM_FI_NVML_VERSION                          = 2
	DCGM_FI_PROCESS_NAME                          = 3
	DCGM_FI_DEV_COUNT                             = 4
	DCGM_FI_DEV_NAME                              = 50
	DCGM_FI_DEV_BRAND                             = 51
	DCGM_FI_DEV_NVML_INDEX                        = 52
	DCGM_FI_DEV_SERIAL                            = 53
	DCGM_FI_DEV_UUID                              = 54
	DCGM_FI_DEV_MINOR_NUMBER                      = 55
	DCGM_FI_DEV_OEM_INFOROM_VER                   = 56
	DCGM_FI_DEV_PCI_BUSID                         = 57
	DCGM_FI_DEV_PCI_COMBINED_ID                   = 58
	DCGM_FI_DEV_PCI_SUBSYS_ID                     = 59
	DCGM_FI_GPU_TOPOLOGY_PCI                      = 60
	DCGM_FI_GPU_TOPOLOGY_NVLINK                   = 61
	DCGM_FI_GPU_TOPOLOGY_AFFINITY                 = 62
	DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY           = 63
	DCGM_FI_DEV_COMPUTE_MODE                      = 65
	DCGM_FI_DEV_CPU_AFFINITY_0                    = 70
	DCGM_FI_DEV_CPU_AFFINITY_1                    = 71
	DCGM_FI_DEV_CPU_AFFINITY_2                    = 72
	DCGM_FI_DEV_CPU_AFFINITY_3                    = 73
	DCGM_FI_DEV_ECC_INFOROM_VER                   = 80
	DCGM_FI_DEV_POWER_INFOROM_VER                 = 81
	DCGM_FI_DEV_INFOROM_IMAGE_VER                 = 82
	DCGM_FI_DEV_INFOROM_CONFIG_CHECK              = 83
	DCGM_FI_DEV_INFOROM_CONFIG_VALID              = 84
	DCGM_FI_DEV_VBIOS_VERSION                     = 85
	DCGM_FI_DEV_BAR1_TOTAL                        = 90
	DCGM_FI_SYNC_BOOST                            = 91
	DCGM_FI_DEV_BAR1_USED                         = 92
	DCGM_FI_DEV_BAR1_FREE                         = 93
	DCGM_FI_DEV_SM_CLOCK                          = 100
	DCGM_FI_DEV_MEM_CLOCK                         = 101
	DCGM_FI_DEV_VIDEO_CLOCK                       = 102
	DCGM_FI_DEV_APP_SM_CLOCK                      = 110
	DCGM_FI_DEV_APP_MEM_CLOCK                     = 111
	DCGM_FI_DEV_CLOCK_THROTTLE_REASONS            = 112
	DCGM_FI_DEV_MAX_SM_CLOCK                      = 113
	DCGM_FI_DEV_MAX_MEM_CLOCK                     = 114
	DCGM_FI_DEV_MAX_VIDEO_CLOCK                   = 115
	DCGM_FI_DEV_AUTOBOOST                         = 120
	DCGM_FI_DEV_SUPPORTED_CLOCKS                  = 130
	DCGM_FI_DEV_MEMORY_TEMP                       = 140
	DCGM_FI_DEV_GPU_TEMP                          = 150
	DCGM_FI_DEV_POWER_USAGE                       = 155
	DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION          = 156
	DCGM_FI_DEV_SLOWDOWN_TEMP                     = 158
	DCGM_FI_DEV_SHUTDOWN_TEMP                     = 159
	DCGM_FI_DEV_POWER_MGMT_LIMIT                  = 160
	DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN              = 161
	DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX              = 162
	DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF              = 163
	DCGM_FI_DEV_ENFORCED_POWER_LIMIT              = 164
	DCGM_FI_DEV_PSTATE                            = 190
	DCGM_FI_DEV_FAN_SPEED                         = 191
	DCGM_FI_DEV_PCIE_TX_THROUGHPUT                = 200
	DCGM_FI_DEV_PCIE_RX_THROUGHPUT                = 201
	DCGM_FI_DEV_PCIE_REPLAY_COUNTER               = 202
	DCGM_FI_DEV_GPU_UTIL                          = 203
	DCGM_FI_DEV_MEM_COPY_UTIL                     = 204
	DCGM_FI_DEV_ACCOUNTING_DATA                   = 205
	DCGM_FI_DEV_ENC_UTIL                          = 206
	DCGM_FI_DEV_DEC_UTIL                          = 207
	DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES             = 210
	DCGM_FI_DEV_GPU_UTIL_SAMPLES                  = 211
	DCGM_FI_DEV_GRAPHICS_PIDS                     = 220
	DCGM_FI_DEV_COMPUTE_PIDS                      = 221
	DCGM_FI_DEV_XID_ERRORS                        = 230
	DCGM_FI_DEV_PCIE_MAX_LINK_GEN                 = 235
	DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH               = 236
	DCGM_FI_DEV_PCIE_LINK_GEN                     = 237
	DCGM_FI_DEV_PCIE_LINK_WIDTH                   = 238
	DCGM_FI_DEV_POWER_VIOLATION                   = 240
	DCGM_FI_DEV_THERMAL_VIOLATION                 = 241
	DCGM_FI_DEV_SYNC_BOOST_VIOLATION              = 242
	DCGM_FI_DEV_BOARD_LIMIT_VIOLATION             = 243
	DCGM_FI_DEV_LOW_UTIL_VIOLATION                = 244
	DCGM_FI_DEV_RELIABILITY_VIOLATION             = 245
	DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION        = 246
	DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION       = 247
	DCGM_FI_DEV_FB_TOTAL                          = 250
	DCGM_FI_DEV_FB_FREE                           = 251
	DCGM_FI_DEV_FB_USED                           = 252
	DCGM_FI_DEV_ECC_CURRENT                       = 300
	DCGM_FI_DEV_ECC_PENDING                       = 301
	DCGM_FI_DEV_ECC_SBE_VOL_TOTAL                 = 310
	DCGM_FI_DEV_ECC_DBE_VOL_TOTAL                 = 311
	DCGM_FI_DEV_ECC_SBE_AGG_TOTAL                 = 312
	DCGM_FI_DEV_ECC_DBE_AGG_TOTAL                 = 313
	DCGM_FI_DEV_ECC_SBE_VOL_L1                    = 314
	DCGM_FI_DEV_ECC_DBE_VOL_L1                    = 315
	DCGM_FI_DEV_ECC_SBE_VOL_L2                    = 316
	DCGM_FI_DEV_ECC_DBE_VOL_L2                    = 317
	DCGM_FI_DEV_ECC_SBE_VOL_DEV                   = 318
	DCGM_FI_DEV_ECC_DBE_VOL_DEV                   = 319
	DCGM_FI_DEV_ECC_SBE_VOL_REG                   = 320
	DCGM_FI_DEV_ECC_DBE_VOL_REG                   = 321
	DCGM_FI_DEV_ECC_SBE_VOL_TEX                   = 322
	DCGM_FI_DEV_ECC_DBE_VOL_TEX                   = 323
	DCGM_FI_DEV_ECC_SBE_AGG_L1                    = 324
	DCGM_FI_DEV_ECC_DBE_AGG_L1                    = 325
	DCGM_FI_DEV_ECC_SBE_AGG_L2                    = 326
	DCGM_FI_DEV_ECC_DBE_AGG_L2                    = 327
	DCGM_FI_DEV_ECC_SBE_AGG_DEV                   = 328
	DCGM_FI_DEV_ECC_DBE_AGG_DEV                   = 329
	DCGM_FI_DEV_ECC_SBE_AGG_REG                   = 330
	DCGM_FI_DEV_ECC_DBE_AGG_REG                   = 331
	DCGM_FI_DEV_ECC_SBE_AGG_TEX                   = 332
	DCGM_FI_DEV_ECC_DBE_AGG_TEX                   = 333
	DCGM_FI_DEV_RETIRED_SBE                       = 390
	DCGM_FI_DEV_RETIRED_DBE                       = 391
	DCGM_FI_DEV_RETIRED_PENDING                   = 392
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0    = 400
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1    = 401
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2    = 402
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3    = 403
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4    = 404
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5    = 405
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0    = 410
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1    = 411
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2    = 412
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3    = 413
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4    = 414
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5    = 415
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0      = 420
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1      = 421
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2      = 422
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3      = 423
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4      = 424
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5      = 425
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL   = 429
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0    = 430
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1    = 431
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2    = 432
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3    = 433
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4    = 434
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5    = 435
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L0               = 440
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L1               = 441
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L2               = 442
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L3               = 443
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L4               = 444
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L5               = 445
	DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL            = 449
	DCGM_FI_DEV_GPU_NVLINK_ERRORS                 = 450
	DCGM_FI_DEV_VIRTUAL_MODE                      = 500
	DCGM_FI_DEV_SUPPORTED_TYPE_INFO               = 501
	DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS           = 502
	DCGM_FI_DEV_VGPU_INSTANCE_IDS                 = 503
	DCGM_FI_DEV_VGPU_UTILIZATIONS                 = 504
	DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION      = 505
	DCGM_FI_DEV_ENC_STATS                         = 506
	DCGM_FI_DEV_FBC_STATS                         = 507
	DCGM_FI_DEV_FBC_SESSIONS_INFO                 = 508
	DCGM_FI_DEV_VGPU_VM_ID                        = 520
	DCGM_FI_DEV_VGPU_VM_NAME                      = 521
	DCGM_FI_DEV_VGPU_TYPE                         = 522
	DCGM_FI_DEV_VGPU_UUID                         = 523
	DCGM_FI_DEV_VGPU_DRIVER_VERSION               = 524
	DCGM_FI_DEV_VGPU_MEMORY_USAGE                 = 525
	DCGM_FI_DEV_VGPU_LICENSE_STATUS               = 526
	DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT             = 527
	DCGM_FI_DEV_VGPU_ENC_STATS                    = 528
	DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO            = 529
	DCGM_FI_DEV_VGPU_FBC_STATS                    = 530
	DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO            = 531
	DCGM_FI_FIRST_VGPU_FIELD_ID                   = 520
	DCGM_FI_LAST_VGPU_FIELD_ID                    = 570
	DCGM_FI_INTERNAL_FIELDS_0_START               = 600
	DCGM_FI_INTERNAL_FIELDS_0_END                 = 699
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00          = 700
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00          = 701
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00         = 702
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00          = 703
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01          = 704
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01          = 705
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01         = 706
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01          = 707
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02          = 708
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02          = 709
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02         = 710
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02          = 711
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03          = 712
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03          = 713
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03         = 714
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03          = 715
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04          = 716
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04          = 717
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04         = 718
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04          = 719
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05          = 720
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05          = 721
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05         = 722
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05          = 723
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06          = 724
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06          = 725
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06         = 726
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06          = 727
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07          = 728
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07          = 729
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07         = 730
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07          = 731
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08          = 732
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08          = 733
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08         = 734
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08          = 735
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09          = 736
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09          = 737
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09         = 738
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09          = 739
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10          = 740
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10          = 741
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10         = 742
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10          = 743
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11          = 744
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11          = 745
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11         = 746
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11          = 747
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12          = 748
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12          = 749
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12         = 750
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12          = 751
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13          = 752
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13          = 753
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13         = 754
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13          = 755
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14          = 756
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14          = 757
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14         = 758
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14          = 759
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15          = 760
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15          = 761
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15         = 762
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15          = 763
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16          = 764
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16          = 765
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16         = 766
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16          = 767
	DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17          = 768
	DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17          = 769
	DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17         = 770
	DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17          = 771
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00       = 780
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00       = 781
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01       = 782
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01       = 783
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02       = 784
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02       = 785
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03       = 786
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03       = 787
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04       = 788
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04       = 789
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05       = 790
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05       = 791
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06       = 792
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06       = 793
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07       = 794
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07       = 795
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08       = 796
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08       = 797
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09       = 798
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09       = 799
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10       = 800
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10       = 801
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11       = 802
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11       = 803
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12       = 804
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12       = 805
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13       = 806
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13       = 807
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14       = 808
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14       = 809
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15       = 810
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15       = 811
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16       = 812
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16       = 813
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17       = 814
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17       = 815
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00       = 820
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00       = 821
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01       = 822
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01       = 823
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02       = 824
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02       = 825
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03       = 826
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03       = 827
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04       = 828
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04       = 829
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05       = 830
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05       = 831
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06       = 832
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06       = 833
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07       = 834
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07       = 835
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08       = 836
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08       = 837
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09       = 838
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09       = 839
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10       = 840
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10       = 841
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11       = 842
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11       = 843
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12       = 844
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12       = 845
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13       = 846
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13       = 847
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14       = 848
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14       = 849
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15       = 850
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15       = 851
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16       = 852
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16       = 853
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17       = 854
	DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17       = 855
	DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS             = 856
	DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS         = 857
	DCGM_FI_FIRST_NVSWITCH_FIELD_ID               = 700
	DCGM_FI_LAST_NVSWITCH_FIELD_ID                = 860
	DCGM_FI_PROF_GR_ENGINE_ACTIVE                 = 1001
	DCGM_FI_PROF_SM_ACTIVE                        = 1002
	DCGM_FI_PROF_SM_OCCUPANCY                     = 1003
	DCGM_FI_PROF_PIPE_TENSOR_ACTIVE               = 1004
	DCGM_FI_PROF_DRAM_ACTIVE                      = 1005
	DCGM_FI_PROF_PIPE_FP64_ACTIVE                 = 1006
	DCGM_FI_PROF_PIPE_FP32_ACTIVE                 = 1007
	DCGM_FI_PROF_PIPE_FP16_ACTIVE                 = 1008
	DCGM_FI_PROF_PCIE_TX_BYTES                    = 1009
	DCGM_FI_PROF_PCIE_RX_BYTES                    = 1010
	DCGM_FI_PROF_NVLINK_TX_BYTES                  = 1011
	DCGM_FI_PROF_NVLINK_RX_BYTES                  = 1012
	DCGM_FI_MAX_FIELDS                            = 1013
)
View Source
const (
	PerfStateMax     = 0
	PerfStateMin     = 15
	PerfStateUnknown = 32
)
View Source
const (
	DbePolicy     = policyCondition("Double-bit ECC error")
	PCIePolicy    = policyCondition("PCI error")
	MaxRtPgPolicy = policyCondition("Max Retired Pages Limit")
	ThermalPolicy = policyCondition("Thermal Limit")
	PowerPolicy   = policyCondition("Power Limit")
	NvlinkPolicy  = policyCondition("Nvlink Error")
	XidPolicy     = policyCondition("XID Error")
)

Variables

View Source
var (
	DCGM_FI = map[string]Short{}/* 342 elements not displayed */

)

Functions

func AddToGroup

func AddToGroup(groupId GroupHandle, gpuId uint) (err error)

func DestroyGroup

func DestroyGroup(groupId GroupHandle) (err error)

func FieldGroupDestroy

func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)

func GetAllDeviceCount

func GetAllDeviceCount() (uint, error)

GetAllDeviceCount counts all GPUs on the system

func GetSupportedDevices

func GetSupportedDevices() ([]uint, error)

GetSupportedDevices returns only DCGM supported GPUs

func Init

func Init(m mode, args ...string) (cleanup func(), err error)

Init starts DCGM, based on the user selected mode DCGM can be started in 3 differengt modes: 1. Embedded: Start hostengine within this process 2. Standalone: Connect to an already running nv-hostengine at the specified address Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting

func IsInt32Blank

func IsInt32Blank(value int) bool

func IsInt64Blank

func IsInt64Blank(value int64) bool

func Policy

func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error)

Policy sets GPU usage and error policies and notifies in case of any violations via callback functions

func Shutdown

func Shutdown() (err error)

Shutdown stops DCGM and destroy all connections

func UpdateAllFields

func UpdateAllFields() error

func ViolationRegistration

func ViolationRegistration(data unsafe.Pointer) int

ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()

func WatchFieldsWithGroup

func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error

Types

type ClockInfo

type ClockInfo struct {
	Cores  int64 // MHz
	Memory int64 // MHz
}

type DcgmStatus

type DcgmStatus struct {
	Memory int64
	CPU    float64
}

func Introspect

func Introspect() (DcgmStatus, error)

Introspect returns DCGM hostengine memory and CPU usage

type Device

type Device struct {
	GPU           uint
	DCGMSupported string
	UUID          string
	Power         uint // W
	PCI           PCIInfo
	Identifiers   DeviceIdentifiers
	Topology      []P2PLink
	CPUAffinity   string
}

func GetDeviceInfo

func GetDeviceInfo(gpuId uint) (Device, error)

GetDeviceInfo describes the given device

type DeviceHealth

type DeviceHealth struct {
	GPU     uint
	Status  string
	Watches []SystemWatch
}

func HealthCheckByGpuId

func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error)

HealthCheckByGpuId monitors GPU health for any errors/failures/warnings

type DeviceIdentifiers

type DeviceIdentifiers struct {
	Brand               string
	Model               string
	Serial              string
	Vbios               string
	InforomImageVersion string
	DriverVersion       string
}

type DeviceStatus

type DeviceStatus struct {
	Power       float64 // W
	Temperature int64   // °C
	Utilization UtilizationInfo
	Memory      MemoryInfo
	Clocks      ClockInfo
	PCI         PCIStatusInfo
	Performance PerfState
	FanSpeed    int64 // %
}

func GetDeviceStatus

func GetDeviceStatus(gpuId uint) (DeviceStatus, error)

GetDeviceStatus monitors GPU status including its power, memory and GPU utilization

type ECCErrorsInfo

type ECCErrorsInfo struct {
	SingleBit int64
	DoubleBit int64
}

type FieldHandle

type FieldHandle struct {
	// contains filtered or unexported fields
}

func FieldGroupCreate

func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error)

type FieldValue_v1

type FieldValue_v1 struct {
	Version   uint
	FieldId   uint
	FieldType uint
	Status    int
	Ts        int64
	Value     [4096]byte
}

func GetLatestValuesForFields

func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error)

func (FieldValue_v1) Blob

func (fv FieldValue_v1) Blob() [4096]byte

func (FieldValue_v1) Float64

func (fv FieldValue_v1) Float64() float64

func (FieldValue_v1) Int64

func (fv FieldValue_v1) Int64() int64

func (FieldValue_v1) String

func (fv FieldValue_v1) String() string

type GroupHandle

type GroupHandle struct {
	// contains filtered or unexported fields
}

func CreateGroup

func CreateGroup(groupName string) (goGroupId GroupHandle, err error)

func NewDefaultGroup

func NewDefaultGroup(groupName string) (GroupHandle, error)

func WatchFields

func WatchFields(gpuId uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)

func WatchPidFields

func WatchPidFields() (GroupHandle, error)

WatchPidFields lets DCGM start recording stats for GPU process It needs to be called before calling GetProcessInfo

type MemoryInfo

type MemoryInfo struct {
	GlobalUsed int64
	ECCErrors  ECCErrorsInfo
}

type MetricGroup

type MetricGroup struct {
	// contains filtered or unexported fields
}

func GetSupportedMetricGroups

func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error)

Get all of the profiling metric groups for a given GPU group.

type P2PLink struct {
	GPU   uint
	BusID string
	Link  P2PLinkType
}

func GetDeviceTopology

func GetDeviceTopology(gpuId uint) ([]P2PLink, error)

GetDeviceTopology returns device topology corresponding to the gpuId

type P2PLinkType

type P2PLinkType uint
const (
	P2PLinkUnknown P2PLinkType = iota
	P2PLinkCrossCPU
	P2PLinkSameCPU
	P2PLinkHostBridge
	P2PLinkMultiSwitch
	P2PLinkSingleSwitch
	P2PLinkSameBoard
	SingleNVLINKLink
	TwoNVLINKLinks
	ThreeNVLINKLinks
	FourNVLINKLinks
)

func (P2PLinkType) PCIPaths

func (l P2PLinkType) PCIPaths() string

type PCIInfo

type PCIInfo struct {
	BusID     string
	BAR1      uint  // MB
	FBTotal   uint  // MB
	Bandwidth int64 // MB/s
}

type PCIStatusInfo

type PCIStatusInfo struct {
	BAR1Used   int64 // MB
	Throughput PCIThroughputInfo
	FBUsed     int64
}

type PCIThroughputInfo

type PCIThroughputInfo struct {
	Rx      int64 // MB
	Tx      int64 // MB
	Replays int64
}

type PerfState

type PerfState uint

func (PerfState) String

func (p PerfState) String() string

type PolicyViolation

type PolicyViolation struct {
	Condition policyCondition
	Timestamp time.Time
	Data      interface{}
}

type ProcessInfo

type ProcessInfo struct {
	GPU                uint
	PID                uint
	Name               string
	ProcessUtilization ProcessUtilInfo
	PCI                PCIStatusInfo
	Memory             MemoryInfo
	GpuUtilization     UtilizationInfo
	Clocks             ClockInfo
	Violations         ViolationTime
	XIDErrors          XIDErrorInfo
}

func GetProcessInfo

func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error)

GetProcessInfo provides detailed per GPU stats for this process

type ProcessUtilInfo

type ProcessUtilInfo struct {
	StartTime      Time
	EndTime        Time
	EnergyConsumed *uint64 // Joules
	SmUtil         *float64
	MemUtil        *float64
}

type Short

type Short C.ushort

type SystemWatch

type SystemWatch struct {
	Type   string
	Status string
	Error  string
}

type Time

type Time uint64

func (Time) String

func (t Time) String() string

type UtilizationInfo

type UtilizationInfo struct {
	GPU     int64 // %
	Memory  int64 // %
	Encoder int64 // %
	Decoder int64 // %
}

type ViolationTime

type ViolationTime struct {
	Power          *uint64
	Thermal        *uint64
	Reliability    *uint64
	BoardLimit     *uint64
	LowUtilization *uint64
	SyncBoost      *uint64
}

ViolationTime measures amount of time (in ms) GPU was at reduced clocks

type XIDErrorInfo

type XIDErrorInfo struct {
	NumErrors int
	Timestamp []uint64
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL