Documentation
¶
Index ¶
Constants ¶
View Source
const ( GPUUID KubernetesGPUIDType = "uid" DeviceName KubernetesGPUIDType = "device-name" NvidiaResourceName = "nvidia.com/gpu" NvidiaMigResourcePrefix = "nvidia.com/mig-" MIG_UUID_PREFIX = "MIG-" )
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Config ¶
type Config struct {
CollectorsFile string
Address string
CollectInterval int
Kubernetes bool
KubernetesEnablePodLabels bool
KubernetesEnablePodUID bool
KubernetesGPUIdType KubernetesGPUIDType
KubernetesPodLabelAllowlistRegex []string // Regex patterns for filtering pod labels
KubernetesPodLabelCacheSize int // Maximum number of label keys to cache (<=0 means default size)
CollectDCP bool
UseOldNamespace bool
UseRemoteHE bool
RemoteHEInfo string
GPUDeviceOptions DeviceOptions
SwitchDeviceOptions DeviceOptions
CPUDeviceOptions DeviceOptions
NoHostname bool
UseFakeGPUs bool
ConfigMapData string
MetricGroups []dcgm.MetricGroup
WebSystemdSocket bool
WebConfigFile string
XIDCountWindowSize int
ReplaceBlanksInModelName bool
Debug bool
ClockEventsCountWindowSize int
EnableDCGMLog bool
DCGMLogLevel string
PodResourcesKubeletSocket string
HPCJobMappingDir string
NvidiaResourceNames []string
KubernetesVirtualGPUs bool
DumpConfig DumpConfig // Configuration for file-based dumps
KubernetesEnableDRA bool
DisableStartupValidate bool
EnableGPUBindUnbindWatch bool // Enable GPU bind/unbind event monitoring
GPUBindUnbindPollInterval time.Duration // Poll interval for GPU bind/unbind events
EnablePprof bool // Enable /debug/pprof/ HTTP endpoints
}
type DeviceOptions ¶
type DeviceOptions struct {
Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled.
MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all
MinorRange []int // The indices of each GPUInstance/NvLink to monitor, or -1 to monitor all
}
type DumpConfig ¶
type DumpConfig struct {
Enabled bool `yaml:"enabled" json:"enabled"` // Enable file-based dumps
Directory string `yaml:"directory" json:"directory"` // Directory to store dump files
Retention int `yaml:"retention" json:"retention"` // Retention period in hours (0 = no cleanup)
Compression bool `yaml:"compression" json:"compression"` // Use gzip compression for dump files
}
DumpConfig controls file-based debugging dumps
type KubernetesGPUIDType ¶
type KubernetesGPUIDType string
Click to show internal directories.
Click to hide internal directories.