Documentation
¶
Overview ¶
Package collector implements different collectors of the exporter
Index ¶
- Constants
- Variables
- func DisableDefaultCollectors()
- func IsNoDataError(err error) bool
- func KernelStringToNumeric(ver string) int64
- func KernelVersion() (int64, error)
- func NewCgroupCollector(logger *slog.Logger, cgManager *cgroupManager, opts cgroupOpts) (*cgroupCollector, error)
- func NewCgroupManager(name manager, logger *slog.Logger) (*cgroupManager, error)
- func NewEbpfCollector(logger *slog.Logger, cgManager *cgroupManager) (*ebpfCollector, error)
- func NewGokitLogger(lvl string, logger *slog.Logger) log.Logger
- func NewPerfCollector(logger *slog.Logger, cgManager *cgroupManager) (*perfCollector, error)
- func NewRDMACollector(logger *slog.Logger, cgManager *cgroupManager) (*rdmaCollector, error)
- func RegisterCollector(collector string, isDefaultEnabled bool, ...)
- func SanitizeMetricName(metricName string) string
- func TargetsHandlerFor(discoverer Discoverer, opts promhttp.HandlerOpts) http.Handler
- type AMDASIC
- type AMDBoard
- type AMDBus
- type AMDGPU
- type AMDNodeProperties
- type AMDPartition
- type Address
- type BusID
- type CEEMSCollector
- type CEEMSExporter
- type CEEMSExporterServer
- type CEEMSProfilerConfig
- type Collector
- func NewCPUCollector(logger *slog.Logger) (Collector, error)
- func NewCrayPMCCollector(logger *slog.Logger) (Collector, error)
- func NewEmissionsCollector(logger *slog.Logger) (Collector, error)
- func NewHwmonCollector(logger *slog.Logger) (Collector, error)
- func NewIPMICollector(logger *slog.Logger) (Collector, error)
- func NewInfiniBandCollector(logger *slog.Logger) (Collector, error)
- func NewK8sCollector(logger *slog.Logger) (Collector, error)
- func NewLibvirtCollector(logger *slog.Logger) (Collector, error)
- func NewMeminfoCollector(logger *slog.Logger) (Collector, error)
- func NewNetdevCollector(logger *slog.Logger) (Collector, error)
- func NewRaplCollector(logger *slog.Logger) (Collector, error)
- func NewRedfishCollector(logger *slog.Logger) (Collector, error)
- func NewSlurmCollector(logger *slog.Logger) (Collector, error)
- type ComputeUnit
- type Config
- type Device
- type DeviceAttrs
- type DeviceAttrsShared
- type Devices
- type Discoverer
- type DomStatus
- type Domain
- type GPUInstance
- type GPUSMI
- type HostDev
- type Ksyms
- type MIGDevice
- type MIGDevices
- type MIGMode
- type Memory
- type NVIDIASMILog
- type NvidiaGPU
- type PMCDomain
- type Profiler
- type ProfilerConfig
- type PyroscopeConfig
- type ROCMSMI
- type SessionConfig
- type Source
- type Target
- type VirtMode
- type WebConfig
Constants ¶
const CEEMSExporterAppName = "ceems_exporter"
CEEMSExporterAppName is kingpin app name.
const Namespace = "ceems"
Namespace defines the common namespace to be used by all metrics.
Variables ¶
var CEEMSExporterApp = *kingpin.New( CEEMSExporterAppName, "Prometheus Exporter and Pyroscope client to export compute (job, VM, pod) resource usage and ebpf based profiling metrics.", )
CEEMSExporterApp is kingpin CLI app.
var (
)Custom errors.
var ErrNoData = errors.New("collector returned no data")
ErrNoData indicates the collector found no data to collect, but had no other error.
Functions ¶
func DisableDefaultCollectors ¶
func DisableDefaultCollectors()
DisableDefaultCollectors sets the collector state to false for all collectors which have not been explicitly enabled on the command line.
func IsNoDataError ¶
IsNoDataError returns true if error is ErrNoData.
func KernelStringToNumeric ¶
KernelStringToNumeric converts the kernel version string into a numerical value that can be used to make comparison.
func KernelVersion ¶
KernelVersion returns kernel version of current host.
func NewCgroupCollector ¶
func NewCgroupCollector(logger *slog.Logger, cgManager *cgroupManager, opts cgroupOpts) (*cgroupCollector, error)
NewCgroupCollector returns a new cgroupCollector exposing a summary of cgroups.
func NewCgroupManager ¶
NewCgroupManager returns an instance of cgroupManager based on resource manager.
func NewEbpfCollector ¶
NewEbpfCollector returns a new instance of ebpf collector.
func NewGokitLogger ¶
NewGokitLogger creates a new Go-kit logger from slog.Logger.
func NewPerfCollector ¶
NewPerfCollector returns a new perf based collector, it creates a profiler per compute unit.
func NewRDMACollector ¶
NewRDMACollector returns a new Collector exposing RAPL metrics.
func RegisterCollector ¶
func RegisterCollector( collector string, isDefaultEnabled bool, factory func(logger *slog.Logger) (Collector, error), )
RegisterCollector registers collector into collector factory.
func SanitizeMetricName ¶
SanitizeMetricName sanitize the given metric name by replacing invalid characters by underscores.
OpenMetrics and the Prometheus exposition format require the metric name to consist only of alphanumericals and "_", ":" and they must not start with digits. Since colons in MetricFamily are reserved to signal that the MetricFamily is the result of a calculation or aggregation of a general purpose monitoring system, colons will be replaced as well.
Note: If not subsequently prepending a namespace and/or subsystem (e.g., with prometheus.BuildFQName), the caller must ensure that the supplied metricName does not begin with a digit.
func TargetsHandlerFor ¶
func TargetsHandlerFor(discoverer Discoverer, opts promhttp.HandlerOpts) http.Handler
TargetsHandlerFor returns http.Handler for Alloy targets.
Types ¶
type AMDGPU ¶
type AMDGPU struct { ID int64 `json:"gpu"` ASIC *AMDASIC `json:"asic"` Bus *AMDBus `json:"bus"` Board *AMDBoard `json:"board"` Partition *AMDPartition `json:"partition"` }
type AMDNodeProperties ¶
type AMDPartition ¶
type BusID ¶
type BusID struct {
// contains filtered or unexported fields
}
BusID is a struct that contains PCI bus address of GPU device.
type CEEMSCollector ¶
type CEEMSCollector struct { Collectors map[string]Collector // contains filtered or unexported fields }
CEEMSCollector implements the prometheus.Collector interface.
func NewCEEMSCollector ¶
func NewCEEMSCollector(logger *slog.Logger) (*CEEMSCollector, error)
NewCEEMSCollector creates a new CEEMSCollector.
func (CEEMSCollector) Close ¶
func (n CEEMSCollector) Close(ctx context.Context) error
Close stops all the collectors and release system resources.
func (CEEMSCollector) Collect ¶
func (n CEEMSCollector) Collect(ch chan<- prometheus.Metric)
Collect implements the prometheus.Collector interface.
func (CEEMSCollector) Describe ¶
func (n CEEMSCollector) Describe(ch chan<- *prometheus.Desc)
Describe implements the prometheus.Collector interface.
type CEEMSExporter ¶
type CEEMSExporter struct { App kingpin.Application // contains filtered or unexported fields }
CEEMSExporter represents the `ceems_exporter` cli.
func NewCEEMSExporter ¶
func NewCEEMSExporter() (*CEEMSExporter, error)
NewCEEMSExporter returns a new CEEMSExporter instance.
func (*CEEMSExporter) Main ¶
func (b *CEEMSExporter) Main() error
Main is the entry point of the `ceems_exporter` command.
type CEEMSExporterServer ¶
type CEEMSExporterServer struct {
// contains filtered or unexported fields
}
CEEMSExporterServer struct implements HTTP server for exporter.
func NewCEEMSExporterServer ¶
func NewCEEMSExporterServer(c *Config) (*CEEMSExporterServer, error)
NewCEEMSExporterServer creates new CEEMSExporterServer struct instance.
func (*CEEMSExporterServer) Shutdown ¶
func (s *CEEMSExporterServer) Shutdown(ctx context.Context) error
Shutdown stops CEEMS exporter HTTP server.
func (*CEEMSExporterServer) Start ¶
func (s *CEEMSExporterServer) Start() error
Start launches CEEMS exporter HTTP server.
type CEEMSProfilerConfig ¶
type CEEMSProfilerConfig struct { Session SessionConfig `yaml:"ebpf"` Pyroscope PyroscopeConfig `yaml:"pyroscope"` }
type Collector ¶
type Collector interface { // Get new metrics and expose them via prometheus registry. Update(ch chan<- prometheus.Metric) error // Stops each collector and cleans up system resources Stop(ctx context.Context) error }
Collector is the interface a collector has to implement.
func NewCPUCollector ¶
NewCPUCollector returns a new Collector exposing kernel/system statistics.
func NewCrayPMCCollector ¶
NewCrayPMCCollector returns a new Collector exposing Cray's `pm_counters` metrics.
func NewEmissionsCollector ¶
NewEmissionsCollector returns a new Collector exposing emission factor metrics.
func NewHwmonCollector ¶
NewHwmonCollector returns a new Collector exposing /sys/class/hwmon stats (similar to lm-sensors).
func NewIPMICollector ¶
NewIPMICollector returns a new Collector exposing IMPI DCMI power metrics.
func NewInfiniBandCollector ¶
NewInfiniBandCollector returns a new Collector exposing InfiniBand stats.
func NewK8sCollector ¶
NewK8sCollector returns a new Collector exposing a summary of cgroups.
func NewLibvirtCollector ¶
NewLibvirtCollector returns a new libvirt collector exposing a summary of cgroups.
func NewMeminfoCollector ¶
NewMeminfoCollector returns a new Collector exposing memory stats.
func NewNetdevCollector ¶
NewNetdevCollector returns a new Collector exposing node network stats.
func NewRaplCollector ¶
NewRaplCollector returns a new Collector exposing RAPL metrics.
func NewRedfishCollector ¶
NewRedfishCollector returns a new Collector to fetch power usage from redfish API.
type ComputeUnit ¶
type ComputeUnit struct { UUID string Hostname string // Only applicable to SLURM when multiple daemons are enabled on same physical host }
ComputeUnit contains the unit details that will be associated with each GPU.
type Config ¶
type Config struct { Logger *slog.Logger Collector *CEEMSCollector Discoverer Discoverer Web WebConfig }
Config makes a server config.
type Device ¶
type Device struct { Minor string Index string Name string UUID string BusID BusID NumSMs uint64 ComputeUnits []ComputeUnit MdevUUIDs []string Instances []GPUInstance InstancesEnabled bool VGPUEnabled bool // contains filtered or unexported fields }
Device contains the details of physical GPU devices.
func (*Device) CompareBusID ¶
CompareBusID compares the provided bus ID with device bus ID and returns true if they match and false in all other cases.
func (*Device) ResetUnits ¶
func (d *Device) ResetUnits()
ResetUnits will remove existing compute unit UUIDs.
type DeviceAttrs ¶
type DeviceAttrsShared ¶
type DeviceAttrsShared struct {}
func (*DeviceAttrsShared) UnmarshalXML ¶ added in v0.10.1
func (p *DeviceAttrsShared) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error
UnmarshalXML implements the xml.Unmarshaler interface.
type Discoverer ¶
func NewTargetDiscoverer ¶
func NewTargetDiscoverer(c *discovererConfig) (Discoverer, error)
NewTargetDiscoverer returns a new profiling target discoverer.
type DomStatus ¶ added in v0.11.0
type DomStatus struct {
Domain Domain `xml:"domain"`
}
Domstatus is the top level XML field for runtime XML files.
type Domain ¶
type Domain struct { Devices Devices `xml:"devices"` Name string `xml:"name"` UUID string `xml:"uuid"` }
Domain is the top level XML field for persistent XML files.
type GPUInstance ¶
type GPUInstance struct { InstanceIndex uint64 Index string UUID string ComputeInstID uint64 GPUInstID uint64 SMFraction float64 NumSMs uint64 ComputeUnits []ComputeUnit MdevUUIDs []string }
GPUInstance is abstraction for NVIDIA MIG instance or AMD GPU partition.
func (GPUInstance) ID ¶
func (d GPUInstance) ID() string
ID return instance ID that will be used by k8s requests.
func (*GPUInstance) ResetUnits ¶
func (d *GPUInstance) ResetUnits()
ResetUnits will remove existing compute unit UUIDs.
func (GPUInstance) String ¶
func (d GPUInstance) String() string
String implements Stringer interface of the Device struct.
type GPUSMI ¶
type GPUSMI struct { Devices []Device // contains filtered or unexported fields }
GPUSMI is a vendor neutral SMI interface for GPUs.
func (*GPUSMI) ReindexGPUs ¶
ReindexGPUs reindexes GPU globalIndex based on orderMap string.
func (*GPUSMI) UpdateGPUMdevs ¶
UpdateGPUMdevs updates GPU devices slice with mdev UUIDs.
type Ksyms ¶
type Ksyms struct {
// contains filtered or unexported fields
}
Ksyms is a structure for kernel symbols.
func (*Ksyms) GetArchSpecificName ¶
GetArchSpecificName returns architecture specific symbol (if exists) of a given kernel symbol.
func (*Ksyms) IsAvailable ¶
IsAvailable returns true if the given name is available on current kernel.
type MIGDevice ¶
type MIGDevice struct { XMLName xml.Name `xml:"mig_device"` Index uint64 `xml:"index"` GPUInstID uint64 `xml:"gpu_instance_id"` ComputeInstID uint64 `xml:"compute_instance_id"` DeviceAttrs DeviceAttrs `xml:"device_attributes"` FBMemory Memory `xml:"fb_memory_usage"` Bar1Memory Memory `xml:"bar1_memory_usage"` UUID string }
func (*MIGDevice) UnmarshalXML ¶ added in v0.10.1
UnmarshalXML implements the xml.Unmarshaler interface.
type MIGDevices ¶
type NVIDIASMILog ¶
type NvidiaGPU ¶
type NvidiaGPU struct { XMLName xml.Name `xml:"gpu"` ID string `xml:"id,attr"` ProductName string `xml:"product_name"` ProductBrand string `xml:"product_brand"` ProductArch string `xml:"product_architecture"` MIGMode MIGMode `xml:"mig_mode"` VirtMode VirtMode `xml:"gpu_virtualization_mode"` MIGDevices MIGDevices `xml:"mig_devices"` UUID string `xml:"uuid"` MinorNumber string `xml:"minor_number"` }
type PMCDomain ¶
type PMCDomain struct { Name string // name of PM counter domain zone from filename Path string // filesystem path of PM counters }
PMCDomain stores the information for one Cray's domain PM counter.
func GetCrayPMCDomains ¶
GetCrayPMCDomains returns a slice of Cray's `pm_counters` domains. - https://cray-hpe.github.io/docs-csm/en-10/operations/power_management/user_access_to_compute_node_power_data/
func (PMCDomain) GetEnergyJoules ¶
GetEnergyJoules returns the current joule value from the domain counter.
func (PMCDomain) GetPowerLimitWatts ¶
GetPowerLimitWatts returns the current power limit watt value from the domain counter.
func (PMCDomain) GetPowerWatts ¶
GetPowerWatts returns the current watt value from the domain counter.
func (PMCDomain) GetTempCelsius ¶
GetTempCelsius returns the current node temperature in C value from the domain counter.
type Profiler ¶
Profiler is the interface different profilers must implement.
func NewProfiler ¶
NewProfiler returns a new instance of continuous profiler based on eBPF.
type ProfilerConfig ¶
type ProfilerConfig struct {
Profiler CEEMSProfilerConfig `yaml:"ceems_profiler"`
}
type PyroscopeConfig ¶
type PyroscopeConfig struct { URL string `yaml:"url"` ExternalLabels map[string]string `yaml:"external_labels"` HTTPClientConfig config.HTTPClientConfig `yaml:",inline"` }
func (*PyroscopeConfig) UnmarshalYAML ¶
func (c *PyroscopeConfig) UnmarshalYAML(unmarshal func(any) error) error
UnmarshalYAML implements the yaml.Unmarshaler interface.
func (*PyroscopeConfig) Validate ¶
func (c *PyroscopeConfig) Validate() error
Validate validates the config.
type SessionConfig ¶
type SessionConfig struct { CollectInterval model.Duration `yaml:"collect_interval"` DiscoverInterval model.Duration `yaml:"discover_interval"` CollectUser bool `yaml:"collect_user_profile"` CollectKernel bool `yaml:"collect_kernel_profile"` PythonEnabled bool `yaml:"python_enabled"` SampleRate int `yaml:"sample_rate"` Demangle string `yaml:"demangle"` BuildIDCacheSize int `yaml:"build_id_cache_size"` PIDCacheSize int `yaml:"pid_cache_size"` PIDMapSize uint32 `yaml:"pid_map_size"` SameFileCacheSize int `yaml:"same_file_cache_size"` SymbolsMapSize uint32 `yaml:"symbols_map_size"` CacheRounds int `yaml:"cache_rounds"` }
func (*SessionConfig) UnmarshalYAML ¶
func (c *SessionConfig) UnmarshalYAML(unmarshal func(any) error) error
UnmarshalYAML implements the yaml.Unmarshaler interface.
func (*SessionConfig) Validate ¶
func (c *SessionConfig) Validate() error
Validate validates the config.
type Target ¶
type Target struct { Targets []string `json:"targets"` Labels sd.DiscoveryTarget `json:"labels"` }