Documentation
¶
Index ¶
Constants ¶
const AlertCheckpointConsumer = "alert-exporter"
const AlertCheckpointKey = "active-alerts"
const CurrentAlertCheckpointVersion = 1
const SentinelAlertName = "__sentinel_alert__"
Variables ¶
var ( // Processing metrics ProcessingCyclesTotal = promauto.NewCounter(prometheus.CounterOpts{ Name: "flightctl_alert_exporter_processing_cycles_total", Help: "Total number of processing cycles completed", }) ProcessingDurationSeconds = promauto.NewHistogram(prometheus.HistogramOpts{ Name: "flightctl_alert_exporter_processing_duration_seconds", Help: "Time spent processing events in seconds", Buckets: prometheus.DefBuckets, }) EventsProcessedTotal = promauto.NewCounter(prometheus.CounterOpts{ Name: "flightctl_alert_exporter_events_processed_total", Help: "Total number of events processed", }) // Alert metrics AlertsActiveTotal = promauto.NewGauge(prometheus.GaugeOpts{ Name: "flightctl_alert_exporter_alerts_active_total", Help: "Current number of active alerts", }) AlertsCreatedTotal = promauto.NewCounter(prometheus.CounterOpts{ Name: "flightctl_alert_exporter_alerts_created_total", Help: "Total number of alerts created", }) AlertsResolvedTotal = promauto.NewCounter(prometheus.CounterOpts{ Name: "flightctl_alert_exporter_alerts_resolved_total", Help: "Total number of alerts resolved", }) // Alertmanager interaction metrics AlertmanagerRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "flightctl_alert_exporter_alertmanager_requests_total", Help: "Total number of requests to Alertmanager", }, []string{"status"}) AlertmanagerRequestDurationSeconds = promauto.NewHistogram(prometheus.HistogramOpts{ Name: "flightctl_alert_exporter_alertmanager_request_duration_seconds", Help: "Time spent sending requests to Alertmanager in seconds", Buckets: prometheus.DefBuckets, }) AlertmanagerRetriesTotal = promauto.NewCounter(prometheus.CounterOpts{ Name: "flightctl_alert_exporter_alertmanager_retries_total", Help: "Total number of retries when sending to Alertmanager", }) // Checkpoint metrics CheckpointOperationsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "flightctl_alert_exporter_checkpoint_operations_total", Help: "Total number of checkpoint operations", }, []string{"operation", "status"}) CheckpointSizeBytes = promauto.NewGauge(prometheus.GaugeOpts{ Name: "flightctl_alert_exporter_checkpoint_size_bytes", Help: "Size of the checkpoint data in bytes", }) // Health metrics UptimeSeconds = promauto.NewGauge(prometheus.GaugeOpts{ Name: "flightctl_alert_exporter_uptime_seconds", Help: "Time since the alert exporter started in seconds", }) LastSuccessfulProcessingTimestamp = promauto.NewGauge(prometheus.GaugeOpts{ Name: "flightctl_alert_exporter_last_successful_processing_timestamp", Help: "Unix timestamp of the last successful processing cycle", }) ErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "flightctl_alert_exporter_errors_total", Help: "Total number of errors encountered", }, []string{"component", "type"}) )
Prometheus metrics for the alert exporter
Functions ¶
This section is empty.
Types ¶
type AlertCheckpoint ¶
type AlertExporter ¶
type AlertExporter struct {
// contains filtered or unexported fields
}
func NewAlertExporter ¶
type AlertSender ¶
type AlertSender struct {
// contains filtered or unexported fields
}
func NewAlertSender ¶
func (*AlertSender) SendAlerts ¶
func (a *AlertSender) SendAlerts(checkpoint *AlertCheckpoint) error
type AlertmanagerAlert ¶
type AlertmanagerClient ¶
type AlertmanagerClient struct {
// contains filtered or unexported fields
}
func NewAlertmanagerClient ¶
func NewAlertmanagerClient(hostname string, port uint, log logrus.FieldLogger, cfg *config.Config) *AlertmanagerClient
func (*AlertmanagerClient) SendAllAlerts ¶
func (a *AlertmanagerClient) SendAllAlerts(alerts map[AlertKey]map[string]*AlertInfo) error
SendAllAlerts sends all alerts from a nested map to Alertmanager in batches.
type CheckpointContext ¶
type CheckpointContext struct {
// contains filtered or unexported fields
}
type CheckpointManager ¶
type CheckpointManager struct {
// contains filtered or unexported fields
}
func NewCheckpointManager ¶
func NewCheckpointManager(log *logrus.Logger, handler service.Service) *CheckpointManager
func (*CheckpointManager) LoadCheckpoint ¶
func (c *CheckpointManager) LoadCheckpoint(ctx context.Context) *AlertCheckpoint
LoadCheckpoint retrieves the last processed event and active alerts from the database. If no checkpoint exists, it initializes a fresh state. If it fails to retrieve the checkpoint or unmarshal the contents, it logs an error and starts from a fresh state. This is better than panicking, as it allows the exporter to continue running and at least report new alerts from the point of failure onward. In the future, we could consider using a more robust error handling strategy, such as listing the system resources and reconstructing the list of active alerts based on the current state of the system. However, for now, I assume that if we fail to fetch the checkpoint then we will also fail to fetch the system resources.
func (*CheckpointManager) StoreCheckpoint ¶
func (c *CheckpointManager) StoreCheckpoint(ctx context.Context, checkpoint *AlertCheckpoint) error
type EventProcessor ¶
type EventProcessor struct {
// contains filtered or unexported fields
}
func NewEventProcessor ¶
func NewEventProcessor(log *logrus.Logger, handler service.Service) *EventProcessor
func (*EventProcessor) ProcessLatestEvents ¶
func (e *EventProcessor) ProcessLatestEvents(ctx context.Context, oldCheckpoint *AlertCheckpoint, metrics *ProcessingMetrics) (*AlertCheckpoint, error)
type ProcessingMetrics ¶
type ProcessingMetrics struct { CycleStartTime time.Time EventsProcessed int AlertsCreated int AlertsResolved int ProcessingTimeMs int64 SendingTimeMs int64 CheckpointTimeMs int64 TotalCycleTimeMs int64 ActiveAlerts int }
ProcessingMetrics tracks operational metrics for monitoring and observability