Documentation
¶
Overview ¶
Package remediators provides the base functionality for implementing problem remediators in Node Doctor.
The package defines the common infrastructure that all concrete remediators can use via embedding the BaseRemediator struct. This includes:
- Cooldown tracking per problem
- Attempt counting and limiting
- Thread-safe state management
- Optional logging
- Error handling and panic recovery
Usage Example:
type SystemdRemediator struct {
*remediators.BaseRemediator
serviceName string
}
func NewSystemdRemediator(name, service string, cooldown time.Duration) (*SystemdRemediator, error) {
base, err := remediators.NewBaseRemediator(name, cooldown)
if err != nil {
return nil, err
}
sr := &SystemdRemediator{
BaseRemediator: base,
serviceName: service,
}
err = base.SetRemediateFunc(sr.restartService)
if err != nil {
return nil, err
}
return sr, nil
}
func (sr *SystemdRemediator) restartService(ctx context.Context, problem types.Problem) error {
// Actual remediation logic
return exec.CommandContext(ctx, "systemctl", "restart", sr.serviceName).Run()
}
Thread Safety:
BaseRemediator is designed to be thread-safe and can handle concurrent remediation requests for different problems. State tracking is protected by a read-write mutex.
Cooldown Management:
Cooldown is tracked per unique problem (based on Type and Resource). This prevents rapid repeated remediation attempts for the same issue while allowing remediation of different problems concurrently.
Package remediators provides a pluggable remediator registry system for Node Doctor.
The registry manages remediator instances with global safety mechanisms including:
- Circuit breaker pattern to prevent cascading failures
- Rate limiting to prevent remediation storms
- Remediation history for audit and analysis
- Dry-run mode for testing
These global safety mechanisms work in conjunction with BaseRemediator's per-problem safety features (cooldown and max attempts) to provide defense in depth.
Usage Example:
// Create registry
registry := remediators.NewRegistry()
// Register remediators
registry.Register(remediators.RemediatorInfo{
Type: "kubelet-restart",
Factory: NewKubeletRemediator,
Description: "Restarts kubelet service",
})
// Execute remediation (with automatic safety checks)
err := registry.Remediate(ctx, "kubelet-restart", problem)
Index ¶
- Constants
- Variables
- func GenerateProblemKey(problem types.Problem) string
- type BaseRemediator
- func (b *BaseRemediator) CanRemediate(problem types.Problem) bool
- func (b *BaseRemediator) ClearCooldown(problem types.Problem)
- func (b *BaseRemediator) GetAttemptCount(problem types.Problem) int
- func (b *BaseRemediator) GetCooldown() time.Duration
- func (b *BaseRemediator) GetCooldownRemaining(problem types.Problem) time.Duration
- func (b *BaseRemediator) GetMaxAttempts() int
- func (b *BaseRemediator) GetName() string
- func (b *BaseRemediator) IsInCooldown(problem types.Problem) bool
- func (b *BaseRemediator) Remediate(ctx context.Context, problem types.Problem) error
- func (b *BaseRemediator) ResetAttempts(problem types.Problem)
- func (b *BaseRemediator) SetLogger(logger Logger) error
- func (b *BaseRemediator) SetMaxAttempts(max int) error
- func (b *BaseRemediator) SetRemediateFunc(fn RemediateFunc) error
- type CircuitBreakerConfig
- type CircuitBreakerState
- type CustomConfig
- type CustomRemediator
- type DiskConfig
- type DiskExecutor
- type DiskOperation
- type DiskRemediator
- type EventCreator
- type Logger
- type NetworkConfig
- type NetworkExecutor
- type NetworkOperation
- type NetworkRemediator
- type RegistryStats
- type RemediateFunc
- type RemediationRecord
- type RemediatorFactory
- type RemediatorInfo
- type RemediatorRegistry
- func (r *RemediatorRegistry) GetCircuitState() CircuitBreakerState
- func (r *RemediatorRegistry) GetHistory(limit int) []RemediationRecord
- func (r *RemediatorRegistry) GetRegisteredTypes() []string
- func (r *RemediatorRegistry) GetRemediatorInfo(remediatorType string) *RemediatorInfo
- func (r *RemediatorRegistry) GetStats() RegistryStats
- func (r *RemediatorRegistry) IsDryRun() bool
- func (r *RemediatorRegistry) IsRegistered(remediatorType string) bool
- func (r *RemediatorRegistry) Register(info RemediatorInfo)
- func (r *RemediatorRegistry) Remediate(ctx context.Context, remediatorType string, problem types.Problem) error
- func (r *RemediatorRegistry) ResetCircuitBreaker()
- func (r *RemediatorRegistry) SetCircuitBreakerConfig(config CircuitBreakerConfig) error
- func (r *RemediatorRegistry) SetDryRun(dryRun bool)
- func (r *RemediatorRegistry) SetEventCreator(eventCreator EventCreator, nodeName string)
- func (r *RemediatorRegistry) SetLogger(logger Logger)
- type RemediatorValidator
- type RuntimeConfig
- type RuntimeExecutor
- type RuntimeOperation
- type RuntimeRemediator
- type RuntimeType
- type ScriptExecutor
- type SystemdConfig
- type SystemdExecutor
- type SystemdOperation
- type SystemdRemediator
Constants ¶
const ( // CooldownFast is for quick, low-risk remediations (e.g., DNS cache flush) CooldownFast = 3 * time.Minute // CooldownMedium is for standard service restarts (e.g., systemd services) CooldownMedium = 5 * time.Minute // CooldownSlow is for slow-starting services (e.g., database restarts) CooldownSlow = 10 * time.Minute // CooldownDestructive is for high-impact actions (e.g., node reboot) CooldownDestructive = 30 * time.Minute )
Common cooldown duration presets for different remediation types. These provide sensible defaults based on the impact and risk level.
const ( // DefaultMaxAttempts is the default maximum number of remediation attempts // before giving up on a problem DefaultMaxAttempts = 3 )
Default configuration values for remediators
Variables ¶
var DefaultCircuitBreakerConfig = CircuitBreakerConfig{ Threshold: 5, Timeout: 5 * time.Minute, SuccessThreshold: 2, }
DefaultCircuitBreakerConfig provides sensible defaults for the circuit breaker.
Functions ¶
func GenerateProblemKey ¶
GenerateProblemKey generates a unique key for a problem based on its type and resource. This key is used for tracking cooldown periods and attempt counts per unique problem.
The key format is: "type:resource" Examples:
- "kubelet-unhealthy:kubelet.service"
- "disk-pressure:/var/lib/docker"
- "memory-pressure:node"
Types ¶
type BaseRemediator ¶
type BaseRemediator struct {
// contains filtered or unexported fields
}
BaseRemediator provides common functionality for remediator implementations. Concrete remediators should embed this struct and set the remediateFunc to provide their specific remediation logic.
BaseRemediator handles:
- Cooldown tracking per unique problem
- Attempt counting with max attempt limits
- Thread-safe state management
- Optional logging
- Panic recovery
- Context cancellation support
Example usage:
type MyRemediator struct {
*remediators.BaseRemediator
config MyConfig
}
func NewMyRemediator() (*MyRemediator, error) {
base, err := remediators.NewBaseRemediator("my-remediator", remediators.CooldownMedium)
if err != nil {
return nil, err
}
mr := &MyRemediator{BaseRemediator: base}
err = base.SetRemediateFunc(mr.doRemediation)
return mr, err
}
func NewBaseRemediator ¶
func NewBaseRemediator(name string, cooldown time.Duration) (*BaseRemediator, error)
NewBaseRemediator creates a new BaseRemediator with the specified name and cooldown period.
Parameters:
- name: A descriptive name for the remediator (must not be empty)
- cooldown: The minimum time between remediation attempts for the same problem (must be > 0)
Returns an error if validation fails.
func (*BaseRemediator) CanRemediate ¶
func (b *BaseRemediator) CanRemediate(problem types.Problem) bool
CanRemediate checks if remediation is allowed for the given problem. It enforces both cooldown periods and max attempt limits.
Returns true if:
- The problem is not in cooldown period
- The attempt count has not exceeded maxAttempts
This implements the types.Remediator interface.
func (*BaseRemediator) ClearCooldown ¶
func (b *BaseRemediator) ClearCooldown(problem types.Problem)
ClearCooldown clears the cooldown timer for the given problem. This allows immediate remediation regardless of when the last attempt was made.
This method is primarily useful for testing or manual intervention.
func (*BaseRemediator) GetAttemptCount ¶
func (b *BaseRemediator) GetAttemptCount(problem types.Problem) int
GetAttemptCount returns the current attempt count for the given problem. Returns 0 if no attempts have been made.
func (*BaseRemediator) GetCooldown ¶
func (b *BaseRemediator) GetCooldown() time.Duration
GetCooldown returns the cooldown period for this remediator. This implements the types.Remediator interface.
func (*BaseRemediator) GetCooldownRemaining ¶
func (b *BaseRemediator) GetCooldownRemaining(problem types.Problem) time.Duration
GetCooldownRemaining returns the time remaining in the cooldown period. Returns 0 if not in cooldown or no previous attempt has been made.
func (*BaseRemediator) GetMaxAttempts ¶
func (b *BaseRemediator) GetMaxAttempts() int
GetMaxAttempts returns the maximum number of remediation attempts per problem.
func (*BaseRemediator) GetName ¶
func (b *BaseRemediator) GetName() string
GetName returns the remediator's name.
func (*BaseRemediator) IsInCooldown ¶
func (b *BaseRemediator) IsInCooldown(problem types.Problem) bool
IsInCooldown checks if the problem is currently in its cooldown period. Returns false if no previous attempt has been made.
func (*BaseRemediator) Remediate ¶
Remediate performs the remediation with safety checks and state tracking. This implements the types.Remediator interface.
The method:
- Checks if remediateFunc is set
- Records the attempt (updates timestamp and count)
- Calls the remediateFunc with panic recovery
- Respects context cancellation
- Logs all actions
Returns an error if remediation fails or if remediateFunc is not set.
func (*BaseRemediator) ResetAttempts ¶
func (b *BaseRemediator) ResetAttempts(problem types.Problem)
ResetAttempts resets the attempt counter for the given problem to zero. This does not affect the cooldown timer.
This method is primarily useful for testing or manual intervention.
func (*BaseRemediator) SetLogger ¶
func (b *BaseRemediator) SetLogger(logger Logger) error
SetLogger sets an optional logger for the remediator. If not set, logging calls will be silently ignored.
func (*BaseRemediator) SetMaxAttempts ¶
func (b *BaseRemediator) SetMaxAttempts(max int) error
SetMaxAttempts sets the maximum number of remediation attempts per problem. Must be greater than 0.
Returns an error if max is invalid.
func (*BaseRemediator) SetRemediateFunc ¶
func (b *BaseRemediator) SetRemediateFunc(fn RemediateFunc) error
SetRemediateFunc sets the function that performs the actual remediation logic. This must be called before the remediator can be used.
Returns an error if fn is nil.
type CircuitBreakerConfig ¶
type CircuitBreakerConfig struct {
// Threshold is the number of consecutive failures before opening the circuit
Threshold int
// Timeout is how long to keep the circuit open before trying again
Timeout time.Duration
// SuccessThreshold is the number of consecutive successes in half-open state
// before closing the circuit
SuccessThreshold int
}
CircuitBreakerConfig contains configuration for the circuit breaker.
type CircuitBreakerState ¶
type CircuitBreakerState int
CircuitBreakerState represents the state of the circuit breaker.
const ( // CircuitClosed means normal operation - remediations are allowed CircuitClosed CircuitBreakerState = iota // CircuitOpen means too many failures occurred - remediations are blocked CircuitOpen // CircuitHalfOpen means testing if the system has recovered - limited remediations allowed CircuitHalfOpen )
func (CircuitBreakerState) String ¶
func (s CircuitBreakerState) String() string
String returns the string representation of the circuit breaker state.
type CustomConfig ¶
type CustomConfig struct {
// ScriptPath is the absolute path to the remediation script
ScriptPath string
// ScriptArgs are optional arguments to pass to the script
ScriptArgs []string
// Timeout is the maximum execution time for the script (default: 5 minutes)
Timeout time.Duration
// Environment contains additional environment variables to pass to the script
// Problem metadata is automatically injected as environment variables
Environment map[string]string
// CaptureOutput when true, captures and logs stdout/stderr
CaptureOutput bool
// AllowNonZeroExit when true, doesn't treat non-zero exit codes as failures
// Useful for scripts that use exit codes to indicate severity levels
AllowNonZeroExit bool
// WorkingDir is the working directory for script execution (default: script's directory)
WorkingDir string
// DryRun when true, only simulates the action without executing it
DryRun bool
}
CustomConfig contains configuration for the custom script remediator.
type CustomRemediator ¶
type CustomRemediator struct {
*BaseRemediator
// contains filtered or unexported fields
}
CustomRemediator executes custom user-defined remediation scripts. It provides a flexible way to integrate custom remediation logic while maintaining safety checks and proper error handling.
func NewCustomRemediator ¶
func NewCustomRemediator(config CustomConfig) (*CustomRemediator, error)
NewCustomRemediator creates a new custom script remediator with the given configuration.
func (*CustomRemediator) GetScriptPath ¶
func (r *CustomRemediator) GetScriptPath() string
GetScriptPath returns the configured script path (useful for testing).
func (*CustomRemediator) SetScriptExecutor ¶
func (r *CustomRemediator) SetScriptExecutor(executor ScriptExecutor)
SetScriptExecutor sets a custom script executor (useful for testing).
type DiskConfig ¶
type DiskConfig struct {
// Operation specifies the disk cleanup action to perform
Operation DiskOperation
// JournalVacuumSize is the target size for journal logs (e.g., "500M", "1G")
// Only used for CleanJournalLogs operation
JournalVacuumSize string
// TmpFileAge is the age in days for files to be deleted from /tmp (default: 7)
// Only used for CleanTmp operation
TmpFileAge int
// MinFreeSpaceGB is the minimum free space in GB before cleanup is allowed
// Set to 0 to disable this check
MinFreeSpaceGB float64
// TargetPath is the path to check for free space (default: "/")
TargetPath string
// VerifyAfter when true, verifies disk space was reclaimed
VerifyAfter bool
// DryRun when true, only simulates the action without executing it
DryRun bool
}
DiskConfig contains configuration for the disk remediator.
type DiskExecutor ¶
type DiskExecutor interface {
// ExecuteCommand executes a command with the given arguments
ExecuteCommand(ctx context.Context, name string, args ...string) (string, error)
// GetDiskUsage returns disk usage information for a path (used GB, available GB, total GB)
GetDiskUsage(ctx context.Context, path string) (used, available, total float64, err error)
}
DiskExecutor defines the interface for executing disk cleanup commands. This allows for mocking in tests.
type DiskOperation ¶
type DiskOperation string
DiskOperation defines the type of disk cleanup operation to perform.
const ( // DiskCleanJournalLogs cleans old systemd journal logs DiskCleanJournalLogs DiskOperation = "clean-journal-logs" // DiskCleanDockerImages removes unused Docker images DiskCleanDockerImages DiskOperation = "clean-docker-images" // DiskCleanTmp removes old files from /tmp DiskCleanTmp DiskOperation = "clean-tmp" // DiskCleanContainerLayers removes unused container layers (docker system prune) DiskCleanContainerLayers DiskOperation = "clean-container-layers" )
type DiskRemediator ¶
type DiskRemediator struct {
*BaseRemediator
// contains filtered or unexported fields
}
DiskRemediator remediates disk space problems by performing cleanup operations. It supports cleaning journal logs, Docker images, /tmp files, and container layers.
func NewDiskRemediator ¶
func NewDiskRemediator(config DiskConfig) (*DiskRemediator, error)
NewDiskRemediator creates a new disk remediator with the given configuration.
func (*DiskRemediator) SetDiskExecutor ¶
func (r *DiskRemediator) SetDiskExecutor(executor DiskExecutor)
SetDiskExecutor sets a custom disk executor (useful for testing).
type EventCreator ¶
type EventCreator interface {
// CreateEvent creates a Kubernetes event
CreateEvent(ctx context.Context, event corev1.Event) error
}
EventCreator is an interface for creating Kubernetes events. This allows the registry to persist remediation attempts as Kubernetes events without creating a circular dependency with the kubernetes exporter package.
type Logger ¶
type Logger interface {
// Infof logs an informational message with formatting
Infof(format string, args ...interface{})
// Warnf logs a warning message with formatting
Warnf(format string, args ...interface{})
// Errorf logs an error message with formatting
Errorf(format string, args ...interface{})
}
Logger provides optional logging functionality for remediators. Remediators can use this interface to log their activities without requiring a specific logging implementation.
type NetworkConfig ¶
type NetworkConfig struct {
// Operation specifies the network action to perform
Operation NetworkOperation
// InterfaceName is the name of the network interface (required for RestartInterface)
// Examples: "eth0", "ens3", "enp0s3"
InterfaceName string
// BackupRouting when true, backs up routing table before reset (for ResetRouting)
BackupRouting bool
// VerifyAfter when true, verifies the operation succeeded
VerifyAfter bool
// VerifyTimeout is the maximum time to wait for verification
VerifyTimeout time.Duration
// DryRun when true, only simulates the action without executing it
DryRun bool
}
NetworkConfig contains configuration for the network remediator.
type NetworkExecutor ¶
type NetworkExecutor interface {
// ExecuteCommand executes a command with the given arguments
ExecuteCommand(ctx context.Context, name string, args ...string) (string, error)
// InterfaceExists checks if a network interface exists
InterfaceExists(ctx context.Context, interfaceName string) (bool, error)
// GetRoutingTable gets the current routing table
GetRoutingTable(ctx context.Context) (string, error)
// IsInterfaceUp checks if an interface is up
IsInterfaceUp(ctx context.Context, interfaceName string) (bool, error)
}
NetworkExecutor defines the interface for executing network commands. This allows for mocking in tests.
type NetworkOperation ¶
type NetworkOperation string
NetworkOperation defines the type of network operation to perform.
const ( // NetworkFlushDNS flushes the DNS resolver cache NetworkFlushDNS NetworkOperation = "flush-dns" // NetworkRestartInterface restarts a network interface (down/up) NetworkRestartInterface NetworkOperation = "restart-interface" // NetworkResetRouting resets the routing table to defaults NetworkResetRouting NetworkOperation = "reset-routing" )
type NetworkRemediator ¶
type NetworkRemediator struct {
*BaseRemediator
// contains filtered or unexported fields
}
NetworkRemediator remediates network problems by performing network operations. It supports DNS cache flushing, network interface restarts, and routing table resets.
func NewNetworkRemediator ¶
func NewNetworkRemediator(config NetworkConfig) (*NetworkRemediator, error)
NewNetworkRemediator creates a new network remediator with the given configuration.
func (*NetworkRemediator) SetNetworkExecutor ¶
func (r *NetworkRemediator) SetNetworkExecutor(executor NetworkExecutor)
SetNetworkExecutor sets a custom network executor (useful for testing).
type RegistryStats ¶
type RegistryStats struct {
// RegisteredTypes is the number of registered remediator types
RegisteredTypes int
// CircuitState is the current circuit breaker state
CircuitState CircuitBreakerState
// ConsecutiveFailures is the current count of consecutive failures
ConsecutiveFailures int
// ConsecutiveSuccesses is the current count of consecutive successes
ConsecutiveSuccesses int
// CircuitOpenedAt is when the circuit was last opened (if currently open)
CircuitOpenedAt time.Time
// RecentRemediations is the count of remediations in the current rate limit window
RecentRemediations int
// MaxPerHour is the rate limit
MaxPerHour int
// HistorySize is the current number of records in history
HistorySize int
// MaxHistory is the maximum history size
MaxHistory int
// DryRun indicates if the registry is in dry-run mode
DryRun bool
}
GetStats returns statistics about the registry state.
type RemediateFunc ¶
RemediateFunc is a function type that performs the actual remediation logic. It receives a context for cancellation support and the problem to remediate. It should return nil on success or an error describing the failure.
type RemediationRecord ¶
type RemediationRecord struct {
// RemediatorType is the type of remediator used
RemediatorType string
// Problem is the problem that was remediated
Problem types.Problem
// StartTime is when the remediation started
StartTime time.Time
// EndTime is when the remediation completed
EndTime time.Time
// Duration is how long the remediation took
Duration time.Duration
// Success indicates whether the remediation succeeded
Success bool
// Error contains the error message if the remediation failed
Error string
}
RemediationRecord represents a single remediation attempt in the history.
type RemediatorFactory ¶
type RemediatorFactory func() (types.Remediator, error)
RemediatorFactory is a function that creates a new remediator instance. It returns a remediator that implements the types.Remediator interface.
type RemediatorInfo ¶
type RemediatorInfo struct {
// Type is the unique identifier for this remediator type.
Type string
// Factory is the function used to create new instances of this remediator.
// The factory function should be thread-safe and stateless.
Factory RemediatorFactory
// Validator is the function used to validate remediator instances.
// This is optional but recommended for early validation.
Validator RemediatorValidator
// Description provides human-readable documentation for this remediator type.
Description string
}
RemediatorInfo contains metadata and factory functions for a remediator type. This is used to register remediator implementations with the registry.
type RemediatorRegistry ¶
type RemediatorRegistry struct {
// contains filtered or unexported fields
}
RemediatorRegistry manages the registration and execution of remediators. It provides:
- Factory pattern for creating remediators
- Circuit breaker to prevent cascading failures
- Rate limiting to prevent remediation storms
- Remediation history for audit and analysis
- Dry-run mode for testing
The registry uses a read-write mutex to optimize for concurrent remediation operations while protecting internal state.
func NewRegistry ¶
func NewRegistry(maxPerHour, maxHistory int) *RemediatorRegistry
NewRegistry creates a new remediator registry with default configuration.
Parameters:
- maxPerHour: Maximum number of remediations allowed per hour (0 = unlimited)
- maxHistory: Maximum number of remediation records to keep (0 = unlimited, capped at 10000)
Returns a configured registry ready for use.
func (*RemediatorRegistry) GetCircuitState ¶
func (r *RemediatorRegistry) GetCircuitState() CircuitBreakerState
GetCircuitState returns the current circuit breaker state.
func (*RemediatorRegistry) GetHistory ¶
func (r *RemediatorRegistry) GetHistory(limit int) []RemediationRecord
GetHistory returns a copy of the remediation history. Results are ordered from oldest to newest.
func (*RemediatorRegistry) GetRegisteredTypes ¶
func (r *RemediatorRegistry) GetRegisteredTypes() []string
GetRegisteredTypes returns a sorted list of all registered remediator types.
func (*RemediatorRegistry) GetRemediatorInfo ¶
func (r *RemediatorRegistry) GetRemediatorInfo(remediatorType string) *RemediatorInfo
GetRemediatorInfo returns the registration information for a remediator type. Returns nil if the remediator type is not registered.
func (*RemediatorRegistry) GetStats ¶
func (r *RemediatorRegistry) GetStats() RegistryStats
GetStats returns current statistics about the registry.
func (*RemediatorRegistry) IsDryRun ¶
func (r *RemediatorRegistry) IsDryRun() bool
IsDryRun returns whether the registry is in dry-run mode.
func (*RemediatorRegistry) IsRegistered ¶
func (r *RemediatorRegistry) IsRegistered(remediatorType string) bool
IsRegistered checks whether a remediator type is registered.
func (*RemediatorRegistry) Register ¶
func (r *RemediatorRegistry) Register(info RemediatorInfo)
Register adds a new remediator type to the registry. This function is typically called during initialization to register available remediators.
Register panics if:
- info.Type is empty
- info.Factory is nil
- A remediator with the same type is already registered
Panicking is appropriate here because registration happens at init time, and registration conflicts indicate programming errors that should be caught during development.
func (*RemediatorRegistry) Remediate ¶
func (r *RemediatorRegistry) Remediate(ctx context.Context, remediatorType string, problem types.Problem) error
Remediate performs remediation using the specified remediator type. This is the main entry point for executing remediations with full safety checks:
- Circuit breaker check (fail fast if circuit is open)
- Rate limit check (prevent remediation storms)
- Get/create remediator instance
- Check remediator-specific CanRemediate (cooldown, max attempts)
- Execute remediation (or skip in dry-run mode)
- Record in history
- Update circuit breaker state
- Clean up old rate limit entries
Returns an error if any safety check fails or if remediation fails.
func (*RemediatorRegistry) ResetCircuitBreaker ¶
func (r *RemediatorRegistry) ResetCircuitBreaker()
ResetCircuitBreaker manually resets the circuit breaker to closed state. This is primarily useful for testing or manual intervention.
func (*RemediatorRegistry) SetCircuitBreakerConfig ¶
func (r *RemediatorRegistry) SetCircuitBreakerConfig(config CircuitBreakerConfig) error
SetCircuitBreakerConfig updates the circuit breaker configuration. This can be called at runtime to adjust circuit breaker behavior.
func (*RemediatorRegistry) SetDryRun ¶
func (r *RemediatorRegistry) SetDryRun(dryRun bool)
SetDryRun enables or disables dry-run mode. In dry-run mode, remediations are not actually executed but all other logic (rate limiting, circuit breaker, history) still runs.
func (*RemediatorRegistry) SetEventCreator ¶
func (r *RemediatorRegistry) SetEventCreator(eventCreator EventCreator, nodeName string)
SetEventCreator sets an optional event creator for persisting remediation attempts as Kubernetes events. The nodeName parameter specifies the node these events should be associated with.
func (*RemediatorRegistry) SetLogger ¶
func (r *RemediatorRegistry) SetLogger(logger Logger)
SetLogger sets an optional logger for the registry.
type RemediatorValidator ¶
type RemediatorValidator func(remediator types.Remediator) error
RemediatorValidator is a function that validates a remediator instance. This is optional but recommended for early validation.
type RuntimeConfig ¶
type RuntimeConfig struct {
// Operation specifies the runtime action to perform
Operation RuntimeOperation
// RuntimeType specifies which runtime to target (docker, containerd, crio, auto)
// If set to "auto", the remediator will detect the runtime automatically
RuntimeType RuntimeType
// VerifyAfter when true, verifies the operation succeeded
VerifyAfter bool
// DryRun when true, only simulates the action without executing it
DryRun bool
}
RuntimeConfig contains configuration for the runtime remediator.
type RuntimeExecutor ¶
type RuntimeExecutor interface {
// ExecuteCommand executes a command with the given arguments
ExecuteCommand(ctx context.Context, name string, args ...string) (string, error)
// IsRuntimeAvailable checks if a runtime is available on the system
IsRuntimeAvailable(ctx context.Context, runtime RuntimeType) (bool, error)
// GetSystemdServiceName returns the systemd service name for a runtime
GetSystemdServiceName(runtime RuntimeType) string
}
RuntimeExecutor defines the interface for executing runtime commands. This allows for mocking in tests.
type RuntimeOperation ¶
type RuntimeOperation string
RuntimeOperation defines the type of runtime operation to perform.
const ( // RuntimeRestartDaemon restarts the runtime daemon via systemd RuntimeRestartDaemon RuntimeOperation = "restart-daemon" // RuntimeCleanContainers cleans up stopped containers RuntimeCleanContainers RuntimeOperation = "clean-containers" // RuntimePruneVolumes prunes dangling volumes RuntimePruneVolumes RuntimeOperation = "prune-volumes" )
type RuntimeRemediator ¶
type RuntimeRemediator struct {
*BaseRemediator
// contains filtered or unexported fields
}
RuntimeRemediator remediates container runtime problems. It supports Docker, containerd, and CRI-O runtimes with auto-detection.
func NewRuntimeRemediator ¶
func NewRuntimeRemediator(config RuntimeConfig) (*RuntimeRemediator, error)
NewRuntimeRemediator creates a new runtime remediator with the given configuration.
func (*RuntimeRemediator) GetDetectedRuntime ¶
func (r *RuntimeRemediator) GetDetectedRuntime() RuntimeType
GetDetectedRuntime returns the detected runtime type (useful for testing).
func (*RuntimeRemediator) SetRuntimeExecutor ¶
func (r *RuntimeRemediator) SetRuntimeExecutor(executor RuntimeExecutor)
SetRuntimeExecutor sets a custom runtime executor (useful for testing).
type RuntimeType ¶
type RuntimeType string
RuntimeType defines the type of container runtime.
const ( // RuntimeDocker represents Docker container runtime RuntimeDocker RuntimeType = "docker" // RuntimeContainerd represents containerd container runtime RuntimeContainerd RuntimeType = "containerd" // RuntimeCRIO represents CRI-O container runtime RuntimeCRIO RuntimeType = "crio" // RuntimeAuto automatically detects the runtime type RuntimeAuto RuntimeType = "auto" )
type ScriptExecutor ¶
type ScriptExecutor interface {
// ExecuteScript executes a script with given arguments and environment
ExecuteScript(ctx context.Context, scriptPath string, args []string, env map[string]string, workingDir string) (stdout, stderr string, exitCode int, err error)
// CheckScriptSafety verifies the script exists and has proper permissions
CheckScriptSafety(scriptPath string) error
}
ScriptExecutor defines the interface for executing custom scripts. This allows for mocking in tests.
type SystemdConfig ¶
type SystemdConfig struct {
// Operation specifies the systemd action to perform (restart, stop, start, reload)
Operation SystemdOperation
// ServiceName is the name of the systemd service (e.g., "kubelet", "docker", "containerd")
ServiceName string
// VerifyStatus when true, verifies service is active after remediation
VerifyStatus bool
// VerifyTimeout is the maximum time to wait for service to become active after remediation
VerifyTimeout time.Duration
// DryRun when true, only simulates the action without executing it
DryRun bool
}
SystemdConfig contains configuration for the systemd remediator.
type SystemdExecutor ¶
type SystemdExecutor interface {
// ExecuteSystemctl executes a systemctl command with the given arguments
ExecuteSystemctl(ctx context.Context, args ...string) (string, error)
// IsActive checks if a service is currently active
IsActive(ctx context.Context, serviceName string) (bool, error)
}
SystemdExecutor defines the interface for executing systemd commands. This allows for mocking in tests.
type SystemdOperation ¶
type SystemdOperation string
SystemdOperation defines the type of systemd operation to perform.
const ( // SystemdRestart restarts the service SystemdRestart SystemdOperation = "restart" // SystemdStop stops the service SystemdStop SystemdOperation = "stop" // SystemdStart starts the service SystemdStart SystemdOperation = "start" // SystemdReload reloads the service configuration SystemdReload SystemdOperation = "reload" )
type SystemdRemediator ¶
type SystemdRemediator struct {
*BaseRemediator
// contains filtered or unexported fields
}
SystemdRemediator remediates problems by performing systemd service operations. It supports restarting, stopping, starting, and reloading systemd services like kubelet, docker, and containerd.
func NewSystemdRemediator ¶
func NewSystemdRemediator(config SystemdConfig) (*SystemdRemediator, error)
NewSystemdRemediator creates a new systemd remediator with the given configuration.
func (*SystemdRemediator) SetSystemdExecutor ¶
func (r *SystemdRemediator) SetSystemdExecutor(executor SystemdExecutor)
SetSystemdExecutor sets a custom systemd executor (useful for testing).