upgrade

package
v1.4.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 17, 2022 License: Apache-2.0 Imports: 16 Imported by: 0

Documentation

Index

Constants

View Source
const (
	UpgradeStateAnnotation = "nvidia.com/ofed-upgrade-state"

	OfedDriverLabel           = "nvidia.com/ofed-driver"
	OfedUpgradeSkipDrainLabel = "nvidia.com/ofed-upgrade.skip-drain"

	// UpgradeStateUnknown Node has this state when the upgrade flow is disabled or the node hasn't been processed yet
	UpgradeStateUnknown = ""
	// UpgradeStateDone is set when OFED POD is up to date and running on the node, the node is schedulable
	UpgradeStateDone = "upgrade-done"
	// UpgradeStateUpgradeRequired is set when OFED POD on the node is not up-to-date and required upgrade
	// No actions are performed at this stage
	UpgradeStateUpgradeRequired = "upgrade-required"
	// UpgradeStateDrain is set when the node is scheduled for drain. After the drain the state is changed
	// either to UpgradeStatePodRestart or UpgradeStateDrainFailed
	UpgradeStateDrain = "drain"
	// UpgradeStatePodRestart is set when the OFED POD on the node is scheduler for restart.
	// After the restart state is changed to UpgradeStateDone
	UpgradeStatePodRestart = "pod-restart"
	// UpgradeStateDrainFailed is set when drain on the node has failed. Manual interaction is required at this stage.
	UpgradeStateDrainFailed = "drain-failed"
	// UpgradeStateUncordonRequired is set when OFED POD on the node is up-to-date and has "Ready" status
	UpgradeStateUncordonRequired = "uncordon-required"
)

Variables

This section is empty.

Functions

This section is empty.

Types

type ClusterUpgradeState

type ClusterUpgradeState struct {
	NodeStates map[string][]*NodeUpgradeState
}

ClusterUpgradeState contains a snapshot of the OFED upgrade state in the cluster It contains OFED upgrade policy and mappings between nodes and their upgrade state Nodes are grouped together with the driver POD running on them and the daemon set, controlling this pod This state is then used as an input for the ClusterUpgradeStateManager

func NewClusterUpgradeState

func NewClusterUpgradeState() ClusterUpgradeState

NewClusterUpgradeState creates an empty ClusterUpgradeState object

type ClusterUpgradeStateManager

type ClusterUpgradeStateManager struct {
	K8sClient                client.Client
	K8sInterface             kubernetes.Interface
	Log                      logr.Logger
	DrainManager             DrainManager
	PodDeleteManager         PodDeleteManager
	UncordonManager          UncordonManager
	NodeUpgradeStateProvider NodeUpgradeStateProvider
}

ClusterUpgradeStateManager serves as a state machine for the ClusterUpgradeState It processes each node and based on its state schedules the required jobs to change their state to the next one

func NewClusterUpdateStateManager

func NewClusterUpdateStateManager(
	drainManager DrainManager,
	podDeleteManager PodDeleteManager,
	uncordonManager UncordonManager,
	nodeUpgradeStateProvider NodeUpgradeStateProvider,
	log logr.Logger,
	k8sClient client.Client,
	k8sInterface kubernetes.Interface) *ClusterUpgradeStateManager

NewClusterUpdateStateManager creates a new instance of ClusterUpgradeStateManager

func (*ClusterUpgradeStateManager) ApplyState

func (m *ClusterUpgradeStateManager) ApplyState(ctx context.Context,
	currentState *ClusterUpgradeState, upgradePolicy *v1alpha1.OfedUpgradePolicySpec) error

ApplyState receives a complete cluster upgrade state and, based on upgrade policy, processes each node's state. Based on the current state of the node, it is calculated if the node can be moved to the next state right now or whether any actions need to be scheduled for the node to move to the next state. The function is stateless and idempotent. If the error was returned before all nodes' states were processed, ApplyState would be called again and complete the processing - all the decisions are based on the input data.

func (*ClusterUpgradeStateManager) ProcessDoneOrUnknownNodes

func (m *ClusterUpgradeStateManager) ProcessDoneOrUnknownNodes(
	ctx context.Context, currentClusterState *ClusterUpgradeState, nodeStateName string) error

ProcessDoneOrUnknownNodes iterates over UpgradeStateDone or UpgradeStateUnknown nodes and determines whether each specific node should be in UpgradeStateUpgradeRequired or UpgradeStateDone state.

func (*ClusterUpgradeStateManager) ProcessDrainFailedNodes

func (m *ClusterUpgradeStateManager) ProcessDrainFailedNodes(
	ctx context.Context, currentClusterState *ClusterUpgradeState) error

ProcessDrainFailedNodes processes UpgradeStateDrainFailed nodes and checks whether the driver pod on the node has been successfully restarted. If the pod is in Ready state - moves the node to UpgradeStateUncordonRequired state.

func (*ClusterUpgradeStateManager) ProcessDrainNodes

func (m *ClusterUpgradeStateManager) ProcessDrainNodes(
	ctx context.Context, currentClusterState *ClusterUpgradeState, drainSpec *v1alpha1.DrainSpec) error

ProcessDrainNodes schedules UpgradeStateDrain nodes for drain. If drain is disabled by upgrade policy, moves the nodes straight to UpgradeStatePodRestart state.

func (*ClusterUpgradeStateManager) ProcessPodRestartNodes

func (m *ClusterUpgradeStateManager) ProcessPodRestartNodes(
	ctx context.Context, currentClusterState *ClusterUpgradeState) error

ProcessPodRestartNodes processes UpgradeStatePodRestart nodes and schedules driver pod restart for them. If the pod has already been restarted and is in Ready state - moves the node to UpgradeStateUncordonRequired state.

func (*ClusterUpgradeStateManager) ProcessUncordonRequiredNodes

func (m *ClusterUpgradeStateManager) ProcessUncordonRequiredNodes(
	ctx context.Context, currentClusterState *ClusterUpgradeState) error

ProcessUncordonRequiredNodes processes UpgradeStateUncordonRequired nodes, uncordons them and moves them to UpgradeStateDone state

func (*ClusterUpgradeStateManager) ProcessUpgradeRequiredNodes

func (m *ClusterUpgradeStateManager) ProcessUpgradeRequiredNodes(
	ctx context.Context, currentClusterState *ClusterUpgradeState, limit int) error

ProcessUpgradeRequiredNodes processes UpgradeStateUpgradeRequired nodes and moves them to UpgradeStateDrain until the limit on max parallel upgrades is reached.

type DrainConfiguration

type DrainConfiguration struct {
	Spec  *v1alpha1.DrainSpec
	Nodes []*corev1.Node
}

DrainConfiguration contains the drain specification and the list of nodes to schedule drain on

type DrainManager

type DrainManager interface {
	ScheduleNodesDrain(ctx context.Context, drainConfig *DrainConfiguration) error
}

DrainManager is an interface that allows to schedule nodes drain based on DrainSpec

type DrainManagerImpl

type DrainManagerImpl struct {
	// contains filtered or unexported fields
}

DrainManagerImpl implements DrainManager interface and can perform nodes drain based on received DrainConfiguration

func NewDrainManager

func NewDrainManager(
	k8sInterface kubernetes.Interface,
	nodeUpgradeStateProvider NodeUpgradeStateProvider,
	log logr.Logger) *DrainManagerImpl

func (*DrainManagerImpl) ScheduleNodesDrain

func (m *DrainManagerImpl) ScheduleNodesDrain(ctx context.Context, drainConfig *DrainConfiguration) error

ScheduleNodesDrain receives DrainConfiguration and schedules drain for each node in the list. When the node gets scheduled, it's marked as being drained and therefore will not be scheduled for drain twice if the initial drain didn't complete yet. During the drain the node is cordoned first, and then pods on the node are evicted. If the drain is successful, the node moves to UpgradeStatePodRestart state, otherwise it moves to UpgradeStateDrainFailed state.

type KeyedMutex

type KeyedMutex struct {
	// contains filtered or unexported fields
}

KeyedMutex is a struct that provides a per-key synchronized access

func (*KeyedMutex) Lock

func (m *KeyedMutex) Lock(key string) UnlockFunc

Lock locks a mutex, associated with a given key and returns an unlock function

type NodeUpgradeState

type NodeUpgradeState struct {
	Node            *v1.Node
	DriverPod       *v1.Pod
	DriverDaemonSet *appsv1.DaemonSet
}

NodeUpgradeState contains a mapping between a node, the driver POD running on them and the daemon set, controlling this pod

type NodeUpgradeStateProvider

type NodeUpgradeStateProvider interface {
	GetNode(ctx context.Context, nodeName string) (*v1.Node, error)
	ChangeNodeUpgradeState(ctx context.Context, node *v1.Node, newNodeState string) error
}

NodeUpgradeStateProvider allows for synchronized operations on node objects and ensures that the node, got from the provider, always has the up-to-date upgrade state

func NewNodeUpgradeStateProvider

func NewNodeUpgradeStateProvider(k8sClient client.Client, log logr.Logger) NodeUpgradeStateProvider

type NodeUpgradeStateProviderImpl

type NodeUpgradeStateProviderImpl struct {
	K8sClient client.Client
	Log       logr.Logger
	// contains filtered or unexported fields
}

func (*NodeUpgradeStateProviderImpl) ChangeNodeUpgradeState

func (p *NodeUpgradeStateProviderImpl) ChangeNodeUpgradeState(
	ctx context.Context, node *v1.Node, newNodeState string) error

ChangeNodeUpgradeState patches a given v1.Node object and updates its UpgradeStateAnnotation with a given value The function then waits for the operator cache to get updated

func (*NodeUpgradeStateProviderImpl) GetNode

func (p *NodeUpgradeStateProviderImpl) GetNode(ctx context.Context, nodeName string) (*v1.Node, error)

type PodDeleteManager

type PodDeleteManager interface {
	SchedulePodsRestart(context.Context, []*corev1.Pod) error
}

PodDeleteManager is and interface that allows scheduling driver pod restarts

type PodDeleteManagerImpl

type PodDeleteManagerImpl struct {
	K8sClient client.Client

	Log logr.Logger
}

PodDeleteManagerImpl implements PodDeleteManager interface and can restart pods by deleting them

func NewPodDeleteManager

func NewPodDeleteManager(k8sClient client.Client, log logr.Logger) *PodDeleteManagerImpl

func (*PodDeleteManagerImpl) SchedulePodsRestart

func (m *PodDeleteManagerImpl) SchedulePodsRestart(ctx context.Context, pods []*corev1.Pod) error

SchedulePodsRestart receives a list of pods and schedules to delete them

type StringSet

type StringSet struct {
	// contains filtered or unexported fields
}

func NewStringSet

func NewStringSet() *StringSet

func (*StringSet) Add

func (s *StringSet) Add(item string)

Add item to set

func (*StringSet) Clear

func (s *StringSet) Clear()

Clear removes all items from the set

func (*StringSet) Has

func (s *StringSet) Has(item string) bool

Has looks for item exists in the map

func (*StringSet) Remove

func (s *StringSet) Remove(item string)

Remove deletes the specified item from the set

type UncordonManager

type UncordonManager interface {
	CordonOrUncordonNode(ctx context.Context, node *corev1.Node, desired bool) error
}

UncordonManager is an interface that allows to uncordon nodes

type UncordonManagerImpl

type UncordonManagerImpl struct {
	// contains filtered or unexported fields
}

func NewUncordonManager

func NewUncordonManager(k8sInterface kubernetes.Interface, log logr.Logger) *UncordonManagerImpl

func (*UncordonManagerImpl) CordonOrUncordonNode

func (m *UncordonManagerImpl) CordonOrUncordonNode(ctx context.Context, node *corev1.Node, desired bool) error

type UnlockFunc

type UnlockFunc = func()

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL