controller

package
v1.44.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 17, 2025 License: Apache-2.0 Imports: 57 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	// Killer switch to avoid creating too much cloud vendor nodes
	// Controlled by /api/provision?enable=true/false
	ProvisioningToggle = true

	// creating nodes, next round capacity check should consider the assumed resources
	// map key is pool name, second level is GPUClaim name
	PendingGPUNodeClaim map[string]map[string]tfv1.Resource

	// deleting nodes, must be serialized, delete one round by one round
	// map key is pool name, value is GPUNode name list
	PendingDeletionGPUNodes map[string][]string
)

Functions

func Node added in v1.40.0

func Node(options NodeOptions) *corev1.Node

mock create node

func SetTestModeCompactionPeriod added in v1.41.0

func SetTestModeCompactionPeriod()

func UpdateK8SNodeSelectorHash added in v1.44.0

func UpdateK8SNodeSelectorHash(ctx context.Context, k8sClient client.Client, node *corev1.Node, hash string) error

Types

type FakeNodeClaimReconciler added in v1.40.0

type FakeNodeClaimReconciler struct {
	Scheme *runtime.Scheme
	// contains filtered or unexported fields
}

func (*FakeNodeClaimReconciler) Reconcile added in v1.40.0

func (r *FakeNodeClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

func (*FakeNodeClaimReconciler) SetupWithManager added in v1.40.0

func (r *FakeNodeClaimReconciler) SetupWithManager(mgr ctrl.Manager) error

type GPUNodeClaimReconciler added in v1.41.0

type GPUNodeClaimReconciler struct {
	client.Client
	Scheme   *runtime.Scheme
	Recorder record.EventRecorder
}

GPUNodeClaimReconciler reconciles a GPUNodeClaim object

func (*GPUNodeClaimReconciler) Reconcile added in v1.41.0

func (r *GPUNodeClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

GPUNodeClaim is responsible for creating cloud vendor GPU nodes

func (*GPUNodeClaimReconciler) SetupWithManager added in v1.41.0

func (r *GPUNodeClaimReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type GPUNodeClassReconciler

type GPUNodeClassReconciler struct {
	client.Client
	Scheme *runtime.Scheme
}

GPUNodeClassReconciler reconciles a GPUNodeClass object

func (*GPUNodeClassReconciler) Reconcile

func (r *GPUNodeClassReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

Reconcile GPU node classes

func (*GPUNodeClassReconciler) SetupWithManager

func (r *GPUNodeClassReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type GPUNodeReconciler

type GPUNodeReconciler struct {
	client.Client
	Scheme    *runtime.Scheme
	Recorder  record.EventRecorder
	Allocator *gpuallocator.GpuAllocator
}

GPUNodeReconciler reconciles a GPUNode object

func (*GPUNodeReconciler) Reconcile

func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

Reconcile GPU nodes

func (*GPUNodeReconciler) SetupWithManager

func (r *GPUNodeReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type GPUPoolCompactionReconciler

type GPUPoolCompactionReconciler struct {
	client.Client
	Scheme   *runtime.Scheme
	Recorder record.EventRecorder

	Allocator *gpuallocator.GpuAllocator
	// contains filtered or unexported fields
}

GPUPoolReconciler reconciles a GPUPool object

func (*GPUPoolCompactionReconciler) Reconcile

func (*GPUPoolCompactionReconciler) SetupWithManager

func (r *GPUPoolCompactionReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type GPUPoolReconciler

type GPUPoolReconciler struct {
	client.Client

	LastProcessedItems sync.Map

	Scheme   *runtime.Scheme
	Recorder record.EventRecorder
}

GPUPoolReconciler reconciles a GPUPool object

func (*GPUPoolReconciler) Reconcile

func (r *GPUPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

Reconcile GPU pools

func (*GPUPoolReconciler) SetupWithManager

func (r *GPUPoolReconciler) SetupWithManager(mgr ctrl.Manager, addLimiter bool) error

SetupWithManager sets up the controller with the Manager.

type GPUReconciler

type GPUReconciler struct {
	client.Client
	Scheme *runtime.Scheme
}

GPUReconciler reconciles a GPU object

func (*GPUReconciler) Reconcile

func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state.

func (*GPUReconciler) SetupWithManager

func (r *GPUReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type GPUResourceQuotaReconciler added in v1.34.6

type GPUResourceQuotaReconciler struct {
	client.Client
	Scheme     *runtime.Scheme
	Recorder   record.EventRecorder
	QuotaStore *quota.QuotaStore
}

GPUResourceQuotaReconciler reconciles a GPUResourceQuota object

func (*GPUResourceQuotaReconciler) Reconcile added in v1.34.6

Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state.

func (*GPUResourceQuotaReconciler) SetupWithManager added in v1.34.6

func (r *GPUResourceQuotaReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type NodeOptions added in v1.40.0

type NodeOptions struct {
	metav1.ObjectMeta
	ReadyStatus    corev1.ConditionStatus
	ReadyReason    string
	Conditions     []corev1.NodeCondition
	Unschedulable  bool
	ProviderID     string
	Taints         []corev1.Taint
	Allocatable    corev1.ResourceList
	Capacity       corev1.ResourceList
	OwnerReference []metav1.OwnerReference
}

type NodeReconciler

type NodeReconciler struct {
	client.Client
	Scheme   *runtime.Scheme
	Recorder record.EventRecorder
}

PodReconciler reconciles a Pod object

func (*NodeReconciler) Reconcile

func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

Reconcile k8s nodes to create and update GPUNode

func (*NodeReconciler) SetupWithManager

func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type PodReconciler

type PodReconciler struct {
	client.Client
	Scheme        *runtime.Scheme
	Allocator     *gpuallocator.GpuAllocator
	PortAllocator *portallocator.PortAllocator
}

PodReconciler reconciles a Pod object

func (*PodReconciler) Reconcile

func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

Add GPU connection for Pods using GPU Have to create TensorFusion connection here because pod UID not available in MutatingWebhook

func (*PodReconciler) SetupWithManager

func (r *PodReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type SchedulingConfigTemplateReconciler

type SchedulingConfigTemplateReconciler struct {
	client.Client
	Scheme *runtime.Scheme
}

SchedulingConfigTemplateReconciler reconciles a SchedulingConfigTemplate object

func (*SchedulingConfigTemplateReconciler) Reconcile

When deleted, need check if any GPU pool is using this template, if so, add warning event and requeue When updated, trigger the re-scheduling

func (*SchedulingConfigTemplateReconciler) SetupWithManager

func (r *SchedulingConfigTemplateReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type TensorFusionClusterReconciler

type TensorFusionClusterReconciler struct {
	client.Client
	Scheme          *runtime.Scheme
	Recorder        record.EventRecorder
	MetricsRecorder *metrics.MetricsRecorder

	LastProcessedItems sync.Map
}

TensorFusionClusterReconciler reconciles a TensorFusionCluster object

func (*TensorFusionClusterReconciler) Reconcile

Reconcile a TensorFusionCluster object, create and monitor GPU Pool, managing cluster level component versions

func (*TensorFusionClusterReconciler) SetupWithManager

func (r *TensorFusionClusterReconciler) SetupWithManager(mgr ctrl.Manager, addLimiter bool) error

SetupWithManager sets up the controller with the Manager.

type TensorFusionConnectionReconciler

type TensorFusionConnectionReconciler struct {
	client.Client
	Scheme   *runtime.Scheme
	Recorder record.EventRecorder
}

TensorFusionConnectionReconciler reconciles a TensorFusionConnection object

func (*TensorFusionConnectionReconciler) Reconcile

Add and monitor GPU worker Pod for a TensorFusionConnection

func (*TensorFusionConnectionReconciler) SetupWithManager

func (r *TensorFusionConnectionReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type TensorFusionWorkloadReconciler

type TensorFusionWorkloadReconciler struct {
	client.Client
	Scheme        *runtime.Scheme
	Recorder      record.EventRecorder
	PortAllocator *portallocator.PortAllocator
}

TensorFusionWorkloadReconciler reconciles a TensorFusionWorkload object

func (*TensorFusionWorkloadReconciler) Reconcile

TensorFusionWorkload Reconciler

func (*TensorFusionWorkloadReconciler) SetupWithManager

func (r *TensorFusionWorkloadReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type WorkloadProfileReconciler added in v1.23.7

type WorkloadProfileReconciler struct {
	client.Client
	Scheme *runtime.Scheme
}

WorkloadProfileReconciler reconciles a WorkloadProfile object

func (*WorkloadProfileReconciler) Reconcile added in v1.23.7

WorkloadProfile is a template to be referred by TensorFusionWorkload, no logic for reconcile

func (*WorkloadProfileReconciler) SetupWithManager added in v1.23.7

func (r *WorkloadProfileReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL