Documentation
¶
Index ¶
- Variables
- func Node(options NodeOptions) *corev1.Node
- func SetTestModeCompactionPeriod()
- func UpdateK8SNodeSelectorHash(ctx context.Context, k8sClient client.Client, node *corev1.Node, hash string) error
- type FakeNodeClaimReconciler
- type GPUNodeClaimReconciler
- type GPUNodeClassReconciler
- type GPUNodeReconciler
- type GPUPoolCompactionReconciler
- type GPUPoolReconciler
- type GPUReconciler
- type GPUResourceQuotaReconciler
- type NodeOptions
- type NodeReconciler
- type PodReconciler
- type SchedulingConfigTemplateReconciler
- type TensorFusionClusterReconciler
- type TensorFusionConnectionReconciler
- type TensorFusionWorkloadReconciler
- type WorkloadProfileReconciler
Constants ¶
This section is empty.
Variables ¶
var ( // Killer switch to avoid creating too much cloud vendor nodes // Controlled by /api/provision?enable=true/false ProvisioningToggle = true // creating nodes, next round capacity check should consider the assumed resources // map key is pool name, second level is GPUClaim name PendingGPUNodeClaim map[string]map[string]tfv1.Resource // deleting nodes, must be serialized, delete one round by one round // map key is pool name, value is GPUNode name list PendingDeletionGPUNodes map[string][]string )
Functions ¶
func SetTestModeCompactionPeriod ¶ added in v1.41.0
func SetTestModeCompactionPeriod()
Types ¶
type FakeNodeClaimReconciler ¶ added in v1.40.0
type FakeNodeClaimReconciler struct { Scheme *runtime.Scheme // contains filtered or unexported fields }
func (*FakeNodeClaimReconciler) SetupWithManager ¶ added in v1.40.0
func (r *FakeNodeClaimReconciler) SetupWithManager(mgr ctrl.Manager) error
type GPUNodeClaimReconciler ¶ added in v1.41.0
type GPUNodeClaimReconciler struct { client.Client Scheme *runtime.Scheme Recorder record.EventRecorder }
GPUNodeClaimReconciler reconciles a GPUNodeClaim object
func (*GPUNodeClaimReconciler) Reconcile ¶ added in v1.41.0
func (r *GPUNodeClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
GPUNodeClaim is responsible for creating cloud vendor GPU nodes
func (*GPUNodeClaimReconciler) SetupWithManager ¶ added in v1.41.0
func (r *GPUNodeClaimReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type GPUNodeClassReconciler ¶
GPUNodeClassReconciler reconciles a GPUNodeClass object
func (*GPUNodeClassReconciler) Reconcile ¶
func (r *GPUNodeClassReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile GPU node classes
func (*GPUNodeClassReconciler) SetupWithManager ¶
func (r *GPUNodeClassReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type GPUNodeReconciler ¶
type GPUNodeReconciler struct { client.Client Scheme *runtime.Scheme Recorder record.EventRecorder Allocator *gpuallocator.GpuAllocator }
GPUNodeReconciler reconciles a GPUNode object
func (*GPUNodeReconciler) SetupWithManager ¶
func (r *GPUNodeReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type GPUPoolCompactionReconciler ¶
type GPUPoolCompactionReconciler struct { client.Client Scheme *runtime.Scheme Recorder record.EventRecorder Allocator *gpuallocator.GpuAllocator // contains filtered or unexported fields }
GPUPoolReconciler reconciles a GPUPool object
func (*GPUPoolCompactionReconciler) SetupWithManager ¶
func (r *GPUPoolCompactionReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type GPUPoolReconciler ¶
type GPUPoolReconciler struct { client.Client LastProcessedItems sync.Map Scheme *runtime.Scheme Recorder record.EventRecorder }
GPUPoolReconciler reconciles a GPUPool object
func (*GPUPoolReconciler) SetupWithManager ¶
func (r *GPUPoolReconciler) SetupWithManager(mgr ctrl.Manager, addLimiter bool) error
SetupWithManager sets up the controller with the Manager.
type GPUReconciler ¶
GPUReconciler reconciles a GPU object
func (*GPUReconciler) Reconcile ¶
Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state.
func (*GPUReconciler) SetupWithManager ¶
SetupWithManager sets up the controller with the Manager.
type GPUResourceQuotaReconciler ¶ added in v1.34.6
type GPUResourceQuotaReconciler struct { client.Client Scheme *runtime.Scheme Recorder record.EventRecorder QuotaStore *quota.QuotaStore }
GPUResourceQuotaReconciler reconciles a GPUResourceQuota object
func (*GPUResourceQuotaReconciler) Reconcile ¶ added in v1.34.6
func (r *GPUResourceQuotaReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state.
func (*GPUResourceQuotaReconciler) SetupWithManager ¶ added in v1.34.6
func (r *GPUResourceQuotaReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type NodeOptions ¶ added in v1.40.0
type NodeOptions struct { metav1.ObjectMeta ReadyStatus corev1.ConditionStatus ReadyReason string Conditions []corev1.NodeCondition Unschedulable bool ProviderID string Taints []corev1.Taint Allocatable corev1.ResourceList Capacity corev1.ResourceList OwnerReference []metav1.OwnerReference }
type NodeReconciler ¶
PodReconciler reconciles a Pod object
func (*NodeReconciler) SetupWithManager ¶
func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type PodReconciler ¶
type PodReconciler struct { client.Client Scheme *runtime.Scheme Allocator *gpuallocator.GpuAllocator PortAllocator *portallocator.PortAllocator }
PodReconciler reconciles a Pod object
func (*PodReconciler) Reconcile ¶
Add GPU connection for Pods using GPU Have to create TensorFusion connection here because pod UID not available in MutatingWebhook
func (*PodReconciler) SetupWithManager ¶
func (r *PodReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type SchedulingConfigTemplateReconciler ¶
SchedulingConfigTemplateReconciler reconciles a SchedulingConfigTemplate object
func (*SchedulingConfigTemplateReconciler) Reconcile ¶
func (r *SchedulingConfigTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
When deleted, need check if any GPU pool is using this template, if so, add warning event and requeue When updated, trigger the re-scheduling
func (*SchedulingConfigTemplateReconciler) SetupWithManager ¶
func (r *SchedulingConfigTemplateReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type TensorFusionClusterReconciler ¶
type TensorFusionClusterReconciler struct { client.Client Scheme *runtime.Scheme Recorder record.EventRecorder MetricsRecorder *metrics.MetricsRecorder LastProcessedItems sync.Map }
TensorFusionClusterReconciler reconciles a TensorFusionCluster object
func (*TensorFusionClusterReconciler) Reconcile ¶
func (r *TensorFusionClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile a TensorFusionCluster object, create and monitor GPU Pool, managing cluster level component versions
func (*TensorFusionClusterReconciler) SetupWithManager ¶
func (r *TensorFusionClusterReconciler) SetupWithManager(mgr ctrl.Manager, addLimiter bool) error
SetupWithManager sets up the controller with the Manager.
type TensorFusionConnectionReconciler ¶
type TensorFusionConnectionReconciler struct { client.Client Scheme *runtime.Scheme Recorder record.EventRecorder }
TensorFusionConnectionReconciler reconciles a TensorFusionConnection object
func (*TensorFusionConnectionReconciler) Reconcile ¶
func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Add and monitor GPU worker Pod for a TensorFusionConnection
func (*TensorFusionConnectionReconciler) SetupWithManager ¶
func (r *TensorFusionConnectionReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type TensorFusionWorkloadReconciler ¶
type TensorFusionWorkloadReconciler struct { client.Client Scheme *runtime.Scheme Recorder record.EventRecorder PortAllocator *portallocator.PortAllocator }
TensorFusionWorkloadReconciler reconciles a TensorFusionWorkload object
func (*TensorFusionWorkloadReconciler) Reconcile ¶
func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
TensorFusionWorkload Reconciler
func (*TensorFusionWorkloadReconciler) SetupWithManager ¶
func (r *TensorFusionWorkloadReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type WorkloadProfileReconciler ¶ added in v1.23.7
WorkloadProfileReconciler reconciles a WorkloadProfile object
func (*WorkloadProfileReconciler) Reconcile ¶ added in v1.23.7
func (r *WorkloadProfileReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
WorkloadProfile is a template to be referred by TensorFusionWorkload, no logic for reconcile
func (*WorkloadProfileReconciler) SetupWithManager ¶ added in v1.23.7
func (r *WorkloadProfileReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
Source Files
¶
- fake_node_claim_contoller.go
- gpu_controller.go
- gpunode_controller.go
- gpunodeclaim_controller.go
- gpunodeclass_controller.go
- gpupool_compaction_controller.go
- gpupool_controller.go
- gpupool_node_provision.go
- gpuresourcequota_controller.go
- node_controller.go
- pod_controller.go
- schedulingconfigtemplate_controller.go
- tensorfusioncluster_controller.go
- tensorfusionconnection_controller.go
- tensorfusionworkload_controller.go
- workloadprofile_controller.go