utils

package
v1.44.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 17, 2025 License: Apache-2.0 Imports: 30 Imported by: 0

Documentation

Index

Constants

View Source
const (
	WatchConfigFileChangesInterval = 15 * time.Second

	ServiceAccountTokenPath = "/var/run/secrets/kubernetes.io/serviceaccount/token"
)

Variables

View Source
var ErrNextLoop = errors.New("stop this loop and return the associated Result object")

ErrNextLoop is not a real error. It forces the current reconciliation loop to stop and return the associated Result object

View Source
var ErrTerminateLoop = errors.New("stop this loop and do not requeue")

ErrTerminateLoop is not a real error. It forces the current reconciliation loop to stop

View Source
var GPUResourceNames = []corev1.ResourceName{
	"nvidia.com/gpu",
	"amd.com/gpu",
}
View Source
var IsTestMode = false

Functions

func AddOrOverrideTFClientMissingAnnotationsBeforePatch added in v1.37.0

func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo TensorFusionInfo)

func AddTFDefaultClientConfBeforePatch added in v1.37.0

func AddTFDefaultClientConfBeforePatch(
	ctx context.Context,
	pod *v1.Pod,
	pool *tfv1.GPUPool,
	tfInfo TensorFusionInfo,
	injectContainerIndices []int,
)

func AddTFHypervisorConfAfterTemplate added in v1.37.0

func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, pool *tfv1.GPUPool)

func AddTFNodeDiscoveryConfAfterTemplate added in v1.37.0

func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTemplateSpec, pool *tfv1.GPUPool, gpuNodeName string)

func AddWorkerConfAfterTemplate added in v1.37.0

func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerConfig *tfv1.WorkerConfig, hypervisorConfig *tfv1.HypervisorConfig, workload *tfv1.TensorFusionWorkload) string

func AppendTFWorkerLabelsAndAnnotationsAfterTemplate added in v1.37.0

func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(
	podTmpl *v1.PodTemplate,
	workload *tfv1.TensorFusionWorkload,
	containerName string,
) (map[string]string, map[string]string)

func CalculateExponentialBackoffWithJitter

func CalculateExponentialBackoffWithJitter(retryCount int64) time.Duration

func CompareAndGetObjectHash added in v1.28.0

func CompareAndGetObjectHash(hash string, obj ...any) (bool, string)

func CurrentIP added in v1.33.4

func CurrentIP() string

func CurrentNamespace

func CurrentNamespace() string

func EqualConditionsDisregardTransitionTime added in v1.35.0

func EqualConditionsDisregardTransitionTime(a, b []metav1.Condition) bool

func EscapeJSONPointer added in v1.26.3

func EscapeJSONPointer(s string) string

EscapeJSONPointer escapes a string according to the JSON Pointer spec (RFC 6901). It escapes '~' as '~0' and '/' as '~1'.

func ExtractPoolNameFromNodeLabel added in v1.33.1

func ExtractPoolNameFromNodeLabel(node *tfv1.GPUNode) string

func FindFirstLevelOwnerReference added in v1.35.0

func FindFirstLevelOwnerReference(obj metav1.Object) *metav1.OwnerReference

FindFirstLevelOwnerReference recursively finds the root owner reference for a given object (e.g. Pod).

func FindRootOwnerReference added in v1.26.9

func FindRootOwnerReference(ctx context.Context, c client.Client, namespace string, obj metav1.Object) (*metav1.OwnerReference, error)

FindRootOwnerReference recursively finds the root owner reference for a given object (e.g. Pod).

func GetEnvOrDefault added in v1.34.0

func GetEnvOrDefault(key, defaultValue string) string

func GetGPUResource added in v1.35.0

func GetGPUResource(pod *corev1.Pod, isRequest bool) (tfv1.Resource, error)

func GetInitialGPUNodeSelector added in v1.44.0

func GetInitialGPUNodeSelector() []string

func GetObjectHash

func GetObjectHash(objs ...any) string

GetObjectHash generates a shorter FNV-1a hash for one or more objects

func GetSelfServiceAccountNameFull added in v1.37.0

func GetSelfServiceAccountNameFull() string

func GetSelfServiceAccountNameShort added in v1.37.0

func GetSelfServiceAccountNameShort() string

func HandleFinalizer

func HandleFinalizer[T client.Object](
	ctx context.Context,
	obj T,
	r client.Client,
	deleteHook func(context.Context, T) (bool, error),
) (shouldReturn bool, err error)

HandleFinalizer ensures proper finalizer management for Kubernetes resources. It automatically adds the finalizer when needed, and removes it after successful cleanup. Returns (shouldReturn, err):

  • shouldReturn: true if the caller should immediately return and wait for the next reconcile.
  • err: any error encountered during update or deleteHook.

func HasGPUResourceRequest added in v1.39.0

func HasGPUResourceRequest(pod *corev1.Pod) bool

func InitServiceAccountConfig added in v1.36.1

func InitServiceAccountConfig()

func IsPodConditionTrue

func IsPodConditionTrue(conditions []corev1.PodCondition, conditionType corev1.PodConditionType) bool

func IsPodStopped added in v1.37.0

func IsPodStopped(pod *corev1.Pod) bool

func IsProgressiveMigration added in v1.39.0

func IsProgressiveMigration() bool

func IsTensorFusionPod added in v1.39.0

func IsTensorFusionPod(pod *corev1.Pod) bool

func IsTensorFusionWorker added in v1.39.1

func IsTensorFusionWorker(pod *corev1.Pod) bool

func LoadConfigFromFile added in v1.34.0

func LoadConfigFromFile[T any](filename string, target *T) error

func NewShortID added in v1.35.0

func NewShortID(length int) string

func ReadServiceAccountToken added in v1.36.1

func ReadServiceAccountToken() string

func SetProgressiveMigration added in v1.39.0

func SetProgressiveMigration(isProgressiveMigration bool)

For test purpose only

func WatchConfigFileChanges added in v1.34.0

func WatchConfigFileChanges(ctx context.Context, filename string) (<-chan []byte, error)

WatchConfigFileChanges watches a file for changes and sends the file content through a channel when changes are detected. The channel will receive the raw file content as []byte whenever the file is modified. The watch interval is set to 15 seconds by default.

Types

type TensorFusionInfo added in v1.37.0

type TensorFusionInfo struct {
	Profile         *tfv1.WorkloadProfileSpec
	DynamicReplicas bool
	EnabledReplicas *int32
	WorkloadName    string
	ContainerNames  []string
	GenWorkload     bool

	// Pod mutating webhook can not get Pod UID sometimes,
	// thus need pod controller to set the owner reference
	PendingSetPodAsOwner bool
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL