Documentation ¶
Overview ¶
Package v1 is the v1 version of the API.
Index ¶
- Constants
- Variables
- func CRDName() string
- func Int32(v int32) *int32
- func Kind(kind string) schema.GroupKind
- func RegisterDefaults(scheme *runtime.Scheme) error
- func Resource(resource string) schema.GroupResource
- func SetDefaults_AITrainingJob(job *AITrainingJob)
- func SetObjectDefaults_AITrainingJob(in *AITrainingJob)
- func SetObjectDefaults_AITrainingJobList(in *AITrainingJobList)
- type AITrainingJob
- type AITrainingJobList
- type CleanPodPolicy
- type EdlPolicy
- type EndingPolicy
- type FrameworkType
- type ReplicaName
- type ReplicaSpec
- type ReplicaStatus
- type RestartPolicy
- type RestartScope
- type RestartingExitCode
- type TrainingJobCondition
- type TrainingJobPhase
- type TrainingJobSpec
- type TrainingJobStatus
Constants ¶
const ( ControllerName = "TrainingJobOperator" TrainingJobReplicaName = "TrainingJobReplicaName" TrainingJobReplicaIndex = "TrainingJobReplicaIndex" TrainingJobNameLabel = "TrainingJobName" TrainingJobFrameworkLabel = "FrameworkType" GroupNameLabel = "GroupName" TrainingJobPriorityLabel = "priority" )
const ( TrainingJobReplicaNameEnv = "TRAININGJOB_REPLICA_NAME" TrainingJobReplicaIndexEnv = "TRAININGJOB_REPLICA_INDEX" TrainingJobReplicaRestartCountEnv = "TRAININGJOB_REPLICA_RESTARTCOUNT" TrainingJobNameEnv = "TRAININGJOB_NAME" TrainingJobNamespaceEnv = "TRAININGJOB_NAMESPACE" TrainingJobServiceEnv = "TRAININGJOB_SERVICE" TrainingJobPortEnv = "TRAININGJOB_PORTS" )
const ( PodTemplateRestartPolicyReason = "SettedPodTemplateRestartPolicy" ExitedWithCodeReason = "ExitedWithCode" )
const ( TrainingJobPendingReason = "TrainingJobPending" TrainingJobCreatingReason = "TrainingJobCreating" TrainingJobRunningReason = "TrainingJobRunning" TrainingJobSucceededReason = "TrainingJobSucceed" TrainingJobFailedReason = "TrainingJobFailed" TrainingJobTimeoutReason = "TrainingJobTimeout" TrainingJobRestartingReason = "TrainingJobRestarting" TrainingJobTerminatingReason = "TrainingJobTerminating" TrainingJobPreemptedReason = "TrainingJobPreempted" TrainingJobNodeFailReason = "TrainingJobNodeFail" )
const ( DefaultContainerPrefix = "aitj-" DefaultPortPrefix = "aitj-" )
const ( CRDGroupName = "elasticdeeplearning.ai" CRDGroupVersion = "v1" CRDKind = "AITrainingJob" CRDKindPlural = "aitrainingjobs" CRDShortName = "aitj" )
const ( RestartPolicyAlways RestartPolicy = "Always" RestartPolicyOnFailure RestartPolicy = "OnFailure" RestartPolicyOnNodeFail RestartPolicy = "OnNodeFail" RestartPolicyNever RestartPolicy = "Never" RestartPolicyExitCode RestartPolicy = "ExitCode" RestartPolicyOnNodeFailWithExitCode RestartPolicy = "OnNodeFailWithExitCode" RestartScopeAll RestartScope = "All" RestartScopeReplica RestartScope = "Replica" RestartScopePod RestartScope = "Pod" )
const ( // None means the job has been accepted by the system TrainingJobPhaseNone TrainingJobPhase = "" // Pending means one or more of the pods/services has not been scheduled. TrainingJobPhasePending = "Pending" // Creating means all pods/services of this job have been successfully scheduled, // but one or more of the pods/services has not been launched. TrainingJobPhaseCreating = "Creating" // Running means all pods/services have been launched. TrainingJobPhaseRunning = "Running" // Succeed means all pods of this job reached phase success. TrainingJobPhaseSucceeded = "Succeed" // Failed means one or more pods of this job reached phase failed. TrainingJobPhaseFailed = "Failed" // Timeout means the job runs over the preset maximum run time TrainingJobPhaseTimeout = "Timeout" // TODO: Restarting means the job is restarting TrainingJobPhaseRestarting = "Restarting" // Terminating means the job have been terminated, but resources are not yet fully released TrainingJobPhaseTerminating = "Terminating" // Preempted means the job have been preempted, and resources were fully released TrainingJobPhasePreempted = "Preempted" // NodeFail means the node is failed TrainingJobPhaseNodeFail = "NodeFail" )
Variables ¶
var ( ErrorContainerStatus = []string{"CreateContainerConfigError", "CreateContainerError", "ImagePullBackOff", "ImageInspectError", "ErrImagePull", "ErrImageNeverPull", "RegistryUnavailable", "InvalidImageName"} EndingPhases = []TrainingJobPhase{ TrainingJobPhaseSucceeded, TrainingJobPhaseFailed, TrainingJobPhaseTimeout, TrainingJobPhasePreempted, TrainingJobPhaseNodeFail, } TrainingJobReason = map[TrainingJobPhase]string{ TrainingJobPhaseNone: "", TrainingJobPhasePending: TrainingJobPendingReason, TrainingJobPhaseCreating: TrainingJobCreatingReason, TrainingJobPhaseRunning: TrainingJobRunningReason, TrainingJobPhaseSucceeded: TrainingJobSucceededReason, TrainingJobPhaseFailed: TrainingJobFailedReason, TrainingJobPhaseTimeout: TrainingJobTimeoutReason, TrainingJobPhaseRestarting: TrainingJobRestartingReason, TrainingJobPhaseTerminating: TrainingJobTerminatingReason, TrainingJobPhasePreempted: TrainingJobPreemptedReason, TrainingJobPhaseNodeFail: TrainingJobNodeFailReason, } )
var ( SchemeBuilder runtime.SchemeBuilder AddToScheme = localSchemeBuilder.AddToScheme )
var SchemeGroupVersion = schema.GroupVersion{Group: CRDGroupName, Version: CRDGroupVersion}
SchemeGroupVersion is group version used to register these objects
var SchemeGroupVersionKind = SchemeGroupVersion.WithKind(CRDKind)
Functions ¶
func RegisterDefaults ¶
RegisterDefaults adds defaulters functions to the given scheme. Public to allow building arbitrary schemes. All generated defaulters are covering - they call all nested defaulters.
func Resource ¶
func Resource(resource string) schema.GroupResource
Resource takes an unqualified resource and returns a Group qualified GroupResource
func SetDefaults_AITrainingJob ¶
func SetDefaults_AITrainingJob(job *AITrainingJob)
SetDefaults_TrainingJob sets any unspecified values to defaults.
func SetObjectDefaults_AITrainingJob ¶
func SetObjectDefaults_AITrainingJob(in *AITrainingJob)
func SetObjectDefaults_AITrainingJobList ¶
func SetObjectDefaults_AITrainingJobList(in *AITrainingJobList)
Types ¶
type AITrainingJob ¶
type AITrainingJob struct { metav1.TypeMeta `json:",inline"` // ObjectMeta is metadata that all persisted resources must have, which includes all objects // users must create. metav1.ObjectMeta `json:"metadata,omitempty"` // Spec defines the specification of the desired behavior of the TrainingJob. Spec TrainingJobSpec `json:"spec,omitempty"` // Status is the most recently observed status of the TrainingJob. Status TrainingJobStatus `json:"status,omitempty"` }
AITrainingJob is a specification for a AITrainingJob resource
func (*AITrainingJob) DeepCopy ¶
func (in *AITrainingJob) DeepCopy() *AITrainingJob
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AITrainingJob.
func (*AITrainingJob) DeepCopyInto ¶
func (in *AITrainingJob) DeepCopyInto(out *AITrainingJob)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*AITrainingJob) DeepCopyObject ¶
func (in *AITrainingJob) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type AITrainingJobList ¶
type AITrainingJobList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata"` Items []AITrainingJob `json:"items"` }
AITrainingJobList is a list of AITrainingJob resources
func (*AITrainingJobList) DeepCopy ¶
func (in *AITrainingJobList) DeepCopy() *AITrainingJobList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AITrainingJobList.
func (*AITrainingJobList) DeepCopyInto ¶
func (in *AITrainingJobList) DeepCopyInto(out *AITrainingJobList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*AITrainingJobList) DeepCopyObject ¶
func (in *AITrainingJobList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type CleanPodPolicy ¶
type CleanPodPolicy string
const ( // Delete all pods/services, when job is finished CleanPodPolicyAll CleanPodPolicy = "All" // Delete nothing CleanPodPolicyNone CleanPodPolicy = "None" )
type EndingPolicy ¶
type EndingPolicy string
const ( EndingPolicyAll EndingPolicy = "All" EndingPolicyRank0 EndingPolicy = "Rank0" EndingPolicyAny EndingPolicy = "Any" EndingPolicyNone EndingPolicy = "None" )
type FrameworkType ¶
type FrameworkType string
type ReplicaName ¶
type ReplicaName string
type ReplicaSpec ¶
type ReplicaSpec struct { MinReplicas *int32 `json:"minReplicas,omitempty"` MaxReplicas *int32 `json:"maxReplicas,omitempty"` Replicas *int32 `json:"replicas,omitempty"` RestartLimit *int32 `json:"restartLimit,omitempty"` Template corev1.PodTemplateSpec `json:"template,omitempty"` RestartPolicy RestartPolicy `json:"restartPolicy,omitempty"` RestartScope RestartScope `json:"restartScope,omitempty"` FailPolicy EndingPolicy `json:"failPolicy,omitempty"` CompletePolicy EndingPolicy `json:"completePolicy,omitempty"` EdlPolicy EdlPolicy `json:"edlPolicy,omitempty"` }
+k8s:deepcopy-gen=true ReplicaSpec is a description of the job replica.
func (*ReplicaSpec) DeepCopy ¶
func (in *ReplicaSpec) DeepCopy() *ReplicaSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReplicaSpec.
func (*ReplicaSpec) DeepCopyInto ¶
func (in *ReplicaSpec) DeepCopyInto(out *ReplicaSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ReplicaStatus ¶
type ReplicaStatus struct { // The number of pending pods. Pending int32 `json:"pending,omitempty"` // The number of scheduled pods. Scheduled int32 `json:"scheduled,omitempty"` // The number of actively running pods. Active int32 `json:"active,omitempty"` // The number of pods which reached phase Succeeded. Succeeded int32 `json:"succeeded,omitempty"` // The number of restarting pods. Restarting int32 `json:"restarting,omitempty"` // The number of pods which reached phase Failed. Failed int32 `json:"failed,omitempty"` }
func (*ReplicaStatus) DeepCopy ¶
func (in *ReplicaStatus) DeepCopy() *ReplicaStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReplicaStatus.
func (*ReplicaStatus) DeepCopyInto ¶
func (in *ReplicaStatus) DeepCopyInto(out *ReplicaStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type RestartPolicy ¶
type RestartPolicy string
type RestartScope ¶
type RestartScope string
type RestartingExitCode ¶
type RestartingExitCode string
type TrainingJobCondition ¶
type TrainingJobCondition struct { // Type is the type of the condition. Type TrainingJobPhase `json:"type"` // Status is the status of the condition. // Can be True, False, Unknown. Status corev1.ConditionStatus `json:"status"` // Unique, one-word, CamelCase reason for the condition's last transition Reason string `json:"reason,omitempty"` // Human-readable message indicating details about last transition. Message string `json:"message,omitempty"` // Last time we probed the condition. LastProbeTime metav1.Time `json:"lastProbeTime,omitempty"` // Last time the condition transitioned from one status to another. LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` }
+k8s:deepcopy-gen=true TrainingJobCondition describes the state of the job at a certain point.
func (*TrainingJobCondition) DeepCopy ¶
func (in *TrainingJobCondition) DeepCopy() *TrainingJobCondition
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobCondition.
func (*TrainingJobCondition) DeepCopyInto ¶
func (in *TrainingJobCondition) DeepCopyInto(out *TrainingJobCondition)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainingJobSpec ¶
type TrainingJobSpec struct { // Define restarting exitcode RestartingExitCode RestartingExitCode `json:"restartingExitCode,omitempty"` // Specify the framework, eg: tensorflow / paddlepaddle FrameworkType FrameworkType `json:"frameworkType,omitempty"` // Identify whether fault tolerance is required FaultTolerant bool `json:"faultTolerant,omitempty"` // Specify the job priority Priority string `json:"priority,omitempty"` // Specify the Kubernetes scheduler name SchedulerName string `json:"schedulerName,omitempty"` // Time limit for the job (in seconds) TimeLimit *int64 `json:"timeLimit,omitempty"` // Define the policy for cleaning up pods after the trainingjob completes CleanPodPolicy *CleanPodPolicy `json:"cleanPodPolicy,omitempty"` // Define the policy for fail FailPolicy EndingPolicy `json:"failPolicy,omitempty"` // Define the policy for complete CompletePolicy EndingPolicy `json:"completePolicy,omitempty"` // Specify the TrainingJob configuration ReplicaSpecs map[ReplicaName]*ReplicaSpec `json:"replicaSpecs"` }
TrainingJobSpec is the spec for a AITrainingJob resource
func (*TrainingJobSpec) DeepCopy ¶
func (in *TrainingJobSpec) DeepCopy() *TrainingJobSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobSpec.
func (*TrainingJobSpec) DeepCopyInto ¶
func (in *TrainingJobSpec) DeepCopyInto(out *TrainingJobSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainingJobStatus ¶
type TrainingJobStatus struct { // The phase of a job is a simple, high-level summary of where the Job is in its lifecycle. Phase TrainingJobPhase `json:"phase"` // An array of current job conditions Conditions []TrainingJobCondition `json:"conditions"` // detail status of echo replica resource ReplicaStatuses map[ReplicaName]*ReplicaStatus `json:"replicaStatuses"` // The times of pods restart. RestartCountes map[ReplicaName]int32 `json:"RestartCount,,omitempty"` // ReplicaName need to restart RestartReplicaName ReplicaName // Represents the time when the job was acknowledged by the job controller StartTime *metav1.Time `json:"startTime,omitempty"` // Represents the time when the job start running. StartRunningTime *metav1.Time `json:"startRunningTime,omitempty"` // Represents the time when the job was completed EndTime *metav1.Time `json:"endTime,omitempty"` // Represents the last time when the job was reconciled. LastReconcileTime *metav1.Time `json:"lastReconcileTime,omitempty"` }
+k8s:deepcopy-gen=true TrainingJobStatus is the status for a AITrainingJob resource
func (*TrainingJobStatus) DeepCopy ¶
func (in *TrainingJobStatus) DeepCopy() *TrainingJobStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobStatus.
func (*TrainingJobStatus) DeepCopyInto ¶
func (in *TrainingJobStatus) DeepCopyInto(out *TrainingJobStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.