v1

package
v0.0.0-...-148db3b Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 4, 2020 License: Apache-2.0 Imports: 5 Imported by: 0

Documentation

Overview

Package v1 is the v1 version of the API.

Index

Constants

View Source
const (
	ControllerName            = "TrainingJobOperator"
	TrainingJobReplicaName    = "TrainingJobReplicaName"
	TrainingJobReplicaIndex   = "TrainingJobReplicaIndex"
	TrainingJobNameLabel      = "TrainingJobName"
	TrainingJobFrameworkLabel = "FrameworkType"
	GroupNameLabel            = "GroupName"
	TrainingJobPriorityLabel  = "priority"
)
View Source
const (
	TrainingJobReplicaNameEnv         = "TRAININGJOB_REPLICA_NAME"
	TrainingJobReplicaIndexEnv        = "TRAININGJOB_REPLICA_INDEX"
	TrainingJobReplicaRestartCountEnv = "TRAININGJOB_REPLICA_RESTARTCOUNT"
	TrainingJobNameEnv                = "TRAININGJOB_NAME"
	TrainingJobNamespaceEnv           = "TRAININGJOB_NAMESPACE"
	TrainingJobServiceEnv             = "TRAININGJOB_SERVICE"
	TrainingJobPortEnv                = "TRAININGJOB_PORTS"
)
View Source
const (
	PodTemplateRestartPolicyReason = "SettedPodTemplateRestartPolicy"
	ExitedWithCodeReason           = "ExitedWithCode"
)
View Source
const (
	TrainingJobPendingReason     = "TrainingJobPending"
	TrainingJobCreatingReason    = "TrainingJobCreating"
	TrainingJobRunningReason     = "TrainingJobRunning"
	TrainingJobSucceededReason   = "TrainingJobSucceed"
	TrainingJobFailedReason      = "TrainingJobFailed"
	TrainingJobTimeoutReason     = "TrainingJobTimeout"
	TrainingJobRestartingReason  = "TrainingJobRestarting"
	TrainingJobTerminatingReason = "TrainingJobTerminating"
	TrainingJobPreemptedReason   = "TrainingJobPreempted"
	TrainingJobNodeFailReason    = "TrainingJobNodeFail"
)
View Source
const (
	DefaultContainerPrefix = "aitj-"
	DefaultPortPrefix      = "aitj-"
)
View Source
const (
	CRDGroupName    = "elasticdeeplearning.ai"
	CRDGroupVersion = "v1"
	CRDKind         = "AITrainingJob"
	CRDKindPlural   = "aitrainingjobs"
	CRDShortName    = "aitj"
)
View Source
const (
	RestartPolicyAlways                 RestartPolicy = "Always"
	RestartPolicyOnFailure              RestartPolicy = "OnFailure"
	RestartPolicyOnNodeFail             RestartPolicy = "OnNodeFail"
	RestartPolicyNever                  RestartPolicy = "Never"
	RestartPolicyExitCode               RestartPolicy = "ExitCode"
	RestartPolicyOnNodeFailWithExitCode RestartPolicy = "OnNodeFailWithExitCode"
	RestartScopeAll                     RestartScope  = "All"
	RestartScopeReplica                 RestartScope  = "Replica"
	RestartScopePod                     RestartScope  = "Pod"
)
View Source
const (
	// None means the job has been accepted by the system
	TrainingJobPhaseNone TrainingJobPhase = ""
	// Pending means one or more of the pods/services has not been scheduled.
	TrainingJobPhasePending = "Pending"
	// Creating means all pods/services of this job have been successfully scheduled,
	// but one or more of the pods/services has not been launched.
	TrainingJobPhaseCreating = "Creating"
	// Running means all pods/services have been launched.
	TrainingJobPhaseRunning = "Running"
	// Succeed means all pods of this job reached phase success.
	TrainingJobPhaseSucceeded = "Succeed"
	// Failed means one or more pods of this job reached phase failed.
	TrainingJobPhaseFailed = "Failed"
	// Timeout means the job runs over the preset maximum run time
	TrainingJobPhaseTimeout = "Timeout"
	// TODO: Restarting means the job is restarting
	TrainingJobPhaseRestarting = "Restarting"
	// Terminating means the job have been terminated, but resources are not yet fully released
	TrainingJobPhaseTerminating = "Terminating"
	// Preempted means the job have been preempted, and resources were fully released
	TrainingJobPhasePreempted = "Preempted"
	// NodeFail means the node is failed
	TrainingJobPhaseNodeFail = "NodeFail"
)

Variables

View Source
var (
	ErrorContainerStatus = []string{"CreateContainerConfigError",
		"CreateContainerError",

		"ImagePullBackOff",
		"ImageInspectError",
		"ErrImagePull",
		"ErrImageNeverPull",
		"RegistryUnavailable",
		"InvalidImageName"}

	EndingPhases = []TrainingJobPhase{
		TrainingJobPhaseSucceeded,
		TrainingJobPhaseFailed,
		TrainingJobPhaseTimeout,
		TrainingJobPhasePreempted,
		TrainingJobPhaseNodeFail,
	}
	TrainingJobReason = map[TrainingJobPhase]string{
		TrainingJobPhaseNone:        "",
		TrainingJobPhasePending:     TrainingJobPendingReason,
		TrainingJobPhaseCreating:    TrainingJobCreatingReason,
		TrainingJobPhaseRunning:     TrainingJobRunningReason,
		TrainingJobPhaseSucceeded:   TrainingJobSucceededReason,
		TrainingJobPhaseFailed:      TrainingJobFailedReason,
		TrainingJobPhaseTimeout:     TrainingJobTimeoutReason,
		TrainingJobPhaseRestarting:  TrainingJobRestartingReason,
		TrainingJobPhaseTerminating: TrainingJobTerminatingReason,
		TrainingJobPhasePreempted:   TrainingJobPreemptedReason,
		TrainingJobPhaseNodeFail:    TrainingJobNodeFailReason,
	}
)
View Source
var (
	SchemeBuilder runtime.SchemeBuilder

	AddToScheme = localSchemeBuilder.AddToScheme
)
View Source
var SchemeGroupVersion = schema.GroupVersion{Group: CRDGroupName, Version: CRDGroupVersion}

SchemeGroupVersion is group version used to register these objects

View Source
var SchemeGroupVersionKind = SchemeGroupVersion.WithKind(CRDKind)

Functions

func CRDName

func CRDName() string

CRDName returns name of crd

func Int32

func Int32(v int32) *int32

func Kind

func Kind(kind string) schema.GroupKind

Kind takes an unqualified kind and returns back a Group qualified GroupKind

func RegisterDefaults

func RegisterDefaults(scheme *runtime.Scheme) error

RegisterDefaults adds defaulters functions to the given scheme. Public to allow building arbitrary schemes. All generated defaulters are covering - they call all nested defaulters.

func Resource

func Resource(resource string) schema.GroupResource

Resource takes an unqualified resource and returns a Group qualified GroupResource

func SetDefaults_AITrainingJob

func SetDefaults_AITrainingJob(job *AITrainingJob)

SetDefaults_TrainingJob sets any unspecified values to defaults.

func SetObjectDefaults_AITrainingJob

func SetObjectDefaults_AITrainingJob(in *AITrainingJob)

func SetObjectDefaults_AITrainingJobList

func SetObjectDefaults_AITrainingJobList(in *AITrainingJobList)

Types

type AITrainingJob

type AITrainingJob struct {
	metav1.TypeMeta `json:",inline"`
	// ObjectMeta is metadata that all persisted resources must have, which includes all objects
	// users must create.
	metav1.ObjectMeta `json:"metadata,omitempty"`
	// Spec defines the specification of the desired behavior of the TrainingJob.
	Spec TrainingJobSpec `json:"spec,omitempty"`
	// Status is the most recently observed status of the TrainingJob.
	Status TrainingJobStatus `json:"status,omitempty"`
}

AITrainingJob is a specification for a AITrainingJob resource

func (*AITrainingJob) DeepCopy

func (in *AITrainingJob) DeepCopy() *AITrainingJob

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AITrainingJob.

func (*AITrainingJob) DeepCopyInto

func (in *AITrainingJob) DeepCopyInto(out *AITrainingJob)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

func (*AITrainingJob) DeepCopyObject

func (in *AITrainingJob) DeepCopyObject() runtime.Object

DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.

type AITrainingJobList

type AITrainingJobList struct {
	metav1.TypeMeta `json:",inline"`
	metav1.ListMeta `json:"metadata"`
	Items           []AITrainingJob `json:"items"`
}

AITrainingJobList is a list of AITrainingJob resources

func (*AITrainingJobList) DeepCopy

func (in *AITrainingJobList) DeepCopy() *AITrainingJobList

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AITrainingJobList.

func (*AITrainingJobList) DeepCopyInto

func (in *AITrainingJobList) DeepCopyInto(out *AITrainingJobList)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

func (*AITrainingJobList) DeepCopyObject

func (in *AITrainingJobList) DeepCopyObject() runtime.Object

DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.

type CleanPodPolicy

type CleanPodPolicy string
const (
	// Delete all pods/services, when job is finished
	CleanPodPolicyAll CleanPodPolicy = "All"
	// Delete nothing
	CleanPodPolicyNone CleanPodPolicy = "None"
)

type EdlPolicy

type EdlPolicy string
const (
	EdlPolicyAuto   EdlPolicy = "Auto"
	EdlPolicyManual EdlPolicy = "Manual"
	EdlPolicyNever  EdlPolicy = "Never"
)

type EndingPolicy

type EndingPolicy string
const (
	EndingPolicyAll   EndingPolicy = "All"
	EndingPolicyRank0 EndingPolicy = "Rank0"
	EndingPolicyAny   EndingPolicy = "Any"
	EndingPolicyNone  EndingPolicy = "None"
)

type FrameworkType

type FrameworkType string

type ReplicaName

type ReplicaName string

type ReplicaSpec

type ReplicaSpec struct {
	MinReplicas    *int32                 `json:"minReplicas,omitempty"`
	MaxReplicas    *int32                 `json:"maxReplicas,omitempty"`
	Replicas       *int32                 `json:"replicas,omitempty"`
	RestartLimit   *int32                 `json:"restartLimit,omitempty"`
	Template       corev1.PodTemplateSpec `json:"template,omitempty"`
	RestartPolicy  RestartPolicy          `json:"restartPolicy,omitempty"`
	RestartScope   RestartScope           `json:"restartScope,omitempty"`
	FailPolicy     EndingPolicy           `json:"failPolicy,omitempty"`
	CompletePolicy EndingPolicy           `json:"completePolicy,omitempty"`
	EdlPolicy      EdlPolicy              `json:"edlPolicy,omitempty"`
}

+k8s:deepcopy-gen=true ReplicaSpec is a description of the job replica.

func (*ReplicaSpec) DeepCopy

func (in *ReplicaSpec) DeepCopy() *ReplicaSpec

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReplicaSpec.

func (*ReplicaSpec) DeepCopyInto

func (in *ReplicaSpec) DeepCopyInto(out *ReplicaSpec)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type ReplicaStatus

type ReplicaStatus struct {
	// The number of pending pods.
	Pending int32 `json:"pending,omitempty"`
	// The number of scheduled pods.
	Scheduled int32 `json:"scheduled,omitempty"`
	// The number of actively running pods.
	Active int32 `json:"active,omitempty"`
	// The number of pods which reached phase Succeeded.
	Succeeded int32 `json:"succeeded,omitempty"`
	// The number of restarting pods.
	Restarting int32 `json:"restarting,omitempty"`
	// The number of pods which reached phase Failed.
	Failed int32 `json:"failed,omitempty"`
}

func (*ReplicaStatus) DeepCopy

func (in *ReplicaStatus) DeepCopy() *ReplicaStatus

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReplicaStatus.

func (*ReplicaStatus) DeepCopyInto

func (in *ReplicaStatus) DeepCopyInto(out *ReplicaStatus)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type RestartPolicy

type RestartPolicy string

type RestartScope

type RestartScope string

type RestartingExitCode

type RestartingExitCode string

type TrainingJobCondition

type TrainingJobCondition struct {
	// Type is the type of the condition.
	Type TrainingJobPhase `json:"type"`
	// Status is the status of the condition.
	// Can be True, False, Unknown.
	Status corev1.ConditionStatus `json:"status"`
	// Unique, one-word, CamelCase reason for the condition's last transition
	Reason string `json:"reason,omitempty"`
	// Human-readable message indicating details about last transition.
	Message string `json:"message,omitempty"`
	// Last time we probed the condition.
	LastProbeTime metav1.Time `json:"lastProbeTime,omitempty"`
	// Last time the condition transitioned from one status to another.
	LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
}

+k8s:deepcopy-gen=true TrainingJobCondition describes the state of the job at a certain point.

func (*TrainingJobCondition) DeepCopy

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobCondition.

func (*TrainingJobCondition) DeepCopyInto

func (in *TrainingJobCondition) DeepCopyInto(out *TrainingJobCondition)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type TrainingJobPhase

type TrainingJobPhase string

TrainingJobPhase is the phase of AITrainingJob

type TrainingJobSpec

type TrainingJobSpec struct {
	// Define restarting exitcode
	RestartingExitCode RestartingExitCode `json:"restartingExitCode,omitempty"`
	// Specify the framework, eg: tensorflow / paddlepaddle
	FrameworkType FrameworkType `json:"frameworkType,omitempty"`
	// Identify whether fault tolerance is required
	FaultTolerant bool `json:"faultTolerant,omitempty"`
	// Specify the job priority
	Priority string `json:"priority,omitempty"`
	// Specify the Kubernetes scheduler name
	SchedulerName string `json:"schedulerName,omitempty"`
	// Time limit for the job (in seconds)
	TimeLimit *int64 `json:"timeLimit,omitempty"`
	// Define the policy for cleaning up pods after the trainingjob completes
	CleanPodPolicy *CleanPodPolicy `json:"cleanPodPolicy,omitempty"`
	// Define the policy for fail
	FailPolicy EndingPolicy `json:"failPolicy,omitempty"`
	// Define the policy for complete
	CompletePolicy EndingPolicy `json:"completePolicy,omitempty"`
	// Specify the TrainingJob configuration
	ReplicaSpecs map[ReplicaName]*ReplicaSpec `json:"replicaSpecs"`
}

TrainingJobSpec is the spec for a AITrainingJob resource

func (*TrainingJobSpec) DeepCopy

func (in *TrainingJobSpec) DeepCopy() *TrainingJobSpec

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobSpec.

func (*TrainingJobSpec) DeepCopyInto

func (in *TrainingJobSpec) DeepCopyInto(out *TrainingJobSpec)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type TrainingJobStatus

type TrainingJobStatus struct {
	// The phase of a job is a simple, high-level summary of where the Job is in its lifecycle.
	Phase TrainingJobPhase `json:"phase"`
	// An array of current job conditions
	Conditions []TrainingJobCondition `json:"conditions"`
	// detail status of echo replica resource
	ReplicaStatuses map[ReplicaName]*ReplicaStatus `json:"replicaStatuses"`
	// The times of pods restart.
	RestartCountes map[ReplicaName]int32 `json:"RestartCount,,omitempty"`
	// ReplicaName need to restart
	RestartReplicaName ReplicaName
	// Represents the time when the job was acknowledged by the job controller
	StartTime *metav1.Time `json:"startTime,omitempty"`
	// Represents the time when the job start running.
	StartRunningTime *metav1.Time `json:"startRunningTime,omitempty"`
	// Represents the time when the job was completed
	EndTime *metav1.Time `json:"endTime,omitempty"`
	// Represents the last time when the job was reconciled.
	LastReconcileTime *metav1.Time `json:"lastReconcileTime,omitempty"`
}

+k8s:deepcopy-gen=true TrainingJobStatus is the status for a AITrainingJob resource

func (*TrainingJobStatus) DeepCopy

func (in *TrainingJobStatus) DeepCopy() *TrainingJobStatus

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobStatus.

func (*TrainingJobStatus) DeepCopyInto

func (in *TrainingJobStatus) DeepCopyInto(out *TrainingJobStatus)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL