training

package
v0.0.0-...-1dfa148 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 12, 2024 License: Apache-2.0 Imports: 42 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// NVIDIAGPUResourceName is the extended name of the GPU resource since v1.8
	// this uses the device plugin mechanism
	NVIDIAGPUResourceName = "nvidia.com/gpu"

	DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu"

	// TrainingReplicaTypeLabel training-operator replica type label
	TrainingReplicaTypeLabel = "training.kubeflow.org/replica-type"
	// TrainingReplicaIndexLabel training-operator replica index label
	TrainingReplicaIndexLabel = "training.kubeflow.org/replica-index"
)
View Source
const (
	ETJOB_MAXWORKERS = 1000
	ETJOB_MINWORKERS = 1
)
View Source
const ResourceTypeJob = ResourceType("Job")
View Source
const ResourceTypePod = ResourceType("Pod")
View Source
const ResourceTypeStatefulSet = ResourceType("StatefulSet")

Variables

This section is empty.

Functions

func AcceptJobLog

func AcceptJobLog(jobName string, trainingType types.TrainingJobType, args *types.LogArgs) error

AcceptJobLog is used for arena-go-sdk

func BuildJobInfo

func BuildJobInfo(job TrainingJob, showGPUs bool, services []*v1.Service, nodes []*v1.Node) *types.TrainingJobInfo

* * BuildTrainingJobInfo returns types.TrainingJobInfo

func CheckJobIsOwnedByTrainer

func CheckJobIsOwnedByTrainer(labels map[string]string) error

func CheckOperatorIsInstalled

func CheckOperatorIsInstalled(crdName string) bool

func CheckPrintFormat

func CheckPrintFormat(format string) error

func CompatibleJobCRD

func CompatibleJobCRD(crdName, fieldToCheck string) bool

CompatibleJobCRD Compatible with training-operator CRD.

func DeleteTrainingJob

func DeleteTrainingJob(jobName, namespace string, jobType types.TrainingJobType) error

func DisplayTrainingJobList

func DisplayTrainingJobList(jobInfoList []TrainingJob, format string, allNamespaces bool)

func GetAllTrainers

func GetAllTrainers() map[types.TrainingJobType]Trainer

func GetJobDashboards

func GetJobDashboards(dashboard string, job *v1.Job, pods []corev1.Pod) []string

func GetJobGpuMetric

func GetJobGpuMetric(client *kubernetes.Clientset, job TrainingJob) (jobMetric prometheus.JobGpuMetric, err error)

func GetJobRealStatus

func GetJobRealStatus(job TrainingJob) string

Get real job status WHen has pods being pending, tfJob still show in Running state, it should be Pending

func GetResourcesEvents

func GetResourcesEvents(client *kubernetes.Clientset, namespace string, resources []Resource) (map[string][]v1.Event, error)

Get Event of the Job

func GetTrainingJobLabels

func GetTrainingJobLabels(jobType types.TrainingJobType) string

func PrepareServicesAndNodesForTensorboard

func PrepareServicesAndNodesForTensorboard(jobs []TrainingJob, allNamespaces bool) ([]*v1.Service, []*v1.Node)

func PrintLine

func PrintLine(w io.Writer, fields ...string)

func PrintTrainingJob

func PrintTrainingJob(job TrainingJob, format string, showEvents bool, showGPUs bool)

func PruneTrainingJobs

func PruneTrainingJobs(namespace string, allNamespaces bool, since time.Duration) error

func SortMapKeys

func SortMapKeys(podMetric map[string]types.GpuMetric) []string

func SubmitDeepSpeedJob

func SubmitDeepSpeedJob(namespace string, submitArgs *types.SubmitDeepSpeedJobArgs) (err error)

func SubmitETJob

func SubmitETJob(namespace string, submitArgs *types.SubmitETJobArgs) (err error)

func SubmitHorovodJob

func SubmitHorovodJob(namespace string, submitArgs *types.SubmitHorovodJobArgs) (err error)

func SubmitMPIJob

func SubmitMPIJob(namespace string, submitArgs *types.SubmitMPIJobArgs) (err error)

func SubmitPytorchJob

func SubmitPytorchJob(namespace string, submitArgs *types.SubmitPyTorchJobArgs) (err error)

func SubmitScaleInETJob

func SubmitScaleInETJob(namespace string, submitArgs *types.ScaleInETJobArgs) error

func SubmitScaleOutETJob

func SubmitScaleOutETJob(namespace string, submitArgs *types.ScaleOutETJobArgs) error

func SubmitSparkJob

func SubmitSparkJob(namespace string, submitArgs *types.SubmitSparkJobArgs) (err error)

func SubmitTFJob

func SubmitTFJob(namespace string, submitArgs *types.SubmitTFJobArgs) (err error)

func SubmitVolcanoJob

func SubmitVolcanoJob(namespace string, submitArgs *types.SubmitVolcanoJobArgs) error

func TopTrainingJobs

func TopTrainingJobs(args []string, namespace string, allNamespaces bool, jobType types.TrainingJobType, instanceName string, notStop bool, format types.FormatStyle) error

Types

type BasicJobInfo

type BasicJobInfo struct {
	// contains filtered or unexported fields
}

func (*BasicJobInfo) Resources

func (j *BasicJobInfo) Resources() []Resource

type DeepSpeedJob

type DeepSpeedJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

DeepSpeedJob Information

func (*DeepSpeedJob) Age

func (dsj *DeepSpeedJob) Age() time.Duration

Get the Job Age

func (*DeepSpeedJob) AllPods

func (dsj *DeepSpeedJob) AllPods() []*v1.Pod

Get all the pods of the Training Job

func (*DeepSpeedJob) AllocatedGPU

func (dsj *DeepSpeedJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*DeepSpeedJob) ChiefPod

func (dsj *DeepSpeedJob) ChiefPod() *v1.Pod

Get the chief Pod of the Job.

func (*DeepSpeedJob) Duration

func (dsj *DeepSpeedJob) Duration() time.Duration

Get the Job Training Duration

func (*DeepSpeedJob) GetJobDashboards

func (dsj *DeepSpeedJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)

func (*DeepSpeedJob) GetPriorityClass

func (dsj *DeepSpeedJob) GetPriorityClass() string

GetPriorityClass Get PriorityClass

func (*DeepSpeedJob) GetStatus

func (dsj *DeepSpeedJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*DeepSpeedJob) GetTrainJob

func (dsj *DeepSpeedJob) GetTrainJob() interface{}

func (*DeepSpeedJob) GetWorkerMaxReplicas

func (dsj *DeepSpeedJob) GetWorkerMaxReplicas(maxWorkers int) interface{}

func (*DeepSpeedJob) GetWorkerMinReplicas

func (dsj *DeepSpeedJob) GetWorkerMinReplicas(minWorkers int) interface{}

func (*DeepSpeedJob) HostIPOfChief

func (dsj *DeepSpeedJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*DeepSpeedJob) Name

func (dsj *DeepSpeedJob) Name() string

func (*DeepSpeedJob) Namespace

func (dsj *DeepSpeedJob) Namespace() string

func (*DeepSpeedJob) RequestedGPU

func (dsj *DeepSpeedJob) RequestedGPU() int64

Requested GPU count of the Job

func (*DeepSpeedJob) StartTime

func (dsj *DeepSpeedJob) StartTime() *metav1.Time

Get the start time

func (*DeepSpeedJob) Trainer

func (dsj *DeepSpeedJob) Trainer() types.TrainingJobType

func (*DeepSpeedJob) Uid

func (dsj *DeepSpeedJob) Uid() string

type DeepSpeedJobTrainer

type DeepSpeedJobTrainer struct {
	// contains filtered or unexported fields
}

DeepSpeedJobTrainer DeepSpeed Job trainer

func (*DeepSpeedJobTrainer) GetTrainingJob

func (dst *DeepSpeedJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)

func (*DeepSpeedJobTrainer) IsEnabled

func (dst *DeepSpeedJobTrainer) IsEnabled() bool

func (*DeepSpeedJobTrainer) IsSupported

func (dst *DeepSpeedJobTrainer) IsSupported(name, ns string) bool

check if it's et job

func (*DeepSpeedJobTrainer) ListTrainingJobs

func (dst *DeepSpeedJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)

func (*DeepSpeedJobTrainer) Type

Get the type

type ETJob

type ETJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

ET Job Information

func (*ETJob) Age

func (ej *ETJob) Age() time.Duration

Get the Job Age

func (*ETJob) AllPods

func (ej *ETJob) AllPods() []*v1.Pod

Get all the pods of the Training Job

func (*ETJob) AllocatedGPU

func (ej *ETJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*ETJob) ChiefPod

func (ej *ETJob) ChiefPod() *v1.Pod

Get the chief Pod of the Job.

func (*ETJob) Duration

func (ej *ETJob) Duration() time.Duration

Get the Job Training Duration

func (*ETJob) GetJobDashboards

func (ej *ETJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)

func (*ETJob) GetPriorityClass

func (ej *ETJob) GetPriorityClass() string

Get PriorityClass

func (*ETJob) GetStatus

func (ej *ETJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*ETJob) GetTrainJob

func (ej *ETJob) GetTrainJob() interface{}

func (*ETJob) GetWorkerMaxReplicas

func (ej *ETJob) GetWorkerMaxReplicas(maxWorkers int) interface{}

func (*ETJob) GetWorkerMinReplicas

func (ej *ETJob) GetWorkerMinReplicas(minWorkers int) interface{}

func (*ETJob) HostIPOfChief

func (ej *ETJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*ETJob) Name

func (ej *ETJob) Name() string

func (*ETJob) Namespace

func (ej *ETJob) Namespace() string

func (*ETJob) RequestedGPU

func (ej *ETJob) RequestedGPU() int64

Requested GPU count of the Job

func (*ETJob) StartTime

func (ej *ETJob) StartTime() *metav1.Time

Get the start time

func (*ETJob) Trainer

func (ej *ETJob) Trainer() types.TrainingJobType

func (*ETJob) Uid

func (ej *ETJob) Uid() string

type ETJobTrainer

type ETJobTrainer struct {
	// contains filtered or unexported fields
}

ET Job trainer

func (*ETJobTrainer) GetTrainingJob

func (ejt *ETJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)

func (*ETJobTrainer) IsEnabled

func (ejt *ETJobTrainer) IsEnabled() bool

func (*ETJobTrainer) IsSupported

func (ejt *ETJobTrainer) IsSupported(name, ns string) bool

check if it's et job

func (*ETJobTrainer) ListTrainingJobs

func (ejt *ETJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)

func (*ETJobTrainer) Type

func (ejt *ETJobTrainer) Type() types.TrainingJobType

Get the type

type MPIJob

type MPIJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

MPI Job Information

func (*MPIJob) Age

func (mj *MPIJob) Age() time.Duration

Get the Job Age

func (*MPIJob) AllPods

func (mj *MPIJob) AllPods() []*v1.Pod

Get all the pods of the Training Job

func (*MPIJob) AllocatedGPU

func (mj *MPIJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*MPIJob) ChiefPod

func (mj *MPIJob) ChiefPod() *v1.Pod

Get the chief Pod of the Job.

func (*MPIJob) Duration

func (mj *MPIJob) Duration() time.Duration

Get the Job Training Duration

func (*MPIJob) GetJobDashboards

func (mj *MPIJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)

Get Dashboard url of the job

func (*MPIJob) GetPriorityClass

func (m *MPIJob) GetPriorityClass() string

Get PriorityClass

func (*MPIJob) GetStatus

func (mj *MPIJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*MPIJob) GetTrainJob

func (mj *MPIJob) GetTrainJob() interface{}

func (*MPIJob) HostIPOfChief

func (mj *MPIJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*MPIJob) Name

func (mj *MPIJob) Name() string

func (*MPIJob) Namespace

func (mj *MPIJob) Namespace() string

func (*MPIJob) RequestedGPU

func (mj *MPIJob) RequestedGPU() int64

Requested GPU count of the Job

func (*MPIJob) StartTime

func (mj *MPIJob) StartTime() *metav1.Time

Get the start time

func (*MPIJob) Trainer

func (mj *MPIJob) Trainer() types.TrainingJobType

func (*MPIJob) Uid

func (mj *MPIJob) Uid() string

type MPIJobTrainer

type MPIJobTrainer struct {
	// contains filtered or unexported fields
}

MPI Job trainer

func (*MPIJobTrainer) GetTrainingJob

func (tt *MPIJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)

func (*MPIJobTrainer) IsEnabled

func (tt *MPIJobTrainer) IsEnabled() bool

IsEnabled is used to get the trainer is enable or not

func (*MPIJobTrainer) IsSupported

func (tt *MPIJobTrainer) IsSupported(name, ns string) bool

check if it's TensorFlow job

func (*MPIJobTrainer) ListTrainingJobs

func (tt *MPIJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)

func (*MPIJobTrainer) Type

func (tt *MPIJobTrainer) Type() types.TrainingJobType

Get the type

type PyTorchJob

type PyTorchJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

PyTorch Job Information

func (*PyTorchJob) Age

func (pj *PyTorchJob) Age() time.Duration

Get the Job Age

func (*PyTorchJob) AllPods

func (pj *PyTorchJob) AllPods() []*v1.Pod

Get all the pods of the Training Job

func (*PyTorchJob) AllocatedGPU

func (pj *PyTorchJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*PyTorchJob) ChiefPod

func (pj *PyTorchJob) ChiefPod() *v1.Pod

Get the master Pod of the Job.

func (*PyTorchJob) Duration

func (pj *PyTorchJob) Duration() time.Duration

Get the Job Training Duration

func (*PyTorchJob) GetJobDashboards

func (pj *PyTorchJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)

Get Dashboard url of the job

func (*PyTorchJob) GetPriorityClass

func (p *PyTorchJob) GetPriorityClass() string

Get PriorityClass

func (*PyTorchJob) GetStatus

func (pj *PyTorchJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*PyTorchJob) GetTrainJob

func (pj *PyTorchJob) GetTrainJob() interface{}

func (*PyTorchJob) HostIPOfChief

func (pj *PyTorchJob) HostIPOfChief() (hostIP string)

Get the hostIP of the master Pod

func (*PyTorchJob) Name

func (pj *PyTorchJob) Name() string

func (*PyTorchJob) Namespace

func (pj *PyTorchJob) Namespace() string

func (*PyTorchJob) RequestedGPU

func (pj *PyTorchJob) RequestedGPU() int64

Requested GPU count of the Job

func (*PyTorchJob) StartTime

func (pj *PyTorchJob) StartTime() *metav1.Time

Get the start time

func (*PyTorchJob) Trainer

func (pj *PyTorchJob) Trainer() types.TrainingJobType

func (*PyTorchJob) Uid

func (pj *PyTorchJob) Uid() string

type PyTorchJobTrainer

type PyTorchJobTrainer struct {
	// contains filtered or unexported fields
}

PyTorch Job trainer

func (*PyTorchJobTrainer) GetTrainingJob

func (tt *PyTorchJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)

func (*PyTorchJobTrainer) IsEnabled

func (tt *PyTorchJobTrainer) IsEnabled() bool

IsEnabled is used to get the trainer is enable or not

func (*PyTorchJobTrainer) IsSupported

func (tt *PyTorchJobTrainer) IsSupported(name, ns string) bool

check if it's TensorFlow job

func (*PyTorchJobTrainer) ListTrainingJobs

func (tt *PyTorchJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)

func (*PyTorchJobTrainer) Type

Get the type

type Resource

type Resource struct {
	Name         string
	Uid          string
	ResourceType ResourceType
}

type ResourceType

type ResourceType string

type SortPodConditionByLastTransitionTime

type SortPodConditionByLastTransitionTime []v1.PodCondition

Sort the pod condition by time.

func (SortPodConditionByLastTransitionTime) Len

func (SortPodConditionByLastTransitionTime) Less

func (SortPodConditionByLastTransitionTime) Swap

type SparkJob

type SparkJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

spark application wrapper

func (*SparkJob) Age

func (sj *SparkJob) Age() time.Duration

func (*SparkJob) AllPods

func (sj *SparkJob) AllPods() []*v1.Pod

return pods from cache

func (*SparkJob) AllocatedGPU

func (sj *SparkJob) AllocatedGPU() int64

spark job without gpu supported

func (*SparkJob) ChiefPod

func (sj *SparkJob) ChiefPod() *v1.Pod

return driver pod

func (*SparkJob) Duration

func (sj *SparkJob) Duration() time.Duration

Get the Job Training Duration

func (*SparkJob) GetJobDashboards

func (sj *SparkJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)

func (*SparkJob) GetPriorityClass

func (sj *SparkJob) GetPriorityClass() string

Get PriorityClass TODO: @moyuan

func (*SparkJob) GetStatus

func (sj *SparkJob) GetStatus() (status string)
spark job driver state

------------------------------------------------------- NewState ApplicationStateType = "" SubmittedState ApplicationStateType = "SUBMITTED" RunningState ApplicationStateType = "RUNNING" CompletedState ApplicationStateType = "COMPLETED" FailedState ApplicationStateType = "FAILED" FailedSubmissionState ApplicationStateType = "SUBMISSION_FAILED" PendingRerunState ApplicationStateType = "PENDING_RERUN" InvalidatingState ApplicationStateType = "INVALIDATING" SucceedingState ApplicationStateType = "SUCCEEDING" FailingState ApplicationStateType = "FAILING" UnknownState ApplicationStateType = "UNKNOWN"

spark job executor state

------------------------------------------------------- ExecutorPendingState ExecutorState = "PENDING" ExecutorRunningState ExecutorState = "RUNNING" ExecutorCompletedState ExecutorState = "COMPLETED" ExecutorFailedState ExecutorState = "FAILED" ExecutorUnknownState ExecutorState = "UNKNOWN"

func (*SparkJob) GetTrainJob

func (sj *SparkJob) GetTrainJob() interface{}

func (*SparkJob) HostIPOfChief

func (sj *SparkJob) HostIPOfChief() (hostIP string)

Get the hostIP of the driver Pod

func (*SparkJob) Name

func (sj *SparkJob) Name() string

func (*SparkJob) Namespace

func (sj *SparkJob) Namespace() string

func (*SparkJob) RequestedGPU

func (sj *SparkJob) RequestedGPU() int64

spark job without gpu supported

func (*SparkJob) StartTime

func (sj *SparkJob) StartTime() *metav1.Time

func (*SparkJob) Trainer

func (sj *SparkJob) Trainer() types.TrainingJobType

return trainerType: sparkjob

func (*SparkJob) Uid

func (sj *SparkJob) Uid() string

type SparkJobTrainer

type SparkJobTrainer struct {
	// contains filtered or unexported fields
}

spark job trainer

func (*SparkJobTrainer) GetTrainingJob

func (st *SparkJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)

func (*SparkJobTrainer) IsEnabled

func (st *SparkJobTrainer) IsEnabled() bool

func (*SparkJobTrainer) IsSupported

func (st *SparkJobTrainer) IsSupported(name, ns string) bool

func (*SparkJobTrainer) ListTrainingJobs

func (st *SparkJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)

func (*SparkJobTrainer) Type

type TensorFlowJob

type TensorFlowJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

TensorflowJob implements the TrainingJob TensorFlow Job Information

func (*TensorFlowJob) Age

func (tj *TensorFlowJob) Age() time.Duration

Age returns the age of tfjob

func (*TensorFlowJob) AllPods

func (tj *TensorFlowJob) AllPods() []*v1.Pod

AllPods Get all the pods of the Training Job

func (*TensorFlowJob) AllocatedGPU

func (tj *TensorFlowJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*TensorFlowJob) ChiefPod

func (tj *TensorFlowJob) ChiefPod() *v1.Pod

ChiefPod gets the chief Pod of the Job.

func (*TensorFlowJob) Duration

func (tj *TensorFlowJob) Duration() time.Duration

Duration returns the duration of tfjob

func (*TensorFlowJob) GetJobDashboards

func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)

Get Dashboard url of the job

func (*TensorFlowJob) GetPriorityClass

func (t *TensorFlowJob) GetPriorityClass() string

Get PriorityClass

func (*TensorFlowJob) GetStatus

func (tj *TensorFlowJob) GetStatus() (status string)

GetStatus returns the status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*TensorFlowJob) GetTrainJob

func (tj *TensorFlowJob) GetTrainJob() interface{}

GetTrainJob returns the training job

func (*TensorFlowJob) HostIPOfChief

func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*TensorFlowJob) Name

func (tj *TensorFlowJob) Name() string

Name returns the TensorflowJob name

func (*TensorFlowJob) Namespace

func (tj *TensorFlowJob) Namespace() string

Namespace returns the namespace of tfjob

func (*TensorFlowJob) RequestedGPU

func (tj *TensorFlowJob) RequestedGPU() int64

Requested GPU count of the Job

func (*TensorFlowJob) StartTime

func (tj *TensorFlowJob) StartTime() *metav1.Time

StartTime returns the start time

func (*TensorFlowJob) Trainer

func (tj *TensorFlowJob) Trainer() types.TrainingJobType

Trainer returns the trainer

func (*TensorFlowJob) Uid

func (tj *TensorFlowJob) Uid() string

Uid returns the TensorflowJob uid

type TensorFlowJobTrainer

type TensorFlowJobTrainer struct {
	// contains filtered or unexported fields
}

TensorFlow Job trainer

func (*TensorFlowJobTrainer) GetTrainingJob

func (tt *TensorFlowJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)

func (*TensorFlowJobTrainer) IsEnabled

func (tt *TensorFlowJobTrainer) IsEnabled() bool

IsEnabled is used to get the trainer is enable or not

func (*TensorFlowJobTrainer) IsSupported

func (tt *TensorFlowJobTrainer) IsSupported(name, namespace string) bool

check if it's TensorFlow job

func (*TensorFlowJobTrainer) ListTrainingJobs

func (tt *TensorFlowJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)

func (*TensorFlowJobTrainer) Type

type Trainer

type Trainer interface {
	// IsEnabled is used to check the trainer is enabled or not
	IsEnabled() bool
	// Check if the training job is supported
	IsSupported(name, ns string) bool

	// Get TrainingJob object directly. this method is called when `arena get`
	GetTrainingJob(name, namespace string) (TrainingJob, error)

	// Get the type of trainer
	Type() types.TrainingJobType

	// List all tf training jobs
	ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
}

func NewDeepSpeedJobTrainer

func NewDeepSpeedJobTrainer() Trainer

NewDeepSpeedJobTrainer new deepspeed job trainer

func NewETJobTrainer

func NewETJobTrainer() Trainer

NewETJobTrainer

func NewMPIJobTrainer

func NewMPIJobTrainer() Trainer

NewMPIJobTrainer

func NewPyTorchJobTrainer

func NewPyTorchJobTrainer() Trainer

NewPyTorchJobTrainer

func NewSparkJobTrainer

func NewSparkJobTrainer() Trainer

func NewTensorFlowJobTrainer

func NewTensorFlowJobTrainer() Trainer

func NewVolcanoJobTrainer

func NewVolcanoJobTrainer() Trainer

type TrainingJob

type TrainingJob interface {
	// Get the chief Pod of the Job.
	ChiefPod() *v1.Pod

	// Get the name of the Training Job
	Name() string

	// Get the unique identity of the Training Job
	Uid() string

	// Get the namespace of the Training Job
	Namespace() string

	// Get all the pods of the Training Job
	AllPods() []*v1.Pod

	// Get all the kubernetes resource of the Training Job
	Resources() []Resource

	// Get the Status of the Job: RUNNING, PENDING,
	GetStatus() string

	// Return trainer Type, match the training job type
	Trainer() types.TrainingJobType

	// Get the Job Age
	Age() time.Duration

	// Get the Job Duration
	Duration() time.Duration

	// Get start time
	StartTime() *metav1.Time

	// Get Dashboard
	GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)

	// Requested GPU count of the Job
	RequestedGPU() int64

	// Requested GPU count of the Job
	AllocatedGPU() int64

	// the host ip of the chief pod
	HostIPOfChief() string

	// The priority class name of the training job
	GetPriorityClass() string

	GetTrainJob() interface{}
}

The Training Job can be TensorFlow, MPI and Caffe

func ListTrainingJobs

func ListTrainingJobs(namespace string, allNamespaces bool, jobType types.TrainingJobType) ([]TrainingJob, error)

func SearchTrainingJob

func SearchTrainingJob(jobName, namespace string, jobType types.TrainingJobType) (TrainingJob, error)

* search the training job with name and training type

type VolcanoJob

type VolcanoJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

volcano Job wrapper

func (*VolcanoJob) Age

func (vj *VolcanoJob) Age() time.Duration

func (*VolcanoJob) AllPods

func (vj *VolcanoJob) AllPods() []*v1.Pod

return pods from cache

func (*VolcanoJob) AllocatedGPU

func (vj *VolcanoJob) AllocatedGPU() int64

volcano job without gpu supported

func (*VolcanoJob) ChiefPod

func (vj *VolcanoJob) ChiefPod() *v1.Pod

return driver pod

func (*VolcanoJob) Duration

func (vj *VolcanoJob) Duration() time.Duration

Get the Job Training Duration

func (*VolcanoJob) GetJobDashboards

func (vj *VolcanoJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)

func (*VolcanoJob) GetPriorityClass

func (vj *VolcanoJob) GetPriorityClass() string

Get PriorityClass

func (*VolcanoJob) GetStatus

func (vj *VolcanoJob) GetStatus() (status string)

func (*VolcanoJob) GetTrainJob

func (vj *VolcanoJob) GetTrainJob() interface{}

func (*VolcanoJob) HostIPOfChief

func (vj *VolcanoJob) HostIPOfChief() (hostIP string)

Get the hostIP of the driver Pod

func (*VolcanoJob) Name

func (vj *VolcanoJob) Name() string

func (*VolcanoJob) Namespace

func (vj *VolcanoJob) Namespace() string

func (*VolcanoJob) RequestedGPU

func (vj *VolcanoJob) RequestedGPU() int64

volcano job without gpu supported

func (*VolcanoJob) StartTime

func (vj *VolcanoJob) StartTime() *metav1.Time

func (*VolcanoJob) Trainer

func (vj *VolcanoJob) Trainer() types.TrainingJobType

return trainerType: volcano job

func (*VolcanoJob) Uid

func (vj *VolcanoJob) Uid() string

type VolcanoJobTrainer

type VolcanoJobTrainer struct {
	// contains filtered or unexported fields
}

volcano job trainer

func (*VolcanoJobTrainer) GetTrainingJob

func (st *VolcanoJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)

func (*VolcanoJobTrainer) IsEnabled

func (st *VolcanoJobTrainer) IsEnabled() bool

IsEnabled is used to get the trainer is enable or not

func (*VolcanoJobTrainer) IsSupported

func (st *VolcanoJobTrainer) IsSupported(name, ns string) bool

func (*VolcanoJobTrainer) ListTrainingJobs

func (st *VolcanoJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)

func (*VolcanoJobTrainer) Type

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL