commands

package
v0.5.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 9, 2020 License: Apache-2.0, Apache-2.0 Imports: 57 Imported by: 0

Documentation

Overview

Copyright 2018 The Kubeflow Authors

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Index

Constants

View Source
const (
	RecommendedConfigPathEnvVar = "ARENA_CONFIG"
	DefaultArenaConfigPath      = "~/.arena/config"
)
View Source
const (
	CHART_PKG_LOC = "CHARTREPO"
	// GPUResourceName is the extended name of the GPU resource since v1.8
	// this uses the device plugin mechanism
	NVIDIAGPUResourceName = "nvidia.com/gpu"
	ALIYUNGPUResourceName = "aliyun.com/gpu-mem"

	DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu"
)
View Source
const (
	// CLIName is the name of the CLI
	CLIName = "arena"
)
View Source
const KUBEFLOW_NAMESPACE = "kubeflow"
View Source
const KUBE_SYSTEM_NAMESPACE = "kube-system"
View Source
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
View Source
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
View Source
const PROMETHEUS_SCHEME = "http"
View Source
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"
View Source
const ResourceTypeJob = ResourceType("Job")
View Source
const ResourceTypePod = ResourceType("Pod")
View Source
const ResourceTypeStatefulSet = ResourceType("StatefulSet")

Variables

View Source
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}

Functions

func BuildJobInfo added in v0.2.0

func BuildJobInfo(job TrainingJob) *types.JobInfo

* * BuildTrainingJobInfo returns types.TrainingJobInfo

func GetJobDashboards

func GetJobDashboards(dashboard string, job *v1.Job, pods []corev1.Pod) []string

func GetJobRealStatus

func GetJobRealStatus(job TrainingJob) string

Get real job status WHen has pods being pending, tfJob still show in Running state, it should be Pending

func GetNamespace added in v0.2.0

func GetNamespace() string

func GetPrometheusServiceName

func GetPrometheusServiceName(client *kubernetes.Clientset) (name string, ns string)

* * Get Prometheus from different namespaces

func GetResourcesEvents added in v0.3.0

func GetResourcesEvents(client *kubernetes.Clientset, namespace string, resources []Resource) (map[string][]v1.Event, error)

Get Event of the Job

func GpuMonitoringInstalled

func GpuMonitoringInstalled(client *kubernetes.Clientset) bool

func ListServing added in v0.2.0

func ListServing(client *kubernetes.Clientset) ([]servejob.Serving, error)

ListServing returns a list of serving

func ListServingJobsByHelm added in v0.2.0

func ListServingJobsByHelm() ([]servejob.Serving, error)

func ListServingsByName added in v0.3.0

func ListServingsByName(client *kubernetes.Clientset, name string) (servings []servejob.Serving, err error)

List Servings by name

func NewCommand

func NewCommand() *cobra.Command

NewCommand returns a new instance of an Arena command

func NewCompletionCommand

func NewCompletionCommand() *cobra.Command

func NewDataCommand

func NewDataCommand() *cobra.Command

manage data volume

func NewDataListCommand

func NewDataListCommand() *cobra.Command

List Data Command

func NewDeleteCommand

func NewDeleteCommand() *cobra.Command

NewDeleteCommand

func NewGetCommand

func NewGetCommand() *cobra.Command

NewGetCommand

func NewListCommand

func NewListCommand() *cobra.Command

func NewLogViewerCommand

func NewLogViewerCommand() *cobra.Command

func NewLogsCommand

func NewLogsCommand() *cobra.Command

func NewPruneCommand

func NewPruneCommand() *cobra.Command

func NewServeCommand

func NewServeCommand() *cobra.Command

func NewServingCustomCommand added in v0.3.0

func NewServingCustomCommand() *cobra.Command

func NewServingDeleteCommand

func NewServingDeleteCommand() *cobra.Command

NewDeleteCommand

func NewServingGetCommand added in v0.3.0

func NewServingGetCommand() *cobra.Command

NewServingGetCommand starts the command

func NewServingListCommand

func NewServingListCommand() *cobra.Command

func NewServingLogCommand added in v0.3.1

func NewServingLogCommand() *cobra.Command

func NewServingTensorFlowCommand

func NewServingTensorFlowCommand() *cobra.Command

func NewServingTensorRTCommand added in v0.2.0

func NewServingTensorRTCommand() *cobra.Command

func NewSparkApplicationCommand added in v0.2.0

func NewSparkApplicationCommand() *cobra.Command

* https://github.com/GoogleCloudPlatform/spark-on-k8s-operator

sparkApplication is the supported as default scheduledSparkApplication is not supported.

func NewSubmitCommand

func NewSubmitCommand() *cobra.Command

func NewSubmitHorovodJobCommand

func NewSubmitHorovodJobCommand() *cobra.Command

NewSubmitHorovodJobCommand

func NewSubmitMPIJobCommand

func NewSubmitMPIJobCommand() *cobra.Command

func NewSubmitPyTorchJobCommand added in v0.5.0

func NewSubmitPyTorchJobCommand() *cobra.Command

func NewSubmitSparkJobArgs added in v0.2.0

func NewSubmitSparkJobArgs() *submitSparkJobArgs

func NewSubmitStandaloneJobCommand

func NewSubmitStandaloneJobCommand() *cobra.Command

func NewSubmitTFJobCommand

func NewSubmitTFJobCommand() *cobra.Command

func NewSubmitVolcanoJobArgs added in v0.2.0

func NewSubmitVolcanoJobArgs() *submitVolcanoJobArgs

func NewTopCommand

func NewTopCommand() *cobra.Command

func NewTopJobCommand

func NewTopJobCommand() *cobra.Command

func NewTopNodeCommand

func NewTopNodeCommand() *cobra.Command

func NewTrafficRouterSplitCommand

func NewTrafficRouterSplitCommand() *cobra.Command

func NewVersionCmd

func NewVersionCmd(cliName string) *cobra.Command

func NewVolcanoJobCommand added in v0.2.0

func NewVolcanoJobCommand() *cobra.Command

func ParseMountPath

func ParseMountPath(dataset []string) (err error)

func PrintLine

func PrintLine(w io.Writer, fields ...string)

func SortMapKeys

func SortMapKeys(podMetric PodGpuMetric) []string

Types

type BasicJobInfo added in v0.3.0

type BasicJobInfo struct {
	// contains filtered or unexported fields
}

func (*BasicJobInfo) Resources added in v0.3.0

func (j *BasicJobInfo) Resources() []Resource

type Destination

type Destination struct {
	*istiov1alpha3.Destination
	Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"`
}

type DestinationRuleCRD

type DestinationRuleCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

type DestinationWeight

type DestinationWeight struct {
	Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"`
	Weight      int32        `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"`
}

type Driver added in v0.2.0

type Driver struct {
	CPURequest    int    `yaml:"CPURequest"`
	MemoryRequest string `yaml:"MemoryRequest"`
}

type Executor added in v0.2.0

type Executor struct {
	Replicas      int    `yaml:"Replicas"`
	CPURequest    int    `yaml:"CPURequest"`
	MemoryRequest string `yaml:"MemoryRequest"`
}

type GpuMetric

type GpuMetric struct {
	GpuDutyCycle   float64
	GpuMemoryUsed  float64
	GpuMemoryTotal float64
}

type GpuMetricInfo

type GpuMetricInfo struct {
	MetricName    string
	Value         string
	Time          float64
	PodName       string
	PodNamespace  string
	ContainerName string
	NodeName      string
	GPUUID        string
	Id            string
}

func QueryMetricByPrometheus

func QueryMetricByPrometheus(client *kubernetes.Clientset, prometheusServiceName string, namespace string, query string) ([]GpuMetricInfo, error)

type HTTPMatchRequest

type HTTPMatchRequest struct {
	*istiov1alpha3.HTTPMatchRequest
	Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"`
}

type HTTPRoute

type HTTPRoute struct {
	*istiov1alpha3.HTTPRoute
	Match []*HTTPMatchRequest  `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"`
	Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"`
}

type HorovodJob

type HorovodJob struct {
	*JobInfo
}

Horovod Job Information

func (*HorovodJob) AllPods

func (hj *HorovodJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*HorovodJob) ChiefPod

func (hj *HorovodJob) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*HorovodJob) GetJobDashboards

func (hj *HorovodJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*HorovodJob) GetPriorityClass added in v0.3.0

func (hj *HorovodJob) GetPriorityClass() string

Get PriorityClass

func (*HorovodJob) HostIPOfChief

func (hj *HorovodJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

type HorovodJobTrainer

type HorovodJobTrainer struct {
	// contains filtered or unexported fields
}

Horovod Job trainer

func (*HorovodJobTrainer) GetTrainingJob

func (m *HorovodJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

func (*HorovodJobTrainer) IsSupported

func (m *HorovodJobTrainer) IsSupported(name, ns string) bool

check if it's Horovod job

func (*HorovodJobTrainer) ListTrainingJobs added in v0.2.0

func (hj *HorovodJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*HorovodJobTrainer) Type

func (m *HorovodJobTrainer) Type() string

type JobGpuMetric

type JobGpuMetric map[string]PodGpuMetric

func GetJobGpuMetric

func GetJobGpuMetric(client *kubernetes.Clientset, job TrainingJob) (jobMetric JobGpuMetric, err error)

func GetPodsGpuInfo

func GetPodsGpuInfo(client *kubernetes.Clientset, prometheusServiceName string, namespace string, podNames []string) (JobGpuMetric, error)

func (JobGpuMetric) GetPodMetrics

func (m JobGpuMetric) GetPodMetrics(podName string) PodGpuMetric

func (*JobGpuMetric) SetPodMetric

func (m *JobGpuMetric) SetPodMetric(metric GpuMetricInfo)

type JobInfo

type JobInfo struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

func (*JobInfo) Age

func (ji *JobInfo) Age() time.Duration

func (*JobInfo) AllPods

func (ji *JobInfo) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*JobInfo) AllocatedGPU

func (ji *JobInfo) AllocatedGPU() int64

Requested GPU count of the Job

func (*JobInfo) ChiefPod

func (ji *JobInfo) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*JobInfo) Duration added in v0.2.0

func (ji *JobInfo) Duration() time.Duration

Get the Job Training Duration

func (*JobInfo) GetStatus

func (ji *JobInfo) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*JobInfo) HostIPOfChief

func (ji *JobInfo) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*JobInfo) Name

func (ji *JobInfo) Name() string

func (*JobInfo) Namespace added in v0.2.0

func (ji *JobInfo) Namespace() string

func (*JobInfo) RequestedGPU

func (ji *JobInfo) RequestedGPU() int64

Requested GPU count of the Job

func (*JobInfo) StartTime

func (ji *JobInfo) StartTime() *metav1.Time

func (*JobInfo) Trainer

func (ji *JobInfo) Trainer() string

func (*JobInfo) Uid added in v0.3.0

func (ji *JobInfo) Uid() string

type MPIJob

type MPIJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

MPI Job Information

func (*MPIJob) Age

func (mj *MPIJob) Age() time.Duration

Get the Job Age

func (*MPIJob) AllPods

func (mj *MPIJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*MPIJob) AllocatedGPU

func (mj *MPIJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*MPIJob) ChiefPod

func (mj *MPIJob) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*MPIJob) Duration added in v0.2.0

func (mj *MPIJob) Duration() time.Duration

Get the Job Training Duration

func (*MPIJob) GetJobDashboards

func (mj *MPIJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*MPIJob) GetPriorityClass added in v0.3.0

func (m *MPIJob) GetPriorityClass() string

Get PriorityClass

func (*MPIJob) GetStatus

func (mj *MPIJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*MPIJob) HostIPOfChief

func (mj *MPIJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*MPIJob) Name

func (mj *MPIJob) Name() string

func (*MPIJob) Namespace added in v0.2.0

func (mj *MPIJob) Namespace() string

func (*MPIJob) RequestedGPU

func (mj *MPIJob) RequestedGPU() int64

Requested GPU count of the Job

func (*MPIJob) StartTime

func (mj *MPIJob) StartTime() *metav1.Time

Get the start time

func (*MPIJob) Trainer

func (mj *MPIJob) Trainer() string

func (*MPIJob) Uid added in v0.3.0

func (mj *MPIJob) Uid() string

type MPIJobTrainer

type MPIJobTrainer struct {
	// contains filtered or unexported fields
}

MPI Job trainer

func (*MPIJobTrainer) GetTrainingJob

func (tt *MPIJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

Get the training job from cache or directly

func (*MPIJobTrainer) IsSupported

func (tt *MPIJobTrainer) IsSupported(name, ns string) bool

check if it's TensorFlow job

func (*MPIJobTrainer) ListTrainingJobs added in v0.2.0

func (tt *MPIJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*MPIJobTrainer) Type

func (tt *MPIJobTrainer) Type() string

Get the type

type NodeDescriber

type NodeDescriber struct {
	// contains filtered or unexported fields
}

type NodeInfo

type NodeInfo struct {
	// contains filtered or unexported fields
}

type PodGpuMetric

type PodGpuMetric map[string]*GpuMetric

type PortSelector

type PortSelector struct {
	*istiov1alpha3.PortSelector
	Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"`
}

type PreprocesObject

type PreprocesObject struct {
	ServiceName     string
	Namespace       string
	DestinationRule DestinationRuleCRD
	VirtualService  VirtualServiceCRD
}

type PrintArgs

type PrintArgs struct {
	ShowEvents bool
	Output     string
}

type PrometheusMetric

type PrometheusMetric struct {
	Status string               `json:"status,inline"`
	Data   PrometheusMetricData `json:"data,omitempty"`
}

type PrometheusMetricData

type PrometheusMetricData struct {
	Result     []PrometheusMetricResult `json:"result"`
	ResultType string                   `json:"resultType"`
}

type PrometheusMetricResult

type PrometheusMetricResult struct {
	Metric map[string]string       `json:"metric"`
	Value  []PrometheusMetricValue `json:"value"`
}

type PrometheusMetricValue

type PrometheusMetricValue interface{}

type PruneArgs

type PruneArgs struct {
	// contains filtered or unexported fields
}

type PyTorchJob added in v0.5.0

type PyTorchJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

PyTorch Job Information

func (*PyTorchJob) Age added in v0.5.0

func (pj *PyTorchJob) Age() time.Duration

Get the Job Age

func (*PyTorchJob) AllPods added in v0.5.0

func (pj *PyTorchJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*PyTorchJob) AllocatedGPU added in v0.5.0

func (pj *PyTorchJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*PyTorchJob) ChiefPod added in v0.5.0

func (pj *PyTorchJob) ChiefPod() v1.Pod

Get the master Pod of the Job.

func (*PyTorchJob) Duration added in v0.5.0

func (pj *PyTorchJob) Duration() time.Duration

Get the Job Training Duration

func (*PyTorchJob) GetJobDashboards added in v0.5.0

func (pj *PyTorchJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*PyTorchJob) GetPriorityClass added in v0.5.0

func (p *PyTorchJob) GetPriorityClass() string

Get PriorityClass

func (*PyTorchJob) GetStatus added in v0.5.0

func (pj *PyTorchJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*PyTorchJob) HostIPOfChief added in v0.5.0

func (pj *PyTorchJob) HostIPOfChief() (hostIP string)

Get the hostIP of the master Pod

func (*PyTorchJob) Name added in v0.5.0

func (pj *PyTorchJob) Name() string

func (*PyTorchJob) Namespace added in v0.5.0

func (pj *PyTorchJob) Namespace() string

func (*PyTorchJob) RequestedGPU added in v0.5.0

func (pj *PyTorchJob) RequestedGPU() int64

Requested GPU count of the Job

func (*PyTorchJob) StartTime added in v0.5.0

func (pj *PyTorchJob) StartTime() *metav1.Time

Get the start time

func (*PyTorchJob) Trainer added in v0.5.0

func (pj *PyTorchJob) Trainer() string

func (*PyTorchJob) Uid added in v0.5.0

func (pj *PyTorchJob) Uid() string

type PyTorchJobTrainer added in v0.5.0

type PyTorchJobTrainer struct {
	// contains filtered or unexported fields
}

PyTorch Job trainer

func (*PyTorchJobTrainer) GetTrainingJob added in v0.5.0

func (tt *PyTorchJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

Get the training job from cache or directly

func (*PyTorchJobTrainer) IsSupported added in v0.5.0

func (tt *PyTorchJobTrainer) IsSupported(name, ns string) bool

check if it's TensorFlow job

func (*PyTorchJobTrainer) ListTrainingJobs added in v0.5.0

func (tt *PyTorchJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*PyTorchJobTrainer) Type added in v0.5.0

func (tt *PyTorchJobTrainer) Type() string

Get the type

type Resource added in v0.3.0

type Resource struct {
	Name         string
	Uid          string
	ResourceType ResourceType
}

type ResourceType added in v0.3.0

type ResourceType string

type Runtime added in v0.3.3

type Runtime interface {
	// contains filtered or unexported methods
}

type ServeArgs

type ServeArgs struct {
	ImagePullPolicy string            `yaml:"imagePullPolicy"` // --imagePullPolicy
	GPUCount        int               `yaml:"gpuCount"`        // --gpus
	GPUMemory       int               `yaml:"gpuMemory"`       // --gpumemory
	Cpu             string            `yaml:"cpu"`             // --cpu
	Memory          string            `yaml:"memory"`          // --memory
	Envs            map[string]string `yaml:"envs"`            // --envs
	Command         string            `yaml:"command"`         // --command
	Replicas        int               `yaml:"replicas"`        // --replicas
	Port            int               `yaml:"port"`            // --port
	RestfulPort     int               `yaml:"restApiPort"`     // --restfulPort
	EnableIstio     bool              `yaml:"enableIstio"`     // --enableIstio
	ExposeService   bool              `yaml:"exposeService"`   // --exposeService
	ServingName     string            `yaml:"servingName"`     // --servingName
	ServingVersion  string            `yaml:"servingVersion"`  // --servingVersion
	ModelDirs       map[string]string `yaml:"modelDirs"`
	NodeSelectors   map[string]string `yaml:"nodeSelectors"` // --selector
	Tolerations     []string          `yaml:"tolerations"`   // --toleration
	Annotations     map[string]string `yaml:"annotations"`

	ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists
}

func (ServeArgs) PreCheck added in v0.3.0

func (s ServeArgs) PreCheck() error

PreCheck gives some checking for args.

type ServeCustomArgs added in v0.3.0

type ServeCustomArgs struct {
	// Version string `yaml:"version"` // --version
	Image     string `yaml:"image"` // --image
	ServeArgs `yaml:",inline"`
}

type ServeTensorFlowArgs

type ServeTensorFlowArgs struct {
	VersionPolicy          string `yaml:"versionPolicy"`   // --versionPolicy
	ModelConfigFile        string `yaml:"modelConfigFile"` // --modelConfigFile
	ModelConfigFileContent string `yaml:"modelConfigFileContent"`
	Image                  string `yaml:"image"`     // --image
	ModelName              string `yaml:"modelName"` // --modelName
	ModelPath              string `yaml:"modelPath"` // --modelPath

	ServeArgs `yaml:",inline"`
}

type ServeTensorRTArgs added in v0.2.0

type ServeTensorRTArgs struct {
	Image        string `yaml:"image"`        // --image
	ModelStore   string `yaml:"modelStore"`   // --modelStore
	MetricsPort  int    `yaml:"metricsPort"`  // --metricsPort
	HttpPort     int    `yaml:"httpPort"`     // --httpPort
	GrpcPort     int    `yaml:"grpcPort"`     // --grpcPort
	AllowMetrics bool   `yaml:"allowMetrics"` // --allowMetrics

	ServeArgs `yaml:",inline"`
}

type SortPodConditionByLastTransitionTime added in v0.2.0

type SortPodConditionByLastTransitionTime []v1.PodCondition

Sort the pod condition by time.

func (SortPodConditionByLastTransitionTime) Len added in v0.2.0

func (SortPodConditionByLastTransitionTime) Less added in v0.2.0

func (SortPodConditionByLastTransitionTime) Swap added in v0.2.0

type SparkJob added in v0.2.0

type SparkJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

spark application wrapper

func (*SparkJob) Age added in v0.2.0

func (sj *SparkJob) Age() time.Duration

func (*SparkJob) AllPods added in v0.2.0

func (sj *SparkJob) AllPods() []v1.Pod

return pods from cache

func (*SparkJob) AllocatedGPU added in v0.2.0

func (sj *SparkJob) AllocatedGPU() int64

spark job without gpu supported

func (*SparkJob) ChiefPod added in v0.2.0

func (sj *SparkJob) ChiefPod() v1.Pod

return driver pod

func (*SparkJob) Duration added in v0.2.0

func (sj *SparkJob) Duration() time.Duration

Get the Job Training Duration

func (*SparkJob) GetJobDashboards added in v0.2.0

func (sj *SparkJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

func (*SparkJob) GetPriorityClass added in v0.3.0

func (sj *SparkJob) GetPriorityClass() string

Get PriorityClass TODO: @moyuan

func (*SparkJob) GetStatus added in v0.2.0

func (sj *SparkJob) GetStatus() (status string)
spark job driver state

------------------------------------------------------- NewState ApplicationStateType = "" SubmittedState ApplicationStateType = "SUBMITTED" RunningState ApplicationStateType = "RUNNING" CompletedState ApplicationStateType = "COMPLETED" FailedState ApplicationStateType = "FAILED" FailedSubmissionState ApplicationStateType = "SUBMISSION_FAILED" PendingRerunState ApplicationStateType = "PENDING_RERUN" InvalidatingState ApplicationStateType = "INVALIDATING" SucceedingState ApplicationStateType = "SUCCEEDING" FailingState ApplicationStateType = "FAILING" UnknownState ApplicationStateType = "UNKNOWN"

spark job executor state

------------------------------------------------------- ExecutorPendingState ExecutorState = "PENDING" ExecutorRunningState ExecutorState = "RUNNING" ExecutorCompletedState ExecutorState = "COMPLETED" ExecutorFailedState ExecutorState = "FAILED" ExecutorUnknownState ExecutorState = "UNKNOWN"

func (*SparkJob) HostIPOfChief added in v0.2.0

func (sj *SparkJob) HostIPOfChief() (hostIP string)

Get the hostIP of the driver Pod

func (*SparkJob) Name added in v0.2.0

func (sj *SparkJob) Name() string

func (*SparkJob) Namespace added in v0.2.0

func (sj *SparkJob) Namespace() string

func (*SparkJob) RequestedGPU added in v0.2.0

func (sj *SparkJob) RequestedGPU() int64

spark job without gpu supported

func (*SparkJob) StartTime added in v0.2.0

func (sj *SparkJob) StartTime() *metav1.Time

func (*SparkJob) Trainer added in v0.2.0

func (sj *SparkJob) Trainer() string

return trainerType: sparkjob

func (*SparkJob) Uid added in v0.3.0

func (sj *SparkJob) Uid() string

type SparkJobTrainer added in v0.2.0

type SparkJobTrainer struct {
	// contains filtered or unexported fields
}

spark job trainer

func (*SparkJobTrainer) GetTrainingJob added in v0.2.0

func (st *SparkJobTrainer) GetTrainingJob(name, namespace string) (job TrainingJob, err error)

func (*SparkJobTrainer) IsSupported added in v0.2.0

func (st *SparkJobTrainer) IsSupported(name, ns string) bool

func (*SparkJobTrainer) ListTrainingJobs added in v0.2.0

func (st *SparkJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

func (*SparkJobTrainer) Type added in v0.2.0

func (st *SparkJobTrainer) Type() string

type StandaloneJob

type StandaloneJob struct {
	*JobInfo
}

Standalone Job Information

func (*StandaloneJob) GetJobDashboards

func (sj *StandaloneJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*StandaloneJob) GetPriorityClass added in v0.3.0

func (sj *StandaloneJob) GetPriorityClass() string

Get PriorityClass

type StandaloneJobTrainer

type StandaloneJobTrainer struct {
	// contains filtered or unexported fields
}

Standalone Job trainer

func (*StandaloneJobTrainer) GetTrainingJob

func (s *StandaloneJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

func (*StandaloneJobTrainer) IsSupported

func (s *StandaloneJobTrainer) IsSupported(name, ns string) bool

check if it's Standalone job

func (*StandaloneJobTrainer) ListTrainingJobs added in v0.2.0

func (s *StandaloneJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*StandaloneJobTrainer) Type

func (s *StandaloneJobTrainer) Type() string

type StringMatchPrefix

type StringMatchPrefix struct {
	Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}

type TensorFlowJob

type TensorFlowJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

TensorFlow Job Information

func (*TensorFlowJob) Age

func (tj *TensorFlowJob) Age() time.Duration

Get the Job Age

func (*TensorFlowJob) AllPods

func (tj *TensorFlowJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*TensorFlowJob) AllocatedGPU

func (tj *TensorFlowJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*TensorFlowJob) ChiefPod

func (tj *TensorFlowJob) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*TensorFlowJob) Duration added in v0.2.0

func (tj *TensorFlowJob) Duration() time.Duration

Get the Job Training Duration

func (*TensorFlowJob) GetJobDashboards

func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*TensorFlowJob) GetPriorityClass added in v0.3.0

func (t *TensorFlowJob) GetPriorityClass() string

Get PriorityClass

func (*TensorFlowJob) GetStatus

func (tj *TensorFlowJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*TensorFlowJob) HostIPOfChief

func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*TensorFlowJob) Name

func (tj *TensorFlowJob) Name() string

func (*TensorFlowJob) Namespace added in v0.2.0

func (tj *TensorFlowJob) Namespace() string

func (*TensorFlowJob) RequestedGPU

func (tj *TensorFlowJob) RequestedGPU() int64

Requested GPU count of the Job

func (*TensorFlowJob) StartTime

func (tj *TensorFlowJob) StartTime() *metav1.Time

func (*TensorFlowJob) Trainer

func (tj *TensorFlowJob) Trainer() string

func (*TensorFlowJob) Uid added in v0.3.0

func (tj *TensorFlowJob) Uid() string

type TensorFlowJobTrainer

type TensorFlowJobTrainer struct {
	// contains filtered or unexported fields
}

TensorFlow Job trainer

func (*TensorFlowJobTrainer) GetTrainingJob

func (tt *TensorFlowJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

func (*TensorFlowJobTrainer) IsSupported

func (tt *TensorFlowJobTrainer) IsSupported(name, ns string) bool

check if it's TensorFlow job

func (*TensorFlowJobTrainer) ListTrainingJobs added in v0.2.0

func (tt *TensorFlowJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*TensorFlowJobTrainer) Type

func (tt *TensorFlowJobTrainer) Type() string

type Trainer

type Trainer interface {
	// Check if the training job is supported
	IsSupported(name, ns string) bool

	// Get TrainingJob object directly. this method is called when `arena get`
	GetTrainingJob(name, namespace string) (TrainingJob, error)

	// Get the type of trainer
	Type() string

	ListTrainingJobs() ([]TrainingJob, error)
}

func NewHorovodJobTrainer

func NewHorovodJobTrainer(client *kubernetes.Clientset) Trainer

Create HorovodJob Trainer

func NewMPIJobTrainer

func NewMPIJobTrainer(client *kubernetes.Clientset) Trainer

NewMPIJobTrainer

func NewPyTorchJobTrainer added in v0.5.0

func NewPyTorchJobTrainer(client *kubernetes.Clientset) Trainer

NewPyTorchJobTrainer

func NewSparkJobTrainer added in v0.2.0

func NewSparkJobTrainer(client *kubernetes.Clientset) Trainer

func NewStandaloneJobTrainer

func NewStandaloneJobTrainer(client *kubernetes.Clientset) Trainer

func NewTensorFlowJobTrainer

func NewTensorFlowJobTrainer(client *kubernetes.Clientset) Trainer

func NewTrainers

func NewTrainers(client *kubernetes.Clientset) []Trainer

construct the trainer list

func NewVolcanoJobTrainer added in v0.2.0

func NewVolcanoJobTrainer(client *kubernetes.Clientset) Trainer

type TrainingJob

type TrainingJob interface {
	// Get the chief Pod of the Job.
	ChiefPod() v1.Pod

	// Get the name of the Training Job
	Name() string

	// Get the namespace of the Training Job
	Namespace() string

	// Get all the pods of the Training Job
	AllPods() []v1.Pod

	// Get all the kubernetes resource of the Training Job
	Resources() []Resource

	// Get the Status of the Job: RUNNING, PENDING,
	GetStatus() string

	// Return trainer Type, support MPI, standalone, tensorflow
	Trainer() string

	// Get the Job Age
	Age() time.Duration

	// Get the Job Duration
	Duration() time.Duration

	// Get start time
	StartTime() *metav1.Time

	// Get Dashboard
	GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

	// Requested GPU count of the Job
	RequestedGPU() int64

	// Requested GPU count of the Job
	AllocatedGPU() int64

	// the host ip of the chief pod
	HostIPOfChief() string

	// The priority class name of the training job
	GetPriorityClass() string
}

The Training Job can be TensorFlow, MPI and Caffe

type VirtualService

type VirtualService struct {
	*istiov1alpha3.VirtualService
	Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"`
}

type VirtualServiceCRD

type VirtualServiceCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

type VolcanoJob added in v0.2.0

type VolcanoJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

volcano Job wrapper

func (*VolcanoJob) Age added in v0.2.0

func (vj *VolcanoJob) Age() time.Duration

func (*VolcanoJob) AllPods added in v0.2.0

func (vj *VolcanoJob) AllPods() []v1.Pod

return pods from cache

func (*VolcanoJob) AllocatedGPU added in v0.2.0

func (vj *VolcanoJob) AllocatedGPU() int64

volcano job without gpu supported

func (*VolcanoJob) ChiefPod added in v0.2.0

func (vj *VolcanoJob) ChiefPod() v1.Pod

return driver pod

func (*VolcanoJob) Duration added in v0.2.0

func (vj *VolcanoJob) Duration() time.Duration

Get the Job Training Duration

func (*VolcanoJob) GetJobDashboards added in v0.2.0

func (vj *VolcanoJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

func (*VolcanoJob) GetPriorityClass added in v0.3.0

func (vj *VolcanoJob) GetPriorityClass() string

Get PriorityClass

func (*VolcanoJob) GetStatus added in v0.2.0

func (vj *VolcanoJob) GetStatus() (status string)

func (*VolcanoJob) HostIPOfChief added in v0.2.0

func (vj *VolcanoJob) HostIPOfChief() (hostIP string)

Get the hostIP of the driver Pod

func (*VolcanoJob) Name added in v0.2.0

func (vj *VolcanoJob) Name() string

func (*VolcanoJob) Namespace added in v0.2.0

func (vj *VolcanoJob) Namespace() string

func (*VolcanoJob) RequestedGPU added in v0.2.0

func (vj *VolcanoJob) RequestedGPU() int64

volcano job without gpu supported

func (*VolcanoJob) StartTime added in v0.2.0

func (vj *VolcanoJob) StartTime() *metav1.Time

func (*VolcanoJob) Trainer added in v0.2.0

func (vj *VolcanoJob) Trainer() string

return trainerType: volcano job

func (*VolcanoJob) Uid added in v0.3.0

func (vj *VolcanoJob) Uid() string

type VolcanoJobTrainer added in v0.2.0

type VolcanoJobTrainer struct {
	// contains filtered or unexported fields
}

volcano job trainer

func NewVolcanoJobTrainerSubmit added in v0.2.0

func NewVolcanoJobTrainerSubmit(client *kubernetes.Clientset) *VolcanoJobTrainer

func (*VolcanoJobTrainer) GetTrainingJob added in v0.2.0

func (st *VolcanoJobTrainer) GetTrainingJob(name, namespace string) (job TrainingJob, err error)

func (*VolcanoJobTrainer) GetTrainingJobAtSubmit added in v0.2.0

func (st *VolcanoJobTrainer) GetTrainingJobAtSubmit(name, namespace string) (job TrainingJob, err error)

func (*VolcanoJobTrainer) IsSupported added in v0.2.0

func (st *VolcanoJobTrainer) IsSupported(name, ns string) bool

func (*VolcanoJobTrainer) ListTrainingJobs added in v0.2.0

func (st *VolcanoJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

func (*VolcanoJobTrainer) Type added in v0.2.0

func (st *VolcanoJobTrainer) Type() string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL