commands

package
v1.1.5 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 6, 2020 License: Apache-2.0, Apache-2.0 Imports: 50 Imported by: 0

Documentation

Overview

Copyright 2018 The Kubeflow Authors

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Index

Constants

View Source
const (
	RecommendedConfigPathEnvVar = "ARENA_CONFIG"
	DefaultArenaConfigPath      = "~/.arena/config"
)
View Source
const (
	CHART_PKG_LOC = "CHARTREPO"
	// GPUResourceName is the extended name of the GPU resource since v1.8
	// this uses the device plugin mechanism
	NVIDIAGPUResourceName = "nvidia.com/gpu"
	ALIYUNGPUResourceName = "aliyun.com/gpu-mem"

	DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu"
)
View Source
const KUBEFLOW_NAMESPACE = "kubeflow"
View Source
const KUBE_SYSTEM_NAMESPACE = "kube-system"
View Source
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
View Source
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
View Source
const PROMETHEUS_SCHEME = "http"
View Source
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"
View Source
const ResourceTypeJob = ResourceType("Job")
View Source
const ResourceTypePod = ResourceType("Pod")
View Source
const ResourceTypeReplicaset = ResourceType("ReplicaSet")
View Source
const ResourceTypeStatefulSet = ResourceType("StatefulSet")

Variables

View Source
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}

Functions

func BuildJobInfo added in v0.2.0

func BuildJobInfo(job TrainingJob) *types.JobInfo

* * BuildTrainingJobInfo returns types.TrainingJobInfo

func GetJobDashboards

func GetJobDashboards(dashboard string, job *v1.Job, pods []corev1.Pod) []string

func GetJobRealStatus

func GetJobRealStatus(job TrainingJob) string

Get real job status WHen has pods being pending, tfJob still show in Running state, it should be Pending

func GetNamespace added in v0.2.0

func GetNamespace() string

func GetPrometheusServiceName

func GetPrometheusServiceName(client *kubernetes.Clientset) (name string, ns string)

* * Get Prometheus from different namespaces

func GetResourcesEvents added in v0.3.0

func GetResourcesEvents(client *kubernetes.Clientset, namespace string, resources []Resource) (map[string][]v1.Event, error)

Get Event of the Job

func GpuMonitoringInstalled

func GpuMonitoringInstalled(client *kubernetes.Clientset) bool

func ListServing added in v0.2.0

func ListServing(client *kubernetes.Clientset) ([]servejob.Serving, error)

ListServing returns a list of serving

func ListServingJobsByHelm added in v0.2.0

func ListServingJobsByHelm() ([]servejob.Serving, error)

func ListServingsByName added in v0.3.0

func ListServingsByName(client *kubernetes.Clientset, name string) (servings []servejob.Serving, err error)

List Servings by name

func NewBashCommand added in v1.1.0

func NewBashCommand() *cobra.Command

func NewCommand

func NewCommand() *cobra.Command

NewCommand returns a new instance of an Arena command

func NewCompletionCommand

func NewCompletionCommand() *cobra.Command

func NewDataCommand

func NewDataCommand() *cobra.Command

manage data volume

func NewDataListCommand

func NewDataListCommand() *cobra.Command

List Data Command

func NewDeleteCommand

func NewDeleteCommand() *cobra.Command

NewDeleteCommand

func NewExecCommand added in v1.1.0

func NewExecCommand() *cobra.Command

func NewGetCommand

func NewGetCommand() *cobra.Command

NewGetCommand

func NewListCommand

func NewListCommand() *cobra.Command

func NewLogViewerCommand

func NewLogViewerCommand() *cobra.Command

func NewLogsCommand

func NewLogsCommand() *cobra.Command

func NewPruneCommand

func NewPruneCommand() *cobra.Command

func NewRunaiJobCommand added in v1.0.0

func NewRunaiJobCommand() *cobra.Command

func NewServingDeleteCommand

func NewServingDeleteCommand() *cobra.Command

NewDeleteCommand

func NewServingGetCommand added in v0.3.0

func NewServingGetCommand() *cobra.Command

NewServingGetCommand starts the command

func NewServingListCommand

func NewServingListCommand() *cobra.Command

func NewServingLogCommand added in v1.0.0

func NewServingLogCommand() *cobra.Command

func NewSubmitCommand

func NewSubmitCommand() *cobra.Command

func NewSubmitRunaiJobArgs added in v1.0.0

func NewSubmitRunaiJobArgs() *submitRunaiJobArgs

func NewTemplateCommand added in v1.1.4

func NewTemplateCommand() *cobra.Command

func NewTemplateGetCommand added in v1.1.4

func NewTemplateGetCommand() *cobra.Command

func NewTemplateListCommand added in v1.1.4

func NewTemplateListCommand() *cobra.Command

func NewTopCommand

func NewTopCommand() *cobra.Command

func NewTopJobCommand

func NewTopJobCommand() *cobra.Command

func NewTopNodeCommand

func NewTopNodeCommand() *cobra.Command

func NewUpdateCommand added in v1.1.0

func NewUpdateCommand() *cobra.Command

func NewVersionCmd

func NewVersionCmd() *cobra.Command

func PrintLine

func PrintLine(w io.Writer, fields ...string)

func PrintTemplates added in v1.1.4

func PrintTemplates(configs []clusterConfig.ClusterConfig)

func SortMapKeys

func SortMapKeys(podMetric PodGpuMetric) []string

Types

type Asset added in v1.1.0

type Asset struct {
	Name        string `json:"name"`
	DownloadUrl string `json:"browser_download_url"`
}

type BasicJobInfo added in v0.3.0

type BasicJobInfo struct {
	// contains filtered or unexported fields
}

func (*BasicJobInfo) CreatedByCLI added in v1.1.0

func (*BasicJobInfo) CreatedByCLI() bool

func (*BasicJobInfo) Image added in v1.1.0

func (j *BasicJobInfo) Image() string

func (*BasicJobInfo) Interactive added in v1.1.0

func (j *BasicJobInfo) Interactive() string

func (*BasicJobInfo) Project added in v1.1.0

func (j *BasicJobInfo) Project() string

func (*BasicJobInfo) Resources added in v0.3.0

func (j *BasicJobInfo) Resources() []Resource

func (*BasicJobInfo) ServiceURLs added in v1.1.0

func (*BasicJobInfo) ServiceURLs() []string

func (*BasicJobInfo) User added in v1.1.0

func (j *BasicJobInfo) User() string

type GithubResponse added in v1.1.0

type GithubResponse struct {
	AssetsUrl string  `json:"assets_url"`
	Assets    []Asset `json:"assets"`
}

type GpuMetric

type GpuMetric struct {
	GpuDutyCycle   float64
	GpuMemoryUsed  float64
	GpuMemoryTotal float64
}

type GpuMetricInfo

type GpuMetricInfo struct {
	MetricName    string
	Value         string
	Time          float64
	PodName       string
	PodNamespace  string
	ContainerName string
	NodeName      string
	GPUUID        string
	Id            string
}

func QueryMetricByPrometheus

func QueryMetricByPrometheus(client *kubernetes.Clientset, prometheusServiceName string, namespace string, query string) ([]GpuMetricInfo, error)

type JobGpuMetric

type JobGpuMetric map[string]PodGpuMetric

func GetJobGpuMetric

func GetJobGpuMetric(client *kubernetes.Clientset, job TrainingJob) (jobMetric JobGpuMetric, err error)

func GetPodsGpuInfo

func GetPodsGpuInfo(client *kubernetes.Clientset, prometheusServiceName string, namespace string, podNames []string) (JobGpuMetric, error)

func (JobGpuMetric) GetPodMetrics

func (m JobGpuMetric) GetPodMetrics(podName string) PodGpuMetric

func (*JobGpuMetric) SetPodMetric

func (m *JobGpuMetric) SetPodMetric(metric GpuMetricInfo)

type JobInfo

type JobInfo struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

func (*JobInfo) Age

func (ji *JobInfo) Age() time.Duration

func (*JobInfo) AllPods

func (ji *JobInfo) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*JobInfo) AllocatedGPU

func (ji *JobInfo) AllocatedGPU() int64

Requested GPU count of the Job

func (*JobInfo) ChiefPod

func (ji *JobInfo) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*JobInfo) Duration added in v0.2.0

func (ji *JobInfo) Duration() time.Duration

Get the Job Training Duration

func (*JobInfo) GetStatus

func (ji *JobInfo) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*JobInfo) HostIPOfChief

func (ji *JobInfo) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*JobInfo) Name

func (ji *JobInfo) Name() string

func (*JobInfo) Namespace added in v0.2.0

func (ji *JobInfo) Namespace() string

func (*JobInfo) RequestedGPU

func (ji *JobInfo) RequestedGPU() int64

Requested GPU count of the Job

func (*JobInfo) StartTime

func (ji *JobInfo) StartTime() *metav1.Time

func (*JobInfo) Trainer

func (ji *JobInfo) Trainer() string

func (*JobInfo) Uid added in v0.3.0

func (ji *JobInfo) Uid() string

type NodeDescriber

type NodeDescriber struct {
	// contains filtered or unexported fields
}

type NodeInfo

type NodeInfo struct {
	// contains filtered or unexported fields
}

type PodGpuMetric

type PodGpuMetric map[string]*GpuMetric

type PrintArgs

type PrintArgs struct {
	ShowEvents bool
	Output     string
}

type PrometheusMetric

type PrometheusMetric struct {
	Status string               `json:"status,inline"`
	Data   PrometheusMetricData `json:"data,omitempty"`
}

type PrometheusMetricData

type PrometheusMetricData struct {
	Result     []PrometheusMetricResult `json:"result"`
	ResultType string                   `json:"resultType"`
}

type PrometheusMetricResult

type PrometheusMetricResult struct {
	Metric map[string]string       `json:"metric"`
	Value  []PrometheusMetricValue `json:"value"`
}

type PrometheusMetricValue

type PrometheusMetricValue interface{}

type PruneArgs

type PruneArgs struct {
	// contains filtered or unexported fields
}

type Resource added in v0.3.0

type Resource struct {
	Name         string
	Uid          string
	ResourceType ResourceType
}

type ResourceType added in v0.3.0

type ResourceType string

type RunaiJob added in v1.0.0

type RunaiJob struct {
	*BasicJobInfo
	// contains filtered or unexported fields
}

func NewRunaiJob added in v1.1.0

func NewRunaiJob(pods []v1.Pod, lastCreatedPod *v1.Pod, creationTimestamp metav1.Time, trainingType string, jobName string, interactive bool, createdByCLI bool, serviceUrls []string, deleted bool, podSpec v1.PodSpec, podMetadata metav1.ObjectMeta, namespace string, ownerResource Resource) *RunaiJob

func (*RunaiJob) Age added in v1.0.0

func (rj *RunaiJob) Age() time.Duration

Get the Job Age

func (*RunaiJob) AllPods added in v1.0.0

func (rj *RunaiJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*RunaiJob) AllocatedGPU added in v1.0.0

func (rj *RunaiJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*RunaiJob) ChiefPod added in v1.0.0

func (rj *RunaiJob) ChiefPod() *v1.Pod

// Get the chief Pod of the Job.

func (*RunaiJob) CreatedByCLI added in v1.1.0

func (rj *RunaiJob) CreatedByCLI() bool

func (*RunaiJob) Duration added in v1.0.0

func (rj *RunaiJob) Duration() time.Duration

TODO Get the Job Duration

func (*RunaiJob) GetJobDashboards added in v1.0.0

func (rj *RunaiJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard

func (*RunaiJob) GetPriorityClass added in v1.0.0

func (rj *RunaiJob) GetPriorityClass() string

The priority class name of the training job

func (*RunaiJob) GetStatus added in v1.0.0

func (rj *RunaiJob) GetStatus() string

Get the Status of the Job: RUNNING, PENDING,

func (*RunaiJob) HostIPOfChief added in v1.0.0

func (rj *RunaiJob) HostIPOfChief() string

the host ip of the chief pod

func (*RunaiJob) Image added in v1.1.0

func (rj *RunaiJob) Image() string

func (*RunaiJob) Interactive added in v1.1.0

func (rj *RunaiJob) Interactive() string

func (*RunaiJob) Name added in v1.0.0

func (rj *RunaiJob) Name() string

Get the name of the Training Job

func (*RunaiJob) Namespace added in v1.0.0

func (rj *RunaiJob) Namespace() string

Get the namespace of the Training Job

func (*RunaiJob) Project added in v1.1.0

func (rj *RunaiJob) Project() string

func (*RunaiJob) RequestedGPU added in v1.0.0

func (rj *RunaiJob) RequestedGPU() int64

Requested GPU count of the Job

func (*RunaiJob) Resources added in v1.0.0

func (rj *RunaiJob) Resources() []Resource

Get all the kubernetes resource of the Training Job

func (*RunaiJob) ServiceURLs added in v1.1.0

func (rj *RunaiJob) ServiceURLs() []string

func (*RunaiJob) StartTime added in v1.0.0

func (rj *RunaiJob) StartTime() *metav1.Time

Get start time

func (*RunaiJob) Trainer added in v1.0.0

func (rj *RunaiJob) Trainer() string

Return trainer Type, support MPI, standalone, tensorflow

func (*RunaiJob) User added in v1.1.0

func (rj *RunaiJob) User() string

type RunaiJobInfo added in v1.1.0

type RunaiJobInfo struct {
	// contains filtered or unexported fields
}

type RunaiOwnerInfo added in v1.1.5

type RunaiOwnerInfo struct {
	Name string
	Type string
	Uid  string
}

type RunaiTrainer added in v1.0.0

type RunaiTrainer struct {
	// contains filtered or unexported fields
}

func (*RunaiTrainer) GetTrainingJob added in v1.0.0

func (rt *RunaiTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)

func (*RunaiTrainer) IsSupported added in v1.0.0

func (rt *RunaiTrainer) IsSupported(name, ns string) bool

func (*RunaiTrainer) ListTrainingJobs added in v1.0.0

func (rt *RunaiTrainer) ListTrainingJobs(namespace string) ([]TrainingJob, error)

func (*RunaiTrainer) Type added in v1.0.0

func (rt *RunaiTrainer) Type() string

type SortPodConditionByLastTransitionTime added in v0.2.0

type SortPodConditionByLastTransitionTime []v1.PodCondition

Sort the pod condition by time.

func (SortPodConditionByLastTransitionTime) Len added in v0.2.0

func (SortPodConditionByLastTransitionTime) Less added in v0.2.0

func (SortPodConditionByLastTransitionTime) Swap added in v0.2.0

type Trainer

type Trainer interface {
	// Check if the training job is supported
	IsSupported(name, ns string) bool

	// Get TrainingJob object directly. this method is called when `arena get`
	GetTrainingJob(name, namespace string) (TrainingJob, error)

	// Get the type of trainer
	Type() string

	ListTrainingJobs(namespace string) ([]TrainingJob, error)
}

func NewRunaiTrainer added in v1.0.0

func NewRunaiTrainer(client kubernetes.Interface) Trainer

func NewTrainers

func NewTrainers(client *kubernetes.Clientset) []Trainer

construct the trainer list

type TrainingJob

type TrainingJob interface {
	// Get the chief Pod of the Job.
	ChiefPod() *v1.Pod

	// Get the name of the Training Job
	Name() string

	// Get the namespace of the Training Job
	Namespace() string

	// Get all the pods of the Training Job
	AllPods() []v1.Pod

	// Get all the kubernetes resource of the Training Job
	Resources() []Resource

	// Get the Status of the Job: RUNNING, PENDING,
	GetStatus() string

	// Return trainer Type, support MPI, standalone, tensorflow
	Trainer() string

	// Get the Job Age
	Age() time.Duration

	// Get the Job Duration
	Duration() time.Duration

	// Get start time
	StartTime() *metav1.Time

	// Get Dashboard
	GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

	// Requested GPU count of the Job
	RequestedGPU() int64

	// Requested GPU count of the Job
	AllocatedGPU() int64

	// the host ip of the chief pod
	HostIPOfChief() string

	// The priority class name of the training job
	GetPriorityClass() string

	Project() string

	User() string

	Interactive() string

	Image() string

	CreatedByCLI() bool

	ServiceURLs() []string
}

The Training Job can be TensorFlow, MPI and Caffe

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL