types

package
v0.8.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 18, 2021 License: Apache-2.0, Apache-2.0 Imports: 6 Imported by: 36

Documentation

Overview

Copyright 2018 The Kubeflow Authors

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License

Index

Constants

View Source
const (
	GPUShareResourceName    = "aliyun.com/gpu-mem"
	GPUShareCountName       = "aliyun.com/gpu-count"
	GPUShareEnvGPUID        = "ALIYUN_COM_GPU_MEM_IDX"
	GPUShareAllocationLabel = "scheduler.framework.gpushare.allocation"
	GPUShareNodeLabels      = "gpushare=true,cgpu=true,ack.node.gpu.schedule=share,ack.node.gpu.schedule=cgpu"
)
View Source
const (
	AliyunGPUResourceName      = "aliyun.com/gpu"
	GPUTopologyAllocationLabel = "topology.kubernetes.io/gpu-group"
	GPUTopologyVisibleGPULabel = "topology.kubernetes.io/gpu-visible"
	GPUTopologyNodeLabels      = "ack.node.gpu.schedule=topology"
)
View Source
const KUBEFLOW_NAMESPACE = "kubeflow"
View Source
const KUBE_SYSTEM_NAMESPACE = "kube-system"
View Source
const NODE_METRIC_TMP = `{__name__=~"%s", node_name=~"%s"}`
View Source
const (
	// defines the nvidia resource name
	NvidiaGPUResourceName = "nvidia.com/gpu"
)
View Source
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
View Source
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
View Source
const PROMETHEUS_SCHEME = "http"
View Source
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"
View Source
const (
	RequestGPUsOfJobAnnoKey = "requestGPUsOfJobOwner"
)

Variables

View Source
var ErrTrainingJobNotFound = errors.New("training job not found,please use 'arena list' to make sure job is existed.")
View Source
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}
View Source
var NodeTypeSlice = []NodeTypeInfo{
	{
		Name:      NormalNode,
		Alias:     "none",
		Shorthand: "n",
	},
	{
		Name:      GPUExclusiveNode,
		Alias:     "exclusive",
		Shorthand: "e",
	},
	{
		Name:      GPUTopologyNode,
		Alias:     "topology",
		Shorthand: "t",
	},
	{
		Name:      GPUShareNode,
		Alias:     "share",
		Shorthand: "s",
	},
}
View Source
var SUPPORT_PROMETHEUS_SERVERS = []*PrometheusServer{

	{
		Name:          "arms-prometheus-admin",
		ServiceLabels: "kubernetes.io/service-name=prometheus-admin",
		Protocol:      "http",
		Port:          "9335",
		Path:          "api/v1/query",
		MetricList: []string{
			"nvidia_gpu_duty_cycle",
			"nvidia_gpu_memory_used_bytes",
			"nvidia_gpu_memory_total_bytes",
		},
	},
	{
		Name:          "default",
		ServiceLabels: "kubernetes.io/service-name=prometheus-server",
		Protocol:      "http",
		Port:          "9090",
		Path:          "api/v1/query",
		MetricList: []string{
			"nvidia_gpu_duty_cycle",
			"nvidia_gpu_memory_used_bytes",
			"nvidia_gpu_memory_total_bytes",
		},
	},

	{
		Name:          "default-old",
		ServiceLabels: "kubernetes.io/name=Prometheus",
		Protocol:      "http",
		Port:          "9090",
		Path:          "api/v1/query",
		MetricList: []string{
			"nvidia_gpu_duty_cycle",
			"nvidia_gpu_memory_used_bytes",
			"nvidia_gpu_memory_total_bytes",
		},
	},
}
View Source
var ServingTypeMap = map[ServingJobType]ServingTypeInfo{
	CustomServingJob: {
		Name:      CustomServingJob,
		Alias:     "Custom",
		Shorthand: "custom",
	},
	KFServingJob: {
		Name:      KFServingJob,
		Alias:     "KFServing",
		Shorthand: "kf",
	},
	TFServingJob: {
		Name:      TFServingJob,
		Alias:     "Tensorflow",
		Shorthand: "tf",
	},
	TRTServingJob: {
		Name:      TRTServingJob,
		Alias:     "Tensorrt",
		Shorthand: "trt",
	},
	SeldonServingJob: {
		Name:      SeldonServingJob,
		Alias:     "Seldon",
		Shorthand: "seldon",
	},
}

ServingTypeMap collects serving job type and their alias

View Source
var TrainingTypeMap = map[TrainingJobType]TrainingJobTypeInfo{
	TFTrainingJob: {
		Name:      TFTrainingJob,
		Alias:     "Tensorflow",
		Shorthand: "tf",
	},
	MPITrainingJob: {
		Name:      MPITrainingJob,
		Alias:     "MPI",
		Shorthand: "mpi",
	},
	PytorchTrainingJob: {
		Name:      PytorchTrainingJob,
		Alias:     "Pytorch",
		Shorthand: "py",
	},
	HorovodTrainingJob: {
		Name:      HorovodTrainingJob,
		Alias:     "Horovod",
		Shorthand: "horovod",
	},
	VolcanoTrainingJob: {
		Name:      VolcanoTrainingJob,
		Alias:     "Volcano",
		Shorthand: "volcano",
	},
	ETTrainingJob: {
		Name:      ETTrainingJob,
		Alias:     "ElasticTraining",
		Shorthand: "et",
	},
	SparkTrainingJob: {
		Name:      SparkTrainingJob,
		Alias:     "Spark",
		Shorthand: "spark",
	},
}

ServingTypeMap collects serving job type and their alias

Functions

This section is empty.

Types

type AdvancedGpuMetric

type AdvancedGpuMetric struct {
	Id             string  `json:"id" yaml:"id"`
	UUID           string  `json:"uuid" yaml:"uuid"`
	GpuDutyCycle   float64 `json:"gpuDutyCycle" yaml:"gpuDutyCycle"`
	GpuMemoryUsed  float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
	GpuMemoryTotal float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	// PodName is combined with namespace and  pod name,like 'namespace/pod_name'
	PodNames []string `json:"podNames" yaml:"podNames"`
}

type AllNodeInfo

type AllNodeInfo map[string][]interface{}

type ArenaClientArgs

type ArenaClientArgs struct {
	Kubeconfig     string
	Namespace      string
	ArenaNamespace string
	IsDaemonMode   bool
	LogLevel       string
}

type CommonGPUNodeInfo

type CommonGPUNodeInfo struct {
	TotalGPUs     int                  `json:"totalGPUs" yaml:"totalGPUs"`
	AllocatedGPUs int                  `json:"allocatedGPUs" yaml:"allocatedGPUs"`
	UnhealthyGPUs int                  `json:"unhealthyGPUs" yaml:"unhealthyGPUs"`
	GPUMetrics    []*AdvancedGpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"`
}

type CommonNodeInfo

type CommonNodeInfo struct {
	Name        string   `json:"name" yaml:"name"`
	Description string   `json:"description" yaml:"description"`
	IP          string   `json:"ip" yaml:"ip"`
	Status      string   `json:"status" yaml:"status"`
	Role        string   `json:"role" yaml:"role"`
	Type        NodeType `json:"type" yaml:"type"`
}

type CommonServingArgs

type CommonServingArgs struct {
	Name            string            `yaml:"servingName"`
	Version         string            `yaml:"servingVersion"`
	Namespace       string            `yaml:"-"`
	Type            ServingJobType    `yaml:"-"`
	Image           string            `yaml:"image"`
	ImagePullPolicy string            `yaml:"imagePullPolicy"` // --imagePullPolicy
	GPUCount        int               `yaml:"gpuCount"`        // --gpus
	GPUMemory       int               `yaml:"gpuMemory"`       // --gpumemory
	Cpu             string            `yaml:"cpu"`             // --cpu
	Memory          string            `yaml:"memory"`          // --memory
	Envs            map[string]string `yaml:"envs"`            // --envs
	Command         string            `yaml:"command"`         // --command
	Replicas        int               `yaml:"replicas"`        // --replicas
	EnableIstio     bool              `yaml:"enableIstio"`     // --enableIstio
	ExposeService   bool              `yaml:"exposeService"`   // --exposeService
	ModelDirs       map[string]string `yaml:"modelDirs"`
	HostVolumes     []DataDirVolume   `yaml:"hostVolumes"`   // --data-dir
	NodeSelectors   map[string]string `yaml:"nodeSelectors"` // --selector
	Tolerations     []string          `yaml:"tolerations"`   // --toleration
	Annotations     map[string]string `yaml:"annotations"`

	ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists
}

type CommonSubmitArgs

type CommonSubmitArgs struct {

	// Name stores the job name,match option --name
	Name string `yaml:"-"`

	// Namespace  stores the namespace of job,match option --namespace
	Namespace string `yaml:"-"`

	// TrainingType stores the trainingType
	TrainingType TrainingJobType `yaml:"-"`

	// NodeSelectors defines the node selectors,match option --selector
	NodeSelectors map[string]string `yaml:"nodeSelectors"`

	// ConfigFiles stores the config file which is existed in client host node
	// and map it to container,match option --config-file
	ConfigFiles map[string]map[string]ConfigFileInfo `yaml:"configFiles"`

	// Tolerations defines the tolerations which tolerates node taints
	// match option --toleration
	Tolerations []string `yaml:"tolerations"`

	// Image stores the docker image of job,match option --image
	Image string `yaml:"image"`

	// GPUCount stores the gpu count of the job needs,match option --gpus
	GPUCount int `yaml:"gpuCount"`

	// Envs stores the envs of container in job, match option --env
	Envs map[string]string `yaml:"envs"`

	// WorkingDir stores the working directory of container in job,match option --working-dir
	WorkingDir string `yaml:"workingDir"`

	// Command stores the command of job
	Command string `yaml:"command"`

	// Mode is used for horovod,match option --sync-mode
	Mode string `yaml:"mode"`

	// WorkerCount stores the count of job worker,match option --workers
	WorkerCount int `yaml:"workers"`

	// Retry defines the retry times
	Retry int `yaml:"retry"`

	// DataSet stores the kubernetes pvc names
	DataSet map[string]string `yaml:"dataset"`

	// DataDirs stores the files(or directories) in k8s node which will map to containers
	// match option --data-dir
	DataDirs []DataDirVolume `yaml:"dataDirs"`

	// EnableRDMA enable rdma or not,match option --rdma
	EnableRDMA bool `yaml:"enableRDMA"`

	// UseENI defines using eni or not
	UseENI bool `yaml:"useENI"`

	// Annotations defines pod annotations of job,match option --annotation
	Annotations map[string]string `yaml:"annotations"`

	// IsNonRoot is root user or not
	IsNonRoot bool `yaml:"isNonRoot"`

	// PodSecurityContext defines the pod security context
	PodSecurityContext LimitedPodSecurityContext `yaml:"podSecurityContext"`

	// PriorityClassName defines the priority class
	PriorityClassName string `yaml:"priorityClassName"`

	// Conscheduling defines using Conscheduling
	Conscheduling bool

	// PodGroupName stores pod group name
	PodGroupName string `yaml:"podGroupName"`

	// PodGroupMinAvailable stores pod group min available
	PodGroupMinAvailable string `yaml:"podGroupMinAvailable"`

	// ImagePullSecrets stores image pull secrets,match option --image-pull-secrets
	ImagePullSecrets []string `yaml:"imagePullSecrets"`

	// HelmOptions stores the helm options
	HelmOptions []string `yaml:"-"`
}

CommonSubmitArgs defines the common parts of the submitAthd

type ConfigFileInfo

type ConfigFileInfo struct {
	ContainerFileName string `yaml:"containerFileName"`
	HostFile          string `yaml:"hostFile"`
	Key               string `yaml:"key"`
	ContainerFilePath string `yaml:"containerFilePath"`
}

ConfigFileInfo defines the config files which will be mounted to containers

type CustomServingArgs

type CustomServingArgs struct {
	Port              int `yaml:"port"`        // --port
	RestfulPort       int `yaml:"restApiPort"` // --restfulPort
	CommonServingArgs `yaml:",inline"`
}

type DataDirVolume

type DataDirVolume struct {
	// HostPath defines the host path
	HostPath string `yaml:"hostPath"`
	// ContainerPath defines container path
	ContainerPath string `yaml:"containerPath"`
	// Name defines the volume name
	Name string `yaml:"name"`
}

DataDirVolume defines the volume of kubernetes

type Destination

type Destination struct {
	*istiov1alpha3.Destination
	Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"`
}

type DestinationRuleCRD

type DestinationRuleCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

type DestinationWeight

type DestinationWeight struct {
	Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"`
	Weight      int32        `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"`
}

type Driver

type Driver struct {
	CPURequest    int    `yaml:"CPURequest"`
	MemoryRequest string `yaml:"MemoryRequest"`
}

type Endpoint

type Endpoint struct {
	// Endpoint Name
	Name string `json:"name" yaml:"name"`
	// Port specifies endpoint port
	Port int `json:"port" yaml:"port"`
	// NodePort specifies the node port
	NodePort int `json:"nodePort" yaml:"nodePort"`
}

type Executor

type Executor struct {
	Replicas      int    `yaml:"Replicas"`
	CPURequest    int    `yaml:"CPURequest"`
	MemoryRequest string `yaml:"MemoryRequest"`
}

type FormatStyle

type FormatStyle string

PrintFormatStyle defines the format of output it only used in cmd

const (
	// Wide defines the wide format
	WideFormat FormatStyle = "wide"
	// Json defines the json format
	JsonFormat FormatStyle = "json"
	// Yaml defines the yaml format
	YamlFormat FormatStyle = "yaml"
	// Unknwon defines the unknown format
	UnknownFormat FormatStyle = "unknown"
)

type GPUDeviceInfo

type GPUDeviceInfo struct {
	ID                 string  `json:"id" yaml:"id"`
	TotalGPUMemory     float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	AllocatedGPUMemory float64 `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"`
	UsedGPUMemory      float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
	DutyCycle          float64 `json:"dutyCycle" yaml:"dutyCycle"`
}

type GPUExclusiveNodeInfo

type GPUExclusiveNodeInfo struct {
	PodInfos          []GPUExclusivePodInfo `json:"instances" yaml:"instances"`
	CommonNodeInfo    `yaml:",inline" json:",inline"`
	CommonGPUNodeInfo `yaml:",inline" json:",inline"`
}

type GPUExclusivePodInfo

type GPUExclusivePodInfo struct {
	Name       string `json:"name" yaml:"name"`
	Namespace  string `json:"namespace" yaml:"namespace"`
	Status     string `json:"status" yaml:"status"`
	RequestGPU int    `json:"requestGPUs" yaml:"requestGPUs"`
}

type GPUShareNodeDevice

type GPUShareNodeDevice struct {
	Id                 string  `json:"id" yaml:"id"`
	TotalGPUMemory     float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	AllocatedGPUMemory float64 `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"`
}

type GPUShareNodeInfo

type GPUShareNodeInfo struct {
	PodInfos           []GPUSharePodInfo    `json:"instances" yaml:"instances"`
	TotalGPUMemory     float64              `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	AllocatedGPUMemory float64              `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"`
	Devices            []GPUShareNodeDevice `json:"devices" yaml:"devices"`
	CommonGPUNodeInfo  `yaml:",inline" json:",inline"`
	CommonNodeInfo     `yaml:",inline" json:",inline"`
}

type GPUSharePodInfo

type GPUSharePodInfo struct {
	Name          string         `json:"name" yaml:"name"`
	Namespace     string         `json:"namespace" yaml:"namespace"`
	Status        string         `json:"status" yaml:"status"`
	RequestMemory int            `json:"requestGPUMemory" yaml:"requestGPUMemory"`
	Allocation    map[string]int `json:"allocation" yaml:"allocation"`
}

type GPUTopology

type GPUTopology struct {
	LinkMatrix      [][]string  `json:"linkMatrix" yaml:"linkMatrix"`
	BandwidthMatrix [][]float32 `json:"bandwidthMatrix" yaml:"bandwidthMatrix"`
}

type GPUTopologyNodeDevice

type GPUTopologyNodeDevice struct {
	Id      string `json:"id" yaml:"id"`
	Healthy bool   `json:"healthy" yaml:"healthy"`
	Status  string `json:"status" yaml:"status"`
}

type GPUTopologyNodeInfo

type GPUTopologyNodeInfo struct {
	PodInfos          []GPUTopologyPodInfo `json:"instances" yaml:"instances"`
	GPUTopology       GPUTopology          `json:"gpuTopology" yaml:"gpuTopology"`
	CommonGPUNodeInfo `yaml:",inline" json:",inline"`
	CommonNodeInfo    `yaml:",inline" json:",inline"`
	Devices           []GPUTopologyNodeDevice `yaml:"devices" yaml:"devices"`
}

type GPUTopologyPodInfo

type GPUTopologyPodInfo struct {
	Name        string   `json:"name" yaml:"name"`
	Namespace   string   `json:"namespace" yaml:"namespace"`
	Status      string   `json:"status" yaml:"status"`
	RequestGPU  int      `json:"requestGPUs" yaml:"requestGPUs"`
	Allocation  []string `json:"allocation" yaml:"allocation"`
	VisibleGPUs []string `json:"visibleGPUs" yaml:"visibleGPUs"`
}

type GpuMetric

type GpuMetric struct {
	GpuDutyCycle   float64 `json:"gpuDutyCycle" yaml:"gpuDutyCycle"`
	GpuMemoryUsed  float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
	GpuMemoryTotal float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
}

type GpuMetricInfo

type GpuMetricInfo struct {
	MetricName    string
	Value         string
	Time          float64
	PodName       string
	PodNamespace  string
	ContainerName string
	NodeName      string
	GPUUID        string
	Id            string
}

type HTTPMatchRequest

type HTTPMatchRequest struct {
	*istiov1alpha3.HTTPMatchRequest
	Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"`
}

type HTTPRoute

type HTTPRoute struct {
	*istiov1alpha3.HTTPRoute
	Match []*HTTPMatchRequest  `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"`
	Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"`
}

type JobGpuMetric

type JobGpuMetric map[string]PodGpuMetric

type KFServingArgs

type KFServingArgs struct {
	Port              int    `yaml:"port"`          // --port
	ModelType         string `yaml:"modelType"`     // --modelType
	CanaryPercent     int    `yaml:"canaryPercent"` // --canaryTrafficPercent
	StorageUri        string `yaml:"storageUri"`    // --storageUri
	CommonServingArgs `yaml:",inline"`
}

type LimitedPodSecurityContext

type LimitedPodSecurityContext struct {
	RunAsUser          int64   `yaml:"runAsUser"`
	RunAsNonRoot       bool    `yaml:"runAsNonRoot"`
	RunAsGroup         int64   `yaml:"runAsGroup"`
	SupplementalGroups []int64 `yaml:"supplementalGroups"`
}

LimitedPodSecurityContext defines the kuberntes pod security context

type LogArgs

type LogArgs struct {
	Namespace     string
	JobName       string
	InstanceName  string
	ContainerName string
	Follow        bool
	SinceSeconds  *int64
	SinceTime     *metav1.Time
	Tail          *int64
	Timestamps    bool
	RetryCnt      int
	RetryTimeout  time.Duration
	WriterCloser  io.WriteCloser
}

type LogLevel

type LogLevel string
const (
	LogDebug   LogLevel = "debug"
	LogInfo    LogLevel = "info"
	LogWarning LogLevel = "warn"
	LogError   LogLevel = "error"
	LogUnknown LogLevel = "unknown"
)

type NodeGpuMetric

type NodeGpuMetric map[string]*AdvancedGpuMetric

key of map is device id

type NodeType

type NodeType string
const (
	GPUShareNode     NodeType = "GPUShare"
	GPUExclusiveNode NodeType = "GPUExclusive"
	GPUTopologyNode  NodeType = "GPUTopology"
	NormalNode       NodeType = "Normal"
	UnknownNode      NodeType = "unknown"
	AllKnownNode     NodeType = ""
)

type NodeTypeInfo

type NodeTypeInfo struct {
	Name      NodeType
	Alias     string
	Shorthand string
}

type NormalNodeInfo

type NormalNodeInfo struct {
	CommonNodeInfo `yaml:",inline" json:",inline"`
}

type PodGpuMetric

type PodGpuMetric map[string]*GpuMetric

type PortSelector

type PortSelector struct {
	*istiov1alpha3.PortSelector
	Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"`
}

type PreprocesObject

type PreprocesObject struct {
	ServiceName     string
	Namespace       string
	DestinationRule DestinationRuleCRD
	VirtualService  VirtualServiceCRD
}

type PrometheusMetric

type PrometheusMetric struct {
	Status string               `json:"status,inline"`
	Data   PrometheusMetricData `json:"data,omitempty"`
}

type PrometheusMetricData

type PrometheusMetricData struct {
	Result     []PrometheusMetricResult `json:"result"`
	ResultType string                   `json:"resultType"`
}

type PrometheusMetricResult

type PrometheusMetricResult struct {
	Metric map[string]string       `json:"metric"`
	Value  []PrometheusMetricValue `json:"value"`
}

type PrometheusMetricValue

type PrometheusMetricValue interface{}

type PrometheusServer

type PrometheusServer struct {
	Name          string
	ServiceLabels string
	Protocol      string
	Port          string
	Path          string
	MetricList    []string
	Service       *v1.Service
}

PrometheusServer is used to define prometheus server

type Runtime

type Runtime interface {
	// get the chart
	GetChartName() string
	// defines the runtime is default or not
	IsDefault() bool
}

type ScaleETJobArgs

type ScaleETJobArgs struct {
	//--name string     required, et job name
	Name string `yaml:"etName"`
	// TrainingType stores the trainingType
	JobType TrainingJobType `yaml:"-"`
	// Namespace  stores the namespace of job,match option --namespace
	Namespace string `yaml:"-"`
	//--timeout int     timeout of callback scaler script.
	Timeout int `yaml:"timeout"`
	//--retry int       retry times.
	Retry int `yaml:"retry"`
	//--count int       the nums of you want to add or delete worker.
	Count int `yaml:"count"`
	//--script string        script of scaling.
	Script string `yaml:"script"`
	//-e, --env stringArray      the environment variables
	Envs map[string]string `yaml:"envs"`
}

type ScaleInETJobArgs

type ScaleInETJobArgs struct {
	// common args
	ScaleETJobArgs `yaml:",inline"`
}

type ScaleOutETJobArgs

type ScaleOutETJobArgs struct {
	// common args
	ScaleETJobArgs `yaml:",inline"`
}

type SeldonServingArgs added in v0.8.0

type SeldonServingArgs struct {
	Implementation    string `yaml:"implementation"` // --implementation
	ModelUri          string `yaml:"modelUri"`       // --modelUri
	CommonServingArgs `yaml:",inline"`
}

type ServingInstance

type ServingInstance struct {
	// Name gives the instance name
	Name string `json:"name" yaml:"name"`
	// Status gives the instance status
	Status string `json:"status" yaml:"status"`
	// Age gives the instance ge
	Age string `json:"age" yaml:"age"`
	// ReadyContainer represents the count of ready containers
	ReadyContainer int `json:"readyContainers" yaml:"readyContainers"`
	// TotalContainer represents the count of  total containers
	TotalContainer int `json:"totalContainers" yaml:"totalContainers"`
	// RestartCount represents the count of instance restarts
	RestartCount int `json:"restartCount" yaml:"restartCount"`
	// HostIP specifies host ip of instance
	NodeIP string `json:"nodeIP" yaml:"nodeIP"`
	// NodeName returns the node name
	NodeName string `json:"nodeName" yaml:"nodeName"`
	// IP returns the instance ip
	IP string `json:"ip" yaml:"ip"`
	// RequestGPU returns the request gpus
	RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"`
	// RequestGPUMemory returns the request gpu memory
	RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"`
}

type ServingJobInfo

type ServingJobInfo struct {
	// Name specifies serving job name
	Name string `json:"name" yaml:"name"`
	// Namespace specifies serving job namespace
	Namespace string `json:"namespace" yaml:"namespace"`
	// Type specifies serving job type
	Type string `json:"type" yaml:"type"`
	// Version specifies serving job version
	Version string `json:"version" yaml:"version"`
	// Age specifies the serving job age
	Age string `json:"age" yaml:"age"`
	// Desired specifies the desired instances
	Desired int `json:"desiredInstances" yaml:"desiredInstances"`
	// Available specifies the available instances
	Available int `json:"availableInstances" yaml:"availableInstances"`
	// Endpoints specifies the endpoints
	Endpoints []Endpoint `json:"endpoints" yaml:"endpoints"`
	// IPAddress specifies the ip address
	IPAddress string `json:"ip" yaml:"ip"`
	// Instances gives the instance informations
	Instances []ServingInstance `json:"instances" yaml:"instances"`
	// RequestGPU specifies the request gpus
	RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"`
	// RequestGPUMemory specifies the request gpu memory,only for gpushare
	RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"`
	// CreationTimestamp stores the creation timestamp of job
	CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`
}

ServingJobInfo display serving job information

type ServingJobType

type ServingJobType string

ServingJobType defines the serving job type name must like shorthand + "-serving"

const (
	// TFServingJob defines the tensorflow serving job
	TFServingJob ServingJobType = "tf-serving"
	// TRTServingJob defines the tensorrt serving job
	TRTServingJob ServingJobType = "trt-serving"
	// KFServingJob defines the kfserving job
	KFServingJob ServingJobType = "kf-serving"
	// SeldonServingJob defines the seldon core job
	SeldonServingJob ServingJobType = "seldon-serving"
	// CustomServingJob defines the custom serving job
	CustomServingJob ServingJobType = "custom-serving"
	// AllServingJob represents all serving job type
	AllServingJob ServingJobType = ""
	// UnknownServingJob defines the unknown serving job
	UnknownServingJob ServingJobType = "unknown"
)

type ServingTypeInfo

type ServingTypeInfo struct {
	Name      ServingJobType
	Alias     string
	Shorthand string
}

type ServingVersionWeight

type ServingVersionWeight struct {
	Version string
	Weight  int
}

type StringMatchPrefix

type StringMatchPrefix struct {
	Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}

type SubmitETJobArgs

type SubmitETJobArgs struct {
	Cpu    string `yaml:"cpu"`    // --cpu
	Memory string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`
	// SubmitTensorboardArgs stores tensorboard information
	SubmitTensorboardArgs `yaml:",inline"`
	// SubmitSyncCodeArgs stores syncing code information
	SubmitSyncCodeArgs `yaml:",inline"`
	MaxWorkers         int `yaml:"maxWorkers"`
	MinWorkers         int `yaml:"minWorkers"`
}

type SubmitHorovodJobArgs

type SubmitHorovodJobArgs struct {
	SSHPort int    `yaml:"sshPort"`
	Cpu     string `yaml:"cpu"`    // --cpu
	Memory  string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// for tensorboard
	SubmitTensorboardArgs `yaml:",inline"`

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`
}

type SubmitMPIJobArgs

type SubmitMPIJobArgs struct {
	Cpu    string `yaml:"cpu"`    // --cpu
	Memory string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// for tensorboard
	SubmitTensorboardArgs `yaml:",inline"`

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`

	// enable gpu topology scheduling
	GPUTopology        bool   `yaml:"gputopology"`
	GPUTopologyReplica string `yaml:"gputopologyreplica"`
}

type SubmitPyTorchJobArgs

type SubmitPyTorchJobArgs struct {
	Cpu    string `yaml:"cpu"`    // --cpu
	Memory string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// for tensorboard
	SubmitTensorboardArgs `yaml:",inline"`

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`

	// clean-task-policy
	CleanPodPolicy string `yaml:"cleanPodPolicy"`
}

type SubmitSparkJobArgs

type SubmitSparkJobArgs struct {
	Name         string          `yaml:"-"`
	Namespace    string          `yaml:"-"`
	TrainingType TrainingJobType `yaml:"-"`
	Image        string          `yaml:"Image"`
	MainClass    string          `yaml:"MainClass"`
	Jar          string          `yaml:"Jar"`
	Executor     *Executor       `yaml:"Executor"`
	Driver       *Driver         `yaml:"Driver"`
}

type SubmitSyncCodeArgs

type SubmitSyncCodeArgs struct {
	SyncMode   string `yaml:"syncMode"`            // --syncMode: rsync, hdfs, git
	SyncSource string `yaml:"syncSource"`          // --syncSource
	SyncImage  string `yaml:"syncImage,omitempty"` // --syncImage
	// syncGitProjectName
	SyncGitProjectName string `yaml:"syncGitProjectName,omitempty"` // --syncImage
}

type SubmitTFJobArgs

type SubmitTFJobArgs struct {
	// TFNodeSelectors assigns tfjob node selectors
	TFNodeSelectors map[string]map[string]string `yaml:"tfNodeSelectors"`
	// Port defines the defaut port if workerPort and PSPort are not set
	Port int
	// WorkerImage assigns worker image,match option --worker-image
	WorkerImage string `yaml:"workerImage"`
	// WorkerPort stores worker port,match option --work-port
	WorkerPort int `yaml:"workerPort"`
	// PSPort stores the ps port,match option --ps-port
	PSPort int `yaml:"psPort"`
	// PSCount stores the ps count,--ps-count
	PSCount int `yaml:"ps"`
	// PSImage stores the ps image,--ps-image
	PSImage string `yaml:"psImage"`
	// WorkerCpu stores the cpu of job worker,match option --worker-cpu
	WorkerCpu string `yaml:"workerCPU"`
	//WorkerNodeSelectors map[string]string `yaml:"workerNodeSelectors"` // --worker-selector
	// WorkerMemory stores woker memory,match option --worker-memory
	WorkerMemory string `yaml:"workerMemory"`
	// PSCpu stores ps cpu,match option --ps-cpu
	PSCpu string `yaml:"psCPU"`
	// PSGpu stores ps gpu,match option --ps-gpus
	PSGpu int `yaml:"psGPU"` // --ps-gpus
	// PSMemory stores the ps memory,match option --ps-memory
	PSMemory string `yaml:"psMemory"`
	// CleanPodPolicy stores the cleaning pod policy,match option --clean-task-policy
	CleanPodPolicy string `yaml:"cleanPodPolicy"`
	// UseChief stores the using chief or not,match option --chief
	UseChief bool `yaml:",omitempty"` // --chief
	// ChiefCount stores the chief count of job,match option --chief-count
	ChiefCount int `yaml:"chief"`
	// UseEvaluator is used to enable evaluator or not,match option --evaluator
	UseEvaluator bool `yaml:",omitempty"`
	// ChiefPort stores the chief port,match option --chief-port
	ChiefPort int `yaml:"chiefPort"`
	//ChiefNodeSelectors map[string]string `yaml:"chiefNodeSelectors"` // --chief-selector
	// ChiefCpu stores the chief pod cpu,match option --chief-cpu
	ChiefCpu string `yaml:"chiefCPU"`
	// ChiefMemory stores the chief pod memory,match option --chief-memory
	ChiefMemory string `yaml:"chiefMemory"`
	// EvaluatorCpu stores the evaluator pod cpu,match option --evaluator-cpu
	EvaluatorCpu string `yaml:"evaluatorCPU"`
	//EvaluatorNodeSelectors map[string]string `yaml:"evaluatorNodeSelectors"` // --evaluator-selector
	// EvaluatorMemory stores the evaluator pod memory,match option --evaluator-memory
	EvaluatorMemory string `yaml:"evaluatorMemory"` // --evaluatorMemory
	// EvaluatorCount stores the evaluator pod count,match option --evaluator-count
	EvaluatorCount int `yaml:"evaluator"`
	// HasGangScheduler determines if it has gang scheduler
	HasGangScheduler bool `yaml:"hasGangScheduler"`
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// SubmitTensorboardArgs stores tensorboard information
	SubmitTensorboardArgs `yaml:",inline"`

	// SubmitSyncCodeArgs stores syncing code information
	SubmitSyncCodeArgs `yaml:",inline"`

	// TFRuntime stores the runtime
	TFRuntime `yaml:"-"`
}

type SubmitTensorboardArgs

type SubmitTensorboardArgs struct {
	UseTensorboard   bool   `yaml:"useTensorboard"`   // --tensorboard
	TensorboardImage string `yaml:"tensorboardImage"` // --tensorboardImage
	TrainingLogdir   string `yaml:"trainingLogdir"`   // --logdir
	HostLogPath      string `yaml:"hostLogPath"`
	IsLocalLogging   bool   `yaml:"isLocalLogging"`
}

SubmitTensorboardArgs is used to store tensorborad information

type SubmitVolcanoJobArgs

type SubmitVolcanoJobArgs struct {
	// Name stores the job name
	Name string
	// Namespace stores the namespace of job
	Namespace string
	// TrainingType is used to accept job type
	TrainingType TrainingJobType
	// Command defines the job command
	Command string
	// The MinAvailable available pods to run for this Job
	MinAvailable int `yaml:"minAvailable"`
	// Specifies the queue that will be used in the scheduler, "default" queue is used this leaves empty.
	Queue string `yaml:"queue"`
	// SchedulerName is the default value of `tasks.template.spec.schedulerName`.
	SchedulerName string `yaml:"schedulerName"`
	// TaskName specifies the name of task
	TaskName string `yaml:"taskName"`
	// TaskImages specifies the task image
	TaskImages []string `yaml:"taskImages"`
	// TaskReplicas specifies the replicas of this Task in Job
	TaskReplicas int `yaml:"taskReplicas"`
	// TaskCPU specifies the cpu resource required for each replica of Task in Job. default is 250m
	TaskCPU string `yaml:"taskCPU"`
	// TaskMemory specifies the memory resource required for each replica of Task in Job. default is 128Mi
	TaskMemory string `yaml:"taskMemory"`
	// TaskPort specifies the task port
	TaskPort int `yaml:"taskPort"`
}

type TFRuntime

type TFRuntime interface {
	// check the tfjob args
	Check(tf *SubmitTFJobArgs) (err error)
	// transform the tfjob
	Transform(tf *SubmitTFJobArgs) (err error)
	Runtime
}

Customized runtime for tf training training

type TensorFlowServingArgs

type TensorFlowServingArgs struct {
	VersionPolicy          string `yaml:"versionPolicy"`   // --versionPolicy
	ModelConfigFile        string `yaml:"modelConfigFile"` // --modelConfigFile
	ModelConfigFileContent string `yaml:"modelConfigFileContent"`
	ModelName              string `yaml:"modelName"`   // --modelName
	ModelPath              string `yaml:"modelPath"`   // --modelPath
	Port                   int    `yaml:"port"`        // --port
	RestfulPort            int    `yaml:"restApiPort"` // --restfulPort
	CommonServingArgs      `yaml:",inline"`
}

type TensorRTServingArgs

type TensorRTServingArgs struct {
	ModelStore        string `yaml:"modelStore"`   // --modelStore
	MetricsPort       int    `yaml:"metricsPort"`  // --metricsPort
	HttpPort          int    `yaml:"httpPort"`     // --httpPort
	GrpcPort          int    `yaml:"grpcPort"`     // --grpcPort
	AllowMetrics      bool   `yaml:"allowMetrics"` // --allowMetrics
	CommonServingArgs `yaml:",inline"`
}

type TrafficRouterSplitArgs

type TrafficRouterSplitArgs struct {
	ServingName    string `yaml:"servingName,omitempty"` //--name
	Namespace      string `yaml:"namespace,omitempty"`   //--namespace
	Versions       string `yaml:"versions,omitempty"`    //--versions
	Weights        string `yaml:"weights,omitempty"`     //--weights
	VersionWeights []ServingVersionWeight
}

type TrainingJobInfo

type TrainingJobInfo struct {
	// The name of the training job
	Name string `json:"name" yaml:"name"`
	// The namespace of the training job
	Namespace string `json:"namespace" yaml:"namespace"`
	// The time of the training job
	Duration string `json:"duration" yaml:"duration"`
	// The status of the training Job
	Status TrainingJobStatus `json:"status" yaml:"status"`

	// The training type of the training job
	Trainer TrainingJobType `json:"trainer" yaml:"trainer"`
	// The tensorboard of the training job
	Tensorboard string `json:"tensorboard" yaml:"tensorboard"`

	// The name of the chief Instance
	ChiefName string `json:"chiefName" yaml:"chiefName"`

	// The instances under the training job
	Instances []TrainingJobInstance `json:"instances" yaml:"instances"`

	// The priority of the training job
	Priority string `json:"priority" yaml:"priority"`

	// RequestGPU stores the request gpus
	RequestGPU int64 `json:"requestGPUs" yaml:"requestGPUs"`

	// AllocatedGPU stores the allocated gpus
	AllocatedGPU int64 `json:"allocatedGPUs" yaml:"allocatedGPUs"`

	// CreationTimestamp stores the creation timestamp of job
	CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`
}

TrainingJobInfo stores training job information

type TrainingJobInstance

type TrainingJobInstance struct {
	// IP defines the instance ip
	IP string `json:"ip" yaml:"ip"`
	// the status of of instance
	Status string `json:"status"`
	// the name of instance
	Name string `json:"name"`
	// the age of instance
	Age string `json:"age"`
	// the node instance runs on
	Node string `json:"node"`
	// NodeIP is store the node ip
	NodeIP string `json:"nodeIP" yaml:"nodeIP"`
	// the instance is chief or not
	IsChief bool `json:"chief" yaml:"chief"`
	// RequestGPUs is used to store request gpu count
	RequestGPUs int `json:"requestGPUs" yaml:"requestGPUs"`
	// GpuDutyCycle stores the gpu metrics
	GPUMetrics map[string]GpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"`
}

TrainingJobInstance defines the instance of training job

type TrainingJobStatus

type TrainingJobStatus string

TrainingJobStatus defines all the kinds of JobStatus

const (
	// TrainingJobPending means the job is pending
	TrainingJobPending TrainingJobStatus = "PENDING"
	// TrainingJobRunning means the job is running
	TrainingJobRunning TrainingJobStatus = "RUNNING"
	// TrainingJobSucceeded means the job is Succeeded
	TrainingJobSucceeded TrainingJobStatus = "SUCCEEDED"
	// TrainingJobFailed means the job is failed
	TrainingJobFailed TrainingJobStatus = "FAILED"
)

type TrainingJobType

type TrainingJobType string

TrainingJobType defines the supporting training job type

const (
	// TFTrainingJob defines the tfjob
	TFTrainingJob TrainingJobType = "tfjob"
	// MPITrainingJob defines the mpijob
	MPITrainingJob TrainingJobType = "mpijob"
	// PytorchTrainingJob defines the pytorchjob
	PytorchTrainingJob TrainingJobType = "pytorchjob"
	// HorovodTrainingJob defines the horovod job
	HorovodTrainingJob TrainingJobType = "horovodjob"
	// VolcanoTrainingJob defines the volcano job
	VolcanoTrainingJob TrainingJobType = "volcanojob"
	// ETTrainingJob defines the etjob
	ETTrainingJob TrainingJobType = "etjob"
	// SparkTrainingJob defines the spark job
	SparkTrainingJob TrainingJobType = "sparkjob"
	// AllTrainingJob represents all job types
	AllTrainingJob TrainingJobType = ""
	// UnknownTrainingJob defines the unknown training
	UnknownTrainingJob TrainingJobType = "unknown"
)

type TrainingJobTypeInfo

type TrainingJobTypeInfo struct {
	Name      TrainingJobType
	Alias     string
	Shorthand string
}

type VirtualService

type VirtualService struct {
	*istiov1alpha3.VirtualService
	Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"`
}

type VirtualServiceCRD

type VirtualServiceCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL