Documentation
¶
Index ¶
- func AlertCount(t *testing.T, client monitoring.AlertClient, expectedCount int, ...)
- func AlertDuration(t *testing.T, client monitoring.AlertClient, name string, ...)
- func AlertFiring(t *testing.T, client monitoring.AlertClient, name string, ...)
- func AlertNotExists(t *testing.T, client monitoring.AlertClient, name string, ...)
- func AlertResolved(t *testing.T, client monitoring.AlertClient, name string, ...)
- func AlertSeverity(t *testing.T, client monitoring.AlertClient, name, expectedSeverity string, ...)
- func AllNodesSynced(t *testing.T, nodes []*k8s.Node, expectedStep int)
- func EventOccurred(t *testing.T, k8sClient k8s.K8sClient, objectRef *k8s.ObjectReference, ...)
- func GPUAllocated(t *testing.T, gpuManager resource.GPUManager, gpuID, jobID string)
- func GPUAvailable(t *testing.T, gpuManager resource.GPUManager, nodeName string, ...)
- func GPUReleased(t *testing.T, gpuManager resource.GPUManager, gpuID string)
- func JobCompleted(t *testing.T, job *k8s.PyTorchJob)
- func JobCreated(t *testing.T, k8sClient k8s.K8sClient, jobName string)
- func JobDeleted(t *testing.T, k8sClient k8s.K8sClient, jobName string)
- func JobDuration(t *testing.T, job *k8s.PyTorchJob, minDuration, maxDuration time.Duration)
- func JobFailed(t *testing.T, job *k8s.PyTorchJob, expectedReason string)
- func JobRestarted(t *testing.T, job *k8s.PyTorchJob)
- func JobStatusTransition(t *testing.T, job *k8s.PyTorchJob, expectedPhases []k8s.JobPhase)
- func LogContainsMetric(t *testing.T, logContent, metricName string, expectedValue, tolerance float64)
- func MemoryUsage(t *testing.T, gpu *resource.GPUInfo, maxUsagePercent float64)
- func MetricCount(t *testing.T, client monitoring.MetricClient, name string, expectedCount int, ...)
- func MetricExists(t *testing.T, client monitoring.MetricClient, name string, ...)
- func MetricNotExists(t *testing.T, client monitoring.MetricClient, name string, ...)
- func MetricRecent(t *testing.T, client monitoring.MetricClient, name string, ...)
- func MetricRecorded(t *testing.T, client monitoring.MetricClient, name string, ...)
- func MetricValueInRange(t *testing.T, client monitoring.MetricClient, name string, ...)
- func ModelFileValid(t *testing.T, storageClient storage.StorageClient, bucket, key string, ...)
- func NodeReady(t *testing.T, k8sClient k8s.K8sClient, nodeName string)
- func PodScheduled(t *testing.T, k8sClient k8s.K8sClient, podName, expectedNode string)
- func ResourceUtilization(t *testing.T, gpu *resource.GPUInfo, minUtilization, maxUtilization int)
- func ResourcesAllocated(t *testing.T, job *k8s.PyTorchJob, ...)
- func TemperatureSafe(t *testing.T, gpu *resource.GPUInfo, maxTemp int)
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AlertCount ¶
func AlertCount(t *testing.T, client monitoring.AlertClient, expectedCount int, labels map[string]string)
AlertCount 检查告警数量是否匹配期望值
func AlertDuration ¶
func AlertDuration(t *testing.T, client monitoring.AlertClient, name string, minDuration, maxDuration time.Duration, labels map[string]string)
AlertDuration 检查告警是否在期望的持续时间内触发
func AlertFiring ¶
func AlertFiring(t *testing.T, client monitoring.AlertClient, name string, labels map[string]string)
AlertFiring 检查告警是否处于触发状态
func AlertNotExists ¶
func AlertNotExists(t *testing.T, client monitoring.AlertClient, name string, labels map[string]string)
AlertNotExists 检查告警是否不存在
func AlertResolved ¶
func AlertResolved(t *testing.T, client monitoring.AlertClient, name string, labels map[string]string)
AlertResolved 检查告警是否已解决
func AlertSeverity ¶
func AlertSeverity(t *testing.T, client monitoring.AlertClient, name, expectedSeverity string, labels map[string]string)
AlertSeverity 检查告警是否具有期望的严重程度
func AllNodesSynced ¶
AllNodesSynced checks if all nodes in a distributed training setup are synchronized
func EventOccurred ¶
func EventOccurred(t *testing.T, k8sClient k8s.K8sClient, objectRef *k8s.ObjectReference, eventType, reason string)
EventOccurred checks if a specific event occurred
func GPUAllocated ¶
func GPUAllocated(t *testing.T, gpuManager resource.GPUManager, gpuID, jobID string)
GPUAllocated 检查GPU是否分配给特定任务
func GPUAvailable ¶
GPUAvailable 检查节点上可用的GPU数量是否符合期望
func GPUReleased ¶
func GPUReleased(t *testing.T, gpuManager resource.GPUManager, gpuID string)
GPUReleased 检查GPU是否已释放(未分配)
func JobCompleted ¶
func JobCompleted(t *testing.T, job *k8s.PyTorchJob)
JobCompleted checks if a job completed successfully
func JobCreated ¶
JobCreated checks if a job was created successfully
func JobDeleted ¶
JobDeleted checks if a job was deleted successfully
func JobDuration ¶
JobDuration checks if job duration is within expected range
func JobFailed ¶
func JobFailed(t *testing.T, job *k8s.PyTorchJob, expectedReason string)
JobFailed checks if a job failed as expected
func JobRestarted ¶
func JobRestarted(t *testing.T, job *k8s.PyTorchJob)
JobRestarted checks if a job was restarted
func JobStatusTransition ¶
JobStatusTransition 检查任务状态转换是否匹配期望的阶段
func LogContainsMetric ¶
func LogContainsMetric(t *testing.T, logContent, metricName string, expectedValue, tolerance float64)
LogContainsMetric checks if log content contains a specific metric with expected value
func MemoryUsage ¶
MemoryUsage checks if GPU memory usage is within expected range
func MetricCount ¶
func MetricCount(t *testing.T, client monitoring.MetricClient, name string, expectedCount int, labels map[string]string)
MetricCount 检查指标数量是否匹配期望值
func MetricExists ¶
func MetricExists(t *testing.T, client monitoring.MetricClient, name string, labels map[string]string)
MetricExists 检查指标是否存在
func MetricNotExists ¶
func MetricNotExists(t *testing.T, client monitoring.MetricClient, name string, labels map[string]string)
MetricNotExists 检查指标是否不存在
func MetricRecent ¶
func MetricRecent(t *testing.T, client monitoring.MetricClient, name string, within time.Duration, labels map[string]string)
MetricRecent 检查指标是否在最近记录(在指定时间范围内)
func MetricRecorded ¶
func MetricRecorded(t *testing.T, client monitoring.MetricClient, name string, expectedValue float64, labels map[string]string)
MetricRecorded 检查指标是否记录了期望的值
func MetricValueInRange ¶
func MetricValueInRange(t *testing.T, client monitoring.MetricClient, name string, minValue, maxValue float64, labels map[string]string)
MetricValueInRange 检查指标值是否在期望范围内
func ModelFileValid ¶
func ModelFileValid(t *testing.T, storageClient storage.StorageClient, bucket, key string, expectedSize int64, expectedHash string)
ModelFileValid checks if a model file is valid (size and hash)
func PodScheduled ¶
PodScheduled checks if a pod was scheduled on the expected node
func ResourceUtilization ¶
ResourceUtilization checks if GPU utilization is within expected range
func ResourcesAllocated ¶
func ResourcesAllocated(t *testing.T, job *k8s.PyTorchJob, expectedCPU, expectedMemory, expectedGPU string)
ResourcesAllocated 检查资源是否按期望分配
Types ¶
This section is empty.