Documentation
¶
Overview ¶
Package operator provides a Kubernetes operator for managing ZerfooInferenceService custom resources. It reconciles desired inference service state into Kubernetes Deployments, Services, and HorizontalPodAutoscalers via a pluggable KubeClient interface.
Index ¶
Constants ¶
This section is empty.
Variables ¶
var ( ErrInvalidSpec = errors.New("operator: invalid spec") ErrNotFound = errors.New("operator: resource not found") ErrAlreadyExists = errors.New("operator: resource already exists") )
Standard errors returned by validation and reconciliation.
Functions ¶
This section is empty.
Types ¶
type CanarySpec ¶
type CanarySpec struct {
// ModelRef is the canary model reference.
ModelRef string `json:"modelRef"`
// Weight is the percentage of traffic routed to the canary (0-100).
Weight int `json:"weight"`
}
CanarySpec configures a canary deployment alongside the primary.
type Deployment ¶
type Deployment struct {
Name string `json:"name"`
Namespace string `json:"namespace"`
Replicas int `json:"replicas"`
ModelRef string `json:"modelRef"`
Resources ResourceSpec `json:"resources"`
Health HealthCheckSpec `json:"healthCheck"`
}
Deployment represents a Kubernetes Deployment managed by the operator.
type HPA ¶
type HPA struct {
Name string `json:"name"`
Namespace string `json:"namespace"`
TargetRef string `json:"targetRef"`
MinReplicas int `json:"minReplicas"`
MaxReplicas int `json:"maxReplicas"`
}
HPA represents a Kubernetes HorizontalPodAutoscaler.
type HealthCheckSpec ¶
type HealthCheckSpec struct {
// Path is the HTTP health check endpoint (e.g. "/healthz").
Path string `json:"path"`
// Interval between health checks.
Interval time.Duration `json:"interval"`
// Timeout for a single health check.
Timeout time.Duration `json:"timeout"`
}
HealthCheckSpec configures health check probes.
type KubeClient ¶
type KubeClient interface {
GetDeployment(ctx context.Context, namespace, name string) (*Deployment, error)
CreateDeployment(ctx context.Context, d *Deployment) error
UpdateDeployment(ctx context.Context, d *Deployment) error
DeleteDeployment(ctx context.Context, namespace, name string) error
GetService(ctx context.Context, namespace, name string) (*Service, error)
CreateService(ctx context.Context, s *Service) error
UpdateService(ctx context.Context, s *Service) error
GetHPA(ctx context.Context, namespace, name string) (*HPA, error)
CreateHPA(ctx context.Context, h *HPA) error
UpdateHPA(ctx context.Context, h *HPA) error
}
KubeClient abstracts Kubernetes API operations needed by the reconciler. Implementations may wrap a real Kubernetes client or a mock for testing.
type Reconciler ¶
type Reconciler struct {
// contains filtered or unexported fields
}
Reconciler compares the desired ZerfooInferenceService spec against the current cluster state and applies the necessary changes.
func NewReconciler ¶
func NewReconciler(client KubeClient) *Reconciler
NewReconciler creates a Reconciler backed by the given KubeClient.
func (*Reconciler) Delete ¶
func (r *Reconciler) Delete(ctx context.Context, svc *ZerfooInferenceService) error
Delete removes all resources associated with the given service.
func (*Reconciler) Reconcile ¶
func (r *Reconciler) Reconcile(ctx context.Context, svc *ZerfooInferenceService) error
Reconcile drives the cluster toward the desired state described by svc. It creates, updates, or deletes Deployments, Services, and HPAs as needed.
type ResourceSpec ¶
type ResourceSpec struct {
CPU string `json:"cpu"` // e.g. "4"
Memory string `json:"memory"` // e.g. "16Gi"
GPUMemory string `json:"gpuMemory"` // e.g. "24Gi"
}
ResourceSpec declares CPU, memory, and GPU resource limits.
type Service ¶
type Service struct {
Name string `json:"name"`
Namespace string `json:"namespace"`
Selector map[string]string `json:"selector"`
Weights []WeightedTarget `json:"weights,omitempty"`
}
Service represents a Kubernetes Service managed by the operator.
type WeightedTarget ¶
type WeightedTarget struct {
DeploymentName string `json:"deploymentName"`
Weight int `json:"weight"`
}
WeightedTarget maps a deployment name to a traffic weight for canary routing.
type ZerfooInferenceService ¶
type ZerfooInferenceService struct {
Name string `json:"name"`
Namespace string `json:"namespace"`
Spec ZerfooInferenceServiceSpec `json:"spec"`
Status ZerfooInferenceServiceStatus `json:"status"`
}
ZerfooInferenceService is the top-level custom resource that declares a desired inference service deployment.
type ZerfooInferenceServiceSpec ¶
type ZerfooInferenceServiceSpec struct {
// ModelRef is the model repository reference (e.g. "llama3-8b-q4").
ModelRef string `json:"modelRef"`
// Replicas is the desired number of inference pods.
Replicas int `json:"replicas"`
// MinReplicas for autoscaling (0 means no autoscaling).
MinReplicas int `json:"minReplicas,omitempty"`
// MaxReplicas for autoscaling.
MaxReplicas int `json:"maxReplicas,omitempty"`
// Resources specifies compute resource limits.
Resources ResourceSpec `json:"resources"`
// Canary optionally configures a canary deployment with traffic splitting.
Canary *CanarySpec `json:"canary,omitempty"`
// HealthCheck configures liveness/readiness probes.
HealthCheck HealthCheckSpec `json:"healthCheck"`
}
ZerfooInferenceServiceSpec describes the desired state of an inference service.
func (*ZerfooInferenceServiceSpec) Validate ¶
func (s *ZerfooInferenceServiceSpec) Validate() error
Validate checks the spec for required fields and constraints.
type ZerfooInferenceServiceStatus ¶
type ZerfooInferenceServiceStatus struct {
// Ready indicates whether the service is fully available.
Ready bool `json:"ready"`
// Replicas is the current number of running replicas.
Replicas int `json:"replicas"`
// Message provides a human-readable status message.
Message string `json:"message,omitempty"`
}
ZerfooInferenceServiceStatus represents the observed state.