Documentation
¶
Index ¶
- Constants
- type AssumeRoleConfig
- type AutoscalerConfig
- type Config
- type DebugConfig
- type LeaderElectionConfig
- type ModelConfig
- type ModelConfigItem
- type ObjectStoreConfig
- type OllamaConfig
- type PersistentVolume
- type ProcessedModelConfig
- type Resources
- type RuntimeConfig
- type S3Config
- type ScalingConfig
- type TolerationConfig
- type WorkerConfig
- type WorkerTLSConfig
Constants ¶
const ( // RuntimeNameOllama is the Ollama runtime name. RuntimeNameOllama string = "ollama" // RuntimeNameVLLM is the VLLM runtime name. RuntimeNameVLLM string = "vllm" )
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AssumeRoleConfig ¶ added in v0.350.0
type AssumeRoleConfig struct {
RoleARN string `yaml:"roleArn"`
ExternalID string `yaml:"externalId"`
}
AssumeRoleConfig is the assume role configuration.
type AutoscalerConfig ¶ added in v0.280.0
type AutoscalerConfig struct {
Enable bool `yaml:"enable"`
// InitialDelay is the initial delay before starting the autoscaler.
InitialDelay time.Duration `yaml:"initialDelay"`
// SyncPeriod is the period for calculating the scaling.
SyncPeriod time.Duration `yaml:"syncPeriod"`
// ScaleToZeroGracePeriod is the grace period before scaling to zero.
ScaleToZeroGracePeriod time.Duration `yaml:"scaleToZeroGracePeriod"`
// MetricsWindow is the window size for metrics.
// e.g., if it's 5 minutes, we'll use the 5-minute average as the metric.
MetricsWindow time.Duration `yaml:"metricsWindow"`
RuntimeScalers map[string]ScalingConfig `yaml:"runtimeScalers"`
DefaultScaler ScalingConfig `yaml:"defaultScaler"`
}
AutoscalerConfig is the autoscaler configuration.
type Config ¶
type Config struct {
Runtime RuntimeConfig `yaml:"runtime"`
Ollama OllamaConfig `yaml:"ollama"`
Model ModelConfig `yaml:"model"`
HealthPort int `yaml:"healthPort"`
// GracefulShutdownTimeout is the duration given to runnable to stop
// before the manager actually returns on stop. Default is 30 seconds.
GracefulShutdownTimeout time.Duration `yaml:"gracefulShutdownTimeout"`
LeaderElection LeaderElectionConfig `yaml:"leaderElection"`
Autoscaler AutoscalerConfig `yaml:"autoscaler"`
ObjectStore ObjectStoreConfig `yaml:"objectStore"`
// PreloadedModelIDs is a list of model IDs to preload. These models are downloaded locally
// at the startup time.
// TODO(kenji):Remove once every env uses ModelConfig.
PreloadedModelIDs []string `yaml:"preloadedModelIds"`
// ModelContextLengths is a map of model ID to context length. If not specified, the default
// context length is used.
// TODO(kenji):Remove once every env uses ModelConfig.
ModelContextLengths map[string]int `yaml:"modelContextLengths"`
Debug DebugConfig `yaml:"debug"`
InferenceManagerServerWorkerServiceAddr string `yaml:"inferenceManagerServerWorkerServiceAddr"`
ModelManagerServerWorkerServiceAddr string `yaml:"modelManagerServerWorkerServiceAddr"`
Worker WorkerConfig `yaml:"worker"`
}
Config is the configuration.
type DebugConfig ¶ added in v0.2.0
type DebugConfig struct {
// Standalone is true if the service is running in standalone mode (except the
// dependency to inference-manager-server).
Standalone bool `yaml:"standalone"`
}
DebugConfig is the debug configuration.
type LeaderElectionConfig ¶ added in v0.310.0
type LeaderElectionConfig struct {
ID string `yaml:"id"`
// LeaseDuration is the duration that non-leader candidates will
// wait to force acquire leadership. This is measured against time of
// last observed ack. Default is 15 seconds.
LeaseDuration *time.Duration `yaml:"leaseDuration"`
// RenewDeadline is the duration that the acting controlplane will retry
// refreshing leadership before giving up. Default is 10 seconds.
RenewDeadline *time.Duration `yaml:"renewDeadline"`
// RetryPeriod is the duration the LeaderElector clients should wait
// between tries of actions. Default is 2 seconds.
RetryPeriod *time.Duration `yaml:"retryPeriod"`
}
LeaderElectionConfig is the leader election configuration.
type ModelConfig ¶ added in v0.332.0
type ModelConfig struct {
Default ModelConfigItem `yaml:"default"`
// Overrides is a map of model ID to the model configuration item to be overriden. Only
// fields that are set in the overrides are applied.
Overrides map[string]ModelConfigItem `yaml:"overrides"`
}
ModelConfig is the model configuration.
type ModelConfigItem ¶ added in v0.332.0
type ModelConfigItem struct {
RuntimeName string `yaml:"runtimeName"`
Resources Resources `yaml:"resources"`
Replicas int `yaml:"replicas"`
// Preloaded is true if the model is preloaded.
// If this is set to true in the the default model item, all models that are specified in override items
// are preloaded.
Preloaded bool `yaml:"preloaded"`
// ContextLength is the context length for the model. If the value is 0,
// the default context length is used.
ContextLength int `yaml:"contextLength"`
}
ModelConfigItem is the model configuration item.
type ObjectStoreConfig ¶ added in v0.2.0
type ObjectStoreConfig struct {
S3 S3Config `yaml:"s3"`
}
ObjectStoreConfig is the object store configuration.
func (*ObjectStoreConfig) Validate ¶ added in v0.2.0
func (c *ObjectStoreConfig) Validate() error
Validate validates the object store configuration.
type OllamaConfig ¶ added in v0.180.0
type OllamaConfig struct {
// KeepAlive is the keep-alive duration for Ollama.
// This controls how long Ollama keeps models in GPU memory.
KeepAlive time.Duration `yaml:"keepAlive"`
// NumParallel is the maximum number of requests procesed in parallel.
NumParallel int `yaml:"numParallel"`
// ForceSpreading is true if the models should be spread across all GPUs.
ForceSpreading bool `yaml:"forceSpreading"`
Debug bool `yaml:"debug"`
RunnersDir string `yaml:"runnersDir"`
}
OllamaConfig is the Ollama configuration.
type PersistentVolume ¶ added in v0.240.0
type PersistentVolume struct {
StorageClassName string `yaml:"storageClassName"`
Size string `yaml:"size"`
AccessMode string `yaml:"accessMode"`
}
PersistentVolume is the persistent volume configuration.
type ProcessedModelConfig ¶ added in v0.332.0
type ProcessedModelConfig struct {
// contains filtered or unexported fields
}
ProcessedModelConfig is the processed model configuration.
func NewProcessedModelConfig ¶ added in v0.332.0
func NewProcessedModelConfig(c *Config) *ProcessedModelConfig
NewProcessedModelConfig returns a new ProcessedModelConfig.
func (*ProcessedModelConfig) ModelConfigItem ¶ added in v0.332.0
func (c *ProcessedModelConfig) ModelConfigItem(modelID string) ModelConfigItem
ModelConfigItem returns the model configuration item for the given model ID.
func (*ProcessedModelConfig) PreloadedModelIDs ¶ added in v0.332.0
func (c *ProcessedModelConfig) PreloadedModelIDs() []string
PreloadedModelIDs returns the IDs of the models to be preloaded.
type Resources ¶ added in v0.240.0
type Resources struct {
Requests map[string]string `yaml:"requests"`
Limits map[string]string `yaml:"limits"`
Volume *PersistentVolume `yaml:"volume"`
}
Resources is the resources configuration.
type RuntimeConfig ¶ added in v0.240.0
type RuntimeConfig struct {
PullerImage string `yaml:"pullerImage"`
RuntimeImages map[string]string `yaml:"runtimeImages"`
PullerImagePullPolicy string `yaml:"pullerImagePullPolicy"`
RuntimeImagePullPolicy string `yaml:"runtimeImagePullPolicy"`
ConfigMapName string `yaml:"configMapName"`
AWSSecretName string `yaml:"awsSecretName"`
AWSKeyIDEnvKey string `yaml:"awsKeyIdEnvKey"`
AWSAccessKeyEnvKey string `yaml:"awsAccessKeyEnvKey"`
LLMOWorkerSecretName string `yaml:"llmoWorkerSecretName"`
LLMOKeyEnvKey string `yaml:"llmoKeyEnvKey"`
ServiceAccountName string `yaml:"serviceAccountName"`
NodeSelector map[string]string `yaml:"nodeSelector"`
// TODO(kenji): Support affinity
Tolerations []TolerationConfig `yaml:"tolerations"`
// TODO(kenji): Remove the following fields once every env uses ModelConfig.
Name string `yaml:"name"`
ModelResources map[string]Resources `yaml:"modelResources"`
DefaultResources Resources `yaml:"defaultResources"`
// DefaultReplicas specifies the number of replicas of the runtime (per model).
// TODO(kenji): Revisit this once we support autoscaling.
DefaultReplicas int `yaml:"defaultReplicas"`
}
RuntimeConfig is the runtime configuration.
type S3Config ¶ added in v0.2.0
type S3Config struct {
EndpointURL string `yaml:"endpointUrl"`
Region string `yaml:"region"`
Bucket string `yaml:"bucket"`
AssumeRole *AssumeRoleConfig `yaml:"assumeRole"`
}
S3Config is the S3 configuration.
type ScalingConfig ¶ added in v0.280.0
type ScalingConfig struct {
// TargetValue is the per-pod metric value that we target to maintain.
// Currently, this is the concurrent requests per model runtime.
TargetValue float64 `yaml:"targetValue"`
// MaxReplicas is the maximum number of replicas.
// e.g., if this is 10, the pod can be scaled up to 10.
MaxReplicas int32 `yaml:"maxReplicas"`
// MinReplicas is the minimum number of replicas.
// e.g., if this is 0, the pod can be scaled down to 0.
MinReplicas int32 `yaml:"minReplicas"`
// MaxScaleUpRate is the maximum rate of scaling up.
// e.g., current replicas is 2 and this rate is 3.0,
// the pod can be scaled up to 6. (ceil(2 * 3.0) = 6)
MaxScaleUpRate float64 `yaml:"maxScaleUpRate"`
// MaxScaleDownRate is the maximum rate of scaling down.
// e.g., current replicas is 6 and this rate is 0.5,
// the pod can be scaled down to 3. (floor(6 * 0.5) = 3)
MaxScaleDownRate float64 `yaml:"maxScaleDownRate"`
}
ScalingConfig is the scaling configuration.
type TolerationConfig ¶ added in v0.312.0
type TolerationConfig struct {
Key string `yaml:"key"`
Operator string `yaml:"operator"`
Value string `yaml:"value"`
Effect string `yaml:"effect"`
TolerationSeconds int64 `yaml:"tolerationSeconds"`
}
TolerationConfig is the toleration configuration.
type WorkerConfig ¶ added in v0.103.0
type WorkerConfig struct {
TLS WorkerTLSConfig `yaml:"tls"`
}
WorkerConfig is the worker configuration.
type WorkerTLSConfig ¶ added in v0.103.0
type WorkerTLSConfig struct {
Enable bool `yaml:"enable"`
}
WorkerTLSConfig is the worker TLS configuration.