Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AutoscalerConfig ¶ added in v0.280.0
type AutoscalerConfig struct { Enable bool `yaml:"enable"` // InitialDelay is the initial delay before starting the autoscaler. InitialDelay time.Duration `yaml:"initialDelay"` // SyncPeriod is the period for calculating the scaling. SyncPeriod time.Duration `yaml:"syncPeriod"` // ScaleToZeroGracePeriod is the grace period before scaling to zero. ScaleToZeroGracePeriod time.Duration `yaml:"scaleToZeroGracePeriod"` // MetricsWindow is the window size for metrics. // e.g., if it's 5 minutes, we'll use the 5-minute average as the metric. MetricsWindow time.Duration `yaml:"metricsWindow"` RuntimeScalers map[string]ScalingConfig `yaml:"runtimeScalers"` DefaultScaler ScalingConfig `yaml:"defaultScaler"` }
AutoscalerConfig is the autoscaler configuration.
type Config ¶
type Config struct { Runtime RuntimeConfig `yaml:"runtime"` Ollama OllamaConfig `yaml:"ollama"` VLLM VLLMConfig `yaml:"vllm"` LLMEngine llmkind.K `yaml:"llmEngine"` // LLMPort is the port llm listens on. LLMPort int `yaml:"llmPort"` HealthPort int `yaml:"healthPort"` Autoscaler AutoscalerConfig `yaml:"autoscaler"` ObjectStore ObjectStoreConfig `yaml:"objectStore"` // PreloadedModelIDs is a list of model IDs to preload. These models are downloaded locally // at the startup time. PreloadedModelIDs []string `yaml:"preloadedModelIds"` // ModelContextLengths is a map of model ID to context length. If not specified, the default // context length is used. ModelContextLengths map[string]int `yaml:"modelContextLengths"` Debug DebugConfig `yaml:"debug"` InferenceManagerServerWorkerServiceAddr string `yaml:"inferenceManagerServerWorkerServiceAddr"` ModelManagerServerWorkerServiceAddr string `yaml:"modelManagerServerWorkerServiceAddr"` Worker WorkerConfig `yaml:"worker"` }
Config is the configuration.
func (*Config) FormattedModelContextLengths ¶ added in v0.258.0
FormattedModelContextLengths returns the model context lengths keyed by formatted model IDs.
func (*Config) FormattedPreloadedModelIDs ¶ added in v0.258.0
FormattedPreloadedModelIDs returns a formatted IDs of models to be preloaded.
type DebugConfig ¶ added in v0.2.0
type DebugConfig struct { // Standalone is true if the service is running in standalone mode (except the // dependency to inference-manager-server). Standalone bool `yaml:"standalone"` }
DebugConfig is the debug configuration.
type ObjectStoreConfig ¶ added in v0.2.0
type ObjectStoreConfig struct {
S3 S3Config `yaml:"s3"`
}
ObjectStoreConfig is the object store configuration.
func (*ObjectStoreConfig) Validate ¶ added in v0.2.0
func (c *ObjectStoreConfig) Validate() error
Validate validates the object store configuration.
type OllamaConfig ¶ added in v0.180.0
type OllamaConfig struct { // KeepAlive is the keep-alive duration for Ollama. // This controls how long Ollama keeps models in GPU memory. KeepAlive time.Duration `yaml:"keepAlive"` // NumParallel is the maximum number of requests procesed in parallel. NumParallel int `yaml:"numParallel"` // ForceSpreading is true if the models should be spread across all GPUs. ForceSpreading bool `yaml:"forceSpreading"` Debug bool `yaml:"debug"` RunnersDir string `yaml:"runnersDir"` }
OllamaConfig is the Ollama configuration.
type PersistentVolume ¶ added in v0.240.0
type PersistentVolume struct { StorageClassName string `yaml:"storageClassName"` Size string `yaml:"size"` AccessMode string `yaml:"accessMode"` }
PersistentVolume is the persistent volume configuration.
type Resources ¶ added in v0.240.0
type Resources struct { Requests map[string]string `yaml:"requests"` Limits map[string]string `yaml:"limits"` Volume *PersistentVolume `yaml:"volume"` }
Resources is the resources configuration.
type RuntimeConfig ¶ added in v0.240.0
type RuntimeConfig struct { Name string `yaml:"name"` PullerImage string `yaml:"pullerImage"` RuntimeImages map[string]string `yaml:"runtimeImages"` PullerImagePullPolicy string `yaml:"pullerImagePullPolicy"` RuntimeImagePullPolicy string `yaml:"runtimeImagePullPolicy"` ConfigMapName string `yaml:"configMapName"` AWSSecretName string `yaml:"awsSecretName"` AWSKeyIDEnvKey string `yaml:"awsKeyIdEnvKey"` AWSAccessKeyEnvKey string `yaml:"awsAccessKeyEnvKey"` LLMOWorkerSecretName string `yaml:"llmoWorkerSecretName"` LLMOKeyEnvKey string `yaml:"llmoKeyEnvKey"` ModelResources map[string]Resources `yaml:"modelResources"` DefaultResources Resources `yaml:"defaultResources"` // DefaultReplicas specifies the number of replicas of the runtime (per model). // TODO(kenji): Revisit this once we support autoscaling. DefaultReplicas int `yaml:"defaultReplicas"` }
RuntimeConfig is the runtime configuration.
func (*RuntimeConfig) FormattedModelResources ¶ added in v0.258.0
func (c *RuntimeConfig) FormattedModelResources() map[string]Resources
FormattedModelResources returns the resources keyed by formatted model IDs.
type S3Config ¶ added in v0.2.0
type S3Config struct { EndpointURL string `yaml:"endpointUrl"` Region string `yaml:"region"` Bucket string `yaml:"bucket"` }
S3Config is the S3 configuration.
type ScalingConfig ¶ added in v0.280.0
type ScalingConfig struct { // TargetValue is the per-pod metric value that we target to maintain. // Currently, this is the concurrent requests per model runtime. TargetValue float64 `yaml:"targetValue"` // MaxReplicas is the maximum number of replicas. // e.g., if this is 10, the pod can be scaled up to 10. MaxReplicas int32 `yaml:"maxReplicas"` // MinReplicas is the minimum number of replicas. // e.g., if this is 0, the pod can be scaled down to 0. MinReplicas int32 `yaml:"minReplicas"` // MaxScaleUpRate is the maximum rate of scaling up. // e.g., current replicas is 2 and this rate is 3.0, // the pod can be scaled up to 6. (ceil(2 * 3.0) = 6) MaxScaleUpRate float64 `yaml:"maxScaleUpRate"` // MaxScaleDownRate is the maximum rate of scaling down. // e.g., current replicas is 6 and this rate is 0.5, // the pod can be scaled down to 3. (floor(6 * 0.5) = 3) MaxScaleDownRate float64 `yaml:"maxScaleDownRate"` }
ScalingConfig is the scaling configuration.
type VLLMConfig ¶ added in v0.190.0
VLLMConfig is the configuration for vLLM.
type WorkerConfig ¶ added in v0.103.0
type WorkerConfig struct {
TLS WorkerTLSConfig `yaml:"tls"`
}
WorkerConfig is the worker configuration.
type WorkerTLSConfig ¶ added in v0.103.0
type WorkerTLSConfig struct {
Enable bool `yaml:"enable"`
}
WorkerTLSConfig is the worker TLS configuration.