config

package

v0.327.0 Latest Latest Go to latest Published: Sep 12, 2024 License: Apache-2.0 Imports: 6 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/llm-operator/inference-manager

Links

Open Source Insights

Documentation ¶

Index ¶

Constants ¶

View Source

const (
	// RuntimeNameOllama is the Ollama runtime name.
	RuntimeNameOllama string = "ollama"
	// RuntimeNameVLLM is the VLLM runtime name.
	RuntimeNameVLLM string = "vllm"
)

View Source

const (
	// DefaultRunMode is the default run mode.
	DefaultRunMode runMode = "default"
	// MonolithicRunMode is the monolithic run mode.
	MonolithicRunMode runMode = "monolithic"
)

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type AutoscalerConfig ¶ added in v0.280.0

type AutoscalerConfig struct {
	Enable bool `yaml:"enable"`

	// InitialDelay is the initial delay before starting the autoscaler.
	InitialDelay time.Duration `yaml:"initialDelay"`
	// SyncPeriod is the period for calculating the scaling.
	SyncPeriod time.Duration `yaml:"syncPeriod"`
	// ScaleToZeroGracePeriod is the grace period before scaling to zero.
	ScaleToZeroGracePeriod time.Duration `yaml:"scaleToZeroGracePeriod"`
	// MetricsWindow is the window size for metrics.
	// e.g., if it's 5 minutes, we'll use the 5-minute average as the metric.
	MetricsWindow time.Duration `yaml:"metricsWindow"`

	RuntimeScalers map[string]ScalingConfig `yaml:"runtimeScalers"`
	DefaultScaler  ScalingConfig            `yaml:"defaultScaler"`
}

AutoscalerConfig is the autoscaler configuration.

type Config ¶

type Config struct {
	Runtime RuntimeConfig `yaml:"runtime"`
	Ollama  OllamaConfig  `yaml:"ollama"`
	// LLMPort is the port llm listens on.
	LLMPort int `yaml:"llmPort"`

	HealthPort int `yaml:"healthPort"`

	LeaderElection LeaderElectionConfig `yaml:"leaderElection"`

	Autoscaler AutoscalerConfig `yaml:"autoscaler"`

	ObjectStore ObjectStoreConfig `yaml:"objectStore"`

	// PreloadedModelIDs is a list of model IDs to preload. These models are downloaded locally
	// at the startup time.
	PreloadedModelIDs []string `yaml:"preloadedModelIds"`

	// ModelContextLengths is a map of model ID to context length. If not specified, the default
	// context length is used.
	ModelContextLengths map[string]int `yaml:"modelContextLengths"`

	Debug DebugConfig `yaml:"debug"`

	InferenceManagerServerWorkerServiceAddr string `yaml:"inferenceManagerServerWorkerServiceAddr"`
	ModelManagerServerWorkerServiceAddr     string `yaml:"modelManagerServerWorkerServiceAddr"`

	Worker WorkerConfig `yaml:"worker"`
}

Config is the configuration.

func Parse ¶

func Parse(path string) (Config, error)

Parse parses the configuration file at the given path, returning a new Config struct.

func (*Config) FormattedModelContextLengths ¶ added in v0.258.0

func (c *Config) FormattedModelContextLengths() map[string]int

FormattedModelContextLengths returns the model context lengths keyed by formatted model IDs.

func (*Config) FormattedPreloadedModelIDs ¶ added in v0.258.0

func (c *Config) FormattedPreloadedModelIDs() []string

FormattedPreloadedModelIDs returns a formatted IDs of models to be preloaded.

func (*Config) Validate ¶

func (c *Config) Validate(mode runMode) error

Validate validates the configuration.

type DebugConfig ¶ added in v0.2.0

type DebugConfig struct {
	// Standalone is true if the service is running in standalone mode (except the
	// dependency to inference-manager-server).
	Standalone bool `yaml:"standalone"`
}

DebugConfig is the debug configuration.

type LeaderElectionConfig ¶ added in v0.310.0

type LeaderElectionConfig struct {
	Enable bool   `yaml:"enable"`
	ID     string `yaml:"id"`

	// LeaseDuration is the duration that non-leader candidates will
	// wait to force acquire leadership. This is measured against time of
	// last observed ack. Default is 15 seconds.
	LeaseDuration *time.Duration `yaml:"leaseDuration"`
	// RenewDeadline is the duration that the acting controlplane will retry
	// refreshing leadership before giving up. Default is 10 seconds.
	RenewDeadline *time.Duration `yaml:"renewDeadline"`
	// RetryPeriod is the duration the LeaderElector clients should wait
	// between tries of actions. Default is 2 seconds.
	RetryPeriod *time.Duration `yaml:"retryPeriod"`
}

LeaderElectionConfig is the leader election configuration.

type ObjectStoreConfig ¶ added in v0.2.0

type ObjectStoreConfig struct {
	S3 S3Config `yaml:"s3"`
}

ObjectStoreConfig is the object store configuration.

func (*ObjectStoreConfig) Validate ¶ added in v0.2.0

func (c *ObjectStoreConfig) Validate() error

Validate validates the object store configuration.

type OllamaConfig ¶ added in v0.180.0

type OllamaConfig struct {
	// KeepAlive is the keep-alive duration for Ollama.
	// This controls how long Ollama keeps models in GPU memory.
	KeepAlive time.Duration `yaml:"keepAlive"`

	// NumParallel is the maximum number of requests procesed in parallel.
	NumParallel int `yaml:"numParallel"`

	// ForceSpreading is true if the models should be spread across all GPUs.
	ForceSpreading bool `yaml:"forceSpreading"`

	Debug bool `yaml:"debug"`

	RunnersDir string `yaml:"runnersDir"`
}

OllamaConfig is the Ollama configuration.

type PersistentVolume ¶ added in v0.240.0

type PersistentVolume struct {
	StorageClassName string `yaml:"storageClassName"`
	Size             string `yaml:"size"`
	AccessMode       string `yaml:"accessMode"`
}

PersistentVolume is the persistent volume configuration.

type Resources ¶ added in v0.240.0

type Resources struct {
	Requests map[string]string `yaml:"requests"`
	Limits   map[string]string `yaml:"limits"`
	Volume   *PersistentVolume `yaml:"volume"`
}

Resources is the resources configuration.

type RuntimeConfig ¶ added in v0.240.0

type RuntimeConfig struct {
	Name string `yaml:"name"`

	PullerImage            string            `yaml:"pullerImage"`
	RuntimeImages          map[string]string `yaml:"runtimeImages"`
	PullerImagePullPolicy  string            `yaml:"pullerImagePullPolicy"`
	RuntimeImagePullPolicy string            `yaml:"runtimeImagePullPolicy"`

	ConfigMapName        string `yaml:"configMapName"`
	AWSSecretName        string `yaml:"awsSecretName"`
	AWSKeyIDEnvKey       string `yaml:"awsKeyIdEnvKey"`
	AWSAccessKeyEnvKey   string `yaml:"awsAccessKeyEnvKey"`
	LLMOWorkerSecretName string `yaml:"llmoWorkerSecretName"`
	LLMOKeyEnvKey        string `yaml:"llmoKeyEnvKey"`

	ModelResources   map[string]Resources `yaml:"modelResources"`
	DefaultResources Resources            `yaml:"defaultResources"`

	// DefaultReplicas specifies the number of replicas of the runtime (per model).
	// TODO(kenji): Revisit this once we support autoscaling.
	DefaultReplicas int `yaml:"defaultReplicas"`

	ServiceAccountName string `yaml:"serviceAccountName"`

	NodeSelector map[string]string `yaml:"nodeSelector"`
	// TODO(kenji): Support affinity
	Tolerations []TolerationConfig `yaml:"tolerations"`
}

RuntimeConfig is the runtime configuration.

func (*RuntimeConfig) FormattedModelResources ¶ added in v0.258.0

func (c *RuntimeConfig) FormattedModelResources() map[string]Resources

FormattedModelResources returns the resources keyed by formatted model IDs.

type S3Config ¶ added in v0.2.0

type S3Config struct {
	EndpointURL string `yaml:"endpointUrl"`
	Region      string `yaml:"region"`
	Bucket      string `yaml:"bucket"`
}

S3Config is the S3 configuration.

type ScalingConfig ¶ added in v0.280.0

type ScalingConfig struct {
	// TargetValue is the per-pod metric value that we target to maintain.
	// Currently, this is the concurrent requests per model runtime.
	TargetValue float64 `yaml:"targetValue"`

	// MaxReplicas is the maximum number of replicas.
	// e.g., if this is 10, the pod can be scaled up to 10.
	MaxReplicas int32 `yaml:"maxReplicas"`
	// MinReplicas is the minimum number of replicas.
	// e.g., if this is 0, the pod can be scaled down to 0.
	MinReplicas int32 `yaml:"minReplicas"`

	// MaxScaleUpRate is the maximum rate of scaling up.
	// e.g., current replicas is 2 and this rate is 3.0,
	// the pod can be scaled up to 6. (ceil(2 * 3.0) = 6)
	MaxScaleUpRate float64 `yaml:"maxScaleUpRate"`
	// MaxScaleDownRate is the maximum rate of scaling down.
	// e.g., current replicas is 6 and this rate is 0.5,
	// the pod can be scaled down to 3. (floor(6 * 0.5) = 3)
	MaxScaleDownRate float64 `yaml:"maxScaleDownRate"`
}

ScalingConfig is the scaling configuration.

type TolerationConfig ¶ added in v0.312.0

type TolerationConfig struct {
	Key               string `yaml:"key"`
	Operator          string `yaml:"operator"`
	Value             string `yaml:"value"`
	Effect            string `yaml:"effect"`
	TolerationSeconds int64  `yaml:"tolerationSeconds"`
}

TolerationConfig is the toleration configuration.

type WorkerConfig ¶ added in v0.103.0

type WorkerConfig struct {
	TLS WorkerTLSConfig `yaml:"tls"`
}

WorkerConfig is the worker configuration.

type WorkerTLSConfig ¶ added in v0.103.0

type WorkerTLSConfig struct {
	Enable bool `yaml:"enable"`
}

WorkerTLSConfig is the worker TLS configuration.

Source Files ¶

View all Source files

config.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL