modelagent

package

v0.1.3 Latest Latest Go to latest Published: Jul 16, 2025 License: MIT Imports: 43 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/sgl-project/ome

Links

Open Source Insights

Documentation ¶

Overview ¶

Package modelagent implements the model agent components for managing models in OME.

Index ¶

Constants
func GetModelTypeNamespaceAndName(task *GopherTask) (string, string, string)
func RegisterMetricsHandler(mux *http.ServeMux)
type CacheEntry
type ConfigMapMetadataOp
type ConfigMapReconciler
- func NewConfigMapReconciler(nodeName string, namespace string, kubeClient kubernetes.Interface, ...) *ConfigMapReconciler
- func (c *ConfigMapReconciler) DeleteModelFromConfigMap(ctx context.Context, baseModel *v1beta1.BaseModel, ...) error
- func (c *ConfigMapReconciler) ReconcileModelMetadata(ctx context.Context, op *ConfigMapMetadataOp) error
- func (c *ConfigMapReconciler) ReconcileModelStatus(ctx context.Context, statusOp *ConfigMapStatusOp) error
- func (c *ConfigMapReconciler) StartReconciliation()
- func (c *ConfigMapReconciler) StopReconciliation()
type ConfigMapStatusOp
type Gopher
- func NewGopher(modelConfigParser *ModelConfigParser, configMapReconciler *ConfigMapReconciler, ...) (*Gopher, error)
- func (s *Gopher) Run(stopCh <-chan struct{}, numWorker int)
type GopherTask
type GopherTaskType
type Metrics
- func NewMetrics(registerer prometheus.Registerer) *Metrics
- func (m *Metrics) ObserveDownloadDuration(modelType, namespace, name string, duration time.Duration)
- func (m *Metrics) ObserveVerificationDuration(duration time.Duration)
- func (m *Metrics) RecordBytesTransferred(modelType, namespace, name string, bytes int64)
- func (m *Metrics) RecordFailedDownload(modelType, namespace, name, errorType string)
- func (m *Metrics) RecordGCDuration(duration time.Duration)
- func (m *Metrics) RecordRateLimit(modelType, namespace, name string, waitDuration time.Duration)
- func (m *Metrics) RecordSuccessfulDownload(modelType, namespace, name string)
- func (m *Metrics) RecordVerification(modelType, namespace, name string, success bool)
type ModelAgentHealthCheck
- func NewModelAgentHealthCheck(modelsRootDir string) ModelAgentHealthCheck
- func (h ModelAgentHealthCheck) Check(_ *http.Request) error
- func (h ModelAgentHealthCheck) Name() string
type ModelConfig
- func ConvertMetadataToModelConfig(metadata ModelMetadata) *ModelConfig
type ModelConfigParser
- func NewModelConfigParser(omeClient versioned.Interface, logger *zap.SugaredLogger) *ModelConfigParser
- func (p *ModelConfigParser) ParseModelConfig(modelDir string, baseModel *v1beta1.BaseModel, ...) (*ModelMetadata, error)
type ModelEntry
type ModelMetadata
type ModelStateOnNode
type ModelStatus
type NodeLabelOp
type NodeLabelReconciler
- func NewNodeLabelReconciler(nodeName string, kubeClient kubernetes.Interface, opRetry int, ...) *NodeLabelReconciler
- func (n *NodeLabelReconciler) ReconcileNodeLabels(op *NodeLabelOp) error
type Scout
- func NewScout(ctx context.Context, nodeName string, ...) (*Scout, error)
- func (w *Scout) Run(stopCh <-chan struct{}) error
type TensorRTLLMShapeFilter

Constants ¶

View Source

const (
	BigFileSizeInMB = 200
)

View Source

const ConfigParsingAnnotation = "ome.oracle.com/skip-config-parsing"

ConfigParsingAnnotation is the annotation key to skip config parsing

View Source

const (
	// DefaultConfigFileName Default config file name used by Hugging Face models
	DefaultConfigFileName = "config.json"
)

Variables ¶

This section is empty.

Functions ¶

func GetModelTypeNamespaceAndName ¶

func GetModelTypeNamespaceAndName(task *GopherTask) (string, string, string)

GetModelTypeNamespaceAndName extracts the model type, namespace, and name from a gopher task

func RegisterMetricsHandler ¶

func RegisterMetricsHandler(mux *http.ServeMux)

RegisterMetricsHandler registers the metrics HTTP handler

Types ¶

type CacheEntry ¶

type CacheEntry struct {
	ModelName     string         // Name of the model
	ModelStatus   ModelStatus    // Current status of the model
	ModelMetadata *ModelMetadata // Model metadata if available
}

CacheEntry represents an entry in the model cache for ConfigMap reconciliation.

type ConfigMapMetadataOp ¶

type ConfigMapMetadataOp struct {
	ModelMetadata    ModelMetadata             // The metadata to be stored for the model
	BaseModel        *v1beta1.BaseModel        // Reference to a namespace-scoped BaseModel (nil if using ClusterBaseModel)
	ClusterBaseModel *v1beta1.ClusterBaseModel // Reference to a cluster-scoped BaseModel (nil if using BaseModel)
}

ConfigMapMetadataOp represents an operation to update model metadata in ConfigMap. It contains the necessary information to identify the model and its metadata.

type ConfigMapReconciler ¶

type ConfigMapReconciler struct {
	// contains filtered or unexported fields
}

ConfigMapReconciler handles all ConfigMap operations for storing model state and metadata. It provides self-healing capabilities through periodic reconciliation to recover from manual ConfigMap deletions or modifications without requiring agent restarts.

func NewConfigMapReconciler ¶

func NewConfigMapReconciler(nodeName string, namespace string, kubeClient kubernetes.Interface, logger *zap.SugaredLogger) *ConfigMapReconciler

NewConfigMapReconciler creates a new ConfigMapReconciler with the given parameters. It initializes the in-memory model cache and sets up the reconciliation interval.

Parameters:

nodeName: Name of the node, used as the ConfigMap name
namespace: Kubernetes namespace where the ConfigMap will be stored
kubeClient: Interface to the Kubernetes API
logger: Structured logger for operation recording

Returns:

A configured ConfigMapReconciler ready to use

func (*ConfigMapReconciler) DeleteModelFromConfigMap ¶

func (c *ConfigMapReconciler) DeleteModelFromConfigMap(ctx context.Context, baseModel *v1beta1.BaseModel, clusterBaseModel *v1beta1.ClusterBaseModel) error

DeleteModelFromConfigMap removes a model entry from the ConfigMap

Parameters:

ctx: Context for cancellation and timeouts
baseModel: The BaseModel reference
clusterBaseModel: The ClusterBaseModel reference

Returns:

error: nil if deletion succeeds or model doesn't exist, error otherwise

func (*ConfigMapReconciler) ReconcileModelMetadata ¶

func (c *ConfigMapReconciler) ReconcileModelMetadata(ctx context.Context, op *ConfigMapMetadataOp) error

ReconcileModelMetadata updates the ConfigMap with model metadata

func (*ConfigMapReconciler) ReconcileModelStatus ¶

func (c *ConfigMapReconciler) ReconcileModelStatus(ctx context.Context, statusOp *ConfigMapStatusOp) error

ReconcileModelStatus updates the ConfigMap with model status information and synchronizes the in-memory cache.

This method performs two key operations: 1. Updates the model status in the Kubernetes ConfigMap, creating it if necessary 2. Synchronizes the in-memory model cache with the updated status information

The cache updates are atomic, protected by mutex, ensuring thread safety even with concurrent reconciliation. Both operations must succeed for the method to return nil, otherwise an error is returned.

Parameters:

op: ConfigMapStatusOp containing model references and new status

Returns:

error: nil if both ConfigMap and cache updates succeed, error otherwise

func (*ConfigMapReconciler) StartReconciliation ¶

func (c *ConfigMapReconciler) StartReconciliation()

StartReconciliation begins the periodic reconciliation of ConfigMaps. This launches a background goroutine that checks for ConfigMap consistency at regular intervals and repairs any detected issues without requiring agent restarts. The interval is configurable through the reconcileInterval field (default: 5 minutes).

This method should be called once during component initialization, typically from the model agent's main startup sequence.

func (*ConfigMapReconciler) StopReconciliation ¶

func (c *ConfigMapReconciler) StopReconciliation()

StopReconciliation safely stops the periodic reconciliation process. This should be called during graceful shutdown of the component to ensure that background goroutines are properly terminated. This method is idempotent - calling it multiple times has no additional effect.

type ConfigMapStatusOp ¶

type ConfigMapStatusOp struct {
	ModelStatus      ModelStatus               // The updated status of the model
	BaseModel        *v1beta1.BaseModel        // Reference to a namespace-scoped BaseModel (nil if using ClusterBaseModel)
	ClusterBaseModel *v1beta1.ClusterBaseModel // Reference to a cluster-scoped BaseModel (nil if using BaseModel)
}

ConfigMapStatusOp represents an operation to update model status in ConfigMap. It contains the necessary information to identify the model and its new status.

type Gopher ¶

type Gopher struct {
	// contains filtered or unexported fields
}

func NewGopher ¶

func NewGopher(
	modelConfigParser *ModelConfigParser,
	configMapReconciler *ConfigMapReconciler,
	hubClient *hub.HubClient,
	kubeClient kubernetes.Interface,
	concurrency int,
	multipartConcurrency int,
	downloadRetry int,
	modelRootDir string,
	gopherChan <-chan *GopherTask,
	nodeLabelReconciler *NodeLabelReconciler,
	metrics *Metrics,
	logger *zap.SugaredLogger) (*Gopher, error)

func (*Gopher) Run ¶

func (s *Gopher) Run(stopCh <-chan struct{}, numWorker int)

type GopherTask ¶

type GopherTask struct {
	TaskType               GopherTaskType
	BaseModel              *v1beta1.BaseModel
	ClusterBaseModel       *v1beta1.ClusterBaseModel
	TensorRTLLMShapeFilter *TensorRTLLMShapeFilter
}

type GopherTaskType ¶

type GopherTaskType string

const (
	Download         GopherTaskType = "Download"
	DownloadOverride GopherTaskType = "DownloadOverride"
	Delete           GopherTaskType = "Delete"
)

type Metrics ¶

type Metrics struct {
	// contains filtered or unexported fields
}

Metrics is a struct that contains all metrics for the model-agent

func NewMetrics ¶

func NewMetrics(registerer prometheus.Registerer) *Metrics

NewMetrics creates a new Metrics struct with initialized Prometheus metrics

func (*Metrics) ObserveDownloadDuration ¶

func (m *Metrics) ObserveDownloadDuration(modelType, namespace, name string, duration time.Duration)

ObserveDownloadDuration records the duration of a model download

func (*Metrics) ObserveVerificationDuration ¶

func (m *Metrics) ObserveVerificationDuration(duration time.Duration)

ObserveVerificationDuration records the duration of a model verification

func (*Metrics) RecordBytesTransferred ¶

func (m *Metrics) RecordBytesTransferred(modelType, namespace, name string, bytes int64)

RecordBytesTransferred records the number of bytes transferred during a download

func (*Metrics) RecordFailedDownload ¶

func (m *Metrics) RecordFailedDownload(modelType, namespace, name, errorType string)

RecordFailedDownload records a failed model download

func (*Metrics) RecordGCDuration ¶

func (m *Metrics) RecordGCDuration(duration time.Duration)

RecordGCDuration records the duration of a garbage collection cycle

func (*Metrics) RecordRateLimit ¶ added in v0.1.1

func (m *Metrics) RecordRateLimit(modelType, namespace, name string, waitDuration time.Duration)

RecordRateLimit records a rate limit event

func (*Metrics) RecordSuccessfulDownload ¶

func (m *Metrics) RecordSuccessfulDownload(modelType, namespace, name string)

RecordSuccessfulDownload records a successful model download

func (*Metrics) RecordVerification ¶

func (m *Metrics) RecordVerification(modelType, namespace, name string, success bool)

RecordVerification records a model verification

type ModelAgentHealthCheck ¶

type ModelAgentHealthCheck struct {
	// contains filtered or unexported fields
}

func NewModelAgentHealthCheck ¶

func NewModelAgentHealthCheck(modelsRootDir string) ModelAgentHealthCheck

func (ModelAgentHealthCheck) Check ¶

func (h ModelAgentHealthCheck) Check(_ *http.Request) error

func (ModelAgentHealthCheck) Name ¶

func (h ModelAgentHealthCheck) Name() string

type ModelConfig ¶

type ModelConfig struct {
	// Core model identification
	ModelType         string `json:"modelType,omitempty"`         // e.g., "mistral", "llama", "phi"
	ModelArchitecture string `json:"modelArchitecture,omitempty"` // e.g., "MistralModel", "LlamaModel"

	// Framework and format information
	ModelFramework map[string]string `json:"modelFramework,omitempty"` // e.g., {"name":"transformers","version":"4.34.0"}
	ModelFormat    map[string]string `json:"modelFormat,omitempty"`    // e.g., {"name":"safetensors","version":"1.0.0"}

	// Model capabilities and size
	ModelParameterSize string   `json:"modelParameterSize,omitempty"` // Human-readable size, e.g., "7.11B"
	MaxTokens          int32    `json:"maxTokens,omitempty"`          // Maximum context length, e.g., 32768
	ModelCapabilities  []string `json:"modelCapabilities,omitempty"`  // e.g., ["TEXT_GENERATION", "TEXT_EMBEDDINGS"]

	// Advanced information
	DecodedModelConfiguration map[string]interface{} `json:"decodedModelConfiguration,omitempty"` // Detailed configuration
	Quantization              string                 `json:"quantization,omitempty"`              // Quantization type if applicable
}

ModelConfig represents the configuration of a model This is a structured version of the model metadata that is stored in the ConfigMap

func ConvertMetadataToModelConfig ¶

func ConvertMetadataToModelConfig(metadata ModelMetadata) *ModelConfig

ConvertMetadataToModelConfig converts internal ModelMetadata to a client-facing ModelConfig This transforms the internal representation to the structured format stored in ConfigMaps

type ModelConfigParser ¶

type ModelConfigParser struct {
	// contains filtered or unexported fields
}

ModelConfigParser is responsible for parsing model config files and updating the corresponding Model CRD

func NewModelConfigParser ¶

func NewModelConfigParser(omeClient versioned.Interface, logger *zap.SugaredLogger) *ModelConfigParser

NewModelConfigParser creates a new model config parser

func (*ModelConfigParser) ParseModelConfig ¶

func (p *ModelConfigParser) ParseModelConfig(modelDir string, baseModel *v1beta1.BaseModel, clusterBaseModel *v1beta1.ClusterBaseModel) (*ModelMetadata, error)

ParseModelConfig reads the config.json file from the model directory and extracts metadata without updating any resources. This allows the caller to control when and how updates happen.

type ModelEntry ¶

type ModelEntry struct {
	Name   string       `json:"name"`             // Name of the model
	Status ModelStatus  `json:"status"`           // Current status of the model on this node
	Config *ModelConfig `json:"config,omitempty"` // Model configuration, may be nil if just tracking status
}

ModelEntry represents an entry in the node model ConfigMap This is the top-level structure stored for each model in the ConfigMap

type ModelMetadata ¶

type ModelMetadata struct {
	ModelType                 string
	ModelArchitecture         string
	ModelFramework            *v1beta1.ModelFrameworkSpec
	ModelFormat               v1beta1.ModelFormat
	ModelParameterSize        string
	MaxTokens                 int32
	ModelCapabilities         []string
	ModelConfiguration        []byte
	DecodedModelConfiguration map[string]interface{} `json:"DecodedModelConfiguration,omitempty"`
	Quantization              v1beta1.ModelQuantization
}

ModelMetadata contains the extracted metadata about a model

type ModelStateOnNode ¶

type ModelStateOnNode string

ModelStateOnNode represents the model state in legacy format Maintained for backward compatibility with existing codepaths

const (
	// Ready indicates the model is ready to use
	Ready ModelStateOnNode = "Ready"
	// Updating indicates the model is being downloaded or updated
	Updating ModelStateOnNode = "Updating"
	// Failed indicates the model failed to download or initialize
	Failed ModelStateOnNode = "Failed"
	// Deleted indicates the model was marked for deletion
	Deleted ModelStateOnNode = "Deleted"
)

Model state constants (legacy format)

type ModelStatus ¶

type ModelStatus string

ModelStatus represents the status of a model on a node

const (
	// ModelStatusReady indicates the model is ready for use
	ModelStatusReady ModelStatus = "Ready"
	// ModelStatusUpdating indicates the model is currently being downloaded or updated
	ModelStatusUpdating ModelStatus = "Updating"
	// ModelStatusFailed indicates the model failed to download or initialize
	ModelStatusFailed ModelStatus = "Failed"
	// ModelStatusDeleted indicates the model was deleted
	ModelStatusDeleted ModelStatus = "Deleted"
)

Model status constants

type NodeLabelOp ¶

type NodeLabelOp struct {
	ModelStateOnNode ModelStateOnNode
	BaseModel        *v1beta1.BaseModel
	ClusterBaseModel *v1beta1.ClusterBaseModel
}

NodeLabelOp represents an operation on node labels This is used to pass model references to the NodeLabelReconciler

type NodeLabelReconciler ¶

type NodeLabelReconciler struct {
	// contains filtered or unexported fields
}

NodeLabelReconciler handles updating node labels œwith model status information It provides a clean separation from ConfigMap operations

func NewNodeLabelReconciler ¶

func NewNodeLabelReconciler(nodeName string, kubeClient kubernetes.Interface, opRetry int, logger *zap.SugaredLogger) *NodeLabelReconciler

NewNodeLabelReconciler creates a new NodeLabelReconciler instance

func (*NodeLabelReconciler) ReconcileNodeLabels ¶

func (n *NodeLabelReconciler) ReconcileNodeLabels(op *NodeLabelOp) error

ReconcileNodeLabels applies model state changes to node labels with retries

type Scout ¶ added in v0.1.3

type Scout struct {
	// contains filtered or unexported fields
}

func NewScout ¶

func NewScout(ctx context.Context, nodeName string,
	baseModelInformer omev1beta1.BaseModelInformer,
	clusterBaseModelInformer omev1beta1.ClusterBaseModelInformer,
	informerFactory omev1beta1informers.SharedInformerFactory,
	gopherChan chan<- *GopherTask,
	kubeClient *kubernetes.Clientset,
	logger *zap.SugaredLogger) (*Scout, error)

func (*Scout) Run ¶ added in v0.1.3

func (w *Scout) Run(stopCh <-chan struct{}) error

type TensorRTLLMShapeFilter ¶

type TensorRTLLMShapeFilter struct {
	IsTensorrtLLMModel bool
	ShapeAlias         string
	ModelType          string
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL