metrics

package
v0.4.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 19, 2025 License: Apache-2.0 Imports: 19 Imported by: 0

Documentation

Index

Constants

View Source
const (
	NumRequestsRunning                   = "num_requests_running"
	NumRequestsWaiting                   = "num_requests_waiting"
	NumRequestsSwapped                   = "num_requests_swapped"
	AvgPromptThroughputToksPerS          = "avg_prompt_throughput_toks_per_s"
	AvgGenerationThroughputToksPerS      = "avg_generation_throughput_toks_per_s"
	IterationTokensTotal                 = "iteration_tokens_total"
	TimeToFirstTokenSeconds              = "time_to_first_token_seconds"
	TimePerOutputTokenSeconds            = "time_per_output_token_seconds"
	E2ERequestLatencySeconds             = "e2e_request_latency_seconds"
	RequestQueueTimeSeconds              = "request_queue_time_seconds"
	RequestInferenceTimeSeconds          = "request_inference_time_seconds"
	RequestDecodeTimeSeconds             = "request_decode_time_seconds"
	RequestPrefillTimeSeconds            = "request_prefill_time_seconds"
	P95TTFT5m                            = "p95_ttft_5m"
	P95TTFT5mPod                         = "p95_ttft_5m_pod"
	AvgTTFT5mPod                         = "avg_ttft_5m_pod"
	P95TPOT5mPod                         = "p95_tpot_5m_pod"
	AvgTPOT5mPod                         = "avg_tpot_pod_5m"
	AvgPromptToksPerReq                  = "avg_prompt_toks_per_req"
	AvgGenerationToksPerReq              = "avg_generation_toks_per_req"
	GPUCacheUsagePerc                    = "gpu_cache_usage_perc"
	GPUBusyTimeRatio                     = "gpu_busy_time_ratio"
	CPUCacheUsagePerc                    = "cpu_cache_usage_perc"
	EngineUtilization                    = "engine_utilization"
	AvgE2ELatencyPod                     = "avg_e2e_latency_pod"
	AvgRequestsPerMinPod                 = "avg_requests_per_min_pod"
	AvgPromptThroughputToksPerMinPod     = "avg_prompt_throughput_toks_per_min_pod"
	AvgGenerationThroughputToksPerMinPod = "avg_generation_throughput_toks_per_min_pod"
	MaxLora                              = "max_lora"
	WaitingLoraAdapters                  = "waiting_lora_adapters"
	RunningLoraAdapters                  = "running_lora_adapters"
	VTCBucketSizeActive                  = "vtc_bucket_size_active"
	// Realtime metrics
	RealtimeNumRequestsRunning = "realtime_num_requests_running"
	RealtimeNormalizedPendings = "realtime_normalized_pendings"
)

Variables

View Source
var (

	// Function variables that can be overridden for testing
	SetGaugeMetricFnForTest         = defaultSetGaugeMetric
	IncrementCounterMetricFnForTest = defaultIncrementCounterMetric
)
View Source
var (
	// Metrics defines all available metrics, including raw and query-based metrics.
	Metrics = map[string]Metric{

		NumRequestsRunning: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:num_requests_running",
				"sglang": "sglang:num_running_reqs",
			},
			Description: "Number of running requests",
		},
		NumRequestsWaiting: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:num_requests_waiting",
			},
			Description: "Number of waiting requests",
		},
		NumRequestsSwapped: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:num_requests_swapped",
			},
			Description: "Number of swapped requests",
		},

		AvgPromptThroughputToksPerS: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:avg_prompt_throughput_toks_per_s",
			},
			Description: "Average prompt throughput in tokens per second",
		},
		AvgGenerationThroughputToksPerS: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:avg_generation_throughput_toks_per_s",
				"sglang": "sglang:gen_throughput",
			},
			Description: "Average generation throughput in tokens per second",
		},

		IterationTokensTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:iteration_tokens_total",
			},
			Description: "Total iteration tokens",
		},
		TimeToFirstTokenSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:time_to_first_token_seconds",
				"sglang": "sglang:time_to_first_token_seconds",
			},
			Description: "Time to first token in seconds",
		},
		TimePerOutputTokenSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:time_per_output_token_seconds",
				"sglang": "sglang:inter_token_latency_seconds",
			},
			Description: "Time per output token in seconds",
		},
		E2ERequestLatencySeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:e2e_request_latency_seconds",
				"sglang": "sglang:e2e_request_latency_seconds",
			},
			Description: "End-to-end request latency in seconds",
		},
		RequestQueueTimeSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_queue_time_seconds",
			},
			Description: "Request queue time in seconds",
		},
		RequestInferenceTimeSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_inference_time_seconds",
			},
			Description: "Request inference time in seconds",
		},
		RequestDecodeTimeSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_decode_time_seconds",
			},
			Description: "Request decode time in seconds",
		},
		RequestPrefillTimeSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_prefill_time_seconds",
			},
			Description: "Request prefill time in seconds",
		},

		P95TTFT5m: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{instance="${instance}", model_name="${model_name}", job="pods"}[5m])))`,
			Description: "95th ttft in last 5 mins",
		},
		P95TTFT5mPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{instance="${instance}", job="pods"}[5m])))`,
			Description: "95th ttft in last 5 mins",
		},
		AvgTTFT5mPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:time_to_first_token_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:time_to_first_token_seconds_count{instance="${instance}", job="pods"}[5m])`,
			Description: "Average ttft in last 5 mins",
		},
		P95TPOT5mPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{instance="${instance}", job="pods"}[5m])))`,
			Description: "95th tpot in last 5 mins",
		},
		AvgTPOT5mPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:time_per_output_token_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:time_per_output_token_seconds_sum{instance="${instance}", job="pods"}[5m])`,
			Description: "Average tpot in last 5 mins",
		},
		AvgPromptToksPerReq: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:request_prompt_tokens_sum{instance="${instance}", model_name="${model_name}", job="pods"}[1d]) / increase(vllm:request_prompt_tokens_count{instance="${instance}", model_name="${model_name}", job="pods"}[1d])`,
			Description: "Average prompt tokens per request in last day",
		},
		AvgGenerationToksPerReq: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:request_generation_tokens_sum{instance="${instance}", model_name="${model_name}", job="pods"}[1d]) / increase(vllm:request_generation_tokens_count{instance="${instance}", model_name="${model_name}", job="pods"}[1d])`,
			Description: "Average generation tokens per request in last day",
		},
		GPUCacheUsagePerc: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:gpu_cache_usage_perc",
				"sglang": "sglang:token_usage",
				"xllm":   "kv_cache_utilization",
			},
			Description: "GPU cache usage percentage",
		},
		EngineUtilization: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"xllm": "engine_utilization",
			},
			Description: "GPU busy time ratio",
		},
		CPUCacheUsagePerc: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:cpu_cache_usage_perc",
			},
			Description: "CPU cache usage percentage",
		},
		AvgE2ELatencyPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:e2e_request_latency_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:e2e_request_latency_seconds_count{instance="${instance}", job="pods"}[5m])`,
			Description: "Average End-to-end latency in last 5 mins",
		},
		AvgRequestsPerMinPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:request_success_total{instance="${instance}", job="pods"}[5m]) / 5`,
			Description: "Average requests throughput per minute in last 5 mins",
		},
		AvgPromptThroughputToksPerMinPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:prompt_tokens_total{instance="${instance}", job="pods"}[5m]) / 5`,
			Description: "Average prompt throughput in tokens per minute in last 5 mins",
		},
		AvgGenerationThroughputToksPerMinPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:generation_tokens_total{instance="${instance}", job="pods"}[5m]) / 5`,
			Description: "Average generation throughput in tokens per minute in last 5 mins",
		},
		MaxLora: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Query: QueryLabel,
			},
			RawMetricName: "lora_requests_info",
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:max_lora",
			},
			Description: "Max count of Lora Adapters",
		},
		RunningLoraAdapters: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Query: QueryLabel,
			},
			RawMetricName: "lora_requests_info",
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:running_lora_adapters",
			},
			Description: "Count of running Lora Adapters",
		},
		WaitingLoraAdapters: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Query: QueryLabel,
			},
			RawMetricName: "lora_requests_info",
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:waiting_lora_adapters",
			},
			Description: "Count of waiting Lora Adapters",
		},
		VTCBucketSizeActive: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			Description: "Current adaptive bucket size used by VTC algorithm for token normalization",
		},
	}
)

Functions

func BuildQuery

func BuildQuery(queryTemplate string, queryLabels map[string]string) string

BuildQuery dynamically injects labels into a PromQL query template.

func GetCounterGaugeValue

func GetCounterGaugeValue(metric *dto.Metric, metricType dto.MetricType) (float64, error)

func GetGaugeValueForTest

func GetGaugeValueForTest(name string, labelValues ...string) float64

func GetLabelValueForKey

func GetLabelValueForKey(metric *dto.Metric, key string) (string, error)

func GetMetricHelp

func GetMetricHelp(metricName string) string

func IncrementCounterMetric

func IncrementCounterMetric(name string, help string, value float64, labelNames []string, labelValues ...string)

func InitializePrometheusAPI

func InitializePrometheusAPI(endpoint, username, password string) (prometheusv1.API, error)

InitializePrometheusAPI initializes the Prometheus API client.

func ParseMetricFromBody

func ParseMetricFromBody(body []byte, metricName string) (float64, error)

ParseMetricFromBody parses a simple metric from the Prometheus response body.

func ParseMetricsURLWithContext added in v0.4.0

func ParseMetricsURLWithContext(ctx context.Context, url string) (map[string]*dto.MetricFamily, error)

func SetGaugeMetric

func SetGaugeMetric(name string, help string, value float64, labelNames []string, labelValues ...string)

func SetupCounterMetricsForTest

func SetupCounterMetricsForTest(metricName string, labelNames []string) (*prometheus.CounterVec, func())

func SetupMetricsForTest

func SetupMetricsForTest(metricName string, labelNames []string) (*prometheus.GaugeVec, func())

Types

type HistogramMetricValue

type HistogramMetricValue struct {
	Sum     float64
	Count   float64
	Buckets map[string]float64 // e.g., {"0.1": 5, "0.5": 3, "1.0": 2}
}

HistogramMetricValue represents a detailed histogram metric.

func GetHistogramValue

func GetHistogramValue(metric *dto.Metric) (*HistogramMetricValue, error)

func ParseHistogramFromBody

func ParseHistogramFromBody(body []byte, metricName string) (*HistogramMetricValue, error)

ParseHistogramFromBody parses a histogram metric from the Prometheus response body.

func (*HistogramMetricValue) GetBucketValue

func (h *HistogramMetricValue) GetBucketValue(bucket string) (float64, bool)

GetBucketValue returns the count for a specific bucket.

func (*HistogramMetricValue) GetCount

func (h *HistogramMetricValue) GetCount() float64

GetCount returns the total count of values in the histogram.

func (*HistogramMetricValue) GetHistogramValue

func (h *HistogramMetricValue) GetHistogramValue() *HistogramMetricValue

func (*HistogramMetricValue) GetLabelValue

func (s *HistogramMetricValue) GetLabelValue() string

func (*HistogramMetricValue) GetMean

func (h *HistogramMetricValue) GetMean() float64

GetMean returns the mean value of the histogram (Sum / Count).

func (*HistogramMetricValue) GetPercentile

func (h *HistogramMetricValue) GetPercentile(percentile float64) (float64, error)

func (*HistogramMetricValue) GetPrometheusResult

func (h *HistogramMetricValue) GetPrometheusResult() *model.Value

func (*HistogramMetricValue) GetSimpleValue

func (h *HistogramMetricValue) GetSimpleValue() float64

func (*HistogramMetricValue) GetSum

func (h *HistogramMetricValue) GetSum() float64

GetSum returns the sum of the histogram values.

func (*HistogramMetricValue) GetValue

func (h *HistogramMetricValue) GetValue() interface{}

type LabelValueMetricValue

type LabelValueMetricValue struct {
	Value string
}

PrometheusMetricValue represents Prometheus query results.

func (*LabelValueMetricValue) GetHistogramValue

func (l *LabelValueMetricValue) GetHistogramValue() *HistogramMetricValue

func (*LabelValueMetricValue) GetLabelValue

func (l *LabelValueMetricValue) GetLabelValue() string

func (*LabelValueMetricValue) GetPrometheusResult

func (l *LabelValueMetricValue) GetPrometheusResult() *model.Value

func (*LabelValueMetricValue) GetSimpleValue

func (l *LabelValueMetricValue) GetSimpleValue() float64

type Metric

type Metric struct {
	MetricSource             MetricSource
	MetricType               MetricType
	PromQL                   string            // Optional: Only applicable for PromQL-based metrics
	RawMetricName            string            // Optional: Only applicable for QueryLabel-based metrics
	EngineMetricsNameMapping map[string]string // Optional: Mapping from engine type to raw metric name.
	Description              string
	MetricScope              MetricScope
}

Metric defines a unique metric with metadata.

type MetricScope

type MetricScope string

MetricScope defines the scope of a metric (e.g., model or pod or podmodel).

const (
	ModelMetricScope    MetricScope = "Model"
	PodMetricScope      MetricScope = "Pod"
	PodModelMetricScope MetricScope = "PodModel" // model in pod
)

type MetricSource

type MetricSource string

MetricSource defines the metric source

const (
	// PrometheusEndpoint indicates metrics are queried from a remote Prometheus server.
	// This source allows querying both raw and aggregated metrics, leveraging PromQL for advanced analytics.
	PrometheusEndpoint MetricSource = "PrometheusEndpoint"
	// PodRawMetrics indicates metrics are collected directly from the metricPort of a Pod.
	PodRawMetrics MetricSource = "PodRawMetrics"
)

type MetricSubscriber

type MetricSubscriber interface {
	SubscribedMetrics() []string
}

type MetricType

type MetricType struct {
	Raw   RawMetricType // Optional: Represents the type of raw metric.
	Query QueryType     // Optional: Represents the query type for derived metrics.
}

MetricType defines the type of a metric, including raw metrics and queries.

func (MetricType) IsQuery

func (m MetricType) IsQuery() bool

func (MetricType) IsRawMetric

func (m MetricType) IsRawMetric() bool

type MetricValue

type MetricValue interface {
	GetSimpleValue() float64
	GetHistogramValue() *HistogramMetricValue
	GetPrometheusResult() *model.Value
	GetLabelValue() string
}

MetricValue is the interface for all metric values.

type PrometheusMetricValue

type PrometheusMetricValue struct {
	Result *model.Value
}

PrometheusMetricValue represents Prometheus query results.

func (*PrometheusMetricValue) GetHistogramValue

func (p *PrometheusMetricValue) GetHistogramValue() *HistogramMetricValue

func (*PrometheusMetricValue) GetLabelValue

func (s *PrometheusMetricValue) GetLabelValue() string

func (*PrometheusMetricValue) GetPrometheusResult

func (p *PrometheusMetricValue) GetPrometheusResult() *model.Value

func (*PrometheusMetricValue) GetSimpleValue

func (p *PrometheusMetricValue) GetSimpleValue() float64

type QueryType

type QueryType string

QueryType defines the type of metric query, such as PromQL.

const (
	PromQL     QueryType = "PromQL"     // PromQL represents a Prometheus query language expression.
	QueryLabel QueryType = "QueryLabel" // Query Label value from raw metrics.
)

type RawMetricType

type RawMetricType string

RawMetricType defines the type of raw metrics (e.g., collected directly from a source).

const (
	Gauge     RawMetricType = "Gauge"     // Gauge represents a snapshot value.
	Counter   RawMetricType = "Counter"   // Counter represents a cumulative value.
	Histogram RawMetricType = "Histogram" // Histogram represents a distribution of values.
)

type Server added in v0.4.0

type Server struct {
	// contains filtered or unexported fields
}

func NewServer added in v0.4.0

func NewServer(addr string) *Server

func (*Server) Start added in v0.4.0

func (s *Server) Start() error

func (*Server) Stop added in v0.4.0

func (s *Server) Stop() error

type SimpleMetricValue

type SimpleMetricValue struct {
	Value float64
}

SimpleMetricValue represents simple metrics (e.g., gauge or counter).

func (*SimpleMetricValue) GetHistogramValue

func (s *SimpleMetricValue) GetHistogramValue() *HistogramMetricValue

func (*SimpleMetricValue) GetLabelValue

func (s *SimpleMetricValue) GetLabelValue() string

func (*SimpleMetricValue) GetPrometheusResult

func (s *SimpleMetricValue) GetPrometheusResult() *model.Value

func (*SimpleMetricValue) GetSimpleValue

func (s *SimpleMetricValue) GetSimpleValue() float64

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL