Documentation
¶
Index ¶
- Constants
- Variables
- func BuildQuery(queryTemplate string, queryLabels map[string]string) string
- func GetCounterGaugeValue(metric *dto.Metric, metricType dto.MetricType) (float64, error)
- func GetGaugeValueForTest(name string, labelValues ...string) float64
- func GetLabelValueForKey(metric *dto.Metric, key string) (string, error)
- func GetMetricHelp(metricName string) string
- func IncrementCounterMetric(name string, help string, value float64, labelNames []string, ...)
- func InitializePrometheusAPI(endpoint, username, password string) (prometheusv1.API, error)
- func ParseMetricFromBody(body []byte, metricName string) (float64, error)
- func ParseMetricsURLWithContext(ctx context.Context, url string) (map[string]*dto.MetricFamily, error)
- func SetGaugeMetric(name string, help string, value float64, labelNames []string, ...)
- func SetupCounterMetricsForTest(metricName string, labelNames []string) (*prometheus.CounterVec, func())
- func SetupMetricsForTest(metricName string, labelNames []string) (*prometheus.GaugeVec, func())
- type HistogramMetricValue
- func (h *HistogramMetricValue) GetBucketValue(bucket string) (float64, bool)
- func (h *HistogramMetricValue) GetCount() float64
- func (h *HistogramMetricValue) GetHistogramValue() *HistogramMetricValue
- func (s *HistogramMetricValue) GetLabelValue() string
- func (h *HistogramMetricValue) GetMean() float64
- func (h *HistogramMetricValue) GetPercentile(percentile float64) (float64, error)
- func (h *HistogramMetricValue) GetPrometheusResult() *model.Value
- func (h *HistogramMetricValue) GetSimpleValue() float64
- func (h *HistogramMetricValue) GetSum() float64
- func (h *HistogramMetricValue) GetValue() interface{}
- type LabelValueMetricValue
- type Metric
- type MetricScope
- type MetricSource
- type MetricSubscriber
- type MetricType
- type MetricValue
- type PrometheusMetricValue
- type QueryType
- type RawMetricType
- type Server
- type SimpleMetricValue
Constants ¶
const ( NumRequestsRunning = "num_requests_running" NumRequestsWaiting = "num_requests_waiting" NumRequestsSwapped = "num_requests_swapped" AvgPromptThroughputToksPerS = "avg_prompt_throughput_toks_per_s" AvgGenerationThroughputToksPerS = "avg_generation_throughput_toks_per_s" IterationTokensTotal = "iteration_tokens_total" TimeToFirstTokenSeconds = "time_to_first_token_seconds" TimePerOutputTokenSeconds = "time_per_output_token_seconds" E2ERequestLatencySeconds = "e2e_request_latency_seconds" RequestQueueTimeSeconds = "request_queue_time_seconds" RequestInferenceTimeSeconds = "request_inference_time_seconds" RequestDecodeTimeSeconds = "request_decode_time_seconds" RequestPrefillTimeSeconds = "request_prefill_time_seconds" P95TTFT5m = "p95_ttft_5m" P95TTFT5mPod = "p95_ttft_5m_pod" AvgTTFT5mPod = "avg_ttft_5m_pod" P95TPOT5mPod = "p95_tpot_5m_pod" AvgTPOT5mPod = "avg_tpot_pod_5m" AvgPromptToksPerReq = "avg_prompt_toks_per_req" AvgGenerationToksPerReq = "avg_generation_toks_per_req" GPUCacheUsagePerc = "gpu_cache_usage_perc" GPUBusyTimeRatio = "gpu_busy_time_ratio" CPUCacheUsagePerc = "cpu_cache_usage_perc" EngineUtilization = "engine_utilization" AvgE2ELatencyPod = "avg_e2e_latency_pod" AvgRequestsPerMinPod = "avg_requests_per_min_pod" AvgPromptThroughputToksPerMinPod = "avg_prompt_throughput_toks_per_min_pod" AvgGenerationThroughputToksPerMinPod = "avg_generation_throughput_toks_per_min_pod" MaxLora = "max_lora" WaitingLoraAdapters = "waiting_lora_adapters" RunningLoraAdapters = "running_lora_adapters" VTCBucketSizeActive = "vtc_bucket_size_active" // Realtime metrics RealtimeNumRequestsRunning = "realtime_num_requests_running" RealtimeNormalizedPendings = "realtime_normalized_pendings" )
Variables ¶
var ( // Function variables that can be overridden for testing SetGaugeMetricFnForTest = defaultSetGaugeMetric IncrementCounterMetricFnForTest = defaultIncrementCounterMetric )
var ( // Metrics defines all available metrics, including raw and query-based metrics. Metrics = map[string]Metric{ NumRequestsRunning: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_requests_running", "sglang": "sglang:num_running_reqs", }, Description: "Number of running requests", }, NumRequestsWaiting: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_requests_waiting", }, Description: "Number of waiting requests", }, NumRequestsSwapped: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_requests_swapped", }, Description: "Number of swapped requests", }, AvgPromptThroughputToksPerS: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:avg_prompt_throughput_toks_per_s", }, Description: "Average prompt throughput in tokens per second", }, AvgGenerationThroughputToksPerS: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:avg_generation_throughput_toks_per_s", "sglang": "sglang:gen_throughput", }, Description: "Average generation throughput in tokens per second", }, IterationTokensTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:iteration_tokens_total", }, Description: "Total iteration tokens", }, TimeToFirstTokenSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:time_to_first_token_seconds", "sglang": "sglang:time_to_first_token_seconds", }, Description: "Time to first token in seconds", }, TimePerOutputTokenSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:time_per_output_token_seconds", "sglang": "sglang:inter_token_latency_seconds", }, Description: "Time per output token in seconds", }, E2ERequestLatencySeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:e2e_request_latency_seconds", "sglang": "sglang:e2e_request_latency_seconds", }, Description: "End-to-end request latency in seconds", }, RequestQueueTimeSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_queue_time_seconds", }, Description: "Request queue time in seconds", }, RequestInferenceTimeSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_inference_time_seconds", }, Description: "Request inference time in seconds", }, RequestDecodeTimeSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_decode_time_seconds", }, Description: "Request decode time in seconds", }, RequestPrefillTimeSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_prefill_time_seconds", }, Description: "Request prefill time in seconds", }, P95TTFT5m: { MetricScope: PodModelMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{instance="${instance}", model_name="${model_name}", job="pods"}[5m])))`, Description: "95th ttft in last 5 mins", }, P95TTFT5mPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{instance="${instance}", job="pods"}[5m])))`, Description: "95th ttft in last 5 mins", }, AvgTTFT5mPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:time_to_first_token_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:time_to_first_token_seconds_count{instance="${instance}", job="pods"}[5m])`, Description: "Average ttft in last 5 mins", }, P95TPOT5mPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{instance="${instance}", job="pods"}[5m])))`, Description: "95th tpot in last 5 mins", }, AvgTPOT5mPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:time_per_output_token_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:time_per_output_token_seconds_sum{instance="${instance}", job="pods"}[5m])`, Description: "Average tpot in last 5 mins", }, AvgPromptToksPerReq: { MetricScope: PodModelMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:request_prompt_tokens_sum{instance="${instance}", model_name="${model_name}", job="pods"}[1d]) / increase(vllm:request_prompt_tokens_count{instance="${instance}", model_name="${model_name}", job="pods"}[1d])`, Description: "Average prompt tokens per request in last day", }, AvgGenerationToksPerReq: { MetricScope: PodModelMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:request_generation_tokens_sum{instance="${instance}", model_name="${model_name}", job="pods"}[1d]) / increase(vllm:request_generation_tokens_count{instance="${instance}", model_name="${model_name}", job="pods"}[1d])`, Description: "Average generation tokens per request in last day", }, GPUCacheUsagePerc: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:gpu_cache_usage_perc", "sglang": "sglang:token_usage", "xllm": "kv_cache_utilization", }, Description: "GPU cache usage percentage", }, EngineUtilization: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "xllm": "engine_utilization", }, Description: "GPU busy time ratio", }, CPUCacheUsagePerc: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:cpu_cache_usage_perc", }, Description: "CPU cache usage percentage", }, AvgE2ELatencyPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:e2e_request_latency_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:e2e_request_latency_seconds_count{instance="${instance}", job="pods"}[5m])`, Description: "Average End-to-end latency in last 5 mins", }, AvgRequestsPerMinPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:request_success_total{instance="${instance}", job="pods"}[5m]) / 5`, Description: "Average requests throughput per minute in last 5 mins", }, AvgPromptThroughputToksPerMinPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:prompt_tokens_total{instance="${instance}", job="pods"}[5m]) / 5`, Description: "Average prompt throughput in tokens per minute in last 5 mins", }, AvgGenerationThroughputToksPerMinPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:generation_tokens_total{instance="${instance}", job="pods"}[5m]) / 5`, Description: "Average generation throughput in tokens per minute in last 5 mins", }, MaxLora: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Query: QueryLabel, }, RawMetricName: "lora_requests_info", EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:max_lora", }, Description: "Max count of Lora Adapters", }, RunningLoraAdapters: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Query: QueryLabel, }, RawMetricName: "lora_requests_info", EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:running_lora_adapters", }, Description: "Count of running Lora Adapters", }, WaitingLoraAdapters: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Query: QueryLabel, }, RawMetricName: "lora_requests_info", EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:waiting_lora_adapters", }, Description: "Count of waiting Lora Adapters", }, VTCBucketSizeActive: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, Description: "Current adaptive bucket size used by VTC algorithm for token normalization", }, } )
Functions ¶
func BuildQuery ¶
BuildQuery dynamically injects labels into a PromQL query template.
func GetCounterGaugeValue ¶
func GetGaugeValueForTest ¶
func GetMetricHelp ¶
func IncrementCounterMetric ¶
func InitializePrometheusAPI ¶
func InitializePrometheusAPI(endpoint, username, password string) (prometheusv1.API, error)
InitializePrometheusAPI initializes the Prometheus API client.
func ParseMetricFromBody ¶
ParseMetricFromBody parses a simple metric from the Prometheus response body.
func ParseMetricsURLWithContext ¶ added in v0.4.0
func SetGaugeMetric ¶
func SetupCounterMetricsForTest ¶
func SetupCounterMetricsForTest(metricName string, labelNames []string) (*prometheus.CounterVec, func())
func SetupMetricsForTest ¶
func SetupMetricsForTest(metricName string, labelNames []string) (*prometheus.GaugeVec, func())
Types ¶
type HistogramMetricValue ¶
type HistogramMetricValue struct { Sum float64 Count float64 Buckets map[string]float64 // e.g., {"0.1": 5, "0.5": 3, "1.0": 2} }
HistogramMetricValue represents a detailed histogram metric.
func GetHistogramValue ¶
func GetHistogramValue(metric *dto.Metric) (*HistogramMetricValue, error)
func ParseHistogramFromBody ¶
func ParseHistogramFromBody(body []byte, metricName string) (*HistogramMetricValue, error)
ParseHistogramFromBody parses a histogram metric from the Prometheus response body.
func (*HistogramMetricValue) GetBucketValue ¶
func (h *HistogramMetricValue) GetBucketValue(bucket string) (float64, bool)
GetBucketValue returns the count for a specific bucket.
func (*HistogramMetricValue) GetCount ¶
func (h *HistogramMetricValue) GetCount() float64
GetCount returns the total count of values in the histogram.
func (*HistogramMetricValue) GetHistogramValue ¶
func (h *HistogramMetricValue) GetHistogramValue() *HistogramMetricValue
func (*HistogramMetricValue) GetLabelValue ¶
func (s *HistogramMetricValue) GetLabelValue() string
func (*HistogramMetricValue) GetMean ¶
func (h *HistogramMetricValue) GetMean() float64
GetMean returns the mean value of the histogram (Sum / Count).
func (*HistogramMetricValue) GetPercentile ¶
func (h *HistogramMetricValue) GetPercentile(percentile float64) (float64, error)
func (*HistogramMetricValue) GetPrometheusResult ¶
func (h *HistogramMetricValue) GetPrometheusResult() *model.Value
func (*HistogramMetricValue) GetSimpleValue ¶
func (h *HistogramMetricValue) GetSimpleValue() float64
func (*HistogramMetricValue) GetSum ¶
func (h *HistogramMetricValue) GetSum() float64
GetSum returns the sum of the histogram values.
func (*HistogramMetricValue) GetValue ¶
func (h *HistogramMetricValue) GetValue() interface{}
type LabelValueMetricValue ¶
type LabelValueMetricValue struct {
Value string
}
PrometheusMetricValue represents Prometheus query results.
func (*LabelValueMetricValue) GetHistogramValue ¶
func (l *LabelValueMetricValue) GetHistogramValue() *HistogramMetricValue
func (*LabelValueMetricValue) GetLabelValue ¶
func (l *LabelValueMetricValue) GetLabelValue() string
func (*LabelValueMetricValue) GetPrometheusResult ¶
func (l *LabelValueMetricValue) GetPrometheusResult() *model.Value
func (*LabelValueMetricValue) GetSimpleValue ¶
func (l *LabelValueMetricValue) GetSimpleValue() float64
type Metric ¶
type Metric struct { MetricSource MetricSource MetricType MetricType PromQL string // Optional: Only applicable for PromQL-based metrics RawMetricName string // Optional: Only applicable for QueryLabel-based metrics EngineMetricsNameMapping map[string]string // Optional: Mapping from engine type to raw metric name. Description string MetricScope MetricScope }
Metric defines a unique metric with metadata.
type MetricScope ¶
type MetricScope string
MetricScope defines the scope of a metric (e.g., model or pod or podmodel).
const ( ModelMetricScope MetricScope = "Model" PodMetricScope MetricScope = "Pod" PodModelMetricScope MetricScope = "PodModel" // model in pod )
type MetricSource ¶
type MetricSource string
MetricSource defines the metric source
const ( // PrometheusEndpoint indicates metrics are queried from a remote Prometheus server. // This source allows querying both raw and aggregated metrics, leveraging PromQL for advanced analytics. PrometheusEndpoint MetricSource = "PrometheusEndpoint" // PodRawMetrics indicates metrics are collected directly from the metricPort of a Pod. PodRawMetrics MetricSource = "PodRawMetrics" )
type MetricSubscriber ¶
type MetricSubscriber interface {
SubscribedMetrics() []string
}
type MetricType ¶
type MetricType struct { Raw RawMetricType // Optional: Represents the type of raw metric. Query QueryType // Optional: Represents the query type for derived metrics. }
MetricType defines the type of a metric, including raw metrics and queries.
func (MetricType) IsQuery ¶
func (m MetricType) IsQuery() bool
func (MetricType) IsRawMetric ¶
func (m MetricType) IsRawMetric() bool
type MetricValue ¶
type MetricValue interface { GetSimpleValue() float64 GetHistogramValue() *HistogramMetricValue GetPrometheusResult() *model.Value GetLabelValue() string }
MetricValue is the interface for all metric values.
type PrometheusMetricValue ¶
PrometheusMetricValue represents Prometheus query results.
func (*PrometheusMetricValue) GetHistogramValue ¶
func (p *PrometheusMetricValue) GetHistogramValue() *HistogramMetricValue
func (*PrometheusMetricValue) GetLabelValue ¶
func (s *PrometheusMetricValue) GetLabelValue() string
func (*PrometheusMetricValue) GetPrometheusResult ¶
func (p *PrometheusMetricValue) GetPrometheusResult() *model.Value
func (*PrometheusMetricValue) GetSimpleValue ¶
func (p *PrometheusMetricValue) GetSimpleValue() float64
type RawMetricType ¶
type RawMetricType string
RawMetricType defines the type of raw metrics (e.g., collected directly from a source).
const ( Gauge RawMetricType = "Gauge" // Gauge represents a snapshot value. Counter RawMetricType = "Counter" // Counter represents a cumulative value. Histogram RawMetricType = "Histogram" // Histogram represents a distribution of values. )
type SimpleMetricValue ¶
type SimpleMetricValue struct {
Value float64
}
SimpleMetricValue represents simple metrics (e.g., gauge or counter).
func (*SimpleMetricValue) GetHistogramValue ¶
func (s *SimpleMetricValue) GetHistogramValue() *HistogramMetricValue
func (*SimpleMetricValue) GetLabelValue ¶
func (s *SimpleMetricValue) GetLabelValue() string
func (*SimpleMetricValue) GetPrometheusResult ¶
func (s *SimpleMetricValue) GetPrometheusResult() *model.Value
func (*SimpleMetricValue) GetSimpleValue ¶
func (s *SimpleMetricValue) GetSimpleValue() float64