common

package

v0.8.2 Latest Latest Go to latest Published: Mar 31, 2026 License: Apache-2.0 Imports: 25 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/llm-d/llm-d-inference-sim

Links

Documentation ¶

Index ¶

Constants
Variables
func BuildStubEmbedding(tokens []uint32, dim int) []float32
func CreateSub(ctx context.Context, topic string) (zmq4.Socket, string)
func FinishReason(maxTokens *int64, respLen int) string
func GenerateChatLogprobs(tokens []string, topLogprobsCount int) *openaiserverapi.ChatLogprobs
func GenerateSingleTokenChatLogprobs(token string, tokenPosition int, topLogprobsCount int) *openaiserverapi.LogprobsContent
func GenerateSingleTokenTextLogprobs(token string, tokenPosition int, logprobsCount int) *openaiserverapi.TextLogprobs
func GenerateTextLogprobs(tokens []string, logprobsCount int) *openaiserverapi.TextLogprobs
func MaxIntSlice(numbers []int) (int, error)
func NewSub(ctx context.Context) zmq4.Socket
func StartSub(sub zmq4.Socket, endpoint string, topic string) string
func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64)
func WriteToChannel[T any](channel Channel[T], object T, logger logr.Logger)
type Channel
type Configuration
- func ParseCommandParamsAndLoadConfig() (*Configuration, error)
- func (c *Configuration) Copy() (*Configuration, error)
- func (c *Configuration) SSLEnabled() bool
- func (c *Configuration) Show(logger logr.Logger) error
type Duration
- func (d *Duration) Milliseconds() int64
- func (d *Duration) Set(s string) error
- func (d *Duration) String() string
- func (d *Duration) ToDuration() time.Duration
- func (*Duration) Type() string
- func (d *Duration) UnmarshalYAML(value *yaml.Node) error
type FakeMetricWithFunction
- func (f *FakeMetricWithFunction) UnmarshalJSON(data []byte) error
- func (f *FakeMetricWithFunction) UnmarshalYAML(value *yaml.Node) error
type FakeMetrics
type FunctionInfo
type LoraModule
type LorasMetrics
type MetricInfo
type Publisher
- func NewPublisher(ctx context.Context, endpoint string) (*Publisher, error)
- func (p *Publisher) Close() error
- func (p *Publisher) PublishEvent(ctx context.Context, topic string, batch interface{}) error
type Random
- func NewRandom(seed int64, port int) *Random
- func (r *Random) FlipCoin() bool
- func (r *Random) GenerateUUIDString() string
- func (r *Random) RandomBool(probability int) bool
- func (r *Random) RandomFloat(min float64, max float64) float64
- func (r *Random) RandomInt(min int, max int) int
- func (r *Random) RandomNorm(mean int, stddev int) float64
- func (r *Random) RandomNormDuration(mean, stddev time.Duration) time.Duration
- func (r *Random) RandomNormTruncated(mean int, stddev int) int
- func (r *Random) RandomNumericString(length int) string

Constants ¶

View Source

const (
	ModeRandom = "random"
	ModeEcho   = "echo"

	// Failure type constants
	FailureTypeRateLimit      = "rate_limit"
	FailureTypeInvalidAPIKey  = "invalid_api_key"
	FailureTypeContextLength  = "context_length"
	FailureTypeServerError    = "server_error"
	FailureTypeInvalidRequest = "invalid_request"
	FailureTypeModelNotFound  = "model_not_found"

	StopFinishReason           = "stop"
	LengthFinishReason         = "length"
	ToolsFinishReason          = "tool_calls"
	RemoteDecodeFinishReason   = "remote_decode"
	CacheThresholdFinishReason = "cache_threshold"

	DefaultLatencyCalculator        = ""
	ConstantLatencyCalculator       = "constant"
	PerPromptTokenLatencyCalculator = "per-token"

	DefaultDSTableName = "llmd"
)

View Source

const (
	OscillateFuncName     = "oscillate"
	RampFuncName          = "ramp"
	RampWithResetFuncName = "rampreset"
	SquarewaveFuncName    = "squarewave"
)

View Source

const (
	PodNameEnv = "POD_NAME"
	PodNsEnv   = "POD_NAMESPACE"
)

View Source

const (
	QwenModelName = "Qwen/Qwen2-0.5B"
	TestModelName = "testmodel"
)

constants

View Source

const InvalidMaxTokensErrMsg = "Max completion tokens and max tokens should be positive"

Variables ¶

View Source

var RequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
	20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}

View Source

var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
	1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}

View Source

var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
	0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
	2560.0}

Definition of buckets for time-to-first-token and time-per-output-token metrics, each value is an upper boundary of a bucket

Functions ¶

func BuildStubEmbedding ¶ added in v0.8.0

func BuildStubEmbedding(tokens []uint32, dim int) []float32

BuildStubEmbedding returns a deterministic embedding of length dim from token ids (simulator stub).

func CreateSub ¶ added in v0.7.0

func CreateSub(ctx context.Context, topic string) (zmq4.Socket, string)

CreateSub creates a ZMQ sub, subscribes to the provided topic, and returns the sub and the endpoint to publish events on

func FinishReason ¶ added in v0.7.0

func FinishReason(maxTokens *int64, respLen int) string

FinishReason returns finish reason based on request's max tokens parameter and the length of the generated response

func GenerateChatLogprobs ¶ added in v0.6.1

func GenerateChatLogprobs(tokens []string, topLogprobsCount int) *openaiserverapi.ChatLogprobs

GenerateChatLogprobs generates synthetic log probabilities for chat completion responses

func GenerateSingleTokenChatLogprobs ¶ added in v0.6.1

func GenerateSingleTokenChatLogprobs(token string, tokenPosition int, topLogprobsCount int) *openaiserverapi.LogprobsContent

GenerateSingleTokenChatLogprobs generates logprobs for a single token in chat completion streaming

func GenerateSingleTokenTextLogprobs ¶ added in v0.6.1

func GenerateSingleTokenTextLogprobs(token string, tokenPosition int, logprobsCount int) *openaiserverapi.TextLogprobs

GenerateSingleTokenTextLogprobs generates logprobs for a single token in text completion streaming

func GenerateTextLogprobs ¶ added in v0.6.1

func GenerateTextLogprobs(tokens []string, logprobsCount int) *openaiserverapi.TextLogprobs

GenerateTextLogprobs generates synthetic log probabilities for text completion responses

func MaxIntSlice ¶ added in v0.7.0

func MaxIntSlice(numbers []int) (int, error)

MaxIntSlice receives a slice of ints, returns the maximum value in the slice if not empty, and error if the slice is empty

func NewSub ¶ added in v0.8.0

func NewSub(ctx context.Context) zmq4.Socket

func StartSub ¶ added in v0.8.0

func StartSub(sub zmq4.Socket, endpoint string, topic string) string

starts the given sub on a random port and subscribes to the given topic. Returns the sub and the real endpoint to publish events on.

func ValidateContextWindow ¶

func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64)

ValidateContextWindow checks if the request fits within the model's context window Returns validation result, actual completion tokens, and total tokens

func WriteToChannel ¶ added in v0.6.0

func WriteToChannel[T any](channel Channel[T], object T, logger logr.Logger)

Types ¶

type Channel ¶ added in v0.8.0

type Channel[T any] struct {
	Channel chan T
	Name    string
}

type Configuration ¶

type Configuration struct {
	// IP defines on which IP the simulator runs, loaded from env
	IP string
	// Port defines on which port the simulator runs
	Port int `yaml:"port" json:"port"`
	// Model defines the current base model name
	Model string `yaml:"model" json:"model"`
	// ServedModelNames is one or many model names exposed by the API
	ServedModelNames []string `yaml:"served-model-name" json:"served-model-name"`
	// MaxLoras defines maximum number of loaded LoRAs
	MaxLoras int `yaml:"max-loras" json:"max-loras"`
	// MaxCPULoras defines maximum number of LoRAs to store in CPU memory
	MaxCPULoras int `yaml:"max-cpu-loras" json:"max-cpu-loras"`
	// MaxNumSeqs is maximum number of sequences per iteration (the maximum
	// number of inference requests that could be processed at the same time)
	MaxNumSeqs int `yaml:"max-num-seqs" json:"max-num-seqs"`
	// MaxWaitingQueueLength defines maximum size of waiting requests queue
	MaxWaitingQueueLength int `yaml:"max-waiting-queue-length" json:"max-waiting-queue-length"`
	// MaxModelLen is the model's context window, the maximum number of tokens
	// in a single request including input and output. Default value is 1024.
	MaxModelLen int `yaml:"max-model-len" json:"max-model-len"`
	// LoraModulesString is a list of LoRA adapters as strings
	LoraModulesString []string `yaml:"lora-modules" json:"lora-modules"`
	// LoraModules is a list of LoRA adapters
	LoraModules []LoraModule

	// PodNameSpace specifies the Kubernetes namespace in which the simulator pod is running.
	// Useful for multi-namespace deployments and resource scoping.
	// Set by env variable POD_NAMESPACE
	PodNameSpace string
	// PodName specifies the name of the pod running the simulator instance.
	// Used for identification in Kubernetes environments.
	// Set by env variable POD_NAME
	PodName string
	// VllmDevMode enables development mode for the vLLM simulator
	// Allowing for additional debugging features during local development and testing.
	// Set by env variable VLLM_SERVER_DEV_MODE
	VllmDevMode bool

	// TimeToFirstToken time before the first token will be returned
	TimeToFirstToken Duration `yaml:"time-to-first-token" json:"time-to-first-token"`
	// TimeToFirstTokenStdDev standard deviation for time before the first token will be returned
	// optional, default is 0, can't be more than 30% of TimeToFirstToken, will not
	// cause the actual time to first token to differ by more than 70% from TimeToFirstToken
	TimeToFirstTokenStdDev Duration `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"`

	// InterTokenLatency time between generated tokens
	InterTokenLatency Duration `yaml:"inter-token-latency" json:"inter-token-latency"`
	// InterTokenLatencyStdDev standard deviation for time between generated tokens
	// optional, default is 0, can't be more than 30% of InterTokenLatency, will not cause the actual
	// inter token latency to differ by more than 70% from InterTokenLatency
	InterTokenLatencyStdDev Duration `yaml:"inter-token-latency-std-dev" json:"inter-token-latency-std-dev"`
	// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated,
	KVCacheTransferLatency Duration `yaml:"kv-cache-transfer-latency" json:"kv-cache-transfer-latency"`
	// KVCacheTransferLatencyStdDev standard deviation for time to "transfer" kv-cache from another
	// vLLM instance in case P/D is activated, can't be more than 30% of KVCacheTransferLatency, will not
	// cause the actual latency to differ by more than 70% from KVCacheTransferLatency
	KVCacheTransferLatencyStdDev Duration `yaml:"kv-cache-transfer-latency-std-dev" json:"kv-cache-transfer-latency-std-dev"`

	// $Total Prefill Time = PrefillOverhead + n * PrefillTimePerToken$
	// the assumption is that n is less than k, where k is the number of prallelism units of GPU
	// PrefillOverhead time taken to prefill the context
	PrefillOverhead     Duration `yaml:"prefill-overhead" json:"prefill-overhead"`
	PrefillTimePerToken Duration `yaml:"prefill-time-per-token" json:"prefill-time-per-token"`
	// PrefillOverheadStdDev similar to TimeToFirstTokenStdDev
	PrefillTimeStdDev Duration `yaml:"prefill-time-std-dev" json:"prefill-time-std-dev"`
	// $Total KV Cache Transfer Time = n * KVCacheTransferTimePerToken$
	// the assumption is that the cache blocks are all missed at the remote pod
	// KVCacheTransfer overhead time taken to transfer kv-cache from another vLLM instance in case P/D is activated
	KVCacheTransferTimePerToken Duration `yaml:"kv-cache-transfer-time-per-token" json:"kv-cache-transfer-time-per-token"`
	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
	KVCacheTransferTimeStdDev Duration `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`

	// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel
	// requests are being processed.
	// The value of this factor must be >= 1.0, with a default of 1.0.
	// - If this factor is 1.0, no extra time is added.
	// - When the factor is x (where x > 1.0) and there are MaxNumSeqs requests, the total time will be multiplied by x.
	// - The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
	TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"`

	// Mode defines the simulator response generation mode, valid values: echo, random
	Mode string `yaml:"mode" json:"mode"`
	// Seed defines random seed for operations
	Seed int64 `yaml:"seed" json:"seed"`

	// MaxToolCallIntegerParam defines the maximum possible value of integer parameters in a tool call,
	// optional, defaults to 100
	MaxToolCallIntegerParam int `yaml:"max-tool-call-integer-param" json:"max-tool-call-integer-param"`
	// MinToolCallIntegerParam defines the minimum possible value of integer parameters in a tool call,
	// optional, defaults to 0
	MinToolCallIntegerParam int `yaml:"min-tool-call-integer-param" json:"min-tool-call-integer-param"`
	// MaxToolCallNumberParam defines the maximum possible value of number (float) parameters in a tool call,
	// optional, defaults to 100
	MaxToolCallNumberParam float64 `yaml:"max-tool-call-number-param" json:"max-tool-call-number-param"`
	// MinToolCallNumberParam defines the minimum possible value of number (float) parameters in a tool call,
	// optional, defaults to 0
	MinToolCallNumberParam float64 `yaml:"min-tool-call-number-param" json:"min-tool-call-number-param"`

	// MaxToolCallArrayParamLength defines the maximum possible length of array parameters in a tool call,
	// optional, defaults to 5
	MaxToolCallArrayParamLength int `yaml:"max-tool-call-array-param-length" json:"max-tool-call-array-param-length"`
	// MinToolCallArrayParamLength defines the minimum possible length of array parameters in a tool call,
	// optional, defaults to 1
	MinToolCallArrayParamLength int `yaml:"min-tool-call-array-param-length" json:"min-tool-call-array-param-length"`

	// ToolCallNotRequiredParamProbability is the probability to add a parameter, that is not required,
	// in a tool call, optional, defaults to 50
	ToolCallNotRequiredParamProbability int `yaml:"tool-call-not-required-param-probability" json:"tool-call-not-required-param-probability"`
	// ObjectToolCallNotRequiredParamProbability is the probability to add a field, that is not required,
	// in an object in a tool call, optional, defaults to 50
	ObjectToolCallNotRequiredParamProbability int `yaml:"object-tool-call-not-required-field-probability" json:"object-tool-call-not-required-field-probability"`

	// EnableKVCache defines if kv cache feature will be enabled
	EnableKVCache bool `yaml:"enable-kvcache" json:"enable-kvcache"`
	//  KVCacheSize is the maximum number of token blocks in kv cache, the default value is 1024
	KVCacheSize int `yaml:"kv-cache-size" json:"kv-cache-size"`
	// GlobalCacheHitThreshold is the default cache hit threshold (0-1] for all requests.
	// If a request specifies cache_hit_threshold, it takes precedence over this global value.
	GlobalCacheHitThreshold float64 `yaml:"global-cache-hit-threshold" json:"global-cache-hit-threshold"`

	// TokenBlockSize is token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128, defaults to 16
	TokenBlockSize int `yaml:"block-size" json:"block-size"`
	// HashSeed is the seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)
	HashSeed string `yaml:"hash-seed" json:"hash-seed"`

	// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557
	ZMQEndpoint string `yaml:"zmq-endpoint" json:"zmq-endpoint"`

	// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
	EventBatchSize int `yaml:"event-batch-size" json:"event-batch-size"`

	// FakeMetrics is a set of metrics to send to Prometheus instead of the real data
	FakeMetrics *FakeMetrics `yaml:"fake-metrics" json:"fake-metrics"`

	// FakeMetricsRefreshInterval defines how often function-based fake metrics are recalculated, defaults to 100ms
	FakeMetricsRefreshInterval time.Duration `yaml:"fake-metrics-refresh-interval" json:"fake-metrics-refresh-interval"`

	// FailureInjectionRate is the probability (0-100) of injecting failures
	FailureInjectionRate int `yaml:"failure-injection-rate" json:"failure-injection-rate"`
	// FailureTypes is a list of specific failure types to inject (empty means all types)
	FailureTypes []string `yaml:"failure-types" json:"failure-types"`

	// DPSize is data parallel size - a number of ranks to run, minimum is 1, maximum is 8, default is 1
	DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`

	// Rank is the vLLM parameter used to specify the rank of this instance. Here only
	// used when running Data Parallel ranks as separate processes. If set, data-parallel-size is ignored
	Rank int `yaml:"data-parallel-rank" json:"data-parallel-rank"`

	// SSLCertFile is the path to the SSL certificate file for HTTPS
	SSLCertFile string `yaml:"ssl-certfile" json:"ssl-certfile"`
	// SSLKeyFile is the path to the SSL private key file for HTTPS
	SSLKeyFile string `yaml:"ssl-keyfile" json:"ssl-keyfile"`
	// SelfSignedCerts enables automatic generation of self-signed certificates for HTTPS
	SelfSignedCerts bool `yaml:"self-signed-certs" json:"self-signed-certs"`

	// DatasetPath Optional local file path to the SQLite database file used for generating responses from a dataset.
	//   - If not set, hardcoded preset responses will be used.
	//   - If set but the file does not exist the `dataset-url` will be used to download the database to the path specified by `dataset-path`.
	//   - If the file exists but is currently occupied by another process, responses will be randomly generated from preset text (the same behavior as if the path were not set).
	//   - Responses are retrieved from the dataset by the hash of the conversation history, with a fallback to a random dataset response, constrained by the maximum output tokens and EoS token handling, if no matching history is found.
	//   - Refer to [llm-d converted ShareGPT](https://huggingface.co/datasets/hf07397/inference-sim-datasets/blob/0b7ac1a4daf0aace1556326964bd75633372299e/README.md) for detailed information on the expected format of the SQLite database file.
	DatasetPath string `yaml:"dataset-path" json:"dataset-path"`
	// DatasetURL Optional URL for downloading the SQLite database file used for response generation.
	//   - This parameter is only used if the `dataset-path` is also set and the file does not exist at that path.
	//   - If the file needs to be downloaded, it will be saved to the location specified by `dataset-path`.
	//   - If the file already exists at the `dataset-path`, it will not be downloaded again
	//   - Example URL `https://huggingface.co/datasets/hf07397/inference-sim-datasets/resolve/91ffa7aafdfd6b3b1af228a517edc1e8f22cd274/huggingface/ShareGPT_Vicuna_unfiltered/conversations.sqlite3`
	DatasetURL string `yaml:"dataset-url" json:"dataset-url"`
	// DatasetInMemory defines whether to load the entire dataset into memory for faster access.
	DatasetInMemory bool `yaml:"dataset-in-memory" json:"dataset-in-memory"`
	// DatasetTableName defines custom SQLite dataset table name
	DatasetTableName string `yaml:"dataset-table-name" json:"dataset-table-name"`

	// Tokenizer UDS socker path
	UDSSocketPath string `yaml:"uds-socket-path" json:"uds-socket-path"`

	// EnableSleepMode enables sleep mode
	EnableSleepMode bool `yaml:"enable-sleep-mode" json:"enable-sleep-mode"`

	// EnableRequestIDHeaders enables including X-Request-Id header in responses
	EnableRequestIDHeaders bool `yaml:"enable-request-id-headers" json:"enable-request-id-headers"`

	// LatencyCalculator is the name of the latency calculator to use in the simulation of the response latencies.
	// The default calculation is based on the current load of the simulator and on the configured latency
	// parameters, e.g., time-to-first-token and prefill-time-per-token.
	LatencyCalculator string `yaml:"latency-calculator" json:"latency-calculator"`

	// DefaultEmbeddingDimensions is the default size of embedding vectors when the request does not specify dimensions.
	// Used by the /v1/embeddings endpoint. Default is 384.
	DefaultEmbeddingDimensions int `yaml:"default-embedding-dimensions" json:"default-embedding-dimensions"`

	// MMEncoderOnly defines whether to skip the language component of the model.
	MMEncoderOnly bool `yaml:"mm-encoder-only" json:"mm-encoder-only"`

	// Ignored parameters:
	// MMProcessorKWArgs defines arguments to be forwarded to the model's processor for multi-modal data.
	// Ignored in the simulator.
	MMProcessorKWArgs string `yaml:"mm-processor-kwargs" json:"mm-processor-kwargs"`
	// ECTransferConfig defines the configurations for distributed EC cache transfer.
	// Ignored in the simulator.
	ECTransferConfig string `yaml:"ec-transfer-config" json:"ec-transfer-config"`
	// EnforceEager defines whether to always use eager-mode PyTorch.
	// Ignored in the simulator.
	EnforceEager bool `yaml:"enforce-eager" json:"enforce-eager"`
	// EnablePrefixCaching defines whether to enable prefix caching.
	// Ignored in the simulator.
	EnablePrefixCaching bool `yaml:"enable-prefix-caching" json:"enable-prefix-caching"`
}

func ParseCommandParamsAndLoadConfig ¶

func ParseCommandParamsAndLoadConfig() (*Configuration, error)

ParseCommandParamsAndLoadConfig loads configuration, parses command line parameters, merges the values (command line values overwrite the config file ones), and validates the configuration

func (*Configuration) Copy ¶ added in v0.5.0

func (c *Configuration) Copy() (*Configuration, error)

func (*Configuration) SSLEnabled ¶ added in v0.5.1

func (c *Configuration) SSLEnabled() bool

SSLEnabled returns true if SSL is enabled either via certificate files or self-signed certificates

func (*Configuration) Show ¶ added in v0.8.0

func (c *Configuration) Show(logger logr.Logger) error

type Duration ¶ added in v0.7.0

type Duration time.Duration

Duration wraps time.Duration. It is used to parse the custom duration format from YAML.

func (*Duration) Milliseconds ¶ added in v0.7.0

func (d *Duration) Milliseconds() int64

func (*Duration) Set ¶ added in v0.7.0

func (d *Duration) Set(s string) error

Set implements pflag/flag.Value.

func (*Duration) String ¶ added in v0.7.0

func (d *Duration) String() string

String implements pflag.Value.

func (*Duration) ToDuration ¶ added in v0.7.0

func (d *Duration) ToDuration() time.Duration

func (*Duration) Type ¶ added in v0.7.0

func (*Duration) Type() string

Type implements pflag.Value.

func (*Duration) UnmarshalYAML ¶ added in v0.7.0

func (d *Duration) UnmarshalYAML(value *yaml.Node) error

UnmarshalYAML implements the yaml.Unmarshaler interface.

type FakeMetricWithFunction ¶ added in v0.8.0

type FakeMetricWithFunction struct {
	FixedValue float64
	Function   *FunctionInfo
	IsFunction bool
}

func (*FakeMetricWithFunction) UnmarshalJSON ¶ added in v0.8.0

func (f *FakeMetricWithFunction) UnmarshalJSON(data []byte) error

func (*FakeMetricWithFunction) UnmarshalYAML ¶ added in v0.8.0

func (f *FakeMetricWithFunction) UnmarshalYAML(value *yaml.Node) error

type FakeMetrics ¶ added in v0.8.0

type FakeMetrics struct {
	// LoraMetrics
	LoraMetrics []LorasMetrics `json:"loras"`
	LorasString []string       `yaml:"loras"`

	// RunningRequests is the number of inference requests that are currently being processed
	RunningRequests FakeMetricWithFunction `yaml:"running-requests" json:"running-requests"`
	// WaitingRequests is the number of inference requests that are waiting to be processed
	WaitingRequests FakeMetricWithFunction `yaml:"waiting-requests" json:"waiting-requests"`
	// KVCacheUsagePercentage  is the fraction of KV-cache blocks currently in use (from 0 to 1)
	KVCacheUsagePercentage FakeMetricWithFunction `yaml:"kv-cache-usage" json:"kv-cache-usage"`

	// TTFTBuckets is an array of values for time-to-first-token buckets.
	// Buckets upper boundaries in seconds are:
	// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
	// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
	TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
	// TPOTBuckets is an array of values for time-per-output-token buckets.
	// Buckets upper boundaries in seconds are:
	// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
	// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
	TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
	// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
	// Each value in these arrays is passed to Observe() exactly once at startup.
	// By default:
	//   - The sum of RequestPromptTokens initializes the metric vllm:prompt_tokens_total.
	//   - The sum of RequestGenerationTokens initializes the metric vllm:generation_tokens_total.
	//
	// If TotalPromptTokens or TotalGenerationTokens are explicitly provided,
	// they override the above sums and are used directly as the initial total token counts.
	RequestPromptTokens        []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"`                 // prompt-length samples
	RequestGenerationTokens    []int `yaml:"request-generation-tokens" json:"request-generation-tokens"`         // generation-length samples
	RequestParamsMaxTokens     []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"`         // max_tokens parameter samples
	RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
	// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
	RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`

	// TotalPromptTokens is the total number of prompt tokens processed
	TotalPromptTokens *int64 `json:"total-prompt-tokens,omitempty"`
	// TotalGenerationTokens is the total number of generated tokens
	TotalGenerationTokens *int64 `json:"total-generation-tokens,omitempty"`

	// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets.
	E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
	// ReqQueueTimeBucketValues is an array of values for request queue time buckets.
	ReqQueueTimeBucketValues []int `yaml:"queue-time-buckets-values" json:"queue-time-buckets-values"`
	// ReqInfTimeBucketValues is an array of values for request inference time buckets.
	ReqInfTimeBucketValues []int `yaml:"inf-time-buckets-values" json:"inf-time-buckets-values"`
	// ReqPrefillTimeBucketValues is an array of values for request prefill time buckets.
	ReqPrefillTimeBucketValues []int `yaml:"prefill-time-buckets-values" json:"prefill-time-buckets-values"`
	// ReqDecodeTimeBucketValues is an array of values for request decode time buckets.
	ReqDecodeTimeBucketValues []int `yaml:"decode-time-buckets-values" json:"decode-time-buckets-values"`

	// PrefixCacheHits is the initial value for the prefix cache hits counter (in tokens)
	PrefixCacheHits *int64 `yaml:"prefix-cache-hits" json:"prefix-cache-hits,omitempty"`
	// PrefixCacheQueries is the initial value for the prefix cache queries counter (in tokens)
	PrefixCacheQueries *int64 `yaml:"prefix-cache-queries" json:"prefix-cache-queries,omitempty"`
}

type FunctionInfo ¶ added in v0.8.0

type FunctionInfo struct {
	Name   string
	Start  float64
	End    float64
	Period time.Duration
}

type LoraModule ¶

type LoraModule struct {
	// Name is the LoRA's name
	Name string `json:"name"`
	// Path is the LoRA's path
	Path string `json:"path"`
	// BaseModelName is the LoRA's base model
	BaseModelName string `json:"base_model_name"`
}

type LorasMetrics ¶ added in v0.4.0

type LorasMetrics struct {
	// RunningLoras is a comma separated list of running LoRAs
	RunningLoras string `json:"running"`
	// WaitingLoras is a comma separated list of waiting LoRAs
	WaitingLoras string `json:"waiting"`
	// Timestamp is the timestamp of the metric
	Timestamp float64 `json:"timestamp"`
}

type MetricInfo ¶ added in v0.8.0

type MetricInfo struct {
	// Value is the value for metric's update
	Value float64
	// IsFake is true if this a fake metric, and false if not
	IsFake bool
}

MetricInfo contains metrics update value to pass through the corresponding channel

type Publisher ¶

type Publisher struct {
	// contains filtered or unexported fields
}

Publisher sends events to a ZMQ endpoint.

func NewPublisher ¶

func NewPublisher(ctx context.Context, endpoint string) (*Publisher, error)

NewPublisher creates a new ZMQ publisher. endpoint is the ZMQ address to bind to (e.g., "tcp://*:5557"). retries is the maximum number of connection attempts.

func (*Publisher) Close ¶

func (p *Publisher) Close() error

Close closes the publisher and cleans up resources.

func (*Publisher) PublishEvent ¶

func (p *Publisher) PublishEvent(ctx context.Context, topic string, batch interface{}) error

PublishEvent publishes a KV cache event batch to the ZMQ topic. topic should include the pod identifier (e.g., "kv.pod1").

type Random ¶ added in v0.6.0

type Random struct {
	// contains filtered or unexported fields
}

func NewRandom ¶ added in v0.6.0

func NewRandom(seed int64, port int) *Random

func (*Random) FlipCoin ¶ added in v0.6.0

func (r *Random) FlipCoin() bool

Returns true or false randomly

func (*Random) GenerateUUIDString ¶ added in v0.6.0

func (r *Random) GenerateUUIDString() string

GenerateUUIDString generates a UUID string under a lock

func (*Random) RandomBool ¶ added in v0.6.0

func (r *Random) RandomBool(probability int) bool

probability is an integer between 0 and 100

func (*Random) RandomFloat ¶ added in v0.6.0

func (r *Random) RandomFloat(min float64, max float64) float64

Returns a random float64 in the range [min, max)

func (*Random) RandomInt ¶ added in v0.6.0

func (r *Random) RandomInt(min int, max int) int

Returns an integer between min and max (included)

func (*Random) RandomNorm ¶ added in v0.6.0

func (r *Random) RandomNorm(mean int, stddev int) float64

Returns a normally distributed float64

func (*Random) RandomNormDuration ¶ added in v0.7.0

func (r *Random) RandomNormDuration(mean, stddev time.Duration) time.Duration

func (*Random) RandomNormTruncated ¶ added in v0.6.0

func (r *Random) RandomNormTruncated(mean int, stddev int) int

Returns a normally distributed int If the generated value differs by more than 70% from mean, the returned value will be 70% of mean

func (*Random) RandomNumericString ¶ added in v0.6.0

func (r *Random) RandomNumericString(length int) string

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
logging

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL