common

package
v0.8.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 31, 2026 License: Apache-2.0 Imports: 25 Imported by: 0

Documentation

Index

Constants

View Source
const (
	ModeRandom = "random"
	ModeEcho   = "echo"

	// Failure type constants
	FailureTypeRateLimit      = "rate_limit"
	FailureTypeInvalidAPIKey  = "invalid_api_key"
	FailureTypeContextLength  = "context_length"
	FailureTypeServerError    = "server_error"
	FailureTypeInvalidRequest = "invalid_request"
	FailureTypeModelNotFound  = "model_not_found"

	StopFinishReason           = "stop"
	LengthFinishReason         = "length"
	ToolsFinishReason          = "tool_calls"
	RemoteDecodeFinishReason   = "remote_decode"
	CacheThresholdFinishReason = "cache_threshold"

	DefaultLatencyCalculator        = ""
	ConstantLatencyCalculator       = "constant"
	PerPromptTokenLatencyCalculator = "per-token"

	DefaultDSTableName = "llmd"
)
View Source
const (
	OscillateFuncName     = "oscillate"
	RampFuncName          = "ramp"
	RampWithResetFuncName = "rampreset"
	SquarewaveFuncName    = "squarewave"
)
View Source
const (
	PodNameEnv = "POD_NAME"
	PodNsEnv   = "POD_NAMESPACE"
)
View Source
const (
	QwenModelName = "Qwen/Qwen2-0.5B"
	TestModelName = "testmodel"
)

constants

View Source
const InvalidMaxTokensErrMsg = "Max completion tokens and max tokens should be positive"

Variables

View Source
var RequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
	20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}
View Source
var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
	1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
View Source
var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
	0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
	2560.0}

Definition of buckets for time-to-first-token and time-per-output-token metrics, each value is an upper boundary of a bucket

Functions

func BuildStubEmbedding added in v0.8.0

func BuildStubEmbedding(tokens []uint32, dim int) []float32

BuildStubEmbedding returns a deterministic embedding of length dim from token ids (simulator stub).

func CreateSub added in v0.7.0

func CreateSub(ctx context.Context, topic string) (zmq4.Socket, string)

CreateSub creates a ZMQ sub, subscribes to the provided topic, and returns the sub and the endpoint to publish events on

func FinishReason added in v0.7.0

func FinishReason(maxTokens *int64, respLen int) string

FinishReason returns finish reason based on request's max tokens parameter and the length of the generated response

func GenerateChatLogprobs added in v0.6.1

func GenerateChatLogprobs(tokens []string, topLogprobsCount int) *openaiserverapi.ChatLogprobs

GenerateChatLogprobs generates synthetic log probabilities for chat completion responses

func GenerateSingleTokenChatLogprobs added in v0.6.1

func GenerateSingleTokenChatLogprobs(token string, tokenPosition int, topLogprobsCount int) *openaiserverapi.LogprobsContent

GenerateSingleTokenChatLogprobs generates logprobs for a single token in chat completion streaming

func GenerateSingleTokenTextLogprobs added in v0.6.1

func GenerateSingleTokenTextLogprobs(token string, tokenPosition int, logprobsCount int) *openaiserverapi.TextLogprobs

GenerateSingleTokenTextLogprobs generates logprobs for a single token in text completion streaming

func GenerateTextLogprobs added in v0.6.1

func GenerateTextLogprobs(tokens []string, logprobsCount int) *openaiserverapi.TextLogprobs

GenerateTextLogprobs generates synthetic log probabilities for text completion responses

func MaxIntSlice added in v0.7.0

func MaxIntSlice(numbers []int) (int, error)

MaxIntSlice receives a slice of ints, returns the maximum value in the slice if not empty, and error if the slice is empty

func NewSub added in v0.8.0

func NewSub(ctx context.Context) zmq4.Socket

func StartSub added in v0.8.0

func StartSub(sub zmq4.Socket, endpoint string, topic string) string

starts the given sub on a random port and subscribes to the given topic. Returns the sub and the real endpoint to publish events on.

func ValidateContextWindow

func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64)

ValidateContextWindow checks if the request fits within the model's context window Returns validation result, actual completion tokens, and total tokens

func WriteToChannel added in v0.6.0

func WriteToChannel[T any](channel Channel[T], object T, logger logr.Logger)

Types

type Channel added in v0.8.0

type Channel[T any] struct {
	Channel chan T
	Name    string
}

type Configuration

type Configuration struct {
	// IP defines on which IP the simulator runs, loaded from env
	IP string
	// Port defines on which port the simulator runs
	Port int `yaml:"port" json:"port"`
	// Model defines the current base model name
	Model string `yaml:"model" json:"model"`
	// ServedModelNames is one or many model names exposed by the API
	ServedModelNames []string `yaml:"served-model-name" json:"served-model-name"`
	// MaxLoras defines maximum number of loaded LoRAs
	MaxLoras int `yaml:"max-loras" json:"max-loras"`
	// MaxCPULoras defines maximum number of LoRAs to store in CPU memory
	MaxCPULoras int `yaml:"max-cpu-loras" json:"max-cpu-loras"`
	// MaxNumSeqs is maximum number of sequences per iteration (the maximum
	// number of inference requests that could be processed at the same time)
	MaxNumSeqs int `yaml:"max-num-seqs" json:"max-num-seqs"`
	// MaxWaitingQueueLength defines maximum size of waiting requests queue
	MaxWaitingQueueLength int `yaml:"max-waiting-queue-length" json:"max-waiting-queue-length"`
	// MaxModelLen is the model's context window, the maximum number of tokens
	// in a single request including input and output. Default value is 1024.
	MaxModelLen int `yaml:"max-model-len" json:"max-model-len"`
	// LoraModulesString is a list of LoRA adapters as strings
	LoraModulesString []string `yaml:"lora-modules" json:"lora-modules"`
	// LoraModules is a list of LoRA adapters
	LoraModules []LoraModule

	// PodNameSpace specifies the Kubernetes namespace in which the simulator pod is running.
	// Useful for multi-namespace deployments and resource scoping.
	// Set by env variable POD_NAMESPACE
	PodNameSpace string
	// PodName specifies the name of the pod running the simulator instance.
	// Used for identification in Kubernetes environments.
	// Set by env variable POD_NAME
	PodName string
	// VllmDevMode enables development mode for the vLLM simulator
	// Allowing for additional debugging features during local development and testing.
	// Set by env variable VLLM_SERVER_DEV_MODE
	VllmDevMode bool

	// TimeToFirstToken time before the first token will be returned
	TimeToFirstToken Duration `yaml:"time-to-first-token" json:"time-to-first-token"`
	// TimeToFirstTokenStdDev standard deviation for time before the first token will be returned
	// optional, default is 0, can't be more than 30% of TimeToFirstToken, will not
	// cause the actual time to first token to differ by more than 70% from TimeToFirstToken
	TimeToFirstTokenStdDev Duration `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"`

	// InterTokenLatency time between generated tokens
	InterTokenLatency Duration `yaml:"inter-token-latency" json:"inter-token-latency"`
	// InterTokenLatencyStdDev standard deviation for time between generated tokens
	// optional, default is 0, can't be more than 30% of InterTokenLatency, will not cause the actual
	// inter token latency to differ by more than 70% from InterTokenLatency
	InterTokenLatencyStdDev Duration `yaml:"inter-token-latency-std-dev" json:"inter-token-latency-std-dev"`
	// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated,
	KVCacheTransferLatency Duration `yaml:"kv-cache-transfer-latency" json:"kv-cache-transfer-latency"`
	// KVCacheTransferLatencyStdDev standard deviation for time to "transfer" kv-cache from another
	// vLLM instance in case P/D is activated, can't be more than 30% of KVCacheTransferLatency, will not
	// cause the actual latency to differ by more than 70% from KVCacheTransferLatency
	KVCacheTransferLatencyStdDev Duration `yaml:"kv-cache-transfer-latency-std-dev" json:"kv-cache-transfer-latency-std-dev"`

	// $Total Prefill Time = PrefillOverhead + n * PrefillTimePerToken$
	// the assumption is that n is less than k, where k is the number of prallelism units of GPU
	// PrefillOverhead time taken to prefill the context
	PrefillOverhead     Duration `yaml:"prefill-overhead" json:"prefill-overhead"`
	PrefillTimePerToken Duration `yaml:"prefill-time-per-token" json:"prefill-time-per-token"`
	// PrefillOverheadStdDev similar to TimeToFirstTokenStdDev
	PrefillTimeStdDev Duration `yaml:"prefill-time-std-dev" json:"prefill-time-std-dev"`
	// $Total KV Cache Transfer Time = n * KVCacheTransferTimePerToken$
	// the assumption is that the cache blocks are all missed at the remote pod
	// KVCacheTransfer overhead time taken to transfer kv-cache from another vLLM instance in case P/D is activated
	KVCacheTransferTimePerToken Duration `yaml:"kv-cache-transfer-time-per-token" json:"kv-cache-transfer-time-per-token"`
	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
	KVCacheTransferTimeStdDev Duration `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`

	// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel
	// requests are being processed.
	// The value of this factor must be >= 1.0, with a default of 1.0.
	// - If this factor is 1.0, no extra time is added.
	// - When the factor is x (where x > 1.0) and there are MaxNumSeqs requests, the total time will be multiplied by x.
	// - The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
	TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"`

	// Mode defines the simulator response generation mode, valid values: echo, random
	Mode string `yaml:"mode" json:"mode"`
	// Seed defines random seed for operations
	Seed int64 `yaml:"seed" json:"seed"`

	// MaxToolCallIntegerParam defines the maximum possible value of integer parameters in a tool call,
	// optional, defaults to 100
	MaxToolCallIntegerParam int `yaml:"max-tool-call-integer-param" json:"max-tool-call-integer-param"`
	// MinToolCallIntegerParam defines the minimum possible value of integer parameters in a tool call,
	// optional, defaults to 0
	MinToolCallIntegerParam int `yaml:"min-tool-call-integer-param" json:"min-tool-call-integer-param"`
	// MaxToolCallNumberParam defines the maximum possible value of number (float) parameters in a tool call,
	// optional, defaults to 100
	MaxToolCallNumberParam float64 `yaml:"max-tool-call-number-param" json:"max-tool-call-number-param"`
	// MinToolCallNumberParam defines the minimum possible value of number (float) parameters in a tool call,
	// optional, defaults to 0
	MinToolCallNumberParam float64 `yaml:"min-tool-call-number-param" json:"min-tool-call-number-param"`

	// MaxToolCallArrayParamLength defines the maximum possible length of array parameters in a tool call,
	// optional, defaults to 5
	MaxToolCallArrayParamLength int `yaml:"max-tool-call-array-param-length" json:"max-tool-call-array-param-length"`
	// MinToolCallArrayParamLength defines the minimum possible length of array parameters in a tool call,
	// optional, defaults to 1
	MinToolCallArrayParamLength int `yaml:"min-tool-call-array-param-length" json:"min-tool-call-array-param-length"`

	// ToolCallNotRequiredParamProbability is the probability to add a parameter, that is not required,
	// in a tool call, optional, defaults to 50
	ToolCallNotRequiredParamProbability int `yaml:"tool-call-not-required-param-probability" json:"tool-call-not-required-param-probability"`
	// ObjectToolCallNotRequiredParamProbability is the probability to add a field, that is not required,
	// in an object in a tool call, optional, defaults to 50
	ObjectToolCallNotRequiredParamProbability int `yaml:"object-tool-call-not-required-field-probability" json:"object-tool-call-not-required-field-probability"`

	// EnableKVCache defines if kv cache feature will be enabled
	EnableKVCache bool `yaml:"enable-kvcache" json:"enable-kvcache"`
	//  KVCacheSize is the maximum number of token blocks in kv cache, the default value is 1024
	KVCacheSize int `yaml:"kv-cache-size" json:"kv-cache-size"`
	// GlobalCacheHitThreshold is the default cache hit threshold (0-1] for all requests.
	// If a request specifies cache_hit_threshold, it takes precedence over this global value.
	GlobalCacheHitThreshold float64 `yaml:"global-cache-hit-threshold" json:"global-cache-hit-threshold"`

	// TokenBlockSize is token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128, defaults to 16
	TokenBlockSize int `yaml:"block-size" json:"block-size"`
	// HashSeed is the seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)
	HashSeed string `yaml:"hash-seed" json:"hash-seed"`

	// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557
	ZMQEndpoint string `yaml:"zmq-endpoint" json:"zmq-endpoint"`

	// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
	EventBatchSize int `yaml:"event-batch-size" json:"event-batch-size"`

	// FakeMetrics is a set of metrics to send to Prometheus instead of the real data
	FakeMetrics *FakeMetrics `yaml:"fake-metrics" json:"fake-metrics"`

	// FakeMetricsRefreshInterval defines how often function-based fake metrics are recalculated, defaults to 100ms
	FakeMetricsRefreshInterval time.Duration `yaml:"fake-metrics-refresh-interval" json:"fake-metrics-refresh-interval"`

	// FailureInjectionRate is the probability (0-100) of injecting failures
	FailureInjectionRate int `yaml:"failure-injection-rate" json:"failure-injection-rate"`
	// FailureTypes is a list of specific failure types to inject (empty means all types)
	FailureTypes []string `yaml:"failure-types" json:"failure-types"`

	// DPSize is data parallel size - a number of ranks to run, minimum is 1, maximum is 8, default is 1
	DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`

	// Rank is the vLLM parameter used to specify the rank of this instance. Here only
	// used when running Data Parallel ranks as separate processes. If set, data-parallel-size is ignored
	Rank int `yaml:"data-parallel-rank" json:"data-parallel-rank"`

	// SSLCertFile is the path to the SSL certificate file for HTTPS
	SSLCertFile string `yaml:"ssl-certfile" json:"ssl-certfile"`
	// SSLKeyFile is the path to the SSL private key file for HTTPS
	SSLKeyFile string `yaml:"ssl-keyfile" json:"ssl-keyfile"`
	// SelfSignedCerts enables automatic generation of self-signed certificates for HTTPS
	SelfSignedCerts bool `yaml:"self-signed-certs" json:"self-signed-certs"`

	// DatasetPath Optional local file path to the SQLite database file used for generating responses from a dataset.
	//   - If not set, hardcoded preset responses will be used.
	//   - If set but the file does not exist the `dataset-url` will be used to download the database to the path specified by `dataset-path`.
	//   - If the file exists but is currently occupied by another process, responses will be randomly generated from preset text (the same behavior as if the path were not set).
	//   - Responses are retrieved from the dataset by the hash of the conversation history, with a fallback to a random dataset response, constrained by the maximum output tokens and EoS token handling, if no matching history is found.
	//   - Refer to [llm-d converted ShareGPT](https://huggingface.co/datasets/hf07397/inference-sim-datasets/blob/0b7ac1a4daf0aace1556326964bd75633372299e/README.md) for detailed information on the expected format of the SQLite database file.
	DatasetPath string `yaml:"dataset-path" json:"dataset-path"`
	// DatasetURL Optional URL for downloading the SQLite database file used for response generation.
	//   - This parameter is only used if the `dataset-path` is also set and the file does not exist at that path.
	//   - If the file needs to be downloaded, it will be saved to the location specified by `dataset-path`.
	//   - If the file already exists at the `dataset-path`, it will not be downloaded again
	//   - Example URL `https://huggingface.co/datasets/hf07397/inference-sim-datasets/resolve/91ffa7aafdfd6b3b1af228a517edc1e8f22cd274/huggingface/ShareGPT_Vicuna_unfiltered/conversations.sqlite3`
	DatasetURL string `yaml:"dataset-url" json:"dataset-url"`
	// DatasetInMemory defines whether to load the entire dataset into memory for faster access.
	DatasetInMemory bool `yaml:"dataset-in-memory" json:"dataset-in-memory"`
	// DatasetTableName defines custom SQLite dataset table name
	DatasetTableName string `yaml:"dataset-table-name" json:"dataset-table-name"`

	// Tokenizer UDS socker path
	UDSSocketPath string `yaml:"uds-socket-path" json:"uds-socket-path"`

	// EnableSleepMode enables sleep mode
	EnableSleepMode bool `yaml:"enable-sleep-mode" json:"enable-sleep-mode"`

	// EnableRequestIDHeaders enables including X-Request-Id header in responses
	EnableRequestIDHeaders bool `yaml:"enable-request-id-headers" json:"enable-request-id-headers"`

	// LatencyCalculator is the name of the latency calculator to use in the simulation of the response latencies.
	// The default calculation is based on the current load of the simulator and on the configured latency
	// parameters, e.g., time-to-first-token and prefill-time-per-token.
	LatencyCalculator string `yaml:"latency-calculator" json:"latency-calculator"`

	// DefaultEmbeddingDimensions is the default size of embedding vectors when the request does not specify dimensions.
	// Used by the /v1/embeddings endpoint. Default is 384.
	DefaultEmbeddingDimensions int `yaml:"default-embedding-dimensions" json:"default-embedding-dimensions"`

	// MMEncoderOnly defines whether to skip the language component of the model.
	MMEncoderOnly bool `yaml:"mm-encoder-only" json:"mm-encoder-only"`

	// Ignored parameters:
	// MMProcessorKWArgs defines arguments to be forwarded to the model's processor for multi-modal data.
	// Ignored in the simulator.
	MMProcessorKWArgs string `yaml:"mm-processor-kwargs" json:"mm-processor-kwargs"`
	// ECTransferConfig defines the configurations for distributed EC cache transfer.
	// Ignored in the simulator.
	ECTransferConfig string `yaml:"ec-transfer-config" json:"ec-transfer-config"`
	// EnforceEager defines whether to always use eager-mode PyTorch.
	// Ignored in the simulator.
	EnforceEager bool `yaml:"enforce-eager" json:"enforce-eager"`
	// EnablePrefixCaching defines whether to enable prefix caching.
	// Ignored in the simulator.
	EnablePrefixCaching bool `yaml:"enable-prefix-caching" json:"enable-prefix-caching"`
}

func ParseCommandParamsAndLoadConfig

func ParseCommandParamsAndLoadConfig() (*Configuration, error)

ParseCommandParamsAndLoadConfig loads configuration, parses command line parameters, merges the values (command line values overwrite the config file ones), and validates the configuration

func (*Configuration) Copy added in v0.5.0

func (c *Configuration) Copy() (*Configuration, error)

func (*Configuration) SSLEnabled added in v0.5.1

func (c *Configuration) SSLEnabled() bool

SSLEnabled returns true if SSL is enabled either via certificate files or self-signed certificates

func (*Configuration) Show added in v0.8.0

func (c *Configuration) Show(logger logr.Logger) error

type Duration added in v0.7.0

type Duration time.Duration

Duration wraps time.Duration. It is used to parse the custom duration format from YAML.

func (*Duration) Milliseconds added in v0.7.0

func (d *Duration) Milliseconds() int64

func (*Duration) Set added in v0.7.0

func (d *Duration) Set(s string) error

Set implements pflag/flag.Value.

func (*Duration) String added in v0.7.0

func (d *Duration) String() string

String implements pflag.Value.

func (*Duration) ToDuration added in v0.7.0

func (d *Duration) ToDuration() time.Duration

func (*Duration) Type added in v0.7.0

func (*Duration) Type() string

Type implements pflag.Value.

func (*Duration) UnmarshalYAML added in v0.7.0

func (d *Duration) UnmarshalYAML(value *yaml.Node) error

UnmarshalYAML implements the yaml.Unmarshaler interface.

type FakeMetricWithFunction added in v0.8.0

type FakeMetricWithFunction struct {
	FixedValue float64
	Function   *FunctionInfo
	IsFunction bool
}

func (*FakeMetricWithFunction) UnmarshalJSON added in v0.8.0

func (f *FakeMetricWithFunction) UnmarshalJSON(data []byte) error

func (*FakeMetricWithFunction) UnmarshalYAML added in v0.8.0

func (f *FakeMetricWithFunction) UnmarshalYAML(value *yaml.Node) error

type FakeMetrics added in v0.8.0

type FakeMetrics struct {
	// LoraMetrics
	LoraMetrics []LorasMetrics `json:"loras"`
	LorasString []string       `yaml:"loras"`

	// RunningRequests is the number of inference requests that are currently being processed
	RunningRequests FakeMetricWithFunction `yaml:"running-requests" json:"running-requests"`
	// WaitingRequests is the number of inference requests that are waiting to be processed
	WaitingRequests FakeMetricWithFunction `yaml:"waiting-requests" json:"waiting-requests"`
	// KVCacheUsagePercentage  is the fraction of KV-cache blocks currently in use (from 0 to 1)
	KVCacheUsagePercentage FakeMetricWithFunction `yaml:"kv-cache-usage" json:"kv-cache-usage"`

	// TTFTBuckets is an array of values for time-to-first-token buckets.
	// Buckets upper boundaries in seconds are:
	// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
	// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
	TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
	// TPOTBuckets is an array of values for time-per-output-token buckets.
	// Buckets upper boundaries in seconds are:
	// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
	// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
	TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
	// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
	// Each value in these arrays is passed to Observe() exactly once at startup.
	// By default:
	//   - The sum of RequestPromptTokens initializes the metric vllm:prompt_tokens_total.
	//   - The sum of RequestGenerationTokens initializes the metric vllm:generation_tokens_total.
	//
	// If TotalPromptTokens or TotalGenerationTokens are explicitly provided,
	// they override the above sums and are used directly as the initial total token counts.
	RequestPromptTokens        []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"`                 // prompt-length samples
	RequestGenerationTokens    []int `yaml:"request-generation-tokens" json:"request-generation-tokens"`         // generation-length samples
	RequestParamsMaxTokens     []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"`         // max_tokens parameter samples
	RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
	// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
	RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`

	// TotalPromptTokens is the total number of prompt tokens processed
	TotalPromptTokens *int64 `json:"total-prompt-tokens,omitempty"`
	// TotalGenerationTokens is the total number of generated tokens
	TotalGenerationTokens *int64 `json:"total-generation-tokens,omitempty"`

	// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets.
	E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
	// ReqQueueTimeBucketValues is an array of values for request queue time buckets.
	ReqQueueTimeBucketValues []int `yaml:"queue-time-buckets-values" json:"queue-time-buckets-values"`
	// ReqInfTimeBucketValues is an array of values for request inference time buckets.
	ReqInfTimeBucketValues []int `yaml:"inf-time-buckets-values" json:"inf-time-buckets-values"`
	// ReqPrefillTimeBucketValues is an array of values for request prefill time buckets.
	ReqPrefillTimeBucketValues []int `yaml:"prefill-time-buckets-values" json:"prefill-time-buckets-values"`
	// ReqDecodeTimeBucketValues is an array of values for request decode time buckets.
	ReqDecodeTimeBucketValues []int `yaml:"decode-time-buckets-values" json:"decode-time-buckets-values"`

	// PrefixCacheHits is the initial value for the prefix cache hits counter (in tokens)
	PrefixCacheHits *int64 `yaml:"prefix-cache-hits" json:"prefix-cache-hits,omitempty"`
	// PrefixCacheQueries is the initial value for the prefix cache queries counter (in tokens)
	PrefixCacheQueries *int64 `yaml:"prefix-cache-queries" json:"prefix-cache-queries,omitempty"`
}

type FunctionInfo added in v0.8.0

type FunctionInfo struct {
	Name   string
	Start  float64
	End    float64
	Period time.Duration
}

type LoraModule

type LoraModule struct {
	// Name is the LoRA's name
	Name string `json:"name"`
	// Path is the LoRA's path
	Path string `json:"path"`
	// BaseModelName is the LoRA's base model
	BaseModelName string `json:"base_model_name"`
}

type LorasMetrics added in v0.4.0

type LorasMetrics struct {
	// RunningLoras is a comma separated list of running LoRAs
	RunningLoras string `json:"running"`
	// WaitingLoras is a comma separated list of waiting LoRAs
	WaitingLoras string `json:"waiting"`
	// Timestamp is the timestamp of the metric
	Timestamp float64 `json:"timestamp"`
}

type MetricInfo added in v0.8.0

type MetricInfo struct {
	// Value is the value for metric's update
	Value float64
	// IsFake is true if this a fake metric, and false if not
	IsFake bool
}

MetricInfo contains metrics update value to pass through the corresponding channel

type Publisher

type Publisher struct {
	// contains filtered or unexported fields
}

Publisher sends events to a ZMQ endpoint.

func NewPublisher

func NewPublisher(ctx context.Context, endpoint string) (*Publisher, error)

NewPublisher creates a new ZMQ publisher. endpoint is the ZMQ address to bind to (e.g., "tcp://*:5557"). retries is the maximum number of connection attempts.

func (*Publisher) Close

func (p *Publisher) Close() error

Close closes the publisher and cleans up resources.

func (*Publisher) PublishEvent

func (p *Publisher) PublishEvent(ctx context.Context, topic string, batch interface{}) error

PublishEvent publishes a KV cache event batch to the ZMQ topic. topic should include the pod identifier (e.g., "kv.pod1").

type Random added in v0.6.0

type Random struct {
	// contains filtered or unexported fields
}

func NewRandom added in v0.6.0

func NewRandom(seed int64, port int) *Random

func (*Random) FlipCoin added in v0.6.0

func (r *Random) FlipCoin() bool

Returns true or false randomly

func (*Random) GenerateUUIDString added in v0.6.0

func (r *Random) GenerateUUIDString() string

GenerateUUIDString generates a UUID string under a lock

func (*Random) RandomBool added in v0.6.0

func (r *Random) RandomBool(probability int) bool

probability is an integer between 0 and 100

func (*Random) RandomFloat added in v0.6.0

func (r *Random) RandomFloat(min float64, max float64) float64

Returns a random float64 in the range [min, max)

func (*Random) RandomInt added in v0.6.0

func (r *Random) RandomInt(min int, max int) int

Returns an integer between min and max (included)

func (*Random) RandomNorm added in v0.6.0

func (r *Random) RandomNorm(mean int, stddev int) float64

Returns a normally distributed float64

func (*Random) RandomNormDuration added in v0.7.0

func (r *Random) RandomNormDuration(mean, stddev time.Duration) time.Duration

func (*Random) RandomNormTruncated added in v0.6.0

func (r *Random) RandomNormTruncated(mean int, stddev int) int

Returns a normally distributed int If the generated value differs by more than 70% from mean, the returned value will be 70% of mean

func (*Random) RandomNumericString added in v0.6.0

func (r *Random) RandomNumericString(length int) string

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL