Documentation
¶
Index ¶
- Constants
- Variables
- func BuildStubEmbedding(tokens []uint32, dim int) []float32
- func CreateSub(ctx context.Context, topic string) (zmq4.Socket, string)
- func FinishReason(maxTokens *int64, respLen int) string
- func GenerateChatLogprobs(tokens []string, topLogprobsCount int) *openaiserverapi.ChatLogprobs
- func GenerateSingleTokenChatLogprobs(token string, tokenPosition int, topLogprobsCount int) *openaiserverapi.LogprobsContent
- func GenerateSingleTokenTextLogprobs(token string, tokenPosition int, logprobsCount int) *openaiserverapi.TextLogprobs
- func GenerateTextLogprobs(tokens []string, logprobsCount int) *openaiserverapi.TextLogprobs
- func MaxIntSlice(numbers []int) (int, error)
- func NewSub(ctx context.Context) zmq4.Socket
- func StartSub(sub zmq4.Socket, endpoint string, topic string) string
- func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64)
- func WriteToChannel[T any](channel Channel[T], object T, logger logr.Logger)
- type Channel
- type Configuration
- type Duration
- type FakeMetricWithFunction
- type FakeMetrics
- type FunctionInfo
- type LoraModule
- type LorasMetrics
- type MetricInfo
- type Publisher
- type Random
- func (r *Random) FlipCoin() bool
- func (r *Random) GenerateUUIDString() string
- func (r *Random) RandomBool(probability int) bool
- func (r *Random) RandomFloat(min float64, max float64) float64
- func (r *Random) RandomInt(min int, max int) int
- func (r *Random) RandomNorm(mean int, stddev int) float64
- func (r *Random) RandomNormDuration(mean, stddev time.Duration) time.Duration
- func (r *Random) RandomNormTruncated(mean int, stddev int) int
- func (r *Random) RandomNumericString(length int) string
Constants ¶
const ( ModeRandom = "random" ModeEcho = "echo" // Failure type constants FailureTypeRateLimit = "rate_limit" FailureTypeInvalidAPIKey = "invalid_api_key" FailureTypeContextLength = "context_length" FailureTypeServerError = "server_error" FailureTypeInvalidRequest = "invalid_request" FailureTypeModelNotFound = "model_not_found" StopFinishReason = "stop" LengthFinishReason = "length" ToolsFinishReason = "tool_calls" RemoteDecodeFinishReason = "remote_decode" CacheThresholdFinishReason = "cache_threshold" DefaultLatencyCalculator = "" ConstantLatencyCalculator = "constant" PerPromptTokenLatencyCalculator = "per-token" DefaultDSTableName = "llmd" )
const ( OscillateFuncName = "oscillate" RampFuncName = "ramp" RampWithResetFuncName = "rampreset" SquarewaveFuncName = "squarewave" )
const ( PodNameEnv = "POD_NAME" PodNsEnv = "POD_NAMESPACE" )
const ( QwenModelName = "Qwen/Qwen2-0.5B" TestModelName = "testmodel" )
constants
const InvalidMaxTokensErrMsg = "Max completion tokens and max tokens should be positive"
Variables ¶
var RequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}
var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
2560.0}
Definition of buckets for time-to-first-token and time-per-output-token metrics, each value is an upper boundary of a bucket
Functions ¶
func BuildStubEmbedding ¶ added in v0.8.0
BuildStubEmbedding returns a deterministic embedding of length dim from token ids (simulator stub).
func CreateSub ¶ added in v0.7.0
CreateSub creates a ZMQ sub, subscribes to the provided topic, and returns the sub and the endpoint to publish events on
func FinishReason ¶ added in v0.7.0
FinishReason returns finish reason based on request's max tokens parameter and the length of the generated response
func GenerateChatLogprobs ¶ added in v0.6.1
func GenerateChatLogprobs(tokens []string, topLogprobsCount int) *openaiserverapi.ChatLogprobs
GenerateChatLogprobs generates synthetic log probabilities for chat completion responses
func GenerateSingleTokenChatLogprobs ¶ added in v0.6.1
func GenerateSingleTokenChatLogprobs(token string, tokenPosition int, topLogprobsCount int) *openaiserverapi.LogprobsContent
GenerateSingleTokenChatLogprobs generates logprobs for a single token in chat completion streaming
func GenerateSingleTokenTextLogprobs ¶ added in v0.6.1
func GenerateSingleTokenTextLogprobs(token string, tokenPosition int, logprobsCount int) *openaiserverapi.TextLogprobs
GenerateSingleTokenTextLogprobs generates logprobs for a single token in text completion streaming
func GenerateTextLogprobs ¶ added in v0.6.1
func GenerateTextLogprobs(tokens []string, logprobsCount int) *openaiserverapi.TextLogprobs
GenerateTextLogprobs generates synthetic log probabilities for text completion responses
func MaxIntSlice ¶ added in v0.7.0
MaxIntSlice receives a slice of ints, returns the maximum value in the slice if not empty, and error if the slice is empty
func StartSub ¶ added in v0.8.0
starts the given sub on a random port and subscribes to the given topic. Returns the sub and the real endpoint to publish events on.
Types ¶
type Configuration ¶
type Configuration struct {
// IP defines on which IP the simulator runs, loaded from env
IP string
// Port defines on which port the simulator runs
Port int `yaml:"port" json:"port"`
// Model defines the current base model name
Model string `yaml:"model" json:"model"`
// ServedModelNames is one or many model names exposed by the API
ServedModelNames []string `yaml:"served-model-name" json:"served-model-name"`
// MaxLoras defines maximum number of loaded LoRAs
MaxLoras int `yaml:"max-loras" json:"max-loras"`
// MaxCPULoras defines maximum number of LoRAs to store in CPU memory
MaxCPULoras int `yaml:"max-cpu-loras" json:"max-cpu-loras"`
// MaxNumSeqs is maximum number of sequences per iteration (the maximum
// number of inference requests that could be processed at the same time)
MaxNumSeqs int `yaml:"max-num-seqs" json:"max-num-seqs"`
// MaxWaitingQueueLength defines maximum size of waiting requests queue
MaxWaitingQueueLength int `yaml:"max-waiting-queue-length" json:"max-waiting-queue-length"`
// MaxModelLen is the model's context window, the maximum number of tokens
// in a single request including input and output. Default value is 1024.
MaxModelLen int `yaml:"max-model-len" json:"max-model-len"`
// LoraModulesString is a list of LoRA adapters as strings
LoraModulesString []string `yaml:"lora-modules" json:"lora-modules"`
// LoraModules is a list of LoRA adapters
LoraModules []LoraModule
// PodNameSpace specifies the Kubernetes namespace in which the simulator pod is running.
// Useful for multi-namespace deployments and resource scoping.
// Set by env variable POD_NAMESPACE
PodNameSpace string
// PodName specifies the name of the pod running the simulator instance.
// Used for identification in Kubernetes environments.
// Set by env variable POD_NAME
PodName string
// VllmDevMode enables development mode for the vLLM simulator
// Allowing for additional debugging features during local development and testing.
// Set by env variable VLLM_SERVER_DEV_MODE
VllmDevMode bool
// TimeToFirstToken time before the first token will be returned
TimeToFirstToken Duration `yaml:"time-to-first-token" json:"time-to-first-token"`
// TimeToFirstTokenStdDev standard deviation for time before the first token will be returned
// optional, default is 0, can't be more than 30% of TimeToFirstToken, will not
// cause the actual time to first token to differ by more than 70% from TimeToFirstToken
TimeToFirstTokenStdDev Duration `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"`
// InterTokenLatency time between generated tokens
InterTokenLatency Duration `yaml:"inter-token-latency" json:"inter-token-latency"`
// InterTokenLatencyStdDev standard deviation for time between generated tokens
// optional, default is 0, can't be more than 30% of InterTokenLatency, will not cause the actual
// inter token latency to differ by more than 70% from InterTokenLatency
InterTokenLatencyStdDev Duration `yaml:"inter-token-latency-std-dev" json:"inter-token-latency-std-dev"`
// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated,
KVCacheTransferLatency Duration `yaml:"kv-cache-transfer-latency" json:"kv-cache-transfer-latency"`
// KVCacheTransferLatencyStdDev standard deviation for time to "transfer" kv-cache from another
// vLLM instance in case P/D is activated, can't be more than 30% of KVCacheTransferLatency, will not
// cause the actual latency to differ by more than 70% from KVCacheTransferLatency
KVCacheTransferLatencyStdDev Duration `yaml:"kv-cache-transfer-latency-std-dev" json:"kv-cache-transfer-latency-std-dev"`
// $Total Prefill Time = PrefillOverhead + n * PrefillTimePerToken$
// the assumption is that n is less than k, where k is the number of prallelism units of GPU
// PrefillOverhead time taken to prefill the context
PrefillOverhead Duration `yaml:"prefill-overhead" json:"prefill-overhead"`
PrefillTimePerToken Duration `yaml:"prefill-time-per-token" json:"prefill-time-per-token"`
// PrefillOverheadStdDev similar to TimeToFirstTokenStdDev
PrefillTimeStdDev Duration `yaml:"prefill-time-std-dev" json:"prefill-time-std-dev"`
// $Total KV Cache Transfer Time = n * KVCacheTransferTimePerToken$
// the assumption is that the cache blocks are all missed at the remote pod
// KVCacheTransfer overhead time taken to transfer kv-cache from another vLLM instance in case P/D is activated
KVCacheTransferTimePerToken Duration `yaml:"kv-cache-transfer-time-per-token" json:"kv-cache-transfer-time-per-token"`
// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
KVCacheTransferTimeStdDev Duration `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel
// requests are being processed.
// The value of this factor must be >= 1.0, with a default of 1.0.
// - If this factor is 1.0, no extra time is added.
// - When the factor is x (where x > 1.0) and there are MaxNumSeqs requests, the total time will be multiplied by x.
// - The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"`
// Mode defines the simulator response generation mode, valid values: echo, random
Mode string `yaml:"mode" json:"mode"`
// Seed defines random seed for operations
Seed int64 `yaml:"seed" json:"seed"`
// MaxToolCallIntegerParam defines the maximum possible value of integer parameters in a tool call,
// optional, defaults to 100
MaxToolCallIntegerParam int `yaml:"max-tool-call-integer-param" json:"max-tool-call-integer-param"`
// MinToolCallIntegerParam defines the minimum possible value of integer parameters in a tool call,
// optional, defaults to 0
MinToolCallIntegerParam int `yaml:"min-tool-call-integer-param" json:"min-tool-call-integer-param"`
// MaxToolCallNumberParam defines the maximum possible value of number (float) parameters in a tool call,
// optional, defaults to 100
MaxToolCallNumberParam float64 `yaml:"max-tool-call-number-param" json:"max-tool-call-number-param"`
// MinToolCallNumberParam defines the minimum possible value of number (float) parameters in a tool call,
// optional, defaults to 0
MinToolCallNumberParam float64 `yaml:"min-tool-call-number-param" json:"min-tool-call-number-param"`
// MaxToolCallArrayParamLength defines the maximum possible length of array parameters in a tool call,
// optional, defaults to 5
MaxToolCallArrayParamLength int `yaml:"max-tool-call-array-param-length" json:"max-tool-call-array-param-length"`
// MinToolCallArrayParamLength defines the minimum possible length of array parameters in a tool call,
// optional, defaults to 1
MinToolCallArrayParamLength int `yaml:"min-tool-call-array-param-length" json:"min-tool-call-array-param-length"`
// ToolCallNotRequiredParamProbability is the probability to add a parameter, that is not required,
// in a tool call, optional, defaults to 50
ToolCallNotRequiredParamProbability int `yaml:"tool-call-not-required-param-probability" json:"tool-call-not-required-param-probability"`
// ObjectToolCallNotRequiredParamProbability is the probability to add a field, that is not required,
// in an object in a tool call, optional, defaults to 50
ObjectToolCallNotRequiredParamProbability int `yaml:"object-tool-call-not-required-field-probability" json:"object-tool-call-not-required-field-probability"`
// EnableKVCache defines if kv cache feature will be enabled
EnableKVCache bool `yaml:"enable-kvcache" json:"enable-kvcache"`
// KVCacheSize is the maximum number of token blocks in kv cache, the default value is 1024
KVCacheSize int `yaml:"kv-cache-size" json:"kv-cache-size"`
// GlobalCacheHitThreshold is the default cache hit threshold (0-1] for all requests.
// If a request specifies cache_hit_threshold, it takes precedence over this global value.
GlobalCacheHitThreshold float64 `yaml:"global-cache-hit-threshold" json:"global-cache-hit-threshold"`
// TokenBlockSize is token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128, defaults to 16
TokenBlockSize int `yaml:"block-size" json:"block-size"`
// HashSeed is the seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)
HashSeed string `yaml:"hash-seed" json:"hash-seed"`
// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557
ZMQEndpoint string `yaml:"zmq-endpoint" json:"zmq-endpoint"`
// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
EventBatchSize int `yaml:"event-batch-size" json:"event-batch-size"`
// FakeMetrics is a set of metrics to send to Prometheus instead of the real data
FakeMetrics *FakeMetrics `yaml:"fake-metrics" json:"fake-metrics"`
// FakeMetricsRefreshInterval defines how often function-based fake metrics are recalculated, defaults to 100ms
FakeMetricsRefreshInterval time.Duration `yaml:"fake-metrics-refresh-interval" json:"fake-metrics-refresh-interval"`
// FailureInjectionRate is the probability (0-100) of injecting failures
FailureInjectionRate int `yaml:"failure-injection-rate" json:"failure-injection-rate"`
// FailureTypes is a list of specific failure types to inject (empty means all types)
FailureTypes []string `yaml:"failure-types" json:"failure-types"`
// DPSize is data parallel size - a number of ranks to run, minimum is 1, maximum is 8, default is 1
DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`
// Rank is the vLLM parameter used to specify the rank of this instance. Here only
// used when running Data Parallel ranks as separate processes. If set, data-parallel-size is ignored
Rank int `yaml:"data-parallel-rank" json:"data-parallel-rank"`
// SSLCertFile is the path to the SSL certificate file for HTTPS
SSLCertFile string `yaml:"ssl-certfile" json:"ssl-certfile"`
// SSLKeyFile is the path to the SSL private key file for HTTPS
SSLKeyFile string `yaml:"ssl-keyfile" json:"ssl-keyfile"`
// SelfSignedCerts enables automatic generation of self-signed certificates for HTTPS
SelfSignedCerts bool `yaml:"self-signed-certs" json:"self-signed-certs"`
// DatasetPath Optional local file path to the SQLite database file used for generating responses from a dataset.
// - If not set, hardcoded preset responses will be used.
// - If set but the file does not exist the `dataset-url` will be used to download the database to the path specified by `dataset-path`.
// - If the file exists but is currently occupied by another process, responses will be randomly generated from preset text (the same behavior as if the path were not set).
// - Responses are retrieved from the dataset by the hash of the conversation history, with a fallback to a random dataset response, constrained by the maximum output tokens and EoS token handling, if no matching history is found.
// - Refer to [llm-d converted ShareGPT](https://huggingface.co/datasets/hf07397/inference-sim-datasets/blob/0b7ac1a4daf0aace1556326964bd75633372299e/README.md) for detailed information on the expected format of the SQLite database file.
DatasetPath string `yaml:"dataset-path" json:"dataset-path"`
// DatasetURL Optional URL for downloading the SQLite database file used for response generation.
// - This parameter is only used if the `dataset-path` is also set and the file does not exist at that path.
// - If the file needs to be downloaded, it will be saved to the location specified by `dataset-path`.
// - If the file already exists at the `dataset-path`, it will not be downloaded again
// - Example URL `https://huggingface.co/datasets/hf07397/inference-sim-datasets/resolve/91ffa7aafdfd6b3b1af228a517edc1e8f22cd274/huggingface/ShareGPT_Vicuna_unfiltered/conversations.sqlite3`
DatasetURL string `yaml:"dataset-url" json:"dataset-url"`
// DatasetInMemory defines whether to load the entire dataset into memory for faster access.
DatasetInMemory bool `yaml:"dataset-in-memory" json:"dataset-in-memory"`
// DatasetTableName defines custom SQLite dataset table name
DatasetTableName string `yaml:"dataset-table-name" json:"dataset-table-name"`
// Tokenizer UDS socker path
UDSSocketPath string `yaml:"uds-socket-path" json:"uds-socket-path"`
// EnableSleepMode enables sleep mode
EnableSleepMode bool `yaml:"enable-sleep-mode" json:"enable-sleep-mode"`
// EnableRequestIDHeaders enables including X-Request-Id header in responses
EnableRequestIDHeaders bool `yaml:"enable-request-id-headers" json:"enable-request-id-headers"`
// LatencyCalculator is the name of the latency calculator to use in the simulation of the response latencies.
// The default calculation is based on the current load of the simulator and on the configured latency
// parameters, e.g., time-to-first-token and prefill-time-per-token.
LatencyCalculator string `yaml:"latency-calculator" json:"latency-calculator"`
// DefaultEmbeddingDimensions is the default size of embedding vectors when the request does not specify dimensions.
// Used by the /v1/embeddings endpoint. Default is 384.
DefaultEmbeddingDimensions int `yaml:"default-embedding-dimensions" json:"default-embedding-dimensions"`
// MMEncoderOnly defines whether to skip the language component of the model.
MMEncoderOnly bool `yaml:"mm-encoder-only" json:"mm-encoder-only"`
// Ignored parameters:
// MMProcessorKWArgs defines arguments to be forwarded to the model's processor for multi-modal data.
// Ignored in the simulator.
MMProcessorKWArgs string `yaml:"mm-processor-kwargs" json:"mm-processor-kwargs"`
// ECTransferConfig defines the configurations for distributed EC cache transfer.
// Ignored in the simulator.
ECTransferConfig string `yaml:"ec-transfer-config" json:"ec-transfer-config"`
// EnforceEager defines whether to always use eager-mode PyTorch.
// Ignored in the simulator.
EnforceEager bool `yaml:"enforce-eager" json:"enforce-eager"`
// EnablePrefixCaching defines whether to enable prefix caching.
// Ignored in the simulator.
EnablePrefixCaching bool `yaml:"enable-prefix-caching" json:"enable-prefix-caching"`
}
func ParseCommandParamsAndLoadConfig ¶
func ParseCommandParamsAndLoadConfig() (*Configuration, error)
ParseCommandParamsAndLoadConfig loads configuration, parses command line parameters, merges the values (command line values overwrite the config file ones), and validates the configuration
func (*Configuration) Copy ¶ added in v0.5.0
func (c *Configuration) Copy() (*Configuration, error)
func (*Configuration) SSLEnabled ¶ added in v0.5.1
func (c *Configuration) SSLEnabled() bool
SSLEnabled returns true if SSL is enabled either via certificate files or self-signed certificates
type Duration ¶ added in v0.7.0
Duration wraps time.Duration. It is used to parse the custom duration format from YAML.
func (*Duration) Milliseconds ¶ added in v0.7.0
func (*Duration) ToDuration ¶ added in v0.7.0
type FakeMetricWithFunction ¶ added in v0.8.0
type FakeMetricWithFunction struct {
FixedValue float64
Function *FunctionInfo
IsFunction bool
}
func (*FakeMetricWithFunction) UnmarshalJSON ¶ added in v0.8.0
func (f *FakeMetricWithFunction) UnmarshalJSON(data []byte) error
func (*FakeMetricWithFunction) UnmarshalYAML ¶ added in v0.8.0
func (f *FakeMetricWithFunction) UnmarshalYAML(value *yaml.Node) error
type FakeMetrics ¶ added in v0.8.0
type FakeMetrics struct {
// LoraMetrics
LoraMetrics []LorasMetrics `json:"loras"`
LorasString []string `yaml:"loras"`
// RunningRequests is the number of inference requests that are currently being processed
RunningRequests FakeMetricWithFunction `yaml:"running-requests" json:"running-requests"`
// WaitingRequests is the number of inference requests that are waiting to be processed
WaitingRequests FakeMetricWithFunction `yaml:"waiting-requests" json:"waiting-requests"`
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
KVCacheUsagePercentage FakeMetricWithFunction `yaml:"kv-cache-usage" json:"kv-cache-usage"`
// TTFTBuckets is an array of values for time-to-first-token buckets.
// Buckets upper boundaries in seconds are:
// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
// TPOTBuckets is an array of values for time-per-output-token buckets.
// Buckets upper boundaries in seconds are:
// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
// Each value in these arrays is passed to Observe() exactly once at startup.
// By default:
// - The sum of RequestPromptTokens initializes the metric vllm:prompt_tokens_total.
// - The sum of RequestGenerationTokens initializes the metric vllm:generation_tokens_total.
//
// If TotalPromptTokens or TotalGenerationTokens are explicitly provided,
// they override the above sums and are used directly as the initial total token counts.
RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
// TotalPromptTokens is the total number of prompt tokens processed
TotalPromptTokens *int64 `json:"total-prompt-tokens,omitempty"`
// TotalGenerationTokens is the total number of generated tokens
TotalGenerationTokens *int64 `json:"total-generation-tokens,omitempty"`
// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets.
E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
// ReqQueueTimeBucketValues is an array of values for request queue time buckets.
ReqQueueTimeBucketValues []int `yaml:"queue-time-buckets-values" json:"queue-time-buckets-values"`
// ReqInfTimeBucketValues is an array of values for request inference time buckets.
ReqInfTimeBucketValues []int `yaml:"inf-time-buckets-values" json:"inf-time-buckets-values"`
// ReqPrefillTimeBucketValues is an array of values for request prefill time buckets.
ReqPrefillTimeBucketValues []int `yaml:"prefill-time-buckets-values" json:"prefill-time-buckets-values"`
// ReqDecodeTimeBucketValues is an array of values for request decode time buckets.
ReqDecodeTimeBucketValues []int `yaml:"decode-time-buckets-values" json:"decode-time-buckets-values"`
// PrefixCacheHits is the initial value for the prefix cache hits counter (in tokens)
PrefixCacheHits *int64 `yaml:"prefix-cache-hits" json:"prefix-cache-hits,omitempty"`
// PrefixCacheQueries is the initial value for the prefix cache queries counter (in tokens)
PrefixCacheQueries *int64 `yaml:"prefix-cache-queries" json:"prefix-cache-queries,omitempty"`
}
type FunctionInfo ¶ added in v0.8.0
type LoraModule ¶
type LorasMetrics ¶ added in v0.4.0
type LorasMetrics struct {
// RunningLoras is a comma separated list of running LoRAs
RunningLoras string `json:"running"`
// WaitingLoras is a comma separated list of waiting LoRAs
WaitingLoras string `json:"waiting"`
// Timestamp is the timestamp of the metric
Timestamp float64 `json:"timestamp"`
}
type MetricInfo ¶ added in v0.8.0
type MetricInfo struct {
// Value is the value for metric's update
Value float64
// IsFake is true if this a fake metric, and false if not
IsFake bool
}
MetricInfo contains metrics update value to pass through the corresponding channel
type Publisher ¶
type Publisher struct {
// contains filtered or unexported fields
}
Publisher sends events to a ZMQ endpoint.
func NewPublisher ¶
NewPublisher creates a new ZMQ publisher. endpoint is the ZMQ address to bind to (e.g., "tcp://*:5557"). retries is the maximum number of connection attempts.
type Random ¶ added in v0.6.0
type Random struct {
// contains filtered or unexported fields
}
func (*Random) GenerateUUIDString ¶ added in v0.6.0
GenerateUUIDString generates a UUID string under a lock
func (*Random) RandomBool ¶ added in v0.6.0
probability is an integer between 0 and 100
func (*Random) RandomFloat ¶ added in v0.6.0
Returns a random float64 in the range [min, max)
func (*Random) RandomNorm ¶ added in v0.6.0
Returns a normally distributed float64
func (*Random) RandomNormDuration ¶ added in v0.7.0
func (*Random) RandomNormTruncated ¶ added in v0.6.0
Returns a normally distributed int If the generated value differs by more than 70% from mean, the returned value will be 70% of mean