Documentation
¶
Overview ¶
Package evaluation provides automated evaluation framework for AI agents.
Package evaluation provides automated evaluation framework for AI agents.
Package evaluation provides automated evaluation framework for AI agents.
Package evaluation provides automated evaluation framework for AI agents. Validates: Requirements 9.2, 9.4, 9.5, 9.6
Package evaluation provides automated evaluation framework for AI agents.
Package evaluation provides automated evaluation framework for AI agents.
Index ¶
- Constants
- Variables
- func RegisterBuiltinMetrics(registry *MetricRegistry)
- type ABTester
- func (t *ABTester) Analyze(ctx context.Context, experimentID string) (*ExperimentResult, error)
- func (t *ABTester) Assign(experimentID, userID string) (*Variant, error)
- func (t *ABTester) AutoSelectWinner(ctx context.Context, experimentID string, minConfidence float64) (*Variant, error)
- func (t *ABTester) CompleteExperiment(experimentID string) error
- func (t *ABTester) CreateExperiment(exp *Experiment) error
- func (t *ABTester) DeleteExperiment(experimentID string) error
- func (t *ABTester) GenerateReport(ctx context.Context, experimentID string) (*StatisticalReport, error)
- func (t *ABTester) GetExperiment(experimentID string) (*Experiment, error)
- func (t *ABTester) ListExperiments() []*Experiment
- func (t *ABTester) PauseExperiment(experimentID string) error
- func (t *ABTester) RecordResult(experimentID, variantID string, result *EvalResult) error
- func (t *ABTester) StartExperiment(experimentID string) error
- type AccuracyMetric
- type AgentExecutor
- type AggregatedJudgeResult
- type Alert
- type AlertHandler
- type AlertLevel
- type AlertThreshold
- type BatchEvalReport
- type ContainsScorer
- type CostMetric
- type DimensionScore
- type EvalInput
- type EvalOutput
- type EvalReport
- type EvalResult
- type EvalSuite
- type EvalSummary
- type EvalTask
- type Evaluator
- func (e *Evaluator) AddAlertHandler(handler AlertHandler)
- func (e *Evaluator) ClearAlerts()
- func (e *Evaluator) Evaluate(ctx context.Context, suite *EvalSuite, agent AgentExecutor) (*EvalReport, error)
- func (e *Evaluator) EvaluateBatch(ctx context.Context, suites []*EvalSuite, agent AgentExecutor) ([]*EvalReport, error)
- func (e *Evaluator) GenerateReport(reports []*EvalReport) *BatchEvalReport
- func (e *Evaluator) GetAlerts() []Alert
- func (e *Evaluator) RegisterScorer(taskType string, scorer Scorer)
- func (e *Evaluator) SetMetricRegistry(registry *MetricRegistry)
- type EvaluatorConfig
- type ExactMatchScorer
- type Experiment
- type ExperimentResult
- type ExperimentStatus
- type ExperimentStore
- type InputOutputPair
- type JSONScorer
- type JudgeDimension
- type JudgeResult
- type LLMJudge
- func (j *LLMJudge) AggregateResults(results []*JudgeResult) *AggregatedJudgeResult
- func (j *LLMJudge) GetConfig() LLMJudgeConfig
- func (j *LLMJudge) Judge(ctx context.Context, input *EvalInput, output *EvalOutput) (*JudgeResult, error)
- func (j *LLMJudge) JudgeBatch(ctx context.Context, pairs []InputOutputPair) ([]*JudgeResult, error)
- type LLMJudgeConfig
- type LatencyMetric
- type MemoryExperimentStore
- func (s *MemoryExperimentStore) DeleteExperiment(ctx context.Context, id string) error
- func (s *MemoryExperimentStore) GetAssignment(ctx context.Context, experimentID, userID string) (string, error)
- func (s *MemoryExperimentStore) GetAssignmentCount(experimentID string) map[string]int
- func (s *MemoryExperimentStore) GetResultCount(experimentID string) map[string]int
- func (s *MemoryExperimentStore) GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)
- func (s *MemoryExperimentStore) ListExperiments(ctx context.Context) ([]*Experiment, error)
- func (s *MemoryExperimentStore) LoadExperiment(ctx context.Context, id string) (*Experiment, error)
- func (s *MemoryExperimentStore) RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error
- func (s *MemoryExperimentStore) RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error
- func (s *MemoryExperimentStore) SaveExperiment(ctx context.Context, exp *Experiment) error
- type Metric
- type MetricEvalResult
- type MetricRegistry
- type Scorer
- type StatisticalReport
- type TokenUsageMetric
- type Variant
- type VariantComparison
- type VariantReport
- type VariantResult
Constants ¶
const DefaultPromptTemplate = `` /* 1118-byte string literal not displayed */
DefaultPromptTemplate 默认评估提示模板 Validates: Requirements 10.2
Variables ¶
var ( ErrExperimentNotFound = errors.New("experiment not found") ErrExperimentNotActive = errors.New("experiment not active") ErrNoVariants = errors.New("no variants defined") ErrInvalidWeights = errors.New("invalid variant weights") ErrVariantNotFound = errors.New("variant not found") )
A/B 测试相关错误
Functions ¶
func RegisterBuiltinMetrics ¶
func RegisterBuiltinMetrics(registry *MetricRegistry)
RegisterBuiltinMetrics 注册所有内置指标到注册表
Types ¶
type ABTester ¶
type ABTester struct {
// contains filtered or unexported fields
}
ABTester A/B 测试器 Validates: Requirements 11.1, 11.2, 11.3, 11.5
func NewABTester ¶
func NewABTester(store ExperimentStore, logger *zap.Logger) *ABTester
NewABTester 创建 A/B 测试器
func (*ABTester) AutoSelectWinner ¶
func (t *ABTester) AutoSelectWinner(ctx context.Context, experimentID string, minConfidence float64) (*Variant, error)
AutoSelectWinner automatically selects the winning variant configuration when statistical significance is detected. Validates: Requirements 11.6
func (*ABTester) CompleteExperiment ¶
CompleteExperiment 完成实验
func (*ABTester) CreateExperiment ¶
func (t *ABTester) CreateExperiment(exp *Experiment) error
CreateExperiment 创建实验 Validates: Requirements 11.1
func (*ABTester) DeleteExperiment ¶
DeleteExperiment 删除实验
func (*ABTester) GenerateReport ¶
func (t *ABTester) GenerateReport(ctx context.Context, experimentID string) (*StatisticalReport, error)
GenerateReport generates a comprehensive statistical significance analysis report Validates: Requirements 11.4
func (*ABTester) GetExperiment ¶
func (t *ABTester) GetExperiment(experimentID string) (*Experiment, error)
GetExperiment 获取实验
func (*ABTester) ListExperiments ¶
func (t *ABTester) ListExperiments() []*Experiment
ListExperiments 列出所有实验
func (*ABTester) PauseExperiment ¶
PauseExperiment 暂停实验
func (*ABTester) RecordResult ¶
func (t *ABTester) RecordResult(experimentID, variantID string, result *EvalResult) error
RecordResult 记录结果 Validates: Requirements 11.3
func (*ABTester) StartExperiment ¶
StartExperiment 启动实验
type AccuracyMetric ¶
type AccuracyMetric struct {
// CaseSensitive 是否区分大小写
CaseSensitive bool
// TrimWhitespace 是否去除首尾空白
TrimWhitespace bool
// UseContains 是否使用包含匹配(而非精确匹配)
UseContains bool
}
AccuracyMetric 准确率指标 通过比较实际输出与期望输出计算准确率 Validates: Requirements 9.3
func (*AccuracyMetric) Compute ¶
func (m *AccuracyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
Compute 计算准确率 返回值范围: 0.0 - 1.0 - 1.0: 完全匹配 - 0.0 - 1.0: 部分匹配(基于字符相似度)
type AgentExecutor ¶
type AgentExecutor interface {
Execute(ctx context.Context, input string) (output string, tokens int, err error)
}
AgentExecutor defines the interface for executing agent tasks.
type AggregatedJudgeResult ¶
type AggregatedJudgeResult struct {
Results []*JudgeResult `json:"results"`
AverageScore float64 `json:"average_score"`
ScoreStdDev float64 `json:"score_std_dev"`
NeedsReview bool `json:"needs_review"`
ReviewReason string `json:"review_reason,omitempty"`
DimensionAvgs map[string]float64 `json:"dimension_averages"`
}
AggregatedJudgeResult 聚合的评判结果 Validates: Requirements 10.5
type Alert ¶
type Alert struct {
Level AlertLevel `json:"level"`
MetricName string `json:"metric_name"`
Threshold float64 `json:"threshold"`
Actual float64 `json:"actual"`
Message string `json:"message"`
TaskID string `json:"task_id,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
Alert represents an evaluation alert triggered when metrics exceed thresholds. Validates: Requirements 9.6
type AlertHandler ¶
type AlertHandler func(alert *Alert)
AlertHandler is called when an alert is triggered.
type AlertLevel ¶
type AlertLevel string
AlertLevel defines the severity of an alert.
const ( AlertLevelInfo AlertLevel = "info" AlertLevelWarning AlertLevel = "warning" AlertLevelCritical AlertLevel = "critical" )
type AlertThreshold ¶
type AlertThreshold struct {
MetricName string `json:"metric_name"`
Operator string `json:"operator"` // "gt", "lt", "gte", "lte", "eq"
Value float64 `json:"value"`
Level AlertLevel `json:"level"`
Message string `json:"message,omitempty"`
}
AlertThreshold defines a threshold for triggering alerts.
type BatchEvalReport ¶
type BatchEvalReport struct {
Reports []*EvalReport `json:"reports"`
AggregatedSummary EvalSummary `json:"aggregated_summary"`
Alerts []Alert `json:"alerts,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
BatchEvalReport represents a batch evaluation report. Validates: Requirements 9.5
type ContainsScorer ¶
type ContainsScorer struct{}
ContainsScorer scores based on whether output contains expected.
type CostMetric ¶
type CostMetric struct {
// MaxCost 最大成本,用于归一化
// 如果设置,返回值为 max(0, 1 - cost/maxCost)
// 如果不设置(0),直接返回成本值
MaxCost float64
}
CostMetric 成本指标 返回 API 调用成本 Validates: Requirements 9.3
func NewCostMetricWithMax ¶
func NewCostMetricWithMax(maxCost float64) *CostMetric
NewCostMetricWithMax 创建带最大值的成本指标
func (*CostMetric) Compute ¶
func (m *CostMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
Compute 计算成本 如果设置了最大值,返回归一化分数 (0.0 - 1.0) 否则返回原始成本值
type DimensionScore ¶
DimensionScore 维度评分
type EvalInput ¶
type EvalInput struct {
Prompt string `json:"prompt"`
Context map[string]any `json:"context,omitempty"`
Expected string `json:"expected,omitempty"`
Reference string `json:"reference,omitempty"`
}
EvalInput 评估输入
func (*EvalInput) WithContext ¶
WithContext 设置上下文
func (*EvalInput) WithExpected ¶
WithExpected 设置期望输出
func (*EvalInput) WithReference ¶
WithReference 设置参考内容
type EvalOutput ¶
type EvalOutput struct {
Response string `json:"response"`
TokensUsed int `json:"tokens_used"`
Latency time.Duration `json:"latency"`
Cost float64 `json:"cost"`
Metadata map[string]any `json:"metadata,omitempty"`
}
EvalOutput 评估输出
func (*EvalOutput) WithLatency ¶
func (e *EvalOutput) WithLatency(latency time.Duration) *EvalOutput
WithLatency 设置延迟
func (*EvalOutput) WithMetadata ¶
func (e *EvalOutput) WithMetadata(metadata map[string]any) *EvalOutput
WithMetadata 设置元数据
func (*EvalOutput) WithTokensUsed ¶
func (e *EvalOutput) WithTokensUsed(tokens int) *EvalOutput
WithTokensUsed 设置 Token 使用量
type EvalReport ¶
type EvalReport struct {
SuiteID string `json:"suite_id"`
SuiteName string `json:"suite_name"`
AgentID string `json:"agent_id"`
Results []EvalResult `json:"results"`
Summary EvalSummary `json:"summary"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
Duration time.Duration `json:"duration"`
Metadata map[string]string `json:"metadata,omitempty"`
}
EvalReport represents the complete evaluation report.
type EvalResult ¶
type EvalResult struct {
TaskID string `json:"task_id"`
Success bool `json:"success"`
Output string `json:"output"`
Expected string `json:"expected,omitempty"`
Score float64 `json:"score"` // 0.0 - 1.0
Metrics map[string]float64 `json:"metrics,omitempty"`
Error string `json:"error,omitempty"`
Duration time.Duration `json:"duration"`
TokensUsed int `json:"tokens_used,omitempty"`
Cost float64 `json:"cost,omitempty"`
}
EvalResult represents the result of evaluating a single task.
type EvalSuite ¶
type EvalSuite struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Tasks []EvalTask `json:"tasks"`
Version string `json:"version"`
}
EvalSuite represents a collection of evaluation tasks.
type EvalSummary ¶
type EvalSummary struct {
TotalTasks int `json:"total_tasks"`
PassedTasks int `json:"passed_tasks"`
FailedTasks int `json:"failed_tasks"`
PassRate float64 `json:"pass_rate"`
AverageScore float64 `json:"average_score"`
TotalTokens int `json:"total_tokens"`
TotalCost float64 `json:"total_cost"`
TotalDuration time.Duration `json:"total_duration"`
MetricAverages map[string]float64 `json:"metric_averages,omitempty"`
// Statistical metrics
ScoreStdDev float64 `json:"score_std_dev"`
ScoreMin float64 `json:"score_min"`
ScoreMax float64 `json:"score_max"`
ScoreMedian float64 `json:"score_median"`
Percentiles map[string]float64 `json:"percentiles,omitempty"` // p50, p90, p95, p99
}
EvalSummary contains aggregated evaluation metrics. Validates: Requirements 9.5
type EvalTask ¶
type EvalTask struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Input string `json:"input"`
Expected string `json:"expected,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
Tags []string `json:"tags,omitempty"`
Timeout time.Duration `json:"timeout,omitempty"`
}
EvalTask represents an evaluation task.
type Evaluator ¶
type Evaluator struct {
// contains filtered or unexported fields
}
Evaluator runs evaluation suites against agents. Validates: Requirements 9.2, 9.4, 9.5, 9.6
func NewEvaluator ¶
func NewEvaluator(config EvaluatorConfig, logger *zap.Logger) *Evaluator
NewEvaluator creates a new evaluator.
func (*Evaluator) AddAlertHandler ¶
func (e *Evaluator) AddAlertHandler(handler AlertHandler)
AddAlertHandler adds a handler for alerts. Validates: Requirements 9.6
func (*Evaluator) ClearAlerts ¶
func (e *Evaluator) ClearAlerts()
ClearAlerts clears all triggered alerts.
func (*Evaluator) Evaluate ¶
func (e *Evaluator) Evaluate(ctx context.Context, suite *EvalSuite, agent AgentExecutor) (*EvalReport, error)
Evaluate runs an evaluation suite against an agent. Validates: Requirements 9.2, 9.5
func (*Evaluator) EvaluateBatch ¶
func (e *Evaluator) EvaluateBatch(ctx context.Context, suites []*EvalSuite, agent AgentExecutor) ([]*EvalReport, error)
EvaluateBatch runs batch evaluation on multiple suites. Validates: Requirements 9.4
func (*Evaluator) GenerateReport ¶
func (e *Evaluator) GenerateReport(reports []*EvalReport) *BatchEvalReport
GenerateReport generates a comprehensive evaluation report. Validates: Requirements 9.5
func (*Evaluator) RegisterScorer ¶
RegisterScorer registers a scorer for a specific task type.
func (*Evaluator) SetMetricRegistry ¶
func (e *Evaluator) SetMetricRegistry(registry *MetricRegistry)
SetMetricRegistry sets a custom metric registry.
type EvaluatorConfig ¶
type EvaluatorConfig struct {
Concurrency int `json:"concurrency"`
DefaultTimeout time.Duration `json:"default_timeout"`
StopOnFailure bool `json:"stop_on_failure"`
RetryOnError bool `json:"retry_on_error"`
MaxRetries int `json:"max_retries"`
PassThreshold float64 `json:"pass_threshold"` // Score threshold to pass
AlertThresholds []AlertThreshold `json:"alert_thresholds,omitempty"`
// Batch evaluation settings
BatchSize int `json:"batch_size"` // Number of tasks per batch
CollectMetrics bool `json:"collect_metrics"` // Auto-collect metrics after execution
EnableAlerts bool `json:"enable_alerts"` // Enable alert triggering
}
EvaluatorConfig configures the evaluator. Validates: Requirements 9.4, 9.6
func DefaultEvaluatorConfig ¶
func DefaultEvaluatorConfig() EvaluatorConfig
DefaultEvaluatorConfig returns sensible defaults.
type ExactMatchScorer ¶
type ExactMatchScorer struct{}
ExactMatchScorer scores based on exact string match.
type Experiment ¶
type Experiment struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Variants []Variant `json:"variants"`
Metrics []string `json:"metrics"`
StartTime time.Time `json:"start_time"`
EndTime *time.Time `json:"end_time,omitempty"`
Status ExperimentStatus `json:"status"`
}
Experiment 实验定义 Validates: Requirements 11.1
type ExperimentResult ¶
type ExperimentResult struct {
ExperimentID string `json:"experiment_id"`
VariantResults map[string]*VariantResult `json:"variant_results"`
Winner string `json:"winner,omitempty"`
Confidence float64 `json:"confidence"`
SampleSize int `json:"sample_size"`
Duration time.Duration `json:"duration"`
}
ExperimentResult 实验结果 Validates: Requirements 11.3, 11.4
type ExperimentStatus ¶
type ExperimentStatus string
ExperimentStatus 实验状态
const ( ExperimentStatusDraft ExperimentStatus = "draft" ExperimentStatusRunning ExperimentStatus = "running" ExperimentStatusPaused ExperimentStatus = "paused" ExperimentStatusComplete ExperimentStatus = "completed" )
type ExperimentStore ¶
type ExperimentStore interface {
// SaveExperiment 保存实验
SaveExperiment(ctx context.Context, exp *Experiment) error
// LoadExperiment 加载实验
LoadExperiment(ctx context.Context, id string) (*Experiment, error)
// ListExperiments 列出所有实验
ListExperiments(ctx context.Context) ([]*Experiment, error)
// DeleteExperiment 删除实验
DeleteExperiment(ctx context.Context, id string) error
// RecordAssignment 记录分配
RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error
// GetAssignment 获取用户分配
GetAssignment(ctx context.Context, experimentID, userID string) (string, error)
// RecordResult 记录结果
RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error
// GetResults 获取实验结果
GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)
}
ExperimentStore 实验存储接口
type InputOutputPair ¶
type InputOutputPair struct {
Input *EvalInput
Output *EvalOutput
}
InputOutputPair 输入输出对,用于批量评判
type JudgeDimension ¶
type JudgeDimension struct {
Name string `json:"name"`
Description string `json:"description"`
Weight float64 `json:"weight"`
}
JudgeDimension 评判维度 Validates: Requirements 10.3
type JudgeResult ¶
type JudgeResult struct {
OverallScore float64 `json:"overall_score"`
Dimensions map[string]DimensionScore `json:"dimensions"`
Reasoning string `json:"reasoning"`
Confidence float64 `json:"confidence"`
// Additional metadata
Model string `json:"model,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
JudgeResult 评判结果 Validates: Requirements 10.4
type LLMJudge ¶
type LLMJudge struct {
// contains filtered or unexported fields
}
LLMJudge LLM 评判器 使用 LLM 作为评估者来评估 Agent 输出质量 Validates: Requirements 10.1, 10.2, 10.3, 10.4, 10.5
func NewLLMJudge ¶
NewLLMJudge 创建 LLM 评判器 Validates: Requirements 10.1
func (*LLMJudge) AggregateResults ¶
func (j *LLMJudge) AggregateResults(results []*JudgeResult) *AggregatedJudgeResult
AggregateResults 聚合多个评判结果 Validates: Requirements 10.5
func (*LLMJudge) Judge ¶
func (j *LLMJudge) Judge(ctx context.Context, input *EvalInput, output *EvalOutput) (*JudgeResult, error)
Judge 执行评判 Validates: Requirements 10.2, 10.4
func (*LLMJudge) JudgeBatch ¶
func (j *LLMJudge) JudgeBatch(ctx context.Context, pairs []InputOutputPair) ([]*JudgeResult, error)
JudgeBatch 批量评判 Validates: Requirements 10.4, 10.5
type LLMJudgeConfig ¶
type LLMJudgeConfig struct {
Model string `json:"model"`
Dimensions []JudgeDimension `json:"dimensions"`
PromptTemplate string `json:"prompt_template"`
ScoreRange [2]float64 `json:"score_range"` // [min, max]
RequireReasoning bool `json:"require_reasoning"`
// Timeout for each judge call
Timeout time.Duration `json:"timeout,omitempty"`
// MaxConcurrency for batch judging
MaxConcurrency int `json:"max_concurrency,omitempty"`
}
LLMJudgeConfig LLM 评判配置 Validates: Requirements 10.1, 10.3
func DefaultLLMJudgeConfig ¶
func DefaultLLMJudgeConfig() LLMJudgeConfig
DefaultLLMJudgeConfig 返回默认配置
type LatencyMetric ¶
type LatencyMetric struct {
// ThresholdMs 延迟阈值(毫秒),用于归一化
// 如果设置,返回值为 max(0, 1 - latency/threshold)
// 如果不设置(0),直接返回毫秒数
ThresholdMs float64
}
LatencyMetric 延迟指标 返回响应延迟(毫秒) Validates: Requirements 9.3
func NewLatencyMetricWithThreshold ¶
func NewLatencyMetricWithThreshold(thresholdMs float64) *LatencyMetric
NewLatencyMetricWithThreshold 创建带阈值的延迟指标
func (*LatencyMetric) Compute ¶
func (m *LatencyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
Compute 计算延迟 如果设置了阈值,返回归一化分数 (0.0 - 1.0) 否则返回原始延迟(毫秒)
type MemoryExperimentStore ¶
type MemoryExperimentStore struct {
// contains filtered or unexported fields
}
MemoryExperimentStore 内存实验存储(用于测试和简单场景)
func NewMemoryExperimentStore ¶
func NewMemoryExperimentStore() *MemoryExperimentStore
NewMemoryExperimentStore 创建内存实验存储
func (*MemoryExperimentStore) DeleteExperiment ¶
func (s *MemoryExperimentStore) DeleteExperiment(ctx context.Context, id string) error
DeleteExperiment 删除实验
func (*MemoryExperimentStore) GetAssignment ¶
func (s *MemoryExperimentStore) GetAssignment(ctx context.Context, experimentID, userID string) (string, error)
GetAssignment 获取用户分配
func (*MemoryExperimentStore) GetAssignmentCount ¶
func (s *MemoryExperimentStore) GetAssignmentCount(experimentID string) map[string]int
GetAssignmentCount 获取分配计数(用于测试)
func (*MemoryExperimentStore) GetResultCount ¶
func (s *MemoryExperimentStore) GetResultCount(experimentID string) map[string]int
GetResultCount 获取结果计数(用于测试)
func (*MemoryExperimentStore) GetResults ¶
func (s *MemoryExperimentStore) GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)
GetResults 获取实验结果
func (*MemoryExperimentStore) ListExperiments ¶
func (s *MemoryExperimentStore) ListExperiments(ctx context.Context) ([]*Experiment, error)
ListExperiments 列出所有实验
func (*MemoryExperimentStore) LoadExperiment ¶
func (s *MemoryExperimentStore) LoadExperiment(ctx context.Context, id string) (*Experiment, error)
LoadExperiment 加载实验
func (*MemoryExperimentStore) RecordAssignment ¶
func (s *MemoryExperimentStore) RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error
RecordAssignment 记录分配
func (*MemoryExperimentStore) RecordResult ¶
func (s *MemoryExperimentStore) RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error
RecordResult 记录结果
func (*MemoryExperimentStore) SaveExperiment ¶
func (s *MemoryExperimentStore) SaveExperiment(ctx context.Context, exp *Experiment) error
SaveExperiment 保存实验
type Metric ¶
type Metric interface {
// Name 指标名称
Name() string
// Compute 计算指标值
Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
}
Metric 评估指标接口 Validates: Requirements 9.1
type MetricEvalResult ¶
type MetricEvalResult struct {
InputID string `json:"input_id"`
Metrics map[string]float64 `json:"metrics"`
Passed bool `json:"passed"`
Errors []string `json:"errors,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
MetricEvalResult 评估结果(符合设计文档规范) 注意:与现有 EvalResult 区分,此类型专用于 Metric 接口
func NewMetricEvalResult ¶
func NewMetricEvalResult(inputID string) *MetricEvalResult
NewMetricEvalResult 创建评估结果
func (*MetricEvalResult) AddError ¶
func (r *MetricEvalResult) AddError(err string) *MetricEvalResult
AddError 添加错误
func (*MetricEvalResult) AddMetric ¶
func (r *MetricEvalResult) AddMetric(name string, value float64) *MetricEvalResult
AddMetric 添加指标值
func (*MetricEvalResult) SetPassed ¶
func (r *MetricEvalResult) SetPassed(passed bool) *MetricEvalResult
SetPassed 设置是否通过
type MetricRegistry ¶
type MetricRegistry struct {
// contains filtered or unexported fields
}
MetricRegistry 指标注册表
func NewRegistryWithBuiltinMetrics ¶
func NewRegistryWithBuiltinMetrics() *MetricRegistry
NewRegistryWithBuiltinMetrics 创建包含所有内置指标的注册表
func (*MetricRegistry) ComputeAll ¶
func (r *MetricRegistry) ComputeAll(ctx context.Context, input *EvalInput, output *EvalOutput) (*MetricEvalResult, error)
ComputeAll 计算所有注册的指标
type Scorer ¶
type Scorer interface {
Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)
}
Scorer defines the interface for scoring evaluation results.
type StatisticalReport ¶
type StatisticalReport struct {
ExperimentID string `json:"experiment_id"`
ExperimentName string `json:"experiment_name"`
Status ExperimentStatus `json:"status"`
Duration time.Duration `json:"duration"`
TotalSamples int `json:"total_samples"`
VariantReports map[string]*VariantReport `json:"variant_reports"`
Comparisons []*VariantComparison `json:"comparisons"`
Winner string `json:"winner,omitempty"`
WinnerConfidence float64 `json:"winner_confidence,omitempty"`
Recommendation string `json:"recommendation"`
GeneratedAt time.Time `json:"generated_at"`
}
StatisticalReport represents a detailed statistical analysis report Validates: Requirements 11.4
type TokenUsageMetric ¶
type TokenUsageMetric struct {
// MaxTokens 最大 Token 数,用于归一化
// 如果设置,返回值为 max(0, 1 - tokens/maxTokens)
// 如果不设置(0),直接返回 Token 数
MaxTokens int
}
TokenUsageMetric Token 使用量指标 返回 Token 使用量 Validates: Requirements 9.3
func NewTokenUsageMetric ¶
func NewTokenUsageMetric() *TokenUsageMetric
NewTokenUsageMetric 创建 Token 使用量指标
func NewTokenUsageMetricWithMax ¶
func NewTokenUsageMetricWithMax(maxTokens int) *TokenUsageMetric
NewTokenUsageMetricWithMax 创建带最大值的 Token 使用量指标
func (*TokenUsageMetric) Compute ¶
func (m *TokenUsageMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
Compute 计算 Token 使用量 如果设置了最大值,返回归一化分数 (0.0 - 1.0) 否则返回原始 Token 数
type Variant ¶
type Variant struct {
ID string `json:"id"`
Name string `json:"name"`
Config map[string]any `json:"config"`
Weight float64 `json:"weight"` // 流量权重
IsControl bool `json:"is_control"`
}
Variant 实验变体 Validates: Requirements 11.1, 11.5
type VariantComparison ¶
type VariantComparison struct {
ControlID string `json:"control_id"`
TreatmentID string `json:"treatment_id"`
MetricDeltas map[string]float64 `json:"metric_deltas"` // treatment - control
RelativeChange map[string]float64 `json:"relative_change"` // percentage change
PValues map[string]float64 `json:"p_values"`
Confidence map[string]float64 `json:"confidence"`
Significant map[string]bool `json:"significant"` // at 95% level
}
VariantComparison contains comparison results between two variants
type VariantReport ¶
type VariantReport struct {
VariantID string `json:"variant_id"`
VariantName string `json:"variant_name"`
IsControl bool `json:"is_control"`
SampleCount int `json:"sample_count"`
Metrics map[string]float64 `json:"metrics"`
StdDev map[string]float64 `json:"std_dev"`
ConfInterval map[string][2]float64 `json:"confidence_interval"` // 95% CI
}
VariantReport contains detailed statistics for a single variant
type VariantResult ¶
type VariantResult struct {
VariantID string `json:"variant_id"`
SampleCount int `json:"sample_count"`
Metrics map[string]float64 `json:"metrics"`
StdDev map[string]float64 `json:"std_dev"`
// contains filtered or unexported fields
}
VariantResult 变体结果 Validates: Requirements 11.3