Documentation
¶
Index ¶
- type AgentRunFunc
- type BatchOption
- type CompositeConfig
- type CompositeEvaluator
- type ContainsConfig
- type ContainsEval
- type CostConfig
- type CostEval
- type EvalCase
- type EvalDetail
- type EvalFunc
- type EvalReport
- type EvalResult
- type Evaluator
- type ExactMatchEval
- type LLMJudgeEval
- type LatencyEval
- type ToolCallConfig
- type ToolCallEval
- type WeightedEvaluator
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AgentRunFunc ¶
type AgentRunFunc func(ctx context.Context, req *schema.RunRequest) (*schema.RunResponse, error)
AgentRunFunc is the function signature for running an agent. It matches agent.Agent.Run, allowing eval to work without importing the agent package.
type BatchOption ¶
type BatchOption func(*batchConfig)
BatchOption configures batch evaluation behavior.
func WithConcurrency ¶
func WithConcurrency(n int) BatchOption
WithConcurrency sets the number of concurrent evaluations. If n <= 1, evaluations are run sequentially.
type CompositeConfig ¶
type CompositeConfig struct {
// FailFast controls whether the evaluator stops on the first sub-evaluator error.
// If false (default), all sub-evaluators are run and errors are collected.
FailFast bool
}
CompositeConfig configures the CompositeEvaluator.
type CompositeEvaluator ¶
type CompositeEvaluator struct {
// contains filtered or unexported fields
}
CompositeEvaluator runs multiple evaluators on a single case and aggregates results.
func NewCompositeEvaluator ¶
func NewCompositeEvaluator(cfg *CompositeConfig, evaluators ...WeightedEvaluator) (*CompositeEvaluator, error)
NewCompositeEvaluator creates a CompositeEvaluator from the given weighted evaluators. If cfg is nil, default configuration (non-fail-fast) is used.
func (*CompositeEvaluator) Evaluate ¶
func (e *CompositeEvaluator) Evaluate(ctx context.Context, c *EvalCase) (*EvalResult, error)
Evaluate implements Evaluator. Runs all sub-evaluators sequentially. If FailFast is true, any sub-evaluator error is propagated immediately. If FailFast is false, errors are collected and reported in Details.
type ContainsConfig ¶
type ContainsConfig struct {
// Keywords is the list of keywords to check for in the output.
Keywords []string
// PassThreshold is the minimum score required to pass.
// Default is 1.0 (all keywords must be found).
PassThreshold float64
}
ContainsConfig configures the ContainsEval evaluator.
type ContainsEval ¶
type ContainsEval struct {
// contains filtered or unexported fields
}
ContainsEval checks whether the actual output contains all specified keywords. Keywords are provided via ContainsConfig. Matching is case-insensitive.
func NewContainsEval ¶
func NewContainsEval(cfg *ContainsConfig) (*ContainsEval, error)
NewContainsEval creates a new ContainsEval with the given configuration. Returns an error if cfg is nil.
func (*ContainsEval) Evaluate ¶
func (e *ContainsEval) Evaluate(_ context.Context, c *EvalCase) (*EvalResult, error)
Evaluate implements Evaluator.
type CostConfig ¶
type CostConfig struct {
// Budget is the maximum number of tokens allowed.
Budget int
// FailOnMissingUsage controls behavior when Usage data is nil.
// If true, evaluation fails when usage data is missing.
// If false (default), missing usage is treated as passing.
FailOnMissingUsage bool
}
CostConfig configures the CostEval evaluator.
type CostEval ¶
type CostEval struct {
// contains filtered or unexported fields
}
CostEval checks whether token usage is within a specified budget.
func NewCostEval ¶
func NewCostEval(cfg *CostConfig) (*CostEval, error)
NewCostEval creates a new CostEval with the given configuration. Returns an error if cfg is nil or budget is zero or negative.
type EvalCase ¶
type EvalCase struct {
// ID is the unique case identifier.
ID string
// Input is the request sent to the agent.
Input *schema.RunRequest
// Expected is the expected output (optional, used for comparison).
Expected *schema.RunResponse
// Actual is the actual agent output (populated by caller before evaluation).
Actual *schema.RunResponse
// Criteria contains evaluation criteria or dimension names (used by LLMJudgeEval).
Criteria []string
// Tags are used for grouping and filtering.
Tags []string
}
EvalCase represents a single evaluation test case.
type EvalDetail ¶
type EvalDetail struct {
// Name is the dimension or criterion name.
Name string
// Score is the score for this dimension in [0, 1].
Score float64
// Passed indicates whether this dimension passed.
Passed bool
// Message is a human-readable explanation or reason.
Message string
}
EvalDetail represents per-dimension evaluation detail.
type EvalFunc ¶
type EvalFunc func(ctx context.Context, c *EvalCase) (*EvalResult, error)
EvalFunc is a function adapter for Evaluator, similar to http.HandlerFunc.
type EvalReport ¶
type EvalReport struct {
// Results contains individual results for each case.
Results []*EvalResult
// TotalCases is the total number of cases evaluated.
TotalCases int
// PassedCases is the number of cases that passed.
PassedCases int
// FailedCases is the number of cases that failed.
FailedCases int
// ErrorCases is the number of cases that encountered errors.
ErrorCases int
// AvgScore is the average score across all non-error cases.
AvgScore float64
// TotalDuration is the total wall-clock evaluation duration in milliseconds.
TotalDuration int64
}
EvalReport summarizes the results of a batch evaluation.
func BatchEval ¶
func BatchEval(ctx context.Context, evaluator Evaluator, cases []*EvalCase, opts ...BatchOption) (*EvalReport, error)
BatchEval evaluates multiple cases with a single evaluator. All cases are evaluated; a failure in one case does not stop the batch. Context cancellation stops further evaluation and returns a partial report with an error. Use WithConcurrency to enable parallel evaluation.
func RunAndEvaluate ¶
func RunAndEvaluate(ctx context.Context, runFn AgentRunFunc, evaluator Evaluator, cases []*EvalCase, opts ...BatchOption) (*EvalReport, error)
RunAndEvaluate runs the agent for each case, fills in Actual, then evaluates. It combines agent execution and evaluation into a single workflow. Use BatchOption to control concurrency.
type EvalResult ¶
type EvalResult struct {
// CaseID is the corresponding EvalCase.ID.
CaseID string
// Score is the overall score in range [0, 1].
Score float64
// Passed indicates whether the evaluation passed.
Passed bool
// Details contains per-dimension scoring details.
Details []EvalDetail
// Duration is the evaluation duration in milliseconds.
Duration int64
// Usage is the token usage (from actual response or judge call).
Usage *aimodel.Usage
// Error is the error message if evaluation encountered an error.
Error string
}
EvalResult represents the outcome of evaluating a single case.
type Evaluator ¶
type Evaluator interface {
// Evaluate scores a single evaluation case.
// Returns an error for infrastructure failures, not for evaluation failures
// (those are captured in EvalResult.Passed).
Evaluate(ctx context.Context, c *EvalCase) (*EvalResult, error)
}
Evaluator defines the contract for evaluating agent outputs.
type ExactMatchEval ¶
type ExactMatchEval struct{}
ExactMatchEval compares actual output text against expected output text for exact string equality.
func NewExactMatchEval ¶
func NewExactMatchEval() (*ExactMatchEval, error)
NewExactMatchEval creates a new ExactMatchEval.
func (*ExactMatchEval) Evaluate ¶
func (e *ExactMatchEval) Evaluate(_ context.Context, c *EvalCase) (*EvalResult, error)
Evaluate implements Evaluator. It returns an error if Expected or Actual is nil.
type LLMJudgeEval ¶
type LLMJudgeEval struct {
// contains filtered or unexported fields
}
LLMJudgeEval uses an LLM as a judge to evaluate agent output quality.
func NewLLMJudgeEval ¶
func NewLLMJudgeEval(completer aimodel.ChatCompleter, model string) (*LLMJudgeEval, error)
NewLLMJudgeEval creates a new LLMJudgeEval with the given ChatCompleter and model. Returns an error if completer is nil or model is empty.
func (*LLMJudgeEval) Evaluate ¶
func (e *LLMJudgeEval) Evaluate(ctx context.Context, c *EvalCase) (*EvalResult, error)
Evaluate implements Evaluator.
type LatencyEval ¶
type LatencyEval struct {
// contains filtered or unexported fields
}
LatencyEval checks whether the agent response time is within an acceptable threshold.
func NewLatencyEval ¶
func NewLatencyEval(thresholdMs int64) (*LatencyEval, error)
NewLatencyEval creates a new LatencyEval with the given threshold in milliseconds. Returns an error if threshold is zero or negative.
func (*LatencyEval) Evaluate ¶
func (e *LatencyEval) Evaluate(_ context.Context, c *EvalCase) (*EvalResult, error)
Evaluate implements Evaluator. It reads the agent's response duration from c.Actual.Duration and scores accordingly.
type ToolCallConfig ¶
type ToolCallConfig struct {
// StrictArgs controls whether tool call arguments are compared.
// If false, only tool names are compared.
StrictArgs bool
}
ToolCallConfig configures the ToolCallEval evaluator.
type ToolCallEval ¶
type ToolCallEval struct {
// contains filtered or unexported fields
}
ToolCallEval verifies that the agent made the expected tool calls in order.
func NewToolCallEval ¶
func NewToolCallEval(cfg *ToolCallConfig) (*ToolCallEval, error)
NewToolCallEval creates a new ToolCallEval. If cfg is nil, default configuration (name-only matching) is used.
func (*ToolCallEval) Evaluate ¶
func (e *ToolCallEval) Evaluate(_ context.Context, c *EvalCase) (*EvalResult, error)
Evaluate implements Evaluator.
type WeightedEvaluator ¶
type WeightedEvaluator struct {
// Evaluator is the evaluator to run.
Evaluator Evaluator
// Weight is the weight for this evaluator's score in the final average.
// If all weights are zero, equal weighting is used.
Weight float64
}
WeightedEvaluator pairs an evaluator with a weight for composite scoring.