Documentation
¶
Overview ¶
Package eval provides deterministic task harness primitives, harness connectors, trajectory scoring helpers, and repeated-run reliability metrics for agent evaluation.
Index ¶
- func GDS(results []EvalResult, key string) (float64, error)
- func MOP(results []EvalResult, key string) (float64, error)
- func MarkdownReport(results []EvalResult) string
- func VAF(results []EvalResult, key string) (float64, error)
- type AgentFactory
- type ApprovalCase
- type ApprovalCaseResult
- type CostEfficiency
- type Environment
- type EvalResult
- type HarborConnector
- type HarnessEnvironment
- type HarnessTaskSpec
- type Judge
- type MemoryAssertion
- type MemoryCase
- type MemoryCaseResult
- type MemoryExpectation
- type Options
- type ParallelRunner
- type PlanAdherence
- type RunEvaluator
- type RunResult
- type RunSample
- type RunSeries
- type Runner
- type SWEBenchConnector
- type StaticEnvironment
- type StepEfficiency
- type Task
- type TaskSpec
- type ToolCorrectness
- type TurnEvaluator
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func GDS ¶
func GDS(results []EvalResult, key string) (float64, error)
GDS computes weighted partial credit across repeated runs for the named score.
func MOP ¶
func MOP(results []EvalResult, key string) (float64, error)
MOP computes the normalized collapse onset point across repeated runs.
func MarkdownReport ¶
func MarkdownReport(results []EvalResult) string
MarkdownReport generates a beautiful Markdown summary of evaluation results.
Types ¶
type AgentFactory ¶
AgentFactory constructs a fresh agent for a task/run pair.
Runner may call the factory concurrently, so implementations should return independent agents or otherwise guarantee concurrency safety.
type ApprovalCase ¶
type ApprovalCaseResult ¶
func EvaluateApprovalCases ¶
func EvaluateApprovalCases( ctx context.Context, manager *approval.Gate, sess *session.Session, cases []ApprovalCase, ) []ApprovalCaseResult
type CostEfficiency ¶
type CostEfficiency struct{}
CostEfficiency rewards low-cost turns.
func (*CostEfficiency) Name ¶
func (s *CostEfficiency) Name() string
Name returns the scorer identifier.
type Environment ¶
type Environment interface {
ID() string
Bootstrap(ctx context.Context, sess *session.Session) error
}
Environment prepares a task-specific session before the agent runs.
type EvalResult ¶
type EvalResult struct {
RunID string `json:"run_id"`
AgentID string `json:"agent_id"`
ScoredAt time.Time `json:"scored_at"`
TurnCount int `json:"turn_count"`
TotalCost float64 `json:"total_cost"`
Scores map[string]float64 `json:"scores"` // evaluator name → mean score across turns
Metadata map[string]any `json:"metadata,omitzero"`
}
EvalResult is the scored output for a single run transcript.
type HarborConnector ¶
type HarborConnector struct{}
HarborConnector adapts Harbor-style harness specs into Canto tasks.
func (HarborConnector) Connect ¶
func (HarborConnector) Connect(spec HarnessTaskSpec) (TaskSpec, error)
Connect builds a task for Harbor-oriented eval harnesses.
type HarnessEnvironment ¶
type HarnessEnvironment struct {
EnvironmentID string
HarnessName string
TaskID string
WorkspacePath string
Repository string
BaseCommit string
ContainerImage string
SetupCommands []string
TestCommands []string
Notes []string
Metadata map[string]any
}
HarnessEnvironment seeds the session with harness-specific context.
func (HarnessEnvironment) Bootstrap ¶
Bootstrap appends a deterministic harness summary to the session.
func (HarnessEnvironment) ID ¶
func (e HarnessEnvironment) ID() string
ID returns the stable environment identifier.
type HarnessTaskSpec ¶
type HarnessTaskSpec struct {
TaskID string
InstructionText string
EnvironmentID string
WorkspacePath string
Repository string
BaseCommit string
ContainerImage string
SetupCommands []string
TestCommands []string
Notes []string
Metadata map[string]any
}
HarnessTaskSpec captures the common inputs used by external eval harnesses.
type MemoryAssertion ¶
func ExcludeIDs ¶
func ExcludeIDs(ids ...string) MemoryAssertion
func ExcludeNamespaces ¶
func ExcludeNamespaces(namespaces ...memory.Namespace) MemoryAssertion
func RequireNoForgotten ¶
func RequireNoForgotten() MemoryAssertion
func RequireNoSuperseded ¶
func RequireNoSuperseded() MemoryAssertion
func RequireRoles ¶
func RequireRoles(roles ...memory.Role) MemoryAssertion
type MemoryCase ¶
type MemoryCase struct {
Name string
Query memory.Query
Expect MemoryExpectation
Assert MemoryAssertion
}
type MemoryCaseResult ¶
type MemoryCaseResult struct {
Name string
Passed bool
Missing []string
Unexpected []string
AssertionError string
Hits []memory.Memory
}
func EvaluateMemoryCases ¶
func EvaluateMemoryCases( ctx context.Context, retriever memory.Retriever, cases []MemoryCase, ) ([]MemoryCaseResult, error)
type MemoryExpectation ¶
type Options ¶
type Options struct {
TurnEvals []TurnEvaluator
RunEvals []RunEvaluator
OutputPath string // Path to write JSONL results
Concurrency int // Number of parallel workers
}
Options defines a collection of evaluators and configuration for an evaluation run.
type ParallelRunner ¶
type ParallelRunner struct {
Workers int
AgentFn AgentFactory
}
ParallelRunner executes repeated task runs in parallel with a bounded worker pool.
func NewParallelRunner ¶
func NewParallelRunner(workers int, agentFn AgentFactory) *ParallelRunner
NewParallelRunner constructs a ParallelRunner.
type PlanAdherence ¶
PlanAdherence scores whether the full trajectory stayed aligned with the intended plan and task objective.
func (*PlanAdherence) Name ¶
func (p *PlanAdherence) Name() string
Name returns the scorer identifier.
type RunEvaluator ¶
type RunEvaluator interface {
Name() string
ScoreRun(ctx context.Context, log *session.RunLog) (float64, error)
}
RunEvaluator evaluates an entire RunLog and returns a score in [0, 1].
type RunResult ¶
type RunResult struct {
RunID string
TaskID string
EnvironmentID string
RunIndex int
AgentID string
Session *session.Session
StepResult agent.StepResult
Err error
}
RunResult captures one repeated execution of a task.
type RunSeries ¶
RunSeries is a reusable sequence of scored runs for one score key.
func NewRunSeries ¶
func NewRunSeries(results []EvalResult, key string) (RunSeries, error)
NewRunSeries extracts one named score series from the provided results.
func (RunSeries) GDS ¶
GDS computes a weighted partial-credit summary over repeated runs.
The default weighting uses turn count as a cheap proxy for how much work each run represented. Longer runs should contribute more to the aggregate than one-turn stubs.
type SWEBenchConnector ¶
type SWEBenchConnector struct{}
SWEBenchConnector adapts SWE-bench-style harness specs into Canto tasks.
func (SWEBenchConnector) Connect ¶
func (SWEBenchConnector) Connect(spec HarnessTaskSpec) (TaskSpec, error)
Connect builds a task for SWE-bench-style eval harnesses.
type StaticEnvironment ¶
type StaticEnvironment struct {
EnvironmentID string
Context []session.ContextEntry
Transcript []llm.Message
}
StaticEnvironment is a simple Environment that appends fixed context and optional transcript seed messages.
func (StaticEnvironment) Bootstrap ¶
Bootstrap appends the environment's context and transcript seeds to the session.
func (StaticEnvironment) ID ¶
func (e StaticEnvironment) ID() string
ID returns the environment identifier.
type StepEfficiency ¶
type StepEfficiency struct{}
StepEfficiency rewards turns that reach completion with fewer tool calls.
func (*StepEfficiency) Name ¶
func (s *StepEfficiency) Name() string
Name returns the scorer identifier.
type Task ¶
type Task interface {
ID() string
Instruction() string
Environment() Environment
}
Task defines a repeatable evaluation scenario.
type TaskSpec ¶
type TaskSpec struct {
TaskID string
InstructionText string
Env Environment
}
TaskSpec is the default concrete Task implementation.
func (TaskSpec) Environment ¶
func (t TaskSpec) Environment() Environment
Environment returns the task environment.
func (TaskSpec) Instruction ¶
Instruction returns the task instruction.
type ToolCorrectness ¶
type ToolCorrectness struct {
// Expected is the set of tool names that should appear in the turn.
Expected []string
}
ToolCorrectness scores how accurately a turn used the expected tools. If Expected is empty, every turn scores 1.0.
func (*ToolCorrectness) Name ¶
func (s *ToolCorrectness) Name() string
Name returns the scorer identifier.