eval

package
v0.0.0-...-3613e4e Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 14, 2026 License: Apache-2.0 Imports: 19 Imported by: 0

Documentation

Overview

Package eval provides deterministic task harness primitives, harness connectors, trajectory scoring helpers, and repeated-run reliability metrics for agent evaluation.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func GDS

func GDS(results []EvalResult, key string) (float64, error)

GDS computes weighted partial credit across repeated runs for the named score.

func MOP

func MOP(results []EvalResult, key string) (float64, error)

MOP computes the normalized collapse onset point across repeated runs.

func MarkdownReport

func MarkdownReport(results []EvalResult) string

MarkdownReport generates a beautiful Markdown summary of evaluation results.

func VAF

func VAF(results []EvalResult, key string) (float64, error)

VAF computes normalized score variance across repeated runs for the named score.

Types

type AgentFactory

type AgentFactory func(task Task, runIndex int) agent.Agent

AgentFactory constructs a fresh agent for a task/run pair.

Runner may call the factory concurrently, so implementations should return independent agents or otherwise guarantee concurrency safety.

type ApprovalCase

type ApprovalCase struct {
	Name        string
	Run         func(context.Context, *approval.Gate, *session.Session) error
	ExpectError string
}

type ApprovalCaseResult

type ApprovalCaseResult struct {
	Name   string
	Passed bool
	Error  string
}

func EvaluateApprovalCases

func EvaluateApprovalCases(
	ctx context.Context,
	manager *approval.Gate,
	sess *session.Session,
	cases []ApprovalCase,
) []ApprovalCaseResult

type CostEfficiency

type CostEfficiency struct{}

CostEfficiency rewards low-cost turns.

func (*CostEfficiency) Name

func (s *CostEfficiency) Name() string

Name returns the scorer identifier.

func (*CostEfficiency) ScoreTurn

func (s *CostEfficiency) ScoreTurn(
	_ context.Context,
	turn session.RunTurn,
) (float64, error)

ScoreTurn rewards lower-cost turns.

type Environment

type Environment interface {
	ID() string
	Bootstrap(ctx context.Context, sess *session.Session) error
}

Environment prepares a task-specific session before the agent runs.

type EvalResult

type EvalResult struct {
	RunID     string             `json:"run_id"`
	AgentID   string             `json:"agent_id"`
	ScoredAt  time.Time          `json:"scored_at"`
	TurnCount int                `json:"turn_count"`
	TotalCost float64            `json:"total_cost"`
	Scores    map[string]float64 `json:"scores"` // evaluator name → mean score across turns
	Metadata  map[string]any     `json:"metadata,omitzero"`
}

EvalResult is the scored output for a single run transcript.

func Run

func Run(
	ctx context.Context,
	sessions []*session.Session,
	opts Options,
) ([]EvalResult, error)

Run exports a RunLog from each session, scores every turn with each TurnEvaluator, and scores each log with each RunEvaluator.

type HarborConnector

type HarborConnector struct{}

HarborConnector adapts Harbor-style harness specs into Canto tasks.

func (HarborConnector) Connect

func (HarborConnector) Connect(spec HarnessTaskSpec) (TaskSpec, error)

Connect builds a task for Harbor-oriented eval harnesses.

type HarnessEnvironment

type HarnessEnvironment struct {
	EnvironmentID  string
	HarnessName    string
	TaskID         string
	WorkspacePath  string
	Repository     string
	BaseCommit     string
	ContainerImage string
	SetupCommands  []string
	TestCommands   []string
	Notes          []string
	Metadata       map[string]any
}

HarnessEnvironment seeds the session with harness-specific context.

func (HarnessEnvironment) Bootstrap

func (e HarnessEnvironment) Bootstrap(ctx context.Context, sess *session.Session) error

Bootstrap appends a deterministic harness summary to the session.

func (HarnessEnvironment) ID

func (e HarnessEnvironment) ID() string

ID returns the stable environment identifier.

type HarnessTaskSpec

type HarnessTaskSpec struct {
	TaskID          string
	InstructionText string
	EnvironmentID   string
	WorkspacePath   string
	Repository      string
	BaseCommit      string
	ContainerImage  string
	SetupCommands   []string
	TestCommands    []string
	Notes           []string
	Metadata        map[string]any
}

HarnessTaskSpec captures the common inputs used by external eval harnesses.

type Judge

type Judge struct {
	NameText string
	Criteria string
	Model    string
	Provider llm.Provider
}

Judge uses an LLM to score a transcript turn based on a rubric or criteria.

func (*Judge) Name

func (j *Judge) Name() string

Name returns the identifier for this judge.

func (*Judge) ScoreTurn

func (j *Judge) ScoreTurn(ctx context.Context, turn session.RunTurn) (float64, error)

ScoreTurn executes an LLM call to evaluate the turn.

type MemoryAssertion

type MemoryAssertion func([]memory.Memory) error

func ExcludeIDs

func ExcludeIDs(ids ...string) MemoryAssertion

func ExcludeNamespaces

func ExcludeNamespaces(namespaces ...memory.Namespace) MemoryAssertion

func RequireNoForgotten

func RequireNoForgotten() MemoryAssertion

func RequireNoSuperseded

func RequireNoSuperseded() MemoryAssertion

func RequireRoles

func RequireRoles(roles ...memory.Role) MemoryAssertion

type MemoryCase

type MemoryCase struct {
	Name   string
	Query  memory.Query
	Expect MemoryExpectation
	Assert MemoryAssertion
}

type MemoryCaseResult

type MemoryCaseResult struct {
	Name           string
	Passed         bool
	Missing        []string
	Unexpected     []string
	AssertionError string
	Hits           []memory.Memory
}

func EvaluateMemoryCases

func EvaluateMemoryCases(
	ctx context.Context,
	retriever memory.Retriever,
	cases []MemoryCase,
) ([]MemoryCaseResult, error)

type MemoryExpectation

type MemoryExpectation struct {
	Contains []string
	Excludes []string
}

type Options

type Options struct {
	TurnEvals   []TurnEvaluator
	RunEvals    []RunEvaluator
	OutputPath  string // Path to write JSONL results
	Concurrency int    // Number of parallel workers
}

Options defines a collection of evaluators and configuration for an evaluation run.

type ParallelRunner

type ParallelRunner struct {
	Workers int
	AgentFn AgentFactory
}

ParallelRunner executes repeated task runs in parallel with a bounded worker pool.

func NewParallelRunner

func NewParallelRunner(workers int, agentFn AgentFactory) *ParallelRunner

NewParallelRunner constructs a ParallelRunner.

func (*ParallelRunner) Run

func (r *ParallelRunner) Run(ctx context.Context, task Task, runs int) ([]RunResult, error)

Run executes the task repeated runs times and returns ordered results.

type PlanAdherence

type PlanAdherence struct {
	NameText string
	Criteria string
	Model    string
	Provider llm.Provider
}

PlanAdherence scores whether the full trajectory stayed aligned with the intended plan and task objective.

func (*PlanAdherence) Name

func (p *PlanAdherence) Name() string

Name returns the scorer identifier.

func (*PlanAdherence) ScoreRun

func (p *PlanAdherence) ScoreRun(ctx context.Context, log *session.RunLog) (float64, error)

ScoreRun executes an LLM judge over the full trajectory.

type RunEvaluator

type RunEvaluator interface {
	Name() string
	ScoreRun(ctx context.Context, log *session.RunLog) (float64, error)
}

RunEvaluator evaluates an entire RunLog and returns a score in [0, 1].

type RunResult

type RunResult struct {
	RunID         string
	TaskID        string
	EnvironmentID string
	RunIndex      int
	AgentID       string
	Session       *session.Session
	StepResult    agent.StepResult
	Err           error
}

RunResult captures one repeated execution of a task.

type RunSample

type RunSample struct {
	RunID     string
	Score     float64
	TurnCount int
	TotalCost float64
}

RunSample captures the per-run signal for a named score series.

type RunSeries

type RunSeries struct {
	Key     string
	Samples []RunSample
}

RunSeries is a reusable sequence of scored runs for one score key.

func NewRunSeries

func NewRunSeries(results []EvalResult, key string) (RunSeries, error)

NewRunSeries extracts one named score series from the provided results.

func (RunSeries) GDS

func (s RunSeries) GDS() float64

GDS computes a weighted partial-credit summary over repeated runs.

The default weighting uses turn count as a cheap proxy for how much work each run represented. Longer runs should contribute more to the aggregate than one-turn stubs.

func (RunSeries) MOP

func (s RunSeries) MOP() float64

MOP estimates the first sustained collapse point in the series.

The result is normalized to [0,1] where 1 means no collapse was observed and lower values mean the collapse arrived earlier in the run series.

func (RunSeries) VAF

func (s RunSeries) VAF() float64

VAF computes the variance of the score series normalized by the mean.

type Runner

type Runner interface {
	Run(ctx context.Context, task Task, runs int) ([]RunResult, error)
}

Runner executes a task one or more times.

type SWEBenchConnector

type SWEBenchConnector struct{}

SWEBenchConnector adapts SWE-bench-style harness specs into Canto tasks.

func (SWEBenchConnector) Connect

Connect builds a task for SWE-bench-style eval harnesses.

type StaticEnvironment

type StaticEnvironment struct {
	EnvironmentID string
	Context       []session.ContextEntry
	Transcript    []llm.Message
}

StaticEnvironment is a simple Environment that appends fixed context and optional transcript seed messages.

func (StaticEnvironment) Bootstrap

func (e StaticEnvironment) Bootstrap(ctx context.Context, sess *session.Session) error

Bootstrap appends the environment's context and transcript seeds to the session.

func (StaticEnvironment) ID

func (e StaticEnvironment) ID() string

ID returns the environment identifier.

type StepEfficiency

type StepEfficiency struct{}

StepEfficiency rewards turns that reach completion with fewer tool calls.

func (*StepEfficiency) Name

func (s *StepEfficiency) Name() string

Name returns the scorer identifier.

func (*StepEfficiency) ScoreTurn

func (s *StepEfficiency) ScoreTurn(_ context.Context, turn session.RunTurn) (float64, error)

ScoreTurn penalises turns that require more tool calls.

type Task

type Task interface {
	ID() string
	Instruction() string
	Environment() Environment
}

Task defines a repeatable evaluation scenario.

type TaskSpec

type TaskSpec struct {
	TaskID          string
	InstructionText string
	Env             Environment
}

TaskSpec is the default concrete Task implementation.

func (TaskSpec) Environment

func (t TaskSpec) Environment() Environment

Environment returns the task environment.

func (TaskSpec) ID

func (t TaskSpec) ID() string

ID returns the task identifier.

func (TaskSpec) Instruction

func (t TaskSpec) Instruction() string

Instruction returns the task instruction.

type ToolCorrectness

type ToolCorrectness struct {
	// Expected is the set of tool names that should appear in the turn.
	Expected []string
}

ToolCorrectness scores how accurately a turn used the expected tools. If Expected is empty, every turn scores 1.0.

func (*ToolCorrectness) Name

func (s *ToolCorrectness) Name() string

Name returns the scorer identifier.

func (*ToolCorrectness) ScoreTurn

func (s *ToolCorrectness) ScoreTurn(
	_ context.Context,
	turn session.RunTurn,
) (float64, error)

ScoreTurn returns an F1-style score over expected vs actual tool names.

type TurnEvaluator

type TurnEvaluator interface {
	Name() string
	ScoreTurn(ctx context.Context, turn session.RunTurn) (float64, error)
}

TurnEvaluator evaluates a single transcript turn and returns a score in [0, 1].

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL