eval

package

v0.0.0-...-3613e4e Latest Latest Go to latest Published: May 14, 2026 License: Apache-2.0 Imports: 19 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/nijaru/canto

Links

Open Source Insights

Documentation ¶

Overview ¶

Package eval provides deterministic task harness primitives, harness connectors, trajectory scoring helpers, and repeated-run reliability metrics for agent evaluation.

Index ¶

func GDS(results []EvalResult, key string) (float64, error)
func MOP(results []EvalResult, key string) (float64, error)
func MarkdownReport(results []EvalResult) string
func VAF(results []EvalResult, key string) (float64, error)
type AgentFactory
type ApprovalCase
type ApprovalCaseResult
- func EvaluateApprovalCases(ctx context.Context, manager *approval.Gate, sess *session.Session, ...) []ApprovalCaseResult
type CostEfficiency
- func (s *CostEfficiency) Name() string
- func (s *CostEfficiency) ScoreTurn(_ context.Context, turn session.RunTurn) (float64, error)
type Environment
type EvalResult
- func Run(ctx context.Context, sessions []*session.Session, opts Options) ([]EvalResult, error)
type HarborConnector
- func (HarborConnector) Connect(spec HarnessTaskSpec) (TaskSpec, error)
type HarnessEnvironment
- func (e HarnessEnvironment) Bootstrap(ctx context.Context, sess *session.Session) error
- func (e HarnessEnvironment) ID() string
type HarnessTaskSpec
type Judge
- func (j *Judge) Name() string
- func (j *Judge) ScoreTurn(ctx context.Context, turn session.RunTurn) (float64, error)
type MemoryAssertion
- func ExcludeIDs(ids ...string) MemoryAssertion
- func ExcludeNamespaces(namespaces ...memory.Namespace) MemoryAssertion
- func RequireNoForgotten() MemoryAssertion
- func RequireNoSuperseded() MemoryAssertion
- func RequireRoles(roles ...memory.Role) MemoryAssertion
type MemoryCase
type MemoryCaseResult
- func EvaluateMemoryCases(ctx context.Context, retriever memory.Retriever, cases []MemoryCase) ([]MemoryCaseResult, error)
type MemoryExpectation
type Options
type ParallelRunner
- func NewParallelRunner(workers int, agentFn AgentFactory) *ParallelRunner
- func (r *ParallelRunner) Run(ctx context.Context, task Task, runs int) ([]RunResult, error)
type PlanAdherence
- func (p *PlanAdherence) Name() string
- func (p *PlanAdherence) ScoreRun(ctx context.Context, log *session.RunLog) (float64, error)
type RunEvaluator
type RunResult
type RunSample
type RunSeries
- func NewRunSeries(results []EvalResult, key string) (RunSeries, error)
- func (s RunSeries) GDS() float64
- func (s RunSeries) MOP() float64
- func (s RunSeries) VAF() float64
type Runner
type SWEBenchConnector
- func (SWEBenchConnector) Connect(spec HarnessTaskSpec) (TaskSpec, error)
type StaticEnvironment
- func (e StaticEnvironment) Bootstrap(ctx context.Context, sess *session.Session) error
- func (e StaticEnvironment) ID() string
type StepEfficiency
- func (s *StepEfficiency) Name() string
- func (s *StepEfficiency) ScoreTurn(_ context.Context, turn session.RunTurn) (float64, error)
type Task
type TaskSpec
- func (t TaskSpec) Environment() Environment
- func (t TaskSpec) ID() string
- func (t TaskSpec) Instruction() string
type ToolCorrectness
- func (s *ToolCorrectness) Name() string
- func (s *ToolCorrectness) ScoreTurn(_ context.Context, turn session.RunTurn) (float64, error)
type TurnEvaluator

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func GDS ¶

func GDS(results []EvalResult, key string) (float64, error)

GDS computes weighted partial credit across repeated runs for the named score.

func MOP ¶

func MOP(results []EvalResult, key string) (float64, error)

MOP computes the normalized collapse onset point across repeated runs.

func MarkdownReport ¶

func MarkdownReport(results []EvalResult) string

MarkdownReport generates a beautiful Markdown summary of evaluation results.

func VAF ¶

func VAF(results []EvalResult, key string) (float64, error)

VAF computes normalized score variance across repeated runs for the named score.

Types ¶

type AgentFactory ¶

type AgentFactory func(task Task, runIndex int) agent.Agent

AgentFactory constructs a fresh agent for a task/run pair.

Runner may call the factory concurrently, so implementations should return independent agents or otherwise guarantee concurrency safety.

type ApprovalCase ¶

type ApprovalCase struct {
	Name        string
	Run         func(context.Context, *approval.Gate, *session.Session) error
	ExpectError string
}

type ApprovalCaseResult ¶

type ApprovalCaseResult struct {
	Name   string
	Passed bool
	Error  string
}

func EvaluateApprovalCases ¶

func EvaluateApprovalCases(
	ctx context.Context,
	manager *approval.Gate,
	sess *session.Session,
	cases []ApprovalCase,
) []ApprovalCaseResult

type CostEfficiency ¶

type CostEfficiency struct{}

CostEfficiency rewards low-cost turns.

func (*CostEfficiency) Name ¶

func (s *CostEfficiency) Name() string

Name returns the scorer identifier.

func (*CostEfficiency) ScoreTurn ¶

func (s *CostEfficiency) ScoreTurn(
	_ context.Context,
	turn session.RunTurn,
) (float64, error)

ScoreTurn rewards lower-cost turns.

type Environment ¶

type Environment interface {
	ID() string
	Bootstrap(ctx context.Context, sess *session.Session) error
}

Environment prepares a task-specific session before the agent runs.

type EvalResult ¶

type EvalResult struct {
	RunID     string             `json:"run_id"`
	AgentID   string             `json:"agent_id"`
	ScoredAt  time.Time          `json:"scored_at"`
	TurnCount int                `json:"turn_count"`
	TotalCost float64            `json:"total_cost"`
	Scores    map[string]float64 `json:"scores"` // evaluator name → mean score across turns
	Metadata  map[string]any     `json:"metadata,omitzero"`
}

EvalResult is the scored output for a single run transcript.

func Run ¶

func Run(
	ctx context.Context,
	sessions []*session.Session,
	opts Options,
) ([]EvalResult, error)

Run exports a RunLog from each session, scores every turn with each TurnEvaluator, and scores each log with each RunEvaluator.

type HarborConnector ¶

type HarborConnector struct{}

HarborConnector adapts Harbor-style harness specs into Canto tasks.

func (HarborConnector) Connect ¶

func (HarborConnector) Connect(spec HarnessTaskSpec) (TaskSpec, error)

Connect builds a task for Harbor-oriented eval harnesses.

type HarnessEnvironment ¶

type HarnessEnvironment struct {
	EnvironmentID  string
	HarnessName    string
	TaskID         string
	WorkspacePath  string
	Repository     string
	BaseCommit     string
	ContainerImage string
	SetupCommands  []string
	TestCommands   []string
	Notes          []string
	Metadata       map[string]any
}

HarnessEnvironment seeds the session with harness-specific context.

func (HarnessEnvironment) Bootstrap ¶

func (e HarnessEnvironment) Bootstrap(ctx context.Context, sess *session.Session) error

Bootstrap appends a deterministic harness summary to the session.

func (HarnessEnvironment) ID ¶

func (e HarnessEnvironment) ID() string

ID returns the stable environment identifier.

type HarnessTaskSpec ¶

type HarnessTaskSpec struct {
	TaskID          string
	InstructionText string
	EnvironmentID   string
	WorkspacePath   string
	Repository      string
	BaseCommit      string
	ContainerImage  string
	SetupCommands   []string
	TestCommands    []string
	Notes           []string
	Metadata        map[string]any
}

HarnessTaskSpec captures the common inputs used by external eval harnesses.

type Judge ¶

type Judge struct {
	NameText string
	Criteria string
	Model    string
	Provider llm.Provider
}

Judge uses an LLM to score a transcript turn based on a rubric or criteria.

func (*Judge) Name ¶

func (j *Judge) Name() string

Name returns the identifier for this judge.

func (*Judge) ScoreTurn ¶

func (j *Judge) ScoreTurn(ctx context.Context, turn session.RunTurn) (float64, error)

ScoreTurn executes an LLM call to evaluate the turn.

type MemoryAssertion ¶

type MemoryAssertion func([]memory.Memory) error

func ExcludeIDs ¶

func ExcludeIDs(ids ...string) MemoryAssertion

func ExcludeNamespaces ¶

func ExcludeNamespaces(namespaces ...memory.Namespace) MemoryAssertion

func RequireNoForgotten ¶

func RequireNoForgotten() MemoryAssertion

func RequireNoSuperseded ¶

func RequireNoSuperseded() MemoryAssertion

func RequireRoles ¶

func RequireRoles(roles ...memory.Role) MemoryAssertion

type MemoryCase ¶

type MemoryCase struct {
	Name   string
	Query  memory.Query
	Expect MemoryExpectation
	Assert MemoryAssertion
}

type MemoryCaseResult ¶

type MemoryCaseResult struct {
	Name           string
	Passed         bool
	Missing        []string
	Unexpected     []string
	AssertionError string
	Hits           []memory.Memory
}

func EvaluateMemoryCases ¶

func EvaluateMemoryCases(
	ctx context.Context,
	retriever memory.Retriever,
	cases []MemoryCase,
) ([]MemoryCaseResult, error)

type MemoryExpectation ¶

type MemoryExpectation struct {
	Contains []string
	Excludes []string
}

type Options ¶

type Options struct {
	TurnEvals   []TurnEvaluator
	RunEvals    []RunEvaluator
	OutputPath  string // Path to write JSONL results
	Concurrency int    // Number of parallel workers
}

Options defines a collection of evaluators and configuration for an evaluation run.

type ParallelRunner ¶

type ParallelRunner struct {
	Workers int
	AgentFn AgentFactory
}

ParallelRunner executes repeated task runs in parallel with a bounded worker pool.

func NewParallelRunner ¶

func NewParallelRunner(workers int, agentFn AgentFactory) *ParallelRunner

NewParallelRunner constructs a ParallelRunner.

func (*ParallelRunner) Run ¶

func (r *ParallelRunner) Run(ctx context.Context, task Task, runs int) ([]RunResult, error)

Run executes the task repeated runs times and returns ordered results.

type PlanAdherence ¶

type PlanAdherence struct {
	NameText string
	Criteria string
	Model    string
	Provider llm.Provider
}

PlanAdherence scores whether the full trajectory stayed aligned with the intended plan and task objective.

func (*PlanAdherence) Name ¶

func (p *PlanAdherence) Name() string

Name returns the scorer identifier.

func (*PlanAdherence) ScoreRun ¶

func (p *PlanAdherence) ScoreRun(ctx context.Context, log *session.RunLog) (float64, error)

ScoreRun executes an LLM judge over the full trajectory.

type RunEvaluator ¶

type RunEvaluator interface {
	Name() string
	ScoreRun(ctx context.Context, log *session.RunLog) (float64, error)
}

RunEvaluator evaluates an entire RunLog and returns a score in [0, 1].

type RunResult ¶

type RunResult struct {
	RunID         string
	TaskID        string
	EnvironmentID string
	RunIndex      int
	AgentID       string
	Session       *session.Session
	StepResult    agent.StepResult
	Err           error
}

RunResult captures one repeated execution of a task.

type RunSample ¶

type RunSample struct {
	RunID     string
	Score     float64
	TurnCount int
	TotalCost float64
}

RunSample captures the per-run signal for a named score series.

type RunSeries ¶

type RunSeries struct {
	Key     string
	Samples []RunSample
}

RunSeries is a reusable sequence of scored runs for one score key.

func NewRunSeries ¶

func NewRunSeries(results []EvalResult, key string) (RunSeries, error)

NewRunSeries extracts one named score series from the provided results.

func (RunSeries) GDS ¶

func (s RunSeries) GDS() float64

GDS computes a weighted partial-credit summary over repeated runs.

The default weighting uses turn count as a cheap proxy for how much work each run represented. Longer runs should contribute more to the aggregate than one-turn stubs.

func (RunSeries) MOP ¶

func (s RunSeries) MOP() float64

MOP estimates the first sustained collapse point in the series.

The result is normalized to [0,1] where 1 means no collapse was observed and lower values mean the collapse arrived earlier in the run series.

func (RunSeries) VAF ¶

func (s RunSeries) VAF() float64

VAF computes the variance of the score series normalized by the mean.

type Runner ¶

type Runner interface {
	Run(ctx context.Context, task Task, runs int) ([]RunResult, error)
}

Runner executes a task one or more times.

type SWEBenchConnector ¶

type SWEBenchConnector struct{}

SWEBenchConnector adapts SWE-bench-style harness specs into Canto tasks.

func (SWEBenchConnector) Connect ¶

func (SWEBenchConnector) Connect(spec HarnessTaskSpec) (TaskSpec, error)

Connect builds a task for SWE-bench-style eval harnesses.

type StaticEnvironment ¶

type StaticEnvironment struct {
	EnvironmentID string
	Context       []session.ContextEntry
	Transcript    []llm.Message
}

StaticEnvironment is a simple Environment that appends fixed context and optional transcript seed messages.

func (StaticEnvironment) Bootstrap ¶

func (e StaticEnvironment) Bootstrap(ctx context.Context, sess *session.Session) error

Bootstrap appends the environment's context and transcript seeds to the session.

func (StaticEnvironment) ID ¶

func (e StaticEnvironment) ID() string

ID returns the environment identifier.

type StepEfficiency ¶

type StepEfficiency struct{}

StepEfficiency rewards turns that reach completion with fewer tool calls.

func (*StepEfficiency) Name ¶

func (s *StepEfficiency) Name() string

Name returns the scorer identifier.

func (*StepEfficiency) ScoreTurn ¶

func (s *StepEfficiency) ScoreTurn(_ context.Context, turn session.RunTurn) (float64, error)

ScoreTurn penalises turns that require more tool calls.

type Task ¶

type Task interface {
	ID() string
	Instruction() string
	Environment() Environment
}

Task defines a repeatable evaluation scenario.

type TaskSpec ¶

type TaskSpec struct {
	TaskID          string
	InstructionText string
	Env             Environment
}

TaskSpec is the default concrete Task implementation.

func (TaskSpec) Environment ¶

func (t TaskSpec) Environment() Environment

Environment returns the task environment.

func (TaskSpec) ID ¶

func (t TaskSpec) ID() string

ID returns the task identifier.

func (TaskSpec) Instruction ¶

func (t TaskSpec) Instruction() string

Instruction returns the task instruction.

type ToolCorrectness ¶

type ToolCorrectness struct {
	// Expected is the set of tool names that should appear in the turn.
	Expected []string
}

ToolCorrectness scores how accurately a turn used the expected tools. If Expected is empty, every turn scores 1.0.

func (*ToolCorrectness) Name ¶

func (s *ToolCorrectness) Name() string

Name returns the scorer identifier.

func (*ToolCorrectness) ScoreTurn ¶

func (s *ToolCorrectness) ScoreTurn(
	_ context.Context,
	turn session.RunTurn,
) (float64, error)

ScoreTurn returns an F1-style score over expected vs actual tool names.

type TurnEvaluator ¶

type TurnEvaluator interface {
	Name() string
	ScoreTurn(ctx context.Context, turn session.RunTurn) (float64, error)
}

TurnEvaluator evaluates a single transcript turn and returns a score in [0, 1].

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL