experiment

package
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 6, 2026 License: MIT Imports: 29 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrStoreUnavailable = errors.New("experiment store unavailable")

ErrStoreUnavailable indicates the experiment store is not configured.

Functions

This section is empty.

Types

type Comparator

type Comparator struct {
	// contains filtered or unexported fields
}

Comparator analyzes experiment results and computes rankings.

func NewComparator

func NewComparator(store *Store) *Comparator

NewComparator constructs a comparator for experiment results.

func (*Comparator) Compare

func (c *Comparator) Compare(exp *Experiment) (*ComparisonReport, error)

Compare loads runs + evaluations and produces a comparison report.

type ComparisonReport

type ComparisonReport struct {
	ExperimentID string
	Variants     []VariantReport
	Rankings     []Ranking
	Summary      string
}

ComparisonReport summarizes results for an experiment.

type CoordinatedRunner

type CoordinatedRunner struct {
	*Runner
	// contains filtered or unexported fields
}

CoordinatedRunner wraps Runner with conflict-aware scheduling.

func NewCoordinatedRunner

func NewCoordinatedRunner(cfg RunnerConfig, deps Dependencies, repoPath string) (*CoordinatedRunner, error)

NewCoordinatedRunner creates a runner with conflict-aware coordination.

func (*CoordinatedRunner) PreviewExecution

func (r *CoordinatedRunner) PreviewExecution(exp *Experiment) *parallel.ExecutionPreview

PreviewExecution returns a preview of how tasks would be scheduled.

func (*CoordinatedRunner) RunCoordinated

func (r *CoordinatedRunner) RunCoordinated(ctx context.Context, exp *Experiment, targetBranch string) (*parallel.ExecutionReport, error)

RunCoordinated executes the experiment with conflict-aware scheduling. Tasks with overlapping file scopes are automatically serialized into waves.

func (*CoordinatedRunner) SetConflictHandler

func (r *CoordinatedRunner) SetConflictHandler(fn func(parallel.ConflictEvent))

SetConflictHandler sets a callback for conflict events.

func (*CoordinatedRunner) SetMergeHandler

func (r *CoordinatedRunner) SetMergeHandler(fn func(parallel.MergeEvent))

SetMergeHandler sets a callback for merge events.

func (*CoordinatedRunner) SetPartitionHandler

func (r *CoordinatedRunner) SetPartitionHandler(fn func(parallel.PartitionEvent))

SetPartitionHandler sets a callback for partition events.

type CriterionEvaluation

type CriterionEvaluation struct {
	ID          int64
	RunID       string
	CriterionID int64
	Passed      bool
	Score       float64
	Details     string
	EvaluatedAt time.Time
}

CriterionEvaluation records evaluation results for a run.

func EvaluateCriteria

func EvaluateCriteria(ctx context.Context, worktreePath string, workingDir string, output string, criteria []SuccessCriterion) []CriterionEvaluation

EvaluateCriteria evaluates success criteria for a run and returns evaluations.

type CriterionType

type CriterionType string

CriterionType defines supported evaluation types.

const (
	CriterionTestPass   CriterionType = "test_pass"
	CriterionFileExists CriterionType = "file_exists"
	CriterionContains   CriterionType = "contains"
	CriterionCommand    CriterionType = "command"
	CriterionManual     CriterionType = "manual"
)

type Dependencies

type Dependencies struct {
	Config         *config.Config
	ModelManager   *model.Manager
	ProjectContext *projectcontext.ProjectContext
	Telemetry      *telemetry.Hub
	Notify         *notify.Manager
	Worktree       parallel.WorktreeManager
	Store          *Store
}

Dependencies bundles the shared dependencies for the runner.

type Experiment

type Experiment struct {
	ID          string
	Name        string
	Description string
	Hypothesis  string
	Task        Task
	Variants    []Variant
	Criteria    []SuccessCriterion
	Status      ExperimentStatus
	CreatedAt   time.Time
	CompletedAt *time.Time
}

Experiment groups variants for a single comparison run.

type ExperimentStatus

type ExperimentStatus string

ExperimentStatus captures lifecycle state for an experiment.

const (
	ExperimentPending   ExperimentStatus = "pending"
	ExperimentRunning   ExperimentStatus = "running"
	ExperimentCompleted ExperimentStatus = "completed"
	ExperimentFailed    ExperimentStatus = "failed"
	ExperimentCancelled ExperimentStatus = "cancelled"
)

type Ranking

type Ranking struct {
	VariantID string
	Score     float64
	Rank      int
}

Ranking captures ordering for variants.

type ReplayConfig

type ReplayConfig struct {
	SourceSessionID    string
	NewModelID         string
	NewProviderID      string
	NewSystemPrompt    *string
	NewTemperature     *float64
	DeterministicTools bool
}

ReplayConfig specifies how to replay a session.

type Replayer

type Replayer struct {
	// contains filtered or unexported fields
}

Replayer replays a stored session with new configuration.

func NewReplayer

func NewReplayer(store *storage.Store, runner *Runner) (*Replayer, error)

NewReplayer constructs a replayer.

func (*Replayer) Replay

func (r *Replayer) Replay(ctx context.Context, cfg ReplayConfig) (*Run, error)

Replay re-executes a session with updated model configuration.

type Reporter

type Reporter struct {
	// contains filtered or unexported fields
}

Reporter formats experiment results for humans.

func NewReporter

func NewReporter() *Reporter

NewReporter creates a Reporter instance.

func NewReporterWithComparator

func NewReporterWithComparator(comparator *Comparator) *Reporter

NewReporterWithComparator creates a reporter that can compare stored runs.

func (*Reporter) ComparisonMarkdown

func (r *Reporter) ComparisonMarkdown(exp *Experiment) (string, error)

ComparisonMarkdown renders a markdown report from persisted experiment runs.

func (*Reporter) MarkdownTable

func (r *Reporter) MarkdownTable(exp *Experiment, results []*parallel.AgentResult) string

MarkdownTable renders a markdown summary table for the experiment results.

type Run

type Run struct {
	ID           string
	ExperimentID string
	VariantID    string
	SessionID    string
	Branch       string
	Status       RunStatus
	Output       string
	Files        []string
	Metrics      RunMetrics
	Error        *string
	StartedAt    time.Time
	CompletedAt  *time.Time
}

Run captures a single execution of a variant.

type RunMetrics

type RunMetrics struct {
	DurationMs       int64
	PromptTokens     int
	CompletionTokens int
	TotalCost        float64
	ToolCalls        int
	ToolSuccesses    int
	ToolFailures     int
	FilesModified    int
	LinesChanged     int
}

RunMetrics captures measurable outcomes.

type RunStatus

type RunStatus string

RunStatus captures lifecycle state for a run.

const (
	RunPending   RunStatus = "pending"
	RunRunning   RunStatus = "running"
	RunCompleted RunStatus = "completed"
	RunFailed    RunStatus = "failed"
	RunCancelled RunStatus = "cancelled"
)

type Runner

type Runner struct {
	// contains filtered or unexported fields
}

Runner executes experiments across multiple variants.

func NewRunner

func NewRunner(cfg RunnerConfig, deps Dependencies) (*Runner, error)

NewRunner constructs a runner with the required dependencies.

func (*Runner) RunExperiment

func (r *Runner) RunExperiment(ctx context.Context, exp *Experiment) ([]*parallel.AgentResult, error)

RunExperiment executes all variants and returns their results.

type RunnerConfig

type RunnerConfig struct {
	MaxConcurrent  int
	DefaultTimeout time.Duration
	CleanupOnDone  bool
}

RunnerConfig controls experiment execution behavior.

type Store

type Store struct {
	// contains filtered or unexported fields
}

Store manages experiment persistence.

func NewStore

func NewStore(db *sql.DB) *Store

NewStore constructs an experiment store from a database handle.

func NewStoreFromStorage

func NewStoreFromStorage(store *storage.Store) *Store

NewStoreFromStorage constructs an experiment store from the main storage store.

func (*Store) CreateExperiment

func (s *Store) CreateExperiment(exp *Experiment) error

CreateExperiment persists a new experiment along with variants and criteria.

func (*Store) FindExperimentByName

func (s *Store) FindExperimentByName(name string) (*Experiment, error)

FindExperimentByName loads the most recent experiment with the given name.

func (*Store) GetExperiment

func (s *Store) GetExperiment(id string) (*Experiment, error)

GetExperiment loads a single experiment with variants and criteria.

func (*Store) GetRun

func (s *Store) GetRun(runID string) (*Run, error)

GetRun fetches a single run by ID.

func (*Store) ListEvaluationsByExperiment

func (s *Store) ListEvaluationsByExperiment(experimentID string) (map[string][]CriterionEvaluation, error)

ListEvaluationsByExperiment returns evaluations keyed by run ID.

func (*Store) ListExperiments

func (s *Store) ListExperiments(limit int, status ExperimentStatus) ([]Experiment, error)

ListExperiments returns recent experiments, optionally filtered by status.

func (*Store) ListRuns

func (s *Store) ListRuns(experimentID string) ([]Run, error)

ListRuns returns runs for an experiment.

func (*Store) ReplaceEvaluations

func (s *Store) ReplaceEvaluations(runID string, evals []CriterionEvaluation) error

ReplaceEvaluations overwrites evaluations for a run.

func (*Store) SaveRun

func (s *Store) SaveRun(run *Run) error

SaveRun inserts or updates a run record.

func (*Store) UpdateExperimentStatus

func (s *Store) UpdateExperimentStatus(id string, status ExperimentStatus, completedAt *time.Time) error

UpdateExperimentStatus updates experiment status and completion timestamp.

type SuccessCriterion

type SuccessCriterion struct {
	ID     int64
	Name   string
	Type   CriterionType
	Target string
	Weight float64
}

SuccessCriterion defines how to evaluate a run.

type Task

type Task struct {
	Prompt     string
	Context    map[string]string
	WorkingDir string
	Timeout    time.Duration
	Files      []string // Explicit file paths for scope conflict detection
	Scope      []string // Glob patterns for scope conflict detection (e.g., "pkg/auth/...")
}

Task describes what each variant should execute.

type TerminalReporter

type TerminalReporter struct {
	// contains filtered or unexported fields
}

TerminalReporter renders experiment results with colors and charts.

func NewTerminalReporter

func NewTerminalReporter(comparator *Comparator) *TerminalReporter

NewTerminalReporter creates a reporter for terminal output.

func NewTerminalReporterWithOutput

func NewTerminalReporterWithOutput(out io.Writer, comparator *Comparator) *TerminalReporter

NewTerminalReporterWithOutput creates a reporter with custom output.

func (*TerminalReporter) RenderCompact

func (r *TerminalReporter) RenderCompact(exp *Experiment) error

RenderCompact renders a compact one-line summary per variant.

func (*TerminalReporter) RenderReport

func (r *TerminalReporter) RenderReport(exp *Experiment) error

RenderReport renders a full experiment report with charts.

func (*TerminalReporter) SetNoColor

func (r *TerminalReporter) SetNoColor(noColor bool)

SetNoColor disables color output.

type Variant

type Variant struct {
	ID           string
	Name         string
	ModelID      string
	ProviderID   string
	SystemPrompt *string
	Temperature  *float64
	MaxTokens    *int
	ToolsAllowed []string
	CustomConfig map[string]any
	Files        []string // Override task-level file scope for this variant
	Scope        []string // Override task-level glob scope for this variant
}

Variant describes a model configuration to test.

type VariantReport

type VariantReport struct {
	VariantID      string
	VariantName    string
	ModelID        string
	Status         RunStatus
	Metrics        RunMetrics
	CriteriaScore  float64
	CriteriaPassed []string
	CriteriaFailed []string
	OutputPreview  string
	Error          string
}

VariantReport captures metrics and criteria results per variant.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL