experiment

package

v1.1.0 Latest Latest Go to latest Published: Apr 6, 2026 License: MIT Imports: 29 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/odvcencio/buckley

Links

Open Source Insights

Documentation ¶

Index ¶

Variables
type Comparator
- func NewComparator(store *Store) *Comparator
- func (c *Comparator) Compare(exp *Experiment) (*ComparisonReport, error)
type ComparisonReport
type CoordinatedRunner
- func NewCoordinatedRunner(cfg RunnerConfig, deps Dependencies, repoPath string) (*CoordinatedRunner, error)
- func (r *CoordinatedRunner) PreviewExecution(exp *Experiment) *parallel.ExecutionPreview
- func (r *CoordinatedRunner) RunCoordinated(ctx context.Context, exp *Experiment, targetBranch string) (*parallel.ExecutionReport, error)
- func (r *CoordinatedRunner) SetConflictHandler(fn func(parallel.ConflictEvent))
- func (r *CoordinatedRunner) SetMergeHandler(fn func(parallel.MergeEvent))
- func (r *CoordinatedRunner) SetPartitionHandler(fn func(parallel.PartitionEvent))
type CriterionEvaluation
- func EvaluateCriteria(ctx context.Context, worktreePath string, workingDir string, output string, ...) []CriterionEvaluation
type CriterionType
type Dependencies
type Experiment
type ExperimentStatus
type Ranking
type ReplayConfig
type Replayer
- func NewReplayer(store *storage.Store, runner *Runner) (*Replayer, error)
- func (r *Replayer) Replay(ctx context.Context, cfg ReplayConfig) (*Run, error)
type Reporter
- func NewReporter() *Reporter
- func NewReporterWithComparator(comparator *Comparator) *Reporter
- func (r *Reporter) ComparisonMarkdown(exp *Experiment) (string, error)
- func (r *Reporter) MarkdownTable(exp *Experiment, results []*parallel.AgentResult) string
type Run
type RunMetrics
type RunStatus
type Runner
- func NewRunner(cfg RunnerConfig, deps Dependencies) (*Runner, error)
- func (r *Runner) RunExperiment(ctx context.Context, exp *Experiment) ([]*parallel.AgentResult, error)
type RunnerConfig
type Store
- func NewStore(db *sql.DB) *Store
- func NewStoreFromStorage(store *storage.Store) *Store
- func (s *Store) CreateExperiment(exp *Experiment) error
- func (s *Store) FindExperimentByName(name string) (*Experiment, error)
- func (s *Store) GetExperiment(id string) (*Experiment, error)
- func (s *Store) GetRun(runID string) (*Run, error)
- func (s *Store) ListEvaluationsByExperiment(experimentID string) (map[string][]CriterionEvaluation, error)
- func (s *Store) ListExperiments(limit int, status ExperimentStatus) ([]Experiment, error)
- func (s *Store) ListRuns(experimentID string) ([]Run, error)
- func (s *Store) ReplaceEvaluations(runID string, evals []CriterionEvaluation) error
- func (s *Store) SaveRun(run *Run) error
- func (s *Store) UpdateExperimentStatus(id string, status ExperimentStatus, completedAt *time.Time) error
type SuccessCriterion
type Task
type TerminalReporter
- func NewTerminalReporter(comparator *Comparator) *TerminalReporter
- func NewTerminalReporterWithOutput(out io.Writer, comparator *Comparator) *TerminalReporter
- func (r *TerminalReporter) RenderCompact(exp *Experiment) error
- func (r *TerminalReporter) RenderReport(exp *Experiment) error
- func (r *TerminalReporter) SetNoColor(noColor bool)
type Variant
type VariantReport

Constants ¶

This section is empty.

Variables ¶

View Source

var ErrStoreUnavailable = errors.New("experiment store unavailable")

ErrStoreUnavailable indicates the experiment store is not configured.

Functions ¶

This section is empty.

Types ¶

type Comparator ¶

type Comparator struct {
	// contains filtered or unexported fields
}

Comparator analyzes experiment results and computes rankings.

func NewComparator ¶

func NewComparator(store *Store) *Comparator

NewComparator constructs a comparator for experiment results.

func (*Comparator) Compare ¶

func (c *Comparator) Compare(exp *Experiment) (*ComparisonReport, error)

Compare loads runs + evaluations and produces a comparison report.

type ComparisonReport ¶

type ComparisonReport struct {
	ExperimentID string
	Variants     []VariantReport
	Rankings     []Ranking
	Summary      string
}

ComparisonReport summarizes results for an experiment.

type CoordinatedRunner ¶

type CoordinatedRunner struct {
	*Runner
	// contains filtered or unexported fields
}

CoordinatedRunner wraps Runner with conflict-aware scheduling.

func NewCoordinatedRunner ¶

func NewCoordinatedRunner(cfg RunnerConfig, deps Dependencies, repoPath string) (*CoordinatedRunner, error)

NewCoordinatedRunner creates a runner with conflict-aware coordination.

func (*CoordinatedRunner) PreviewExecution ¶

func (r *CoordinatedRunner) PreviewExecution(exp *Experiment) *parallel.ExecutionPreview

PreviewExecution returns a preview of how tasks would be scheduled.

func (*CoordinatedRunner) RunCoordinated ¶

func (r *CoordinatedRunner) RunCoordinated(ctx context.Context, exp *Experiment, targetBranch string) (*parallel.ExecutionReport, error)

RunCoordinated executes the experiment with conflict-aware scheduling. Tasks with overlapping file scopes are automatically serialized into waves.

func (*CoordinatedRunner) SetConflictHandler ¶

func (r *CoordinatedRunner) SetConflictHandler(fn func(parallel.ConflictEvent))

SetConflictHandler sets a callback for conflict events.

func (*CoordinatedRunner) SetMergeHandler ¶

func (r *CoordinatedRunner) SetMergeHandler(fn func(parallel.MergeEvent))

SetMergeHandler sets a callback for merge events.

func (*CoordinatedRunner) SetPartitionHandler ¶

func (r *CoordinatedRunner) SetPartitionHandler(fn func(parallel.PartitionEvent))

SetPartitionHandler sets a callback for partition events.

type CriterionEvaluation ¶

type CriterionEvaluation struct {
	ID          int64
	RunID       string
	CriterionID int64
	Passed      bool
	Score       float64
	Details     string
	EvaluatedAt time.Time
}

CriterionEvaluation records evaluation results for a run.

func EvaluateCriteria ¶

func EvaluateCriteria(ctx context.Context, worktreePath string, workingDir string, output string, criteria []SuccessCriterion) []CriterionEvaluation

EvaluateCriteria evaluates success criteria for a run and returns evaluations.

type CriterionType ¶

type CriterionType string

CriterionType defines supported evaluation types.

const (
	CriterionTestPass   CriterionType = "test_pass"
	CriterionFileExists CriterionType = "file_exists"
	CriterionContains   CriterionType = "contains"
	CriterionCommand    CriterionType = "command"
	CriterionManual     CriterionType = "manual"
)

type Dependencies ¶

type Dependencies struct {
	Config         *config.Config
	ModelManager   *model.Manager
	ProjectContext *projectcontext.ProjectContext
	Telemetry      *telemetry.Hub
	Notify         *notify.Manager
	Worktree       parallel.WorktreeManager
	Store          *Store
}

Dependencies bundles the shared dependencies for the runner.

type Experiment ¶

type Experiment struct {
	ID          string
	Name        string
	Description string
	Hypothesis  string
	Task        Task
	Variants    []Variant
	Criteria    []SuccessCriterion
	Status      ExperimentStatus
	CreatedAt   time.Time
	CompletedAt *time.Time
}

Experiment groups variants for a single comparison run.

type ExperimentStatus ¶

type ExperimentStatus string

ExperimentStatus captures lifecycle state for an experiment.

const (
	ExperimentPending   ExperimentStatus = "pending"
	ExperimentRunning   ExperimentStatus = "running"
	ExperimentCompleted ExperimentStatus = "completed"
	ExperimentFailed    ExperimentStatus = "failed"
	ExperimentCancelled ExperimentStatus = "cancelled"
)

type Ranking ¶

type Ranking struct {
	VariantID string
	Score     float64
	Rank      int
}

Ranking captures ordering for variants.

type ReplayConfig ¶

type ReplayConfig struct {
	SourceSessionID    string
	NewModelID         string
	NewProviderID      string
	NewSystemPrompt    *string
	NewTemperature     *float64
	DeterministicTools bool
}

ReplayConfig specifies how to replay a session.

type Replayer ¶

type Replayer struct {
	// contains filtered or unexported fields
}

Replayer replays a stored session with new configuration.

func NewReplayer ¶

func NewReplayer(store *storage.Store, runner *Runner) (*Replayer, error)

NewReplayer constructs a replayer.

func (*Replayer) Replay ¶

func (r *Replayer) Replay(ctx context.Context, cfg ReplayConfig) (*Run, error)

Replay re-executes a session with updated model configuration.

type Reporter ¶

type Reporter struct {
	// contains filtered or unexported fields
}

Reporter formats experiment results for humans.

func NewReporter ¶

func NewReporter() *Reporter

NewReporter creates a Reporter instance.

func NewReporterWithComparator ¶

func NewReporterWithComparator(comparator *Comparator) *Reporter

NewReporterWithComparator creates a reporter that can compare stored runs.

func (*Reporter) ComparisonMarkdown ¶

func (r *Reporter) ComparisonMarkdown(exp *Experiment) (string, error)

ComparisonMarkdown renders a markdown report from persisted experiment runs.

func (*Reporter) MarkdownTable ¶

func (r *Reporter) MarkdownTable(exp *Experiment, results []*parallel.AgentResult) string

MarkdownTable renders a markdown summary table for the experiment results.

type Run ¶

type Run struct {
	ID           string
	ExperimentID string
	VariantID    string
	SessionID    string
	Branch       string
	Status       RunStatus
	Output       string
	Files        []string
	Metrics      RunMetrics
	Error        *string
	StartedAt    time.Time
	CompletedAt  *time.Time
}

Run captures a single execution of a variant.

type RunMetrics ¶

type RunMetrics struct {
	DurationMs       int64
	PromptTokens     int
	CompletionTokens int
	TotalCost        float64
	ToolCalls        int
	ToolSuccesses    int
	ToolFailures     int
	FilesModified    int
	LinesChanged     int
}

RunMetrics captures measurable outcomes.

type RunStatus ¶

type RunStatus string

RunStatus captures lifecycle state for a run.

const (
	RunPending   RunStatus = "pending"
	RunRunning   RunStatus = "running"
	RunCompleted RunStatus = "completed"
	RunFailed    RunStatus = "failed"
	RunCancelled RunStatus = "cancelled"
)

type Runner ¶

type Runner struct {
	// contains filtered or unexported fields
}

Runner executes experiments across multiple variants.

func NewRunner ¶

func NewRunner(cfg RunnerConfig, deps Dependencies) (*Runner, error)

NewRunner constructs a runner with the required dependencies.

func (*Runner) RunExperiment ¶

func (r *Runner) RunExperiment(ctx context.Context, exp *Experiment) ([]*parallel.AgentResult, error)

RunExperiment executes all variants and returns their results.

type RunnerConfig ¶

type RunnerConfig struct {
	MaxConcurrent  int
	DefaultTimeout time.Duration
	CleanupOnDone  bool
}

RunnerConfig controls experiment execution behavior.

type Store ¶

type Store struct {
	// contains filtered or unexported fields
}

Store manages experiment persistence.

func NewStore ¶

func NewStore(db *sql.DB) *Store

NewStore constructs an experiment store from a database handle.

func NewStoreFromStorage ¶

func NewStoreFromStorage(store *storage.Store) *Store

NewStoreFromStorage constructs an experiment store from the main storage store.

func (*Store) CreateExperiment ¶

func (s *Store) CreateExperiment(exp *Experiment) error

CreateExperiment persists a new experiment along with variants and criteria.

func (*Store) FindExperimentByName ¶

func (s *Store) FindExperimentByName(name string) (*Experiment, error)

FindExperimentByName loads the most recent experiment with the given name.

func (*Store) GetExperiment ¶

func (s *Store) GetExperiment(id string) (*Experiment, error)

GetExperiment loads a single experiment with variants and criteria.

func (*Store) GetRun ¶

func (s *Store) GetRun(runID string) (*Run, error)

GetRun fetches a single run by ID.

func (*Store) ListEvaluationsByExperiment ¶

func (s *Store) ListEvaluationsByExperiment(experimentID string) (map[string][]CriterionEvaluation, error)

ListEvaluationsByExperiment returns evaluations keyed by run ID.

func (*Store) ListExperiments ¶

func (s *Store) ListExperiments(limit int, status ExperimentStatus) ([]Experiment, error)

ListExperiments returns recent experiments, optionally filtered by status.

func (*Store) ListRuns ¶

func (s *Store) ListRuns(experimentID string) ([]Run, error)

ListRuns returns runs for an experiment.

func (*Store) ReplaceEvaluations ¶

func (s *Store) ReplaceEvaluations(runID string, evals []CriterionEvaluation) error

ReplaceEvaluations overwrites evaluations for a run.

func (*Store) SaveRun ¶

func (s *Store) SaveRun(run *Run) error

SaveRun inserts or updates a run record.

func (*Store) UpdateExperimentStatus ¶

func (s *Store) UpdateExperimentStatus(id string, status ExperimentStatus, completedAt *time.Time) error

UpdateExperimentStatus updates experiment status and completion timestamp.

type SuccessCriterion ¶

type SuccessCriterion struct {
	ID     int64
	Name   string
	Type   CriterionType
	Target string
	Weight float64
}

SuccessCriterion defines how to evaluate a run.

type Task ¶

type Task struct {
	Prompt     string
	Context    map[string]string
	WorkingDir string
	Timeout    time.Duration
	Files      []string // Explicit file paths for scope conflict detection
	Scope      []string // Glob patterns for scope conflict detection (e.g., "pkg/auth/...")
}

Task describes what each variant should execute.

type TerminalReporter ¶

type TerminalReporter struct {
	// contains filtered or unexported fields
}

TerminalReporter renders experiment results with colors and charts.

func NewTerminalReporter ¶

func NewTerminalReporter(comparator *Comparator) *TerminalReporter

NewTerminalReporter creates a reporter for terminal output.

func NewTerminalReporterWithOutput ¶

func NewTerminalReporterWithOutput(out io.Writer, comparator *Comparator) *TerminalReporter

NewTerminalReporterWithOutput creates a reporter with custom output.

func (*TerminalReporter) RenderCompact ¶

func (r *TerminalReporter) RenderCompact(exp *Experiment) error

RenderCompact renders a compact one-line summary per variant.

func (*TerminalReporter) RenderReport ¶

func (r *TerminalReporter) RenderReport(exp *Experiment) error

RenderReport renders a full experiment report with charts.

func (*TerminalReporter) SetNoColor ¶

func (r *TerminalReporter) SetNoColor(noColor bool)

SetNoColor disables color output.

type Variant ¶

type Variant struct {
	ID           string
	Name         string
	ModelID      string
	ProviderID   string
	SystemPrompt *string
	Temperature  *float64
	MaxTokens    *int
	ToolsAllowed []string
	CustomConfig map[string]any
	Files        []string // Override task-level file scope for this variant
	Scope        []string // Override task-level glob scope for this variant
}

Variant describes a model configuration to test.

type VariantReport ¶

type VariantReport struct {
	VariantID      string
	VariantName    string
	ModelID        string
	Status         RunStatus
	Metrics        RunMetrics
	CriteriaScore  float64
	CriteriaPassed []string
	CriteriaFailed []string
	OutputPreview  string
	Error          string
}

VariantReport captures metrics and criteria results per variant.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL