evalharness

package
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 17, 2026 License: MIT Imports: 29 Imported by: 0

Documentation

Index

Constants

View Source
const (
	LaneDocsPlans          = "docs_plans"
	LaneTestCase           = "test_case"
	LaneCodeComment        = "code_comment"
	LaneSourceContextOther = "source_context_other"
	LanePackedSections     = "packed_sections"
)
View Source
const (
	CanonicalLaneIntent        = "intent"
	CanonicalLaneModel         = "model"
	CanonicalLaneProtocol      = "protocol"
	CanonicalLaneTemplate      = "template"
	CanonicalLaneTrace         = "trace"
	CanonicalLaneSourceContext = "source_context"
	CanonicalLaneUnknown       = "unknown"
)
View Source
const (
	CorpusSourceFilesystemFixture = "filesystem_fixture"
	CorpusSourceSQLiteIndex       = "sqlite_index"
	ProductPathLabOnly            = "lab_only"
	ProductPathIndexedHarness     = "indexed_harness"
	ProductPathLiveCLICommand     = "live_cli_command"
)

Variables

This section is empty.

Functions

func CheckSummaryThresholds

func CheckSummaryThresholds(r *Result, opts Options) []string

func FormatJSON

func FormatJSON(r *Result) ([]byte, error)

func FormatText

func FormatText(r *Result) string

Types

type AgentMetrics

type AgentMetrics struct {
	MustHitAt1                      float64                  `json:"must_hit_at_1"`
	MustHitAt3                      float64                  `json:"must_hit_at_3"`
	MustHitAt5                      float64                  `json:"must_hit_at_5"`
	MustHitAt10                     float64                  `json:"must_hit_at_10"`
	MeanFirstMustRank               float64                  `json:"mean_first_must_rank,omitempty"`
	MeanFirstUsefulRank             float64                  `json:"mean_first_useful_rank,omitempty"`
	ContextSufficiencyAtTokenBudget []TokenBudgetSufficiency `json:"context_sufficiency_at_token_budget"`
	LowPrecisionSufficientCases     int                      `json:"low_precision_sufficient_cases"`
}

type ApproxTokenCounter

type ApproxTokenCounter struct{}

func (ApproxTokenCounter) Count

func (ApproxTokenCounter) Count(text string) int

func (ApproxTokenCounter) Name

func (ApproxTokenCounter) Name() string

func (ApproxTokenCounter) Profile

type ArtifactGrade

type ArtifactGrade struct {
	Path          string  `json:"path"`
	Lane          string  `json:"lane"`
	CanonicalLane string  `json:"canonical_lane,omitempty"`
	Grade         string  `json:"grade"`
	Weight        float64 `json:"weight"`
	Exact         bool    `json:"exact"`
	SameCluster   bool    `json:"same_cluster,omitempty"`
	HardNegative  bool    `json:"hard_negative,omitempty"`
}

type ArtifactReason

type ArtifactReason = retrieval.Reason

type BaselineMetrics

type BaselineMetrics struct {
	Name                     string   `json:"name"`
	FileScope                string   `json:"file_scope"`
	IncludesSourceCandidates bool     `json:"includes_source_candidates"`
	Tokens                   int      `json:"tokens"`
	ArtifactCount            int      `json:"artifact_count"`
	Artifacts                []string `json:"artifacts"`
	RelevantIncluded         int      `json:"relevant_included"`
	IrrelevantCount          int      `json:"irrelevant_count"`
}

type BudgetEvent

type BudgetEvent struct {
	Name    string `json:"name"`
	Before  int    `json:"before"`
	After   int    `json:"after"`
	Message string `json:"message,omitempty"`
}

type BudgetReport

type BudgetReport struct {
	MaxCorpusFiles       int           `json:"max_corpus_files,omitempty"`
	MaxSourceFiles       int           `json:"max_source_files,omitempty"`
	MaxTestCaseArtifacts int           `json:"max_test_case_artifacts,omitempty"`
	MaxCodeComments      int           `json:"max_code_comments,omitempty"`
	MaxCaseSeconds       int           `json:"max_case_seconds,omitempty"`
	Applied              []BudgetEvent `json:"applied,omitempty"`
}

type CaseAgentMetrics

type CaseAgentMetrics struct {
	IncludedArtifacts         int         `json:"included_artifacts"`
	ExactRelevantArtifacts    int         `json:"exact_relevant_artifacts"`
	SameClusterArtifacts      int         `json:"same_cluster_artifacts"`
	HardNegativeArtifacts     int         `json:"hard_negative_artifacts"`
	StrictPrecision           float64     `json:"strict_precision"`
	GradedPrecision           float64     `json:"graded_precision"`
	PenalizedUtilityPrecision float64     `json:"penalized_utility_precision"`
	FirstMustRank             int         `json:"first_must_rank,omitempty"`
	FirstUsefulRank           int         `json:"first_useful_rank,omitempty"`
	MustHitAt1                bool        `json:"must_hit_at_1"`
	MustHitAt3                bool        `json:"must_hit_at_3"`
	MustHitAt5                bool        `json:"must_hit_at_5"`
	MustHitAt10               bool        `json:"must_hit_at_10"`
	GradeCounts               GradeCounts `json:"grade_counts"`
	LaneCounts                LaneCounts  `json:"lane_counts"`
}

type CaseFile

type CaseFile struct {
	FixtureVersion string     `yaml:"fixture_version"`
	EvalStage      string     `yaml:"eval_stage"`
	Cases          []CaseSpec `yaml:"cases"`
}

type CaseResult

type CaseResult struct {
	ID                               string                     `json:"id"`
	Query                            string                     `json:"query"`
	CaseDurationMS                   int64                      `json:"case_duration_ms,omitempty"`
	CaseBudgetExceeded               bool                       `json:"case_budget_exceeded,omitempty"`
	CaseBudgetSeconds                int                        `json:"case_budget_seconds,omitempty"`
	DevSpecsTokens                   int                        `json:"devspecs_tokens"`
	FullPlanningTokens               int                        `json:"full_planning_tokens"`
	AllMarkdownTokens                int                        `json:"all_markdown_tokens"`
	FullCandidateCorpusTokens        int                        `json:"full_candidate_corpus_tokens"`
	QueryFileBaselineTokens          int                        `json:"query_file_baseline_tokens"`
	PreBudgetDevSpecsTokens          int                        `json:"pre_budget_devspecs_tokens,omitempty"`
	ContextTokenBudget               int                        `json:"context_token_budget,omitempty"`
	ContextBudgetDroppedCount        int                        `json:"context_budget_dropped_count,omitempty"`
	ContextBudgetDroppedArtifacts    []string                   `json:"context_budget_dropped_artifacts,omitempty"`
	TokenReductionVsFullPlanning     float64                    `json:"token_reduction_vs_full_planning"`
	TokenReductionVsAllMarkdown      float64                    `json:"token_reduction_vs_all_markdown"`
	TokenReductionVsFullCandidate    float64                    `json:"token_reduction_vs_full_candidate_corpus"`
	TokenReductionVsQueryFile        float64                    `json:"token_reduction_vs_query_file_baseline"`
	ExpectedRelevantCount            int                        `json:"expected_relevant_count"`
	RelevantRetrieved                int                        `json:"relevant_retrieved"`
	ArtifactRecall                   float64                    `json:"artifact_recall"`
	MustExpectedCount                int                        `json:"must_expected_count"`
	MustRelevantRetrieved            int                        `json:"must_relevant_retrieved"`
	MustHaveRecall                   float64                    `json:"must_have_recall"`
	HelpfulExpectedCount             int                        `json:"helpful_expected_count"`
	HelpfulRelevantRetrieved         int                        `json:"helpful_relevant_retrieved"`
	HelpfulRecall                    float64                    `json:"helpful_recall"`
	BackgroundExpectedCount          int                        `json:"background_expected_count"`
	BackgroundRelevantRetrieved      int                        `json:"background_relevant_retrieved"`
	BackgroundRecall                 float64                    `json:"background_recall"`
	ArtifactsIncluded                []string                   `json:"artifacts_included"`
	ArtifactReasons                  []ArtifactReason           `json:"artifact_reasons"`
	PackDiagnostics                  *retrieval.RoleGroupedPack `json:"pack_diagnostics,omitempty"`
	PackSummary                      *retrieval.PackSummary     `json:"pack_summary,omitempty"`
	GraphContext                     *GraphContext              `json:"graph_context,omitempty"`
	GraphDiagnostics                 *GraphDiagnostics          `json:"graph_diagnostics,omitempty"`
	GraphContextArtifacts            []string                   `json:"graph_context_artifacts,omitempty"`
	GraphContextArtifactReasons      []ArtifactReason           `json:"graph_context_artifact_reasons,omitempty"`
	GraphContextDevSpecsTokens       int                        `json:"graph_context_devspecs_tokens,omitempty"`
	GraphContextRelevantIncluded     []string                   `json:"graph_context_relevant_included,omitempty"`
	GraphContextIrrelevantIncluded   []string                   `json:"graph_context_irrelevant_included,omitempty"`
	GraphContextArtifactPrecision    float64                    `json:"graph_context_artifact_precision,omitempty"`
	GraphContextAgentMetrics         *CaseAgentMetrics          `json:"graph_context_agent_metrics,omitempty"`
	GraphContextArtifactGrades       []ArtifactGrade            `json:"graph_context_artifact_grades,omitempty"`
	GraphAssistedRelevantIncluded    []string                   `json:"graph_assisted_relevant_included,omitempty"`
	RelatedArtifacts                 []string                   `json:"related_artifacts,omitempty"`
	RelatedArtifactReasons           []ArtifactReason           `json:"related_artifact_reasons,omitempty"`
	RelatedDevSpecsTokens            int                        `json:"related_devspecs_tokens,omitempty"`
	RelatedRelevantIncluded          []string                   `json:"related_relevant_included,omitempty"`
	RelatedIrrelevantIncluded        []string                   `json:"related_irrelevant_included,omitempty"`
	RelatedArtifactPrecision         float64                    `json:"related_artifact_precision,omitempty"`
	RelatedAgentMetrics              CaseAgentMetrics           `json:"related_agent_metrics,omitempty"`
	RelatedArtifactGrades            []ArtifactGrade            `json:"related_artifact_grades,omitempty"`
	CombinedTieredArtifacts          []string                   `json:"combined_tiered_artifacts,omitempty"`
	CombinedTieredDevSpecsTokens     int                        `json:"combined_tiered_devspecs_tokens,omitempty"`
	CombinedTieredContextSufficiency SufficiencyResult          `json:"combined_tiered_context_sufficiency,omitempty"`
	PackedSectionArtifacts           []string                   `json:"packed_section_artifacts,omitempty"`
	PackedSectionCount               int                        `json:"packed_section_count,omitempty"`
	SectionSelectedArtifacts         []string                   `json:"section_selected_artifacts,omitempty"`
	SectionSelectedCount             int                        `json:"section_selected_count,omitempty"`
	FullFileArtifactCount            int                        `json:"full_file_artifact_count,omitempty"`
	TestCaseArtifactCount            int                        `json:"test_case_artifact_count,omitempty"`
	CodeCommentArtifactCount         int                        `json:"code_comment_artifact_count,omitempty"`
	RelevantIncluded                 []string                   `json:"relevant_included"`
	IrrelevantIncluded               []string                   `json:"irrelevant_included"`
	ArtifactPrecision                float64                    `json:"artifact_precision"`
	MissedExpectedRelevant           []string                   `json:"missed_expected_relevant"`
	MissedMustConceptDiagnostics     []ConceptMissDiagnostic    `json:"missed_must_concept_diagnostics,omitempty"`
	PrimaryFalsePositiveDiagnostics  []FalsePositiveExample     `json:"primary_false_positive_diagnostics,omitempty"`
	UnexpectedExcludedHits           []string                   `json:"unexpected_excluded_hits"`
	ExpectedAvailableCount           int                        `json:"expected_available_count"`
	ExpectedMissingFromCorpus        []string                   `json:"expected_missing_from_corpus,omitempty"`
	MissedAfterDiscovery             []string                   `json:"missed_after_discovery,omitempty"`
	DiscoveryCoverage                float64                    `json:"discovery_coverage"`
	RetrievalCoverageOfDiscovered    float64                    `json:"retrieval_coverage_of_discovered"`
	ContextSufficiency               SufficiencyResult          `json:"context_sufficiency"`
	AgentMetrics                     CaseAgentMetrics           `json:"agent_metrics"`
	ArtifactGrades                   []ArtifactGrade            `json:"artifact_grades,omitempty"`
	Baselines                        []BaselineMetrics          `json:"baselines"`
	ThresholdFailures                []string                   `json:"threshold_failures,omitempty"`
}

type CaseSpec

type CaseSpec struct {
	ID               string             `yaml:"id" json:"id"`
	Query            string             `yaml:"query" json:"query"`
	ExpectedRelevant []ExpectedArtifact `yaml:"expected_relevant" json:"expected_relevant"`
	ExpectedExcluded []string           `yaml:"expected_excluded" json:"expected_excluded"`
	ExpectedStatus   map[string]string  `yaml:"expected_status" json:"expected_status,omitempty"`
	SuccessCriteria  SuccessCriteria    `yaml:"success_criteria" json:"success_criteria,omitempty"`
}

type CommandCaseOutput

type CommandCaseOutput struct {
	Artifacts                   []retrieval.Candidate
	Context                     string
	ArtifactReasons             []ArtifactReason
	GraphContext                *GraphContext
	GraphDiagnostics            *GraphDiagnostics
	GraphContextArtifacts       []retrieval.Candidate
	GraphContextArtifactReasons []ArtifactReason
}

type CommandRunner

type CommandRunner func(fixtureAbs string, cases []CaseSpec) (map[string]CommandCaseOutput, error)

type ConceptMissDiagnostic

type ConceptMissDiagnostic struct {
	ExpectedPath     string   `json:"expected_path"`
	InCandidatePool  bool     `json:"in_candidate_pool"`
	ConceptRank      int      `json:"concept_rank,omitempty"`
	ConceptScore     float64  `json:"concept_score,omitempty"`
	MatchedCompacts  []string `json:"matched_compacts,omitempty"`
	MatchedPhrases   []string `json:"matched_phrases,omitempty"`
	MatchedPathTerms []string `json:"matched_path_terms,omitempty"`
	GlossaryMatches  []string `json:"glossary_matches,omitempty"`
	GlossaryEvidence []string `json:"glossary_evidence,omitempty"`
}

type CorpusSlice

type CorpusSlice struct {
	FileScope                string   `json:"file_scope"`
	IncludesSourceCandidates bool     `json:"includes_source_candidates"`
	Files                    int      `json:"files"`
	Tokens                   int      `json:"tokens"`
	Artifacts                []string `json:"artifacts,omitempty"`
}

type CorpusSummary

type CorpusSummary struct {
	PlanningArtifacts       CorpusSlice `json:"planning_artifacts"`
	MarkdownFiles           CorpusSlice `json:"markdown_files"`
	SourceContextCandidates CorpusSlice `json:"source_context_candidates"`
	FullCandidateCorpus     CorpusSlice `json:"full_candidate_corpus"`
}

type Diagnostics

type Diagnostics struct {
	ExpectedRelevantCount          int                           `json:"expected_relevant_count"`
	ExpectedAvailableCount         int                           `json:"expected_available_count"`
	ExpectedMissingFromCorpusCount int                           `json:"expected_missing_from_corpus_count"`
	MissedAfterDiscoveryCount      int                           `json:"missed_after_discovery_count"`
	DiscoveryCoverage              float64                       `json:"discovery_coverage"`
	RetrievalCoverageOfDiscovered  float64                       `json:"retrieval_coverage_of_discovered"`
	ExpectedMissingFromCorpus      []string                      `json:"expected_missing_from_corpus,omitempty"`
	MissedAfterDiscovery           []string                      `json:"missed_after_discovery,omitempty"`
	RoleSummaries                  []RoleDiagnostic              `json:"role_summaries,omitempty"`
	MissClassSummaries             []MissClassDiagnostic         `json:"miss_class_summaries,omitempty"`
	FalsePositiveSummaries         []FalsePositiveDiagnostic     `json:"false_positive_summaries,omitempty"`
	ExtensionSummaries             []ExtensionDiagnostic         `json:"extension_summaries,omitempty"`
	UnindexedDocumentSummaries     []UnindexedDocumentDiagnostic `json:"unindexed_document_summaries,omitempty"`
	OpenSpec                       *openspecmetrics.Metrics      `json:"openspec,omitempty"`
}

type ExpectedArtifact

type ExpectedArtifact struct {
	Path       string `yaml:"path" json:"path"`
	Importance string `yaml:"importance" json:"importance"`
}

func (*ExpectedArtifact) UnmarshalYAML

func (a *ExpectedArtifact) UnmarshalYAML(value *yaml.Node) error

type ExtensionDiagnostic

type ExtensionDiagnostic struct {
	Extension                  string      `json:"extension"`
	Role                       string      `json:"role"`
	Expected                   int         `json:"expected"`
	ExactRetrieved             int         `json:"exact_retrieved"`
	MissingFromCorpus          int         `json:"missing_from_corpus"`
	MissedAfterDiscovery       int         `json:"missed_after_discovery"`
	PrimaryFalsePositive       int         `json:"primary_false_positive"`
	PrimaryFalsePositiveGrades GradeCounts `json:"primary_false_positive_grades,omitempty"`
	Examples                   []string    `json:"examples,omitempty"`
}

type FalsePositiveDiagnostic

type FalsePositiveDiagnostic struct {
	Class       string                 `json:"class"`
	QueryType   string                 `json:"query_type"`
	Lane        string                 `json:"lane"`
	Role        string                 `json:"role"`
	ReasonClass string                 `json:"reason_class"`
	GradeCounts GradeCounts            `json:"grade_counts"`
	Count       int                    `json:"count"`
	Examples    []FalsePositiveExample `json:"examples,omitempty"`
}

type FalsePositiveExample

type FalsePositiveExample struct {
	CaseID      string   `json:"case_id"`
	QueryType   string   `json:"query_type"`
	Path        string   `json:"path"`
	Position    int      `json:"position"`
	Lane        string   `json:"lane"`
	Role        string   `json:"role"`
	Grade       string   `json:"grade"`
	Weight      float64  `json:"weight"`
	ReasonClass string   `json:"reason_class"`
	Reasons     []string `json:"reasons,omitempty"`
}

type File

type File = retrieval.Candidate

type GradeCounts

type GradeCounts struct {
	Must         int `json:"must"`
	Helpful      int `json:"helpful"`
	Background   int `json:"background"`
	SameCluster  int `json:"same_cluster"`
	Unlabeled    int `json:"unlabeled"`
	HardNegative int `json:"hard_negative"`
}

type GraphCandidate

type GraphCandidate struct {
	ID                string   `json:"id,omitempty"`
	ShortID           string   `json:"short_id,omitempty"`
	Path              string   `json:"path,omitempty"`
	SourcePath        string   `json:"source_path,omitempty"`
	Kind              string   `json:"kind,omitempty"`
	Subtype           string   `json:"subtype,omitempty"`
	Title             string   `json:"title,omitempty"`
	Role              string   `json:"role,omitempty"`
	RoleReason        string   `json:"role_reason,omitempty"`
	SeedPath          string   `json:"seed_path,omitempty"`
	AdmissionEdgeType string   `json:"admission_edge_type,omitempty"`
	Confidence        float64  `json:"confidence,omitempty"`
	Weight            float64  `json:"weight,omitempty"`
	SourceSignal      string   `json:"source_signal,omitempty"`
	CompanionDerived  bool     `json:"companion_derived,omitempty"`
	Receipt           string   `json:"receipt,omitempty"`
	SupportReceipts   []string `json:"support_receipts,omitempty"`
}

type GraphContext

type GraphContext struct {
	Mode            string              `json:"mode"`
	EvidenceMode    string              `json:"evidence_mode,omitempty"`
	Title           string              `json:"title,omitempty"`
	CandidateCount  int                 `json:"candidate_count"`
	SuppressedCount int                 `json:"suppressed_count,omitempty"`
	Counts          map[string]int      `json:"counts,omitempty"`
	Groups          []GraphContextGroup `json:"groups,omitempty"`
	Notes           []string            `json:"notes,omitempty"`
}

type GraphContextGroup

type GraphContextGroup struct {
	Role  string           `json:"role"`
	Title string           `json:"title"`
	Items []GraphCandidate `json:"items"`
}

type GraphDiagnostics

type GraphDiagnostics struct {
	Mode            string             `json:"mode"`
	SeedCount       int                `json:"seed_count"`
	CandidateCount  int                `json:"candidate_count"`
	SuppressedCount int                `json:"suppressed_count,omitempty"`
	Counts          map[string]int     `json:"counts,omitempty"`
	Candidates      []GraphCandidate   `json:"candidates,omitempty"`
	Suppressed      []GraphSuppression `json:"suppressed,omitempty"`
	Notes           []string           `json:"notes,omitempty"`
}

type GraphSuppression

type GraphSuppression struct {
	Path       string  `json:"path,omitempty"`
	SeedPath   string  `json:"seed_path,omitempty"`
	EdgeType   string  `json:"edge_type,omitempty"`
	Confidence float64 `json:"confidence,omitempty"`
	Reason     string  `json:"reason"`
}

type IndexCacheReport

type IndexCacheReport struct {
	Enabled               bool   `json:"enabled"`
	Hit                   bool   `json:"hit"`
	Key                   string `json:"key,omitempty"`
	Path                  string `json:"path,omitempty"`
	SchemaVersion         int    `json:"schema_version,omitempty"`
	Reason                string `json:"reason,omitempty"`
	CorpusFingerprint     string `json:"corpus_fingerprint,omitempty"`
	ProvenanceFingerprint string `json:"provenance_fingerprint,omitempty"`
}

type LaneCounts

type LaneCounts struct {
	DocsPlans          int `json:"docs_plans"`
	TestCase           int `json:"test_case"`
	CodeComment        int `json:"code_comment"`
	SourceContextOther int `json:"source_context_other"`
	PackedSections     int `json:"packed_sections"`
}

type LaneMetric

type LaneMetric struct {
	Lane                   string  `json:"lane"`
	Cases                  int     `json:"cases"`
	CasesWithIncluded      int     `json:"cases_with_included"`
	CasesWithExpected      int     `json:"cases_with_expected"`
	IncludedArtifacts      int     `json:"included_artifacts"`
	ExactRelevantArtifacts int     `json:"exact_relevant_artifacts"`
	SameClusterArtifacts   int     `json:"same_cluster_artifacts"`
	HardNegativeArtifacts  int     `json:"hard_negative_artifacts"`
	ExpectedArtifacts      int     `json:"expected_artifacts"`
	GradedRelevanceWeight  float64 `json:"graded_relevance_weight"`
	StrictPrecision        float64 `json:"strict_precision,omitempty"`
	GradedPrecision        float64 `json:"graded_precision,omitempty"`
	Recall                 float64 `json:"recall,omitempty"`
	PackedSectionCount     int     `json:"packed_section_count,omitempty"`
}

type MissClassDiagnostic

type MissClassDiagnostic struct {
	Class    string   `json:"class"`
	Count    int      `json:"count"`
	Examples []string `json:"examples,omitempty"`
}

type Options

type Options struct {
	JSON                            bool
	MinRecall                       *float64
	MinMeanRecall                   *float64
	MinMustRecall                   *float64
	MinSufficiency                  *float64
	MinReductionFull                *float64
	CorpusSource                    string
	CommandUnderTest                string
	FindRuntime                     string
	CommandRunner                   CommandRunner
	TokenCounter                    TokenCounter
	Retriever                       retrieval.Retriever
	TestCaseArtifacts               bool
	CodeCommentArtifacts            bool
	DisableSectionAwareRetrieval    bool
	ExperimentalBalancedEvidence    bool
	ExperimentalBudgetedPacking     bool
	ExperimentalConceptBackfill     bool
	ExperimentalGlossaryConcepts    bool
	ExperimentalTieredConceptOutput bool
	ExperimentalAnchorFirstRanking  bool
	ExperimentalAnchorFirstMode     string
	ExperimentalSupportDocs         bool
	PackDiagnostics                 bool
	GraphDiagnostics                bool
	ContextTokenBudget              int
	IndexCacheDir                   string
	RefreshIndexCache               bool
	MaxCorpusFiles                  int
	MaxSourceFiles                  int
	MaxTestCaseArtifacts            int
	MaxCodeComments                 int
	MaxCaseSeconds                  int
	ProgressWriter                  io.Writer
	ProgressInterval                time.Duration
}

type ParetoSummary

type ParetoSummary struct {
	MeanTokenReductionVsFullPlanning      float64 `json:"mean_token_reduction_vs_full_planning"`
	MeanTokenReductionVsQueryFileBaseline float64 `json:"mean_token_reduction_vs_query_file_baseline"`
	MeanArtifactRecall                    float64 `json:"mean_artifact_recall"`
	MeanMustHaveRecall                    float64 `json:"mean_must_have_recall"`
	MeanArtifactPrecision                 float64 `json:"mean_artifact_precision"`
	MeanGradedPrecision                   float64 `json:"mean_graded_precision"`
	MeanPenalizedUtilityPrecision         float64 `json:"mean_penalized_utility_precision"`
	ContextSufficiencyPassRate            float64 `json:"context_sufficiency_pass_rate"`
}

type PhaseTelemetry

type PhaseTelemetry struct {
	Name       string            `json:"name"`
	StartedAt  string            `json:"started_at"`
	EndedAt    string            `json:"ended_at"`
	DurationMS int64             `json:"duration_ms"`
	Status     string            `json:"status"`
	Counts     map[string]int    `json:"counts,omitempty"`
	Details    map[string]string `json:"details,omitempty"`
}

type PricingProfile

type PricingProfile struct {
	Name              string  `json:"name"`
	InputUSDPer1MTok  float64 `json:"input_usd_per_1m_tokens,omitempty"`
	OutputUSDPer1MTok float64 `json:"output_usd_per_1m_tokens,omitempty"`
}

type ProfiledTokenCounter

type ProfiledTokenCounter interface {
	TokenCounter
	Profile() TokenizerProfile
}

type Result

type Result struct {
	Fixture          string            `json:"fixture"`
	FixtureVersion   string            `json:"fixture_version"`
	EvalStage        string            `json:"eval_stage"`
	CorpusSource     string            `json:"corpus_source"`
	ProductPath      string            `json:"product_path"`
	CommandUnderTest string            `json:"command_under_test,omitempty"`
	FindRuntime      string            `json:"find_runtime,omitempty"`
	Retriever        string            `json:"retriever"`
	TokenCounter     string            `json:"token_counter"`
	TokenizerProfile TokenizerProfile  `json:"tokenizer_profile"`
	PricingProfile   PricingProfile    `json:"pricing_profile"`
	ResultsFile      string            `json:"results_file,omitempty"`
	Corpus           CorpusSummary     `json:"corpus"`
	Summary          Summary           `json:"summary"`
	Diagnostics      Diagnostics       `json:"diagnostics"`
	AgentMetrics     AgentMetrics      `json:"agent_metrics"`
	LaneMetrics      []LaneMetric      `json:"lane_metrics"`
	CanonicalLanes   []LaneMetric      `json:"canonical_lane_metrics,omitempty"`
	MetricNotes      map[string]string `json:"metric_notes,omitempty"`
	PhaseTelemetry   []PhaseTelemetry  `json:"phase_telemetry,omitempty"`
	IndexCache       *IndexCacheReport `json:"index_cache,omitempty"`
	Budgets          BudgetReport      `json:"budgets,omitempty"`
	Cases            []CaseResult      `json:"cases"`
}

func Run

func Run(fixture string, opts Options) (*Result, error)

type RoleDiagnostic

type RoleDiagnostic struct {
	Role                          string  `json:"role"`
	Expected                      int     `json:"expected"`
	ExpectedAvailable             int     `json:"expected_available"`
	Retrieved                     int     `json:"retrieved"`
	IrrelevantRetrieved           int     `json:"irrelevant_retrieved"`
	MissingFromCorpus             int     `json:"missing_from_corpus"`
	MissedAfterDiscovery          int     `json:"missed_after_discovery"`
	DiscoveryCoverage             float64 `json:"discovery_coverage"`
	RetrievalCoverageOfDiscovered float64 `json:"retrieval_coverage_of_discovered"`
}

type SuccessCriteria

type SuccessCriteria struct {
	MustContainTerms        []string `yaml:"must_contain_terms" json:"must_contain_terms,omitempty"`
	MustContainArtifacts    []string `yaml:"must_contain_artifacts" json:"must_contain_artifacts,omitempty"`
	MustNotContainTerms     []string `yaml:"must_not_contain_terms" json:"must_not_contain_terms,omitempty"`
	MustNotContainArtifacts []string `yaml:"must_not_contain_artifacts" json:"must_not_contain_artifacts,omitempty"`
	LegacyMustNotContain    []string `yaml:"must_not_contain" json:"must_not_contain,omitempty"`
}

func (SuccessCriteria) Configured

func (c SuccessCriteria) Configured() bool

type SufficiencyResult

type SufficiencyResult struct {
	Configured                bool     `json:"configured"`
	Passed                    bool     `json:"passed"`
	MissingTerms              []string `json:"missing_terms"`
	MissingArtifacts          []string `json:"missing_artifacts"`
	ForbiddenTermsPresent     []string `json:"forbidden_terms_present"`
	ForbiddenArtifactsPresent []string `json:"forbidden_artifacts_present"`
	Failures                  []string `json:"failures"`
}

type Summary

type Summary struct {
	Cases                                    int           `json:"cases"`
	MedianTokenReductionVsFullPlanning       float64       `json:"median_token_reduction_vs_full_planning"`
	MeanTokenReductionVsFullPlanning         float64       `json:"mean_token_reduction_vs_full_planning"`
	MedianTokenReductionVsQueryFileBaseline  float64       `json:"median_token_reduction_vs_query_file_baseline"`
	MeanTokenReductionVsQueryFileBaseline    float64       `json:"mean_token_reduction_vs_query_file_baseline"`
	MeanArtifactRecall                       float64       `json:"mean_artifact_recall"`
	MeanMustHaveRecall                       float64       `json:"mean_must_have_recall"`
	MeanHelpfulRecall                        float64       `json:"mean_helpful_recall"`
	MeanBackgroundRecall                     float64       `json:"mean_background_recall"`
	MeanArtifactPrecision                    float64       `json:"mean_artifact_precision"`
	MeanGradedPrecision                      float64       `json:"mean_graded_precision"`
	MeanPenalizedUtilityPrecision            float64       `json:"mean_penalized_utility_precision"`
	GradeCounts                              GradeCounts   `json:"grade_counts"`
	RelatedCases                             int           `json:"related_cases,omitempty"`
	RelatedArtifactCount                     int           `json:"related_artifact_count,omitempty"`
	RelatedRelevantCount                     int           `json:"related_relevant_count,omitempty"`
	MeanRelatedArtifactPrecision             float64       `json:"mean_related_artifact_precision,omitempty"`
	MeanRelatedGradedPrecision               float64       `json:"mean_related_graded_precision,omitempty"`
	RelatedGradeCounts                       GradeCounts   `json:"related_grade_counts,omitempty"`
	GraphContextCases                        int           `json:"graph_context_cases,omitempty"`
	GraphContextArtifactCount                int           `json:"graph_context_artifact_count,omitempty"`
	GraphContextRelevantCount                int           `json:"graph_context_relevant_count,omitempty"`
	GraphAssistedRelevantCount               int           `json:"graph_assisted_relevant_count,omitempty"`
	MeanGraphContextArtifactPrecision        float64       `json:"mean_graph_context_artifact_precision,omitempty"`
	MeanGraphContextGradedPrecision          float64       `json:"mean_graph_context_graded_precision,omitempty"`
	GraphContextGradeCounts                  GradeCounts   `json:"graph_context_grade_counts,omitempty"`
	PackDiagnosticCases                      int           `json:"pack_diagnostic_cases,omitempty"`
	PackIncludedArtifactCount                int           `json:"pack_included_artifact_count,omitempty"`
	PackExcludedNoiseCount                   int           `json:"pack_excluded_noise_count,omitempty"`
	MeanPackIncludedArtifacts                float64       `json:"mean_pack_included_artifacts,omitempty"`
	MeanPackRoleDiversity                    float64       `json:"mean_pack_role_diversity,omitempty"`
	PackCasesWithBackgroundDecisions         int           `json:"pack_cases_with_background_decisions,omitempty"`
	PackCasesWithImplementation              int           `json:"pack_cases_with_implementation_surface,omitempty"`
	PackCasesWithBehaviorTests               int           `json:"pack_cases_with_behavior_tests,omitempty"`
	PackCasesWithConfigSchema                int           `json:"pack_cases_with_config_schema,omitempty"`
	PackCasesWithOpenWork                    int           `json:"pack_cases_with_open_work,omitempty"`
	PackCasesWithSupportingContext           int           `json:"pack_cases_with_supporting_context,omitempty"`
	PackCasesWithExcludedNoise               int           `json:"pack_cases_with_excluded_noise,omitempty"`
	CombinedTieredContextSufficiencyCases    int           `json:"combined_tiered_context_sufficiency_cases,omitempty"`
	CombinedTieredContextSufficiencyPassed   int           `json:"combined_tiered_context_sufficiency_passed,omitempty"`
	CombinedTieredContextSufficiencyPassRate float64       `json:"combined_tiered_context_sufficiency_pass_rate,omitempty"`
	ContextSufficiencyCases                  int           `json:"context_sufficiency_cases"`
	ContextSufficiencyPassed                 int           `json:"context_sufficiency_passed"`
	ContextSufficiencyPassRate               float64       `json:"context_sufficiency_pass_rate"`
	AgentMetrics                             AgentMetrics  `json:"agent_metrics"`
	Pareto                                   ParetoSummary `json:"pareto"`
	WorstRecallCase                          string        `json:"worst_recall_case"`
	LargestTokenContextCase                  string        `json:"largest_token_context_case"`
	FailedThresholdCount                     int           `json:"failed_threshold_count,omitempty"`
}

type TokenBudgetSufficiency

type TokenBudgetSufficiency struct {
	BudgetTokens  int     `json:"budget_tokens"`
	EligibleCases int     `json:"eligible_cases"`
	PassedCases   int     `json:"passed_cases"`
	PassRate      float64 `json:"pass_rate"`
}

type TokenCounter

type TokenCounter interface {
	Count(text string) int
	Name() string
}

type TokenizerProfile

type TokenizerProfile struct {
	Name          string         `json:"name"`
	Provider      string         `json:"provider"`
	Model         string         `json:"model,omitempty"`
	Approximation string         `json:"approximation,omitempty"`
	Pricing       PricingProfile `json:"pricing,omitempty"`
}

type UnindexedDocumentDiagnostic

type UnindexedDocumentDiagnostic struct {
	Extension string   `json:"extension"`
	Role      string   `json:"role"`
	Count     int      `json:"count"`
	Examples  []string `json:"examples,omitempty"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL