Documentation
¶
Overview ¶
Package evaluate provides LLM-as-judge scoring orchestration for skills.
It exposes the evaluation logic (caching, scoring, aggregation) as a library so that both the CLI and enterprise variants can reuse it.
Index ¶
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func FindParentSkillDir ¶
FindParentSkillDir walks up from filePath looking for a directory containing SKILL.md.
Types ¶
type Options ¶
type Options struct {
Rescore bool
SkillOnly bool
RefsOnly bool
MaxLen int
CacheDir string // Override cache directory; defaults to judge.CacheDir(skillDir) when empty
Progress ProgressFunc // Optional progress callback; nil means no output
}
Options controls what gets scored.
type ProgressFunc ¶
ProgressFunc receives progress events during evaluation. event identifies the kind of event (e.g. "scoring", "cached", "warning", "error"). detail provides human-readable context.
type Result ¶
type Result struct {
SkillDir string
SkillScores *judge.SkillScores
RefResults []RefResult
RefAggregate *judge.RefScores
}
Result holds the complete scoring output for one skill.
func EvaluateSingleFile ¶
func EvaluateSingleFile(ctx context.Context, absPath string, client judge.LLMClient, opts Options) (*Result, error)
EvaluateSingleFile scores a single reference .md file.
func EvaluateSkill ¶
func EvaluateSkill(ctx context.Context, dir string, client judge.LLMClient, opts Options) (*Result, error)
EvaluateSkill scores a skill directory (SKILL.md and/or reference files).
Example ¶
This example demonstrates scoring a skill directory with caching and progress reporting. It requires a valid API key, so it is not executed as a test.
package main
import (
"context"
"fmt"
"os"
"github.com/agent-ecosystem/skill-validator/evaluate"
"github.com/agent-ecosystem/skill-validator/judge"
)
func main() {
client, err := judge.NewClient(judge.ClientOptions{
Provider: "anthropic",
APIKey: os.Getenv("ANTHROPIC_API_KEY"),
})
if err != nil {
panic(err)
}
result, err := evaluate.EvaluateSkill(context.Background(), "./my-skill", client, evaluate.Options{
MaxLen: judge.DefaultMaxContentLen,
Progress: func(event, detail string) {
fmt.Printf("[%s] %s\n", event, detail)
},
})
if err != nil {
panic(err)
}
// SKILL.md scores
if result.SkillScores != nil {
fmt.Printf("Overall: %.2f/5\n", result.SkillScores.Overall)
fmt.Printf("Assessment: %s\n", result.SkillScores.BriefAssessment)
}
// Reference file scores
for _, ref := range result.RefResults {
fmt.Printf("%s: %.2f/5\n", ref.File, ref.Scores.Overall)
}
// Aggregated reference scores
if result.RefAggregate != nil {
fmt.Printf("References average: %.2f/5\n", result.RefAggregate.Overall)
}
}
Example (RefsOnly) ¶
This example shows how to score only reference files, skipping SKILL.md.
package main
import (
"context"
"fmt"
"os"
"github.com/agent-ecosystem/skill-validator/evaluate"
"github.com/agent-ecosystem/skill-validator/judge"
)
func main() {
client, err := judge.NewClient(judge.ClientOptions{
Provider: "openai",
APIKey: os.Getenv("OPENAI_API_KEY"),
Model: "gpt-4o",
})
if err != nil {
panic(err)
}
result, err := evaluate.EvaluateSkill(context.Background(), "./my-skill", client, evaluate.Options{
RefsOnly: true,
MaxLen: judge.DefaultMaxContentLen,
})
if err != nil {
panic(err)
}
for _, ref := range result.RefResults {
fmt.Printf("%s: %.2f/5\n", ref.File, ref.Scores.Overall)
}
}