evaluate

package
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 5, 2026 License: MIT Imports: 12 Imported by: 0

Documentation

Overview

Package evaluate provides LLM-as-judge scoring orchestration for skills.

It exposes the evaluation logic (caching, scoring, aggregation) as a library so that both the CLI and enterprise variants can reuse it.

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func FindParentSkillDir

func FindParentSkillDir(filePath string) (string, error)

FindParentSkillDir walks up from filePath looking for a directory containing SKILL.md.

Types

type Options

type Options struct {
	Rescore   bool
	SkillOnly bool
	RefsOnly  bool
	MaxLen    int
	CacheDir  string       // Override cache directory; defaults to judge.CacheDir(skillDir) when empty
	Progress  ProgressFunc // Optional progress callback; nil means no output
}

Options controls what gets scored.

type ProgressFunc

type ProgressFunc func(event string, detail string)

ProgressFunc receives progress events during evaluation. event identifies the kind of event (e.g. "scoring", "cached", "warning", "error"). detail provides human-readable context.

type RefResult

type RefResult struct {
	File   string
	Scores *judge.RefScores
}

RefResult holds scoring output for a single reference file.

type Result

type Result struct {
	SkillDir     string
	SkillScores  *judge.SkillScores
	RefResults   []RefResult
	RefAggregate *judge.RefScores
}

Result holds the complete scoring output for one skill.

func EvaluateSingleFile

func EvaluateSingleFile(ctx context.Context, absPath string, client judge.LLMClient, opts Options) (*Result, error)

EvaluateSingleFile scores a single reference .md file.

func EvaluateSkill

func EvaluateSkill(ctx context.Context, dir string, client judge.LLMClient, opts Options) (*Result, error)

EvaluateSkill scores a skill directory (SKILL.md and/or reference files).

Example

This example demonstrates scoring a skill directory with caching and progress reporting. It requires a valid API key, so it is not executed as a test.

package main

import (
	"context"
	"fmt"
	"os"

	"github.com/agent-ecosystem/skill-validator/evaluate"
	"github.com/agent-ecosystem/skill-validator/judge"
)

func main() {
	client, err := judge.NewClient(judge.ClientOptions{
		Provider: "anthropic",
		APIKey:   os.Getenv("ANTHROPIC_API_KEY"),
	})
	if err != nil {
		panic(err)
	}

	result, err := evaluate.EvaluateSkill(context.Background(), "./my-skill", client, evaluate.Options{
		MaxLen: judge.DefaultMaxContentLen,
		Progress: func(event, detail string) {
			fmt.Printf("[%s] %s\n", event, detail)
		},
	})
	if err != nil {
		panic(err)
	}

	// SKILL.md scores
	if result.SkillScores != nil {
		fmt.Printf("Overall: %.2f/5\n", result.SkillScores.Overall)
		fmt.Printf("Assessment: %s\n", result.SkillScores.BriefAssessment)
	}

	// Reference file scores
	for _, ref := range result.RefResults {
		fmt.Printf("%s: %.2f/5\n", ref.File, ref.Scores.Overall)
	}

	// Aggregated reference scores
	if result.RefAggregate != nil {
		fmt.Printf("References average: %.2f/5\n", result.RefAggregate.Overall)
	}
}
Example (RefsOnly)

This example shows how to score only reference files, skipping SKILL.md.

package main

import (
	"context"
	"fmt"
	"os"

	"github.com/agent-ecosystem/skill-validator/evaluate"
	"github.com/agent-ecosystem/skill-validator/judge"
)

func main() {
	client, err := judge.NewClient(judge.ClientOptions{
		Provider: "openai",
		APIKey:   os.Getenv("OPENAI_API_KEY"),
		Model:    "gpt-4o",
	})
	if err != nil {
		panic(err)
	}

	result, err := evaluate.EvaluateSkill(context.Background(), "./my-skill", client, evaluate.Options{
		RefsOnly: true,
		MaxLen:   judge.DefaultMaxContentLen,
	})
	if err != nil {
		panic(err)
	}

	for _, ref := range result.RefResults {
		fmt.Printf("%s: %.2f/5\n", ref.File, ref.Scores.Overall)
	}
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL