# Starter eval suite — tool-agnostic. Fill in and wire into CI.
# The shape that matters: dataset + success criteria + graders, read statistically.
# More: https://evaldrivendevelopment.dev/how-to-build-an-eval-harness-for-an-llm-app

suite: my-feature

dataset:
  # Build this from REAL failures and production traces, not imagined cases.
  cases:
    - id: case-001
      input: "…the real input that failed…"
      context: "…retrieved docs / state, if any…"
      criteria:
        - check: "output is valid, parseable JSON"
          grader: code              # cheapest, most reliable
        - check: "answer is faithful to the provided context"
          grader: llm-judge         # validate vs human labels first
        - check: "no personal data is leaked"
          grader: code

run:
  trials: 5                          # run each case repeatedly…
  report: pass^k                     # …and report reliability, not a lucky single run

gate:
  regression: ">= 100% pass"         # behaviours that already work — block the build if they break
  capability: track                  # harder bets — measure the trend, don't block

online:
  sample: 0.05                       # also score ~5% of live traffic
  feed_failures_back_into: dataset   # today's prod failure is tomorrow's golden-set case
