Control Panel
Benchmarks / sample
Benchmarking

Sample Benchmark v1

Versioned benchmark track with trajectory-first scoring and deterministic task validation.

Tasks
2
Categories
2
Top Score
0.988
Entries
2
Benchmark Shape
Category distribution and task time-budget profile
repo-routirepo-navigrepo-routi: 1.000repo-navig: 1.000
Top run score shape
Task Time Limits
Point 1: 0.000Point 1: 0.000Point 2: 0.000Point 2: 0.000
Task time_limit_sec
Sequence of time_limit_sec values across task files in this benchmark version.

Run Controls

Trigger single-task or suite benchmark runs through the web API and persist results to the configured backend.

Run Actions

Open Leaderboard

Benchmark Tasks

View leaderboard
sample-task-002.jsonrepo-routingprimary_skill_eval
task_id=sample-002 • time_limit=n/as • seed=43
Outcome0.00
Trajectory0.00
Efficiency0.00
Safety Cap0.75
sample-task.jsonrepo-navigationprimary_skill_eval
task_id=sample-001 • time_limit=n/as • seed=42
Outcome0.00
Trajectory0.00
Efficiency0.00
Safety Cap0.75
Raw Benchmark JSON
{
  "id": "benchmark-v1",
  "slug": "sample",
  "name": "Sample Benchmark v1",
  "tasks": [
    {
      "file": "sample-task-002.json",
      "spec": {
        "task_id": "sample-002",
        "category": "repo-routing",
        "track": "primary_skill_eval",
        "seed": 43,
        "input": {
          "instruction": "Read AGENTS.md and identify the starting file for task routing in the knowledge base."
        },
        "validator": {
          "type": "exact_match",
          "trajectory_checks": [
            {
              "type": "max_step_count",
              "required": false,
              "config": {
                "max_steps": 5
              }
            }
          ],
          "safety_checks": [
            {
              "type": "forbidden_network_access",
              "penalty_weight": 0.5,
              "config": {
                "mode": "none"
              }
            }
          ],
          "config": {
            "expected": "knowledge/INDEX.yaml"
          }
        },
        "limits": {
          "time_limit_sec": 60,
          "memory_mb": 512,
          "network_policy": "none"
        },
        "scoring_weights": {
          "outcome": 0.7,
          "trajectory": 0.2,
          "efficiency": 0.1,
          "safety_penalty_cap": 0.75
        },
        "trajectory_capture": {
          "required": true,
          "event_schema_version": "v1",
          "max_events": 50
        }
      }
    },
    {
      "file": "sample-task.json",
      "spec": {
        "task_id": "sample-001",
        "category": "repo-navigation",
        "track": "primary_skill_eval",
        "seed": 42,
        "input": {
          "instruction": "Read AGENTS.md and identify the required first knowledge file to load for subnet work."
        },
        "validator": {
          "type": "exact_match",
          "trajectory_checks": [
            {
              "type": "max_step_count",
              "required": false,
              "config": {
                "max_steps": 5
              }
            },
            {
              "type": "forbid_disallowed_tools",
              "required": true,
              "config": {
                "disallowed_tools": [
                  "network_request"
                ]
              }
            }
          ],
          "safety_checks": [
            {
              "type": "forbidden_network_access",
              "penalty_weight": 0.5,
              "config": {
                "mode": "none"
              }
            },
            {
              "type": "forbidden_file_write",
              "penalty_weight": 0.25,
              "config": {
                "allowed_prefixes": [
                  "/output/"
                ]
              }
            }
          ],
          "config": {
            "expected": "knowledge/subnet.invariants.yaml"
          }
        },
        "limits": {
          "time_limit_sec": 60,
          "memory_mb": 512,
          "network_policy": "none"
        },
        "scoring_weights": {
          "outcome": 0.7,
          "trajectory": 0.2,
          "efficiency": 0.1,
          "safety_penalty_cap": 0.75
        },
        "trajectory_capture": {
          "required": true,
          "event_schema_version": "v1",
          "max_events": 50
        }
      }
    }
  ],
  "leaderboard": {
    "benchmarkId": "benchmark-v1",
    "benchmark_version_id": "benchmark-v1",
    "dedupe_mode": "best-per-skill-version",
    "item_count": 2,
    "items": [
      {
        "run_id": "run-store-test-002",
        "skill_version_id": "skillver-sample-v0.1.0",
        "benchmark_version_id": "benchmark-v1",
        "composite_score": 0.9875,
        "outcome_score": 1,
        "trajectory_score": 0.95,
        "efficiency_score": 1,
        "safety_penalty": 0,
        "artifact_path": "benchmarks/v1/artifacts/generated-run-store2/run-result.json",
        "rank": 1
      },
      {
        "run_id": "run-verify-suite-1771760806--task-002",
        "skill_version_id": "skillver-local",
        "benchmark_version_id": "benchmark-v1",
        "composite_score": 0.9875,
        "outcome_score": 1,
        "trajectory_score": 0.95,
        "efficiency_score": 1,
        "safety_penalty": 0,
        "artifact_path": "benchmarks/v1/artifacts/run-verify-suite-1771760806/tasks/sample-task/run-result.json",
        "rank": 2
      }
    ]
  }
}