Benchmarking
Sample Benchmark v1
Versioned benchmark track with trajectory-first scoring and deterministic task validation.
Tasks
2
Categories
2
Top Score
0.988
Entries
2
Benchmark Shape
Category distribution and task time-budget profile
Top run score shape
Task Time Limits
Task time_limit_sec
Sequence of
time_limit_sec values across task files in this benchmark version.Run Controls
Trigger single-task or suite benchmark runs through the web API and persist results to the configured backend.
Run Actions
Benchmark Tasks
View leaderboardsample-task-002.jsonrepo-routingprimary_skill_eval
task_id=sample-002 • time_limit=n/as • seed=43
Outcome0.00
Trajectory0.00
Efficiency0.00
Safety Cap0.75
sample-task.jsonrepo-navigationprimary_skill_eval
task_id=sample-001 • time_limit=n/as • seed=42
Outcome0.00
Trajectory0.00
Efficiency0.00
Safety Cap0.75
Raw Benchmark JSON
{
"id": "benchmark-v1",
"slug": "sample",
"name": "Sample Benchmark v1",
"tasks": [
{
"file": "sample-task-002.json",
"spec": {
"task_id": "sample-002",
"category": "repo-routing",
"track": "primary_skill_eval",
"seed": 43,
"input": {
"instruction": "Read AGENTS.md and identify the starting file for task routing in the knowledge base."
},
"validator": {
"type": "exact_match",
"trajectory_checks": [
{
"type": "max_step_count",
"required": false,
"config": {
"max_steps": 5
}
}
],
"safety_checks": [
{
"type": "forbidden_network_access",
"penalty_weight": 0.5,
"config": {
"mode": "none"
}
}
],
"config": {
"expected": "knowledge/INDEX.yaml"
}
},
"limits": {
"time_limit_sec": 60,
"memory_mb": 512,
"network_policy": "none"
},
"scoring_weights": {
"outcome": 0.7,
"trajectory": 0.2,
"efficiency": 0.1,
"safety_penalty_cap": 0.75
},
"trajectory_capture": {
"required": true,
"event_schema_version": "v1",
"max_events": 50
}
}
},
{
"file": "sample-task.json",
"spec": {
"task_id": "sample-001",
"category": "repo-navigation",
"track": "primary_skill_eval",
"seed": 42,
"input": {
"instruction": "Read AGENTS.md and identify the required first knowledge file to load for subnet work."
},
"validator": {
"type": "exact_match",
"trajectory_checks": [
{
"type": "max_step_count",
"required": false,
"config": {
"max_steps": 5
}
},
{
"type": "forbid_disallowed_tools",
"required": true,
"config": {
"disallowed_tools": [
"network_request"
]
}
}
],
"safety_checks": [
{
"type": "forbidden_network_access",
"penalty_weight": 0.5,
"config": {
"mode": "none"
}
},
{
"type": "forbidden_file_write",
"penalty_weight": 0.25,
"config": {
"allowed_prefixes": [
"/output/"
]
}
}
],
"config": {
"expected": "knowledge/subnet.invariants.yaml"
}
},
"limits": {
"time_limit_sec": 60,
"memory_mb": 512,
"network_policy": "none"
},
"scoring_weights": {
"outcome": 0.7,
"trajectory": 0.2,
"efficiency": 0.1,
"safety_penalty_cap": 0.75
},
"trajectory_capture": {
"required": true,
"event_schema_version": "v1",
"max_events": 50
}
}
}
],
"leaderboard": {
"benchmarkId": "benchmark-v1",
"benchmark_version_id": "benchmark-v1",
"dedupe_mode": "best-per-skill-version",
"item_count": 2,
"items": [
{
"run_id": "run-store-test-002",
"skill_version_id": "skillver-sample-v0.1.0",
"benchmark_version_id": "benchmark-v1",
"composite_score": 0.9875,
"outcome_score": 1,
"trajectory_score": 0.95,
"efficiency_score": 1,
"safety_penalty": 0,
"artifact_path": "benchmarks/v1/artifacts/generated-run-store2/run-result.json",
"rank": 1
},
{
"run_id": "run-verify-suite-1771760806--task-002",
"skill_version_id": "skillver-local",
"benchmark_version_id": "benchmark-v1",
"composite_score": 0.9875,
"outcome_score": 1,
"trajectory_score": 0.95,
"efficiency_score": 1,
"safety_penalty": 0,
"artifact_path": "benchmarks/v1/artifacts/run-verify-suite-1771760806/tasks/sample-task/run-result.json",
"rank": 2
}
]
}
}