Timmy-time-dashboard/timmy-benchmark/results/qwen2.5_14b_20260323_142119.json

{
  "model": "qwen2.5:14b",
  "timestamp": "2026-03-23T18:20:27.095190+00:00",
  "skipped": false,
  "levels": {
    "0": {
      "level": 0,
      "name": "JSON Compliance (Coin Flip)",
      "trials": [
        {
          "trial_index": 0,
          "prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
          "raw_response": "{\"choice\": \"heads\"}",
          "parsed": {
            "choice": "heads"
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 6347.5000858306885,
          "error": ""
        },
        {
          "trial_index": 1,
          "prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
          "raw_response": "{\"action\": \"defend\", \"confidence\": 0.6}",
          "parsed": {
            "action": "defend",
            "confidence": 0.6
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 850.42405128479,
          "error": ""
        },
        {
          "trial_index": 2,
          "prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
          "raw_response": "{\"direction\": \"north\", \"reason\": \"There is a reported enemy encampment to the north that we need to engage.\"}",
          "parsed": {
            "direction": "north",
            "reason": "There is a reported enemy encampment to the north that we need to engage."
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 1443.0510997772217,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 1443.0510997772217,
      "latency_p99_ms": 6347.5000858306885
    },
    "1": {
      "level": 1,
      "name": "Board State Tracking (Tic-Tac-Toe)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Empty board \u2014 opening move",
          "prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 4, \"reason\": \"Control the center\"}",
          "parsed": {
            "move": 4,
            "reason": "Control the center"
          },
          "valid_json": true,
          "move_legal": true,
          "move_correct": true,
          "latency_ms": 1184.3791007995605,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Block opponent's winning move",
          "prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 4, \"reason\": \"Block O's winning move\"}",
          "parsed": {
            "move": 4,
            "reason": "Block O's winning move"
          },
          "valid_json": true,
          "move_legal": false,
          "move_correct": false,
          "latency_ms": 942.816972732544,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Take winning move",
          "prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 2, \"reason\": \"Winning move\"}",
          "parsed": {
            "move": 2,
            "reason": "Winning move"
          },
          "valid_json": true,
          "move_legal": false,
          "move_correct": false,
          "latency_ms": 849.2491245269775,
          "error": ""
        },
        {
          "scenario_index": 3,
          "description": "Legal move on partially filled board",
          "prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 7, \"reason\": \"Creating a fork opportunity\"}",
          "parsed": {
            "move": 7,
            "reason": "Creating a fork opportunity"
          },
          "valid_json": true,
          "move_legal": true,
          "move_correct": true,
          "latency_ms": 885.8909606933594,
          "error": ""
        }
      ],
      "passed": false,
      "score": 0.5,
      "latency_p50_ms": 942.816972732544,
      "latency_p99_ms": 1184.3791007995605
    },
    "2": {
      "level": 2,
      "name": "Resource Management (Party Economy)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Budget allocation under constraint",
          "raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles.\"}",
          "parsed": {
            "choices": [
              "buy_food",
              "repair_armor"
            ],
            "gold_spent": 350,
            "reason": "Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3121.7379570007324,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Troop tier upgrade decision",
          "raw_response": "{\"action\": \"upgrade_recruits\", \"reason\": \"Improving the strength of the weakest troops can balance the party's overall combat effectiveness.\", \"expected_outcome\": \"Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers.\"}",
          "parsed": {
            "action": "upgrade_recruits",
            "reason": "Improving the strength of the weakest troops can balance the party's overall combat effectiveness.",
            "expected_outcome": "Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2936.119794845581,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Multi-turn planning horizon",
          "raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now.\"}",
          "parsed": {
            "extra_food_days": 5,
            "cost": 50,
            "remaining_gold": 250,
            "reason": "To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2515.505790710449,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 2936.119794845581,
      "latency_p99_ms": 3121.7379570007324
    },
    "3": {
      "level": 3,
      "name": "Battle Tactics (Formation Commands)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Ranged vs infantry \u2014 defensive opening",
          "raw_response": "{\"infantry_order\": \"shield_wall\", \"archer_order\": \"skirmish\", \"reason\": \"Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.\", \"expected_outcome\": \"Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact.\"}",
          "parsed": {
            "infantry_order": "shield_wall",
            "archer_order": "skirmish",
            "reason": "Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.",
            "expected_outcome": "Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3828.04012298584,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Outnumbered \u2014 retreat decision",
          "raw_response": "{\"order\": \"skirmish\", \"direction\": \"east\", \"reason\": \"Engage the enemy at range to buy time for an organized retreat.\", \"priority\": \"preserve_troops\"}",
          "parsed": {
            "order": "skirmish",
            "direction": "east",
            "reason": "Engage the enemy at range to buy time for an organized retreat.",
            "priority": "preserve_troops"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": false,
          "latency_ms": 2162.7819538116455,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Cavalry flanking opportunity",
          "raw_response": "{\"cavalry_order\": \"flank_right\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.\", \"risk\": \"low\"}",
          "parsed": {
            "cavalry_order": "flank_right",
            "timing": "now",
            "reason": "Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.",
            "risk": "low"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2248.1229305267334,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 2248.1229305267334,
      "latency_p99_ms": 3828.04012298584
    },
    "4": {
      "level": 4,
      "name": "Trade Route (Campaign Navigation)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Optimal trade route selection",
          "raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Offers the highest profit with a reasonable travel time.\", \"risk\": \"medium\"}",
          "parsed": {
            "route": "A",
            "expected_profit": 500,
            "days_used": 8,
            "reason": "Offers the highest profit with a reasonable travel time.",
            "risk": "medium"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3142.971992492676,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Adapt plan when war declared",
          "raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.\", \"gold_at_risk\": 400}",
          "parsed": {
            "decision": "divert_to_sturgia",
            "reason": "Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.",
            "gold_at_risk": 400
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3317.8718090057373,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Multi-stop route planning with constraints",
          "raw_response": "{\"stops\": [\"Ortysia\", \"Epicrotea\", \"Rhotae\"], \"total_days\": 12, \"estimated_profit\": 1200, \"reason\": \"Optimized for time and profit, focusing on essential goods with high demand and low risk.\"}",
          "parsed": {
            "stops": [
              "Ortysia",
              "Epicrotea",
              "Rhotae"
            ],
            "total_days": 12,
            "estimated_profit": 1200,
            "reason": "Optimized for time and profit, focusing on essential goods with high demand and low risk."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3235.442876815796,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 3235.442876815796,
      "latency_p99_ms": 3317.8718090057373
    },
    "5": {
      "level": 5,
      "name": "Mini Campaign (Full Campaign Loop)",
      "turns": [
        {
          "turn": 1,
          "state_before": {
            "turn": 1,
            "gold": 1000,
            "troops": 25,
            "renown": 150,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": -80,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Increase gold and renown without risking relations or encountering retaliation.\", \"goal\": \"Economic stability and reputation building\"}",
          "parsed": {
            "action": "trade_circuit",
            "target_faction": null,
            "reason": "Increase gold and renown without risking relations or encountering retaliation.",
            "goal": "Economic stability and reputation building"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "trade_circuit",
          "latency_ms": 3970.310926437378,
          "error": ""
        },
        {
          "turn": 2,
          "state_before": {
            "turn": 2,
            "gold": 1300,
            "troops": 25,
            "renown": 160,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": -80,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"escort_caravan\", \"target_faction\": \"empire\", \"reason\": \"Improve relations with a major faction and gain additional renown.\", \"goal\": \"Political alliance and reputation advancement\"}",
          "parsed": {
            "action": "escort_caravan",
            "target_faction": "empire",
            "reason": "Improve relations with a major faction and gain additional renown.",
            "goal": "Political alliance and reputation advancement"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "escort_caravan",
          "latency_ms": 2754.3041706085205,
          "error": ""
        },
        {
          "turn": 3,
          "state_before": {
            "turn": 3,
            "gold": 1450,
            "troops": 25,
            "renown": 180,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": -80,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Increase military strength to prepare for a potential siege or defense.\", \"goal\": \"Military readiness and expansion capability\"}",
          "parsed": {
            "action": "recruit_troops",
            "target_faction": null,
            "reason": "Increase military strength to prepare for a potential siege or defense.",
            "goal": "Military readiness and expansion capability"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "recruit_troops",
          "latency_ms": 2931.4141273498535,
          "error": ""
        },
        {
          "turn": 4,
          "state_before": {
            "turn": 4,
            "gold": 1250,
            "troops": 40,
            "renown": 180,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": -80,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"siege_castle\", \"target_faction\": \"empire\", \"reason\": \"Attempt to capture a settlement to meet the win condition of owning at least one settlement.\", \"goal\": \"Territorial expansion and victory condition\"}",
          "parsed": {
            "action": "siege_castle",
            "target_faction": "empire",
            "reason": "Attempt to capture a settlement to meet the win condition of owning at least one settlement.",
            "goal": "Territorial expansion and victory condition"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "siege_castle",
          "latency_ms": 3413.828134536743,
          "error": ""
        }
      ],
      "final_state": {
        "turn": 4,
        "gold": 750,
        "troops": 40,
        "renown": 260,
        "settlements": 1,
        "relations": {
          "vlandia": 0,
          "sturgia": 0,
          "empire": -80,
          "aserai": 0,
          "battania": 0
        }
      },
      "passed": true,
      "score": 1.0,
      "reached_gold_target": false,
      "reached_renown_target": false,
      "reached_settlement_target": true,
      "latency_p50_ms": 3413.828134536743,
      "latency_p99_ms": 3970.310926437378
    }
  },
  "summary": {
    "total_elapsed_s": 52.1,
    "levels_run": [
      0,
      1,
      2,
      3,
      4,
      5
    ],
    "levels_passed": [
      0,
      2,
      3,
      4,
      5
    ],
    "levels_failed": [
      1
    ],
    "m1_gate_passed": false,
    "m1_gate_notes": [
      "Level 1 FAILED (score=0.50)"
    ],
    "m1_latency_threshold_ms": 10000
  }
}