Files
Timmy-time-dashboard/timmy-benchmark/results/qwen2.5_14b_20260323_142119.json
Alexander Whitestone 9e08e87312
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled
[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com>
Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00

476 lines
18 KiB
JSON

{
"model": "qwen2.5:14b",
"timestamp": "2026-03-23T18:20:27.095190+00:00",
"skipped": false,
"levels": {
"0": {
"level": 0,
"name": "JSON Compliance (Coin Flip)",
"trials": [
{
"trial_index": 0,
"prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
"raw_response": "{\"choice\": \"heads\"}",
"parsed": {
"choice": "heads"
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 6347.5000858306885,
"error": ""
},
{
"trial_index": 1,
"prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
"raw_response": "{\"action\": \"defend\", \"confidence\": 0.6}",
"parsed": {
"action": "defend",
"confidence": 0.6
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 850.42405128479,
"error": ""
},
{
"trial_index": 2,
"prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
"raw_response": "{\"direction\": \"north\", \"reason\": \"There is a reported enemy encampment to the north that we need to engage.\"}",
"parsed": {
"direction": "north",
"reason": "There is a reported enemy encampment to the north that we need to engage."
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 1443.0510997772217,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 1443.0510997772217,
"latency_p99_ms": 6347.5000858306885
},
"1": {
"level": 1,
"name": "Board State Tracking (Tic-Tac-Toe)",
"trials": [
{
"scenario_index": 0,
"description": "Empty board \u2014 opening move",
"prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 4, \"reason\": \"Control the center\"}",
"parsed": {
"move": 4,
"reason": "Control the center"
},
"valid_json": true,
"move_legal": true,
"move_correct": true,
"latency_ms": 1184.3791007995605,
"error": ""
},
{
"scenario_index": 1,
"description": "Block opponent's winning move",
"prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 4, \"reason\": \"Block O's winning move\"}",
"parsed": {
"move": 4,
"reason": "Block O's winning move"
},
"valid_json": true,
"move_legal": false,
"move_correct": false,
"latency_ms": 942.816972732544,
"error": ""
},
{
"scenario_index": 2,
"description": "Take winning move",
"prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 2, \"reason\": \"Winning move\"}",
"parsed": {
"move": 2,
"reason": "Winning move"
},
"valid_json": true,
"move_legal": false,
"move_correct": false,
"latency_ms": 849.2491245269775,
"error": ""
},
{
"scenario_index": 3,
"description": "Legal move on partially filled board",
"prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 7, \"reason\": \"Creating a fork opportunity\"}",
"parsed": {
"move": 7,
"reason": "Creating a fork opportunity"
},
"valid_json": true,
"move_legal": true,
"move_correct": true,
"latency_ms": 885.8909606933594,
"error": ""
}
],
"passed": false,
"score": 0.5,
"latency_p50_ms": 942.816972732544,
"latency_p99_ms": 1184.3791007995605
},
"2": {
"level": 2,
"name": "Resource Management (Party Economy)",
"trials": [
{
"scenario_index": 0,
"description": "Budget allocation under constraint",
"raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles.\"}",
"parsed": {
"choices": [
"buy_food",
"repair_armor"
],
"gold_spent": 350,
"reason": "Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3121.7379570007324,
"error": ""
},
{
"scenario_index": 1,
"description": "Troop tier upgrade decision",
"raw_response": "{\"action\": \"upgrade_recruits\", \"reason\": \"Improving the strength of the weakest troops can balance the party's overall combat effectiveness.\", \"expected_outcome\": \"Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers.\"}",
"parsed": {
"action": "upgrade_recruits",
"reason": "Improving the strength of the weakest troops can balance the party's overall combat effectiveness.",
"expected_outcome": "Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2936.119794845581,
"error": ""
},
{
"scenario_index": 2,
"description": "Multi-turn planning horizon",
"raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now.\"}",
"parsed": {
"extra_food_days": 5,
"cost": 50,
"remaining_gold": 250,
"reason": "To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2515.505790710449,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2936.119794845581,
"latency_p99_ms": 3121.7379570007324
},
"3": {
"level": 3,
"name": "Battle Tactics (Formation Commands)",
"trials": [
{
"scenario_index": 0,
"description": "Ranged vs infantry \u2014 defensive opening",
"raw_response": "{\"infantry_order\": \"shield_wall\", \"archer_order\": \"skirmish\", \"reason\": \"Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.\", \"expected_outcome\": \"Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact.\"}",
"parsed": {
"infantry_order": "shield_wall",
"archer_order": "skirmish",
"reason": "Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.",
"expected_outcome": "Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3828.04012298584,
"error": ""
},
{
"scenario_index": 1,
"description": "Outnumbered \u2014 retreat decision",
"raw_response": "{\"order\": \"skirmish\", \"direction\": \"east\", \"reason\": \"Engage the enemy at range to buy time for an organized retreat.\", \"priority\": \"preserve_troops\"}",
"parsed": {
"order": "skirmish",
"direction": "east",
"reason": "Engage the enemy at range to buy time for an organized retreat.",
"priority": "preserve_troops"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": false,
"latency_ms": 2162.7819538116455,
"error": ""
},
{
"scenario_index": 2,
"description": "Cavalry flanking opportunity",
"raw_response": "{\"cavalry_order\": \"flank_right\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.\", \"risk\": \"low\"}",
"parsed": {
"cavalry_order": "flank_right",
"timing": "now",
"reason": "Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.",
"risk": "low"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2248.1229305267334,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2248.1229305267334,
"latency_p99_ms": 3828.04012298584
},
"4": {
"level": 4,
"name": "Trade Route (Campaign Navigation)",
"trials": [
{
"scenario_index": 0,
"description": "Optimal trade route selection",
"raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Offers the highest profit with a reasonable travel time.\", \"risk\": \"medium\"}",
"parsed": {
"route": "A",
"expected_profit": 500,
"days_used": 8,
"reason": "Offers the highest profit with a reasonable travel time.",
"risk": "medium"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3142.971992492676,
"error": ""
},
{
"scenario_index": 1,
"description": "Adapt plan when war declared",
"raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.\", \"gold_at_risk\": 400}",
"parsed": {
"decision": "divert_to_sturgia",
"reason": "Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.",
"gold_at_risk": 400
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3317.8718090057373,
"error": ""
},
{
"scenario_index": 2,
"description": "Multi-stop route planning with constraints",
"raw_response": "{\"stops\": [\"Ortysia\", \"Epicrotea\", \"Rhotae\"], \"total_days\": 12, \"estimated_profit\": 1200, \"reason\": \"Optimized for time and profit, focusing on essential goods with high demand and low risk.\"}",
"parsed": {
"stops": [
"Ortysia",
"Epicrotea",
"Rhotae"
],
"total_days": 12,
"estimated_profit": 1200,
"reason": "Optimized for time and profit, focusing on essential goods with high demand and low risk."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3235.442876815796,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 3235.442876815796,
"latency_p99_ms": 3317.8718090057373
},
"5": {
"level": 5,
"name": "Mini Campaign (Full Campaign Loop)",
"turns": [
{
"turn": 1,
"state_before": {
"turn": 1,
"gold": 1000,
"troops": 25,
"renown": 150,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": -80,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Increase gold and renown without risking relations or encountering retaliation.\", \"goal\": \"Economic stability and reputation building\"}",
"parsed": {
"action": "trade_circuit",
"target_faction": null,
"reason": "Increase gold and renown without risking relations or encountering retaliation.",
"goal": "Economic stability and reputation building"
},
"valid_json": true,
"valid_action": true,
"action": "trade_circuit",
"latency_ms": 3970.310926437378,
"error": ""
},
{
"turn": 2,
"state_before": {
"turn": 2,
"gold": 1300,
"troops": 25,
"renown": 160,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": -80,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"escort_caravan\", \"target_faction\": \"empire\", \"reason\": \"Improve relations with a major faction and gain additional renown.\", \"goal\": \"Political alliance and reputation advancement\"}",
"parsed": {
"action": "escort_caravan",
"target_faction": "empire",
"reason": "Improve relations with a major faction and gain additional renown.",
"goal": "Political alliance and reputation advancement"
},
"valid_json": true,
"valid_action": true,
"action": "escort_caravan",
"latency_ms": 2754.3041706085205,
"error": ""
},
{
"turn": 3,
"state_before": {
"turn": 3,
"gold": 1450,
"troops": 25,
"renown": 180,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": -80,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Increase military strength to prepare for a potential siege or defense.\", \"goal\": \"Military readiness and expansion capability\"}",
"parsed": {
"action": "recruit_troops",
"target_faction": null,
"reason": "Increase military strength to prepare for a potential siege or defense.",
"goal": "Military readiness and expansion capability"
},
"valid_json": true,
"valid_action": true,
"action": "recruit_troops",
"latency_ms": 2931.4141273498535,
"error": ""
},
{
"turn": 4,
"state_before": {
"turn": 4,
"gold": 1250,
"troops": 40,
"renown": 180,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": -80,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"siege_castle\", \"target_faction\": \"empire\", \"reason\": \"Attempt to capture a settlement to meet the win condition of owning at least one settlement.\", \"goal\": \"Territorial expansion and victory condition\"}",
"parsed": {
"action": "siege_castle",
"target_faction": "empire",
"reason": "Attempt to capture a settlement to meet the win condition of owning at least one settlement.",
"goal": "Territorial expansion and victory condition"
},
"valid_json": true,
"valid_action": true,
"action": "siege_castle",
"latency_ms": 3413.828134536743,
"error": ""
}
],
"final_state": {
"turn": 4,
"gold": 750,
"troops": 40,
"renown": 260,
"settlements": 1,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": -80,
"aserai": 0,
"battania": 0
}
},
"passed": true,
"score": 1.0,
"reached_gold_target": false,
"reached_renown_target": false,
"reached_settlement_target": true,
"latency_p50_ms": 3413.828134536743,
"latency_p99_ms": 3970.310926437378
}
},
"summary": {
"total_elapsed_s": 52.1,
"levels_run": [
0,
1,
2,
3,
4,
5
],
"levels_passed": [
0,
2,
3,
4,
5
],
"levels_failed": [
1
],
"m1_gate_passed": false,
"m1_gate_notes": [
"Level 1 FAILED (score=0.50)"
],
"m1_latency_threshold_ms": 10000
}
}