Refs #1092 - Run full 6-level benchmark against hermes3:latest and hermes3:8b - Fix Level 1 harness: coerce string-digit moves to int (hermes3 emits "move": "4" instead of "move": 4; type fidelity is already tested at L0) - All three models (qwen2.5:14b, hermes3:latest, hermes3:8b) pass L0 and L2–L5 but fail L1 at 50% score - Root cause: models pick heuristic moves without checking if the target square is already occupied - M1 gate: FAILED for all models; qwen3:14b not available on instance - Add SCORECARD.md with full cross-model comparison and recommendations Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
476 lines
21 KiB
JSON
476 lines
21 KiB
JSON
{
|
|
"model": "hermes3:8b",
|
|
"timestamp": "2026-03-23T19:29:22.731080+00:00",
|
|
"skipped": false,
|
|
"levels": {
|
|
"0": {
|
|
"level": 0,
|
|
"name": "JSON Compliance (Coin Flip)",
|
|
"trials": [
|
|
{
|
|
"trial_index": 0,
|
|
"prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
|
|
"raw_response": "{\"choice\": \"heads\"}",
|
|
"parsed": {
|
|
"choice": "heads"
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"value_valid": true,
|
|
"latency_ms": 522.9010581970215,
|
|
"error": ""
|
|
},
|
|
{
|
|
"trial_index": 1,
|
|
"prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
|
|
"raw_response": "{\"action\": \"defend\", \"confidence\": 0.7}",
|
|
"parsed": {
|
|
"action": "defend",
|
|
"confidence": 0.7
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"value_valid": true,
|
|
"latency_ms": 592.6809310913086,
|
|
"error": ""
|
|
},
|
|
{
|
|
"trial_index": 2,
|
|
"prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
|
|
"raw_response": "{\"direction\": \"north\", \"reason\": \"string explaining why\"}",
|
|
"parsed": {
|
|
"direction": "north",
|
|
"reason": "string explaining why"
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"value_valid": true,
|
|
"latency_ms": 569.8130130767822,
|
|
"error": ""
|
|
}
|
|
],
|
|
"passed": true,
|
|
"score": 1.0,
|
|
"latency_p50_ms": 569.8130130767822,
|
|
"latency_p99_ms": 592.6809310913086
|
|
},
|
|
"1": {
|
|
"level": 1,
|
|
"name": "Board State Tracking (Tic-Tac-Toe)",
|
|
"trials": [
|
|
{
|
|
"scenario_index": 0,
|
|
"description": "Empty board \u2014 opening move",
|
|
"prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
|
"raw_response": "{\"move\": \"4\", \"reason\": \"Center move to establish dominance\"}",
|
|
"parsed": {
|
|
"move": "4",
|
|
"reason": "Center move to establish dominance"
|
|
},
|
|
"valid_json": true,
|
|
"move_legal": true,
|
|
"move_correct": true,
|
|
"latency_ms": 766.8571472167969,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 1,
|
|
"description": "Block opponent's winning move",
|
|
"prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
|
"raw_response": "{\"move\": 4, \"reason\": \"Block the winning move for O in the center\"}",
|
|
"parsed": {
|
|
"move": 4,
|
|
"reason": "Block the winning move for O in the center"
|
|
},
|
|
"valid_json": true,
|
|
"move_legal": false,
|
|
"move_correct": false,
|
|
"latency_ms": 734.1668605804443,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 2,
|
|
"description": "Take winning move",
|
|
"prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
|
"raw_response": "{\"move\": 0, \"reason\": \"I can win this turn by placing my X in position 0, creating a winning line with X at positions 0, 4, and 8.\"}",
|
|
"parsed": {
|
|
"move": 0,
|
|
"reason": "I can win this turn by placing my X in position 0, creating a winning line with X at positions 0, 4, and 8."
|
|
},
|
|
"valid_json": true,
|
|
"move_legal": false,
|
|
"move_correct": false,
|
|
"latency_ms": 1312.8020763397217,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 3,
|
|
"description": "Legal move on partially filled board",
|
|
"prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
|
"raw_response": "{\"move\": 6, \"reason\": \"Block X's potential winning move in the center.\"}",
|
|
"parsed": {
|
|
"move": 6,
|
|
"reason": "Block X's potential winning move in the center."
|
|
},
|
|
"valid_json": true,
|
|
"move_legal": true,
|
|
"move_correct": true,
|
|
"latency_ms": 751.0488033294678,
|
|
"error": ""
|
|
}
|
|
],
|
|
"passed": false,
|
|
"score": 0.5,
|
|
"latency_p50_ms": 766.8571472167969,
|
|
"latency_p99_ms": 1312.8020763397217
|
|
},
|
|
"2": {
|
|
"level": 2,
|
|
"name": "Resource Management (Party Economy)",
|
|
"trials": [
|
|
{
|
|
"scenario_index": 0,
|
|
"description": "Budget allocation under constraint",
|
|
"raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"The priority is to address the critical morale issue and reduce casualty rates. Buying food for 20 days will stabilize morale, preventing potential desertions. Repairing armor will also help reduce the casualty rate, making the remaining troops more effective. Recruit infantry is not chosen as it would deplete all the gold, leaving no funds for food or armor repair.\"}",
|
|
"parsed": {
|
|
"choices": [
|
|
"buy_food",
|
|
"repair_armor"
|
|
],
|
|
"gold_spent": 350,
|
|
"reason": "The priority is to address the critical morale issue and reduce casualty rates. Buying food for 20 days will stabilize morale, preventing potential desertions. Repairing armor will also help reduce the casualty rate, making the remaining troops more effective. Recruit infantry is not chosen as it would deplete all the gold, leaving no funds for food or armor repair."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 2832.072973251343,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 1,
|
|
"description": "Troop tier upgrade decision",
|
|
"raw_response": "{\"action\": \"upgrade_recruits\", \"reason\": \"Upgrading recruits to Tier-2 will improve our party's combat effectiveness against the upcoming raider camp attack. The moderate difficulty suggests we need a stronger force to succeed.\", \"expected_outcome\": \"Increased chances of victory in the upcoming battle, potentially saving resources in the long run by avoiding higher upkeep costs of dismissing recruits.\"}",
|
|
"parsed": {
|
|
"action": "upgrade_recruits",
|
|
"reason": "Upgrading recruits to Tier-2 will improve our party's combat effectiveness against the upcoming raider camp attack. The moderate difficulty suggests we need a stronger force to succeed.",
|
|
"expected_outcome": "Increased chances of victory in the upcoming battle, potentially saving resources in the long run by avoiding higher upkeep costs of dismissing recruits."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 2407.7491760253906,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 2,
|
|
"description": "Multi-turn planning horizon",
|
|
"raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"Buying 5 extra food days will ensure we have enough food to cross the desert, and we will still have enough gold left to reach the town on day 10 to buy more supplies.\"}",
|
|
"parsed": {
|
|
"extra_food_days": 5,
|
|
"cost": 50,
|
|
"remaining_gold": 250,
|
|
"reason": "Buying 5 extra food days will ensure we have enough food to cross the desert, and we will still have enough gold left to reach the town on day 10 to buy more supplies."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 2028.775930404663,
|
|
"error": ""
|
|
}
|
|
],
|
|
"passed": true,
|
|
"score": 1.0,
|
|
"latency_p50_ms": 2407.7491760253906,
|
|
"latency_p99_ms": 2832.072973251343
|
|
},
|
|
"3": {
|
|
"level": 3,
|
|
"name": "Battle Tactics (Formation Commands)",
|
|
"trials": [
|
|
{
|
|
"scenario_index": 0,
|
|
"description": "Ranged vs infantry \u2014 defensive opening",
|
|
"raw_response": "{\"infantry_order\": \"skirmish\", \"archer_order\": \"skirmish\", \"reason\": \"The enemy is advancing with heavy infantry, so our priority should be to disrupt their formation and inflict casualties from range before they close the gap. The archers and infantry will pepper the enemy with arrows as they approach, aiming to break their cohesion and lower their morale.\", \"expected_outcome\": \"The enemy's heavy infantry will take significant casualties and lose cohesion before reaching our lines, giving us a tactical advantage when the two forces meet in melee combat.\"}",
|
|
"parsed": {
|
|
"infantry_order": "skirmish",
|
|
"archer_order": "skirmish",
|
|
"reason": "The enemy is advancing with heavy infantry, so our priority should be to disrupt their formation and inflict casualties from range before they close the gap. The archers and infantry will pepper the enemy with arrows as they approach, aiming to break their cohesion and lower their morale.",
|
|
"expected_outcome": "The enemy's heavy infantry will take significant casualties and lose cohesion before reaching our lines, giving us a tactical advantage when the two forces meet in melee combat."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": false,
|
|
"latency_ms": 3436.897039413452,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 1,
|
|
"description": "Outnumbered \u2014 retreat decision",
|
|
"raw_response": "{\"order\": \"fallback\", \"direction\": \"east\", \"reason\": \"Our troops are heavily wounded and outmatched by the 60 bandits surrounding us. We must retreat immediately to the east where there is an escape route. Fallback is the best option to preserve our troops and live to fight another day.\", \"priority\": \"preserve_troops\"}",
|
|
"parsed": {
|
|
"order": "fallback",
|
|
"direction": "east",
|
|
"reason": "Our troops are heavily wounded and outmatched by the 60 bandits surrounding us. We must retreat immediately to the east where there is an escape route. Fallback is the best option to preserve our troops and live to fight another day.",
|
|
"priority": "preserve_troops"
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 2173.5548973083496,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 2,
|
|
"description": "Cavalry flanking opportunity",
|
|
"raw_response": "{\"cavalry_order\": \"flank_left\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and our cavalry can break their line quickly, routing them in ~30 seconds.\", \"risk\": \"low\"}",
|
|
"parsed": {
|
|
"cavalry_order": "flank_left",
|
|
"timing": "now",
|
|
"reason": "Enemy left flank is unprotected and our cavalry can break their line quickly, routing them in ~30 seconds.",
|
|
"risk": "low"
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 1566.443920135498,
|
|
"error": ""
|
|
}
|
|
],
|
|
"passed": true,
|
|
"score": 1.0,
|
|
"latency_p50_ms": 2173.5548973083496,
|
|
"latency_p99_ms": 3436.897039413452
|
|
},
|
|
"4": {
|
|
"level": 4,
|
|
"name": "Trade Route (Campaign Navigation)",
|
|
"trials": [
|
|
{
|
|
"scenario_index": 0,
|
|
"description": "Optimal trade route selection",
|
|
"raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of goods and markets.\", \"risk\": \"low\"}",
|
|
"parsed": {
|
|
"route": "A",
|
|
"expected_profit": 500,
|
|
"days_used": 8,
|
|
"reason": "Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of goods and markets.",
|
|
"risk": "low"
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 2507.443904876709,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 1,
|
|
"description": "Adapt plan when war declared",
|
|
"raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Vlandia declaring war makes entering their territory too dangerous for trade. Sturgia is neutral but closer than the Empire, so less time and gold at risk.\", \"gold_at_risk\": 400}",
|
|
"parsed": {
|
|
"decision": "divert_to_sturgia",
|
|
"reason": "Vlandia declaring war makes entering their territory too dangerous for trade. Sturgia is neutral but closer than the Empire, so less time and gold at risk.",
|
|
"gold_at_risk": 400
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 2011.15083694458,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 2,
|
|
"description": "Multi-stop route planning with constraints",
|
|
"raw_response": "{\n \"stops\": [\n \"Rhotae\",\n \"Ortysia\",\n \"Epicrotea\"\n ],\n \"total_days\": 7,\n \"estimated_profit\": 800,\n \"reason\": \"Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. 7 days total. 800 gold budget should be enough with this plan.\"\n}",
|
|
"parsed": {
|
|
"stops": [
|
|
"Rhotae",
|
|
"Ortysia",
|
|
"Epicrotea"
|
|
],
|
|
"total_days": 7,
|
|
"estimated_profit": 800,
|
|
"reason": "Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. 7 days total. 800 gold budget should be enough with this plan."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 3420.1788902282715,
|
|
"error": ""
|
|
}
|
|
],
|
|
"passed": true,
|
|
"score": 1.0,
|
|
"latency_p50_ms": 2507.443904876709,
|
|
"latency_p99_ms": 3420.1788902282715
|
|
},
|
|
"5": {
|
|
"level": 5,
|
|
"name": "Mini Campaign (Full Campaign Loop)",
|
|
"turns": [
|
|
{
|
|
"turn": 1,
|
|
"state_before": {
|
|
"turn": 1,
|
|
"gold": 1000,
|
|
"troops": 25,
|
|
"renown": 150,
|
|
"settlements": 0,
|
|
"relations": {
|
|
"vlandia": 0,
|
|
"sturgia": 0,
|
|
"empire": 0,
|
|
"aserai": 0,
|
|
"battania": 0
|
|
}
|
|
},
|
|
"raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Establishing trade relations will bring in gold and renown without risking retaliation or damaging relations with any faction. This sets a strong financial foundation for future expansion.\", \"goal\": \"Increase gold and renown\"}",
|
|
"parsed": {
|
|
"action": "trade_circuit",
|
|
"target_faction": null,
|
|
"reason": "Establishing trade relations will bring in gold and renown without risking retaliation or damaging relations with any faction. This sets a strong financial foundation for future expansion.",
|
|
"goal": "Increase gold and renown"
|
|
},
|
|
"valid_json": true,
|
|
"valid_action": true,
|
|
"action": "trade_circuit",
|
|
"latency_ms": 2763.1959915161133,
|
|
"error": ""
|
|
},
|
|
{
|
|
"turn": 2,
|
|
"state_before": {
|
|
"turn": 2,
|
|
"gold": 1300,
|
|
"troops": 25,
|
|
"renown": 160,
|
|
"settlements": 0,
|
|
"relations": {
|
|
"vlandia": 0,
|
|
"sturgia": 0,
|
|
"empire": 0,
|
|
"aserai": 0,
|
|
"battania": 0
|
|
}
|
|
},
|
|
"raw_response": "{\"action\": \"tournament\", \"target_faction\": null, \"reason\": \"Participating in a tournament will boost my renown and relations without spending gold. This positive publicity can help attract potential vassals or allies in the future.\", \"goal\": \"Increase renown and relations\"}",
|
|
"parsed": {
|
|
"action": "tournament",
|
|
"target_faction": null,
|
|
"reason": "Participating in a tournament will boost my renown and relations without spending gold. This positive publicity can help attract potential vassals or allies in the future.",
|
|
"goal": "Increase renown and relations"
|
|
},
|
|
"valid_json": true,
|
|
"valid_action": true,
|
|
"action": "tournament",
|
|
"latency_ms": 2335.5319499969482,
|
|
"error": ""
|
|
},
|
|
{
|
|
"turn": 3,
|
|
"state_before": {
|
|
"turn": 3,
|
|
"gold": 1200,
|
|
"troops": 25,
|
|
"renown": 220,
|
|
"settlements": 0,
|
|
"relations": {
|
|
"vlandia": 0,
|
|
"sturgia": 0,
|
|
"empire": 0,
|
|
"aserai": 0,
|
|
"battania": 0
|
|
}
|
|
},
|
|
"raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Recruiting more troops will strengthen my military force, which is essential for future expansion and defending my settlements once I have them. This investment in my army will pay off in the long run.\", \"goal\": \"Increase troops\"}",
|
|
"parsed": {
|
|
"action": "recruit_troops",
|
|
"target_faction": null,
|
|
"reason": "Recruiting more troops will strengthen my military force, which is essential for future expansion and defending my settlements once I have them. This investment in my army will pay off in the long run.",
|
|
"goal": "Increase troops"
|
|
},
|
|
"valid_json": true,
|
|
"valid_action": true,
|
|
"action": "recruit_troops",
|
|
"latency_ms": 2570.871114730835,
|
|
"error": ""
|
|
},
|
|
{
|
|
"turn": 4,
|
|
"state_before": {
|
|
"turn": 4,
|
|
"gold": 1000,
|
|
"troops": 40,
|
|
"renown": 220,
|
|
"settlements": 0,
|
|
"relations": {
|
|
"vlandia": 0,
|
|
"sturgia": 0,
|
|
"empire": 0,
|
|
"aserai": 0,
|
|
"battania": 0
|
|
}
|
|
},
|
|
"raw_response": "{\"action\": \"siege_castle\", \"target_faction\": null, \"reason\": \"Capturing a castle will give me my first settlement, putting me closer to winning the game. This physical foothold in Calradia will provide a base of operations and resources to further my ambitions of becoming King.\", \"goal\": \"Gain first settlement\"}",
|
|
"parsed": {
|
|
"action": "siege_castle",
|
|
"target_faction": null,
|
|
"reason": "Capturing a castle will give me my first settlement, putting me closer to winning the game. This physical foothold in Calradia will provide a base of operations and resources to further my ambitions of becoming King.",
|
|
"goal": "Gain first settlement"
|
|
},
|
|
"valid_json": true,
|
|
"valid_action": true,
|
|
"action": "siege_castle",
|
|
"latency_ms": 2367.640972137451,
|
|
"error": ""
|
|
}
|
|
],
|
|
"final_state": {
|
|
"turn": 4,
|
|
"gold": 500,
|
|
"troops": 40,
|
|
"renown": 300,
|
|
"settlements": 1,
|
|
"relations": {
|
|
"vlandia": 0,
|
|
"sturgia": 0,
|
|
"empire": 0,
|
|
"aserai": 0,
|
|
"battania": 0
|
|
}
|
|
},
|
|
"passed": true,
|
|
"score": 1.0,
|
|
"reached_gold_target": false,
|
|
"reached_renown_target": false,
|
|
"reached_settlement_target": true,
|
|
"latency_p50_ms": 2570.871114730835,
|
|
"latency_p99_ms": 2763.1959915161133
|
|
}
|
|
},
|
|
"summary": {
|
|
"total_elapsed_s": 37.7,
|
|
"levels_run": [
|
|
0,
|
|
1,
|
|
2,
|
|
3,
|
|
4,
|
|
5
|
|
],
|
|
"levels_passed": [
|
|
0,
|
|
2,
|
|
3,
|
|
4,
|
|
5
|
|
],
|
|
"levels_failed": [
|
|
1
|
|
],
|
|
"m1_gate_passed": false,
|
|
"m1_gate_notes": [
|
|
"Level 1 FAILED (score=0.50)"
|
|
],
|
|
"m1_latency_threshold_ms": 10000
|
|
}
|
|
} |