Files
Timmy-time-dashboard/timmy-benchmark/results/hermes3_latest_20260323_152900.json
Alexander Whitestone 9e08e87312
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled
[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com>
Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00

476 lines
23 KiB
JSON

{
"model": "hermes3:latest",
"timestamp": "2026-03-23T19:28:19.717773+00:00",
"skipped": false,
"levels": {
"0": {
"level": 0,
"name": "JSON Compliance (Coin Flip)",
"trials": [
{
"trial_index": 0,
"prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
"raw_response": "{\"choice\": \"heads\"}",
"parsed": {
"choice": "heads"
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 1028.2011032104492,
"error": ""
},
{
"trial_index": 1,
"prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
"raw_response": "{\"action\": \"defend\", \"confidence\": 0.7}",
"parsed": {
"action": "defend",
"confidence": 0.7
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 700.0582218170166,
"error": ""
},
{
"trial_index": 2,
"prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
"raw_response": "{\"direction\": \"north\", \"reason\": \"The northern direction allows for the most efficient and direct path towards the goal, minimizing potential obstacles and maximizing progress.\"}",
"parsed": {
"direction": "north",
"reason": "The northern direction allows for the most efficient and direct path towards the goal, minimizing potential obstacles and maximizing progress."
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 1184.3459606170654,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 1028.2011032104492,
"latency_p99_ms": 1184.3459606170654
},
"1": {
"level": 1,
"name": "Board State Tracking (Tic-Tac-Toe)",
"trials": [
{
"scenario_index": 0,
"description": "Empty board \u2014 opening move",
"prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": \"4\", \"reason\": \"Center move to establish dominance\"}",
"parsed": {
"move": "4",
"reason": "Center move to establish dominance"
},
"valid_json": true,
"move_legal": true,
"move_correct": true,
"latency_ms": 932.2080612182617,
"error": ""
},
{
"scenario_index": 1,
"description": "Block opponent's winning move",
"prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 4, \"reason\": \"Block the winning move for O in the center, creating a chance for a winning X cross\"}",
"parsed": {
"move": 4,
"reason": "Block the winning move for O in the center, creating a chance for a winning X cross"
},
"valid_json": true,
"move_legal": false,
"move_correct": false,
"latency_ms": 1165.644884109497,
"error": ""
},
{
"scenario_index": 2,
"description": "Take winning move",
"prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 0, \"reason\": \"I can win this turn by placing my X in position 0, creating a winning line of three X's horizontally.\"}",
"parsed": {
"move": 0,
"reason": "I can win this turn by placing my X in position 0, creating a winning line of three X's horizontally."
},
"valid_json": true,
"move_legal": false,
"move_correct": false,
"latency_ms": 1302.915096282959,
"error": ""
},
{
"scenario_index": 3,
"description": "Legal move on partially filled board",
"prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 6, \"reason\": \"Block X's potential win in the center.\"}",
"parsed": {
"move": 6,
"reason": "Block X's potential win in the center."
},
"valid_json": true,
"move_legal": true,
"move_correct": true,
"latency_ms": 851.456880569458,
"error": ""
}
],
"passed": false,
"score": 0.5,
"latency_p50_ms": 1165.644884109497,
"latency_p99_ms": 1302.915096282959
},
"2": {
"level": 2,
"name": "Resource Management (Party Economy)",
"trials": [
{
"scenario_index": 0,
"description": "Budget allocation under constraint",
"raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"Buying food is crucial to prevent desertion, and repairing armor will help reduce casualties. Recruit infantry is put on hold due to limited resources.\"}",
"parsed": {
"choices": [
"buy_food",
"repair_armor"
],
"gold_spent": 350,
"reason": "Buying food is crucial to prevent desertion, and repairing armor will help reduce casualties. Recruit infantry is put on hold due to limited resources."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2031.9790840148926,
"error": ""
},
{
"scenario_index": 1,
"description": "Troop tier upgrade decision",
"raw_response": "{\"action\": \"save_gold\", \"reason\": \"Upgrading recruits now would leave little gold for emergencies, especially with the upcoming raiders camp attack. It's better to maintain the current troop composition and save gold for potential future threats or opportunities.\", \"expected_outcome\": \"Having enough gold to respond to unexpected situations without needing to dismiss or downgrade troops.\"}",
"parsed": {
"action": "save_gold",
"reason": "Upgrading recruits now would leave little gold for emergencies, especially with the upcoming raiders camp attack. It's better to maintain the current troop composition and save gold for potential future threats or opportunities.",
"expected_outcome": "Having enough gold to respond to unexpected situations without needing to dismiss or downgrade troops."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2231.9281101226807,
"error": ""
},
{
"scenario_index": 2,
"description": "Multi-turn planning horizon",
"raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"Buying 5 extra food days will ensure we have enough food to cross the desert and reach the town on day 10, while still having enough gold left for potential expenses.\"}",
"parsed": {
"extra_food_days": 5,
"cost": 50,
"remaining_gold": 250,
"reason": "Buying 5 extra food days will ensure we have enough food to cross the desert and reach the town on day 10, while still having enough gold left for potential expenses."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 1943.3560371398926,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2031.9790840148926,
"latency_p99_ms": 2231.9281101226807
},
"3": {
"level": 3,
"name": "Battle Tactics (Formation Commands)",
"trials": [
{
"scenario_index": 0,
"description": "Ranged vs infantry \u2014 defensive opening",
"raw_response": "{\"infantry_order\": \"skirmish\", \"archer_order\": \"skirmish\", \"reason\": \"The enemy heavy infantry is advancing, so we should use our archers and infantry to pepper them with ranged attacks before they close to melee range. This will soften them up and reduce their combat effectiveness when our forces meet.\", \"expected_outcome\": \"By the time our forces engage in melee, the enemy heavy infantry will have suffered significant casualties from the ranged attacks, giving us a tactical advantage and maximizing the damage we can inflict before contact is made.\"}",
"parsed": {
"infantry_order": "skirmish",
"archer_order": "skirmish",
"reason": "The enemy heavy infantry is advancing, so we should use our archers and infantry to pepper them with ranged attacks before they close to melee range. This will soften them up and reduce their combat effectiveness when our forces meet.",
"expected_outcome": "By the time our forces engage in melee, the enemy heavy infantry will have suffered significant casualties from the ranged attacks, giving us a tactical advantage and maximizing the damage we can inflict before contact is made."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": false,
"latency_ms": 3524.5888233184814,
"error": ""
},
{
"scenario_index": 1,
"description": "Outnumbered \u2014 retreat decision",
"raw_response": "{\"order\": \"fallback\", \"direction\": \"east\", \"reason\": \"Outnumbered and with many wounded, falling back to the escape route is the best option to preserve our troops.\", \"priority\": \"preserve_troops\"}",
"parsed": {
"order": "fallback",
"direction": "east",
"reason": "Outnumbered and with many wounded, falling back to the escape route is the best option to preserve our troops.",
"priority": "preserve_troops"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 1523.0557918548584,
"error": ""
},
{
"scenario_index": 2,
"description": "Cavalry flanking opportunity",
"raw_response": "{\"cavalry_order\": \"flank_left\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and engaging our main force. A swift cavalry charge will catch them off guard and route them in ~30 seconds.\", \"risk\": \"low\"}",
"parsed": {
"cavalry_order": "flank_left",
"timing": "now",
"reason": "Enemy left flank is unprotected and engaging our main force. A swift cavalry charge will catch them off guard and route them in ~30 seconds.",
"risk": "low"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 1614.2308712005615,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 1614.2308712005615,
"latency_p99_ms": 3524.5888233184814
},
"4": {
"level": 4,
"name": "Trade Route (Campaign Navigation)",
"trials": [
{
"scenario_index": 0,
"description": "Optimal trade route selection",
"raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of profitability and travel distance.\", \"risk\": \"low\"}",
"parsed": {
"route": "A",
"expected_profit": 500,
"days_used": 8,
"reason": "Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of profitability and travel distance.",
"risk": "low"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2335.57391166687,
"error": ""
},
{
"scenario_index": 1,
"description": "Adapt plan when war declared",
"raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Given the new information that Vlandia has declared war on our faction, entering their territory would be too dangerous. Sturgia is a neutral but friendly region, so it should be safe to trade there. The Empire is allied but further away, and while safe, the extra travel time could reduce the profitability of the trade mission.\", \"gold_at_risk\": 400}",
"parsed": {
"decision": "divert_to_sturgia",
"reason": "Given the new information that Vlandia has declared war on our faction, entering their territory would be too dangerous. Sturgia is a neutral but friendly region, so it should be safe to trade there. The Empire is allied but further away, and while safe, the extra travel time could reduce the profitability of the trade mission.",
"gold_at_risk": 400
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2723.7470149993896,
"error": ""
},
{
"scenario_index": 2,
"description": "Multi-stop route planning with constraints",
"raw_response": "{\n \"stops\": [\n \"Rhotae\",\n \"Ortysia\",\n \"Epicrotea\"\n ],\n \"total_days\": 7,\n \"estimated_profit\": 1200,\n \"reason\": \"Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. Total 7 days, 800 gold budget sufficient. Expected profit of 1200 gold.\"\n}",
"parsed": {
"stops": [
"Rhotae",
"Ortysia",
"Epicrotea"
],
"total_days": 7,
"estimated_profit": 1200,
"reason": "Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. Total 7 days, 800 gold budget sufficient. Expected profit of 1200 gold."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3038.438320159912,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2723.7470149993896,
"latency_p99_ms": 3038.438320159912
},
"5": {
"level": 5,
"name": "Mini Campaign (Full Campaign Loop)",
"turns": [
{
"turn": 1,
"state_before": {
"turn": 1,
"gold": 1000,
"troops": 25,
"renown": 150,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Establishing trade relations will bring in gold without risking relations or retaliation. This sets a foundation for economic growth and allows me to assess other factions before committing to any alliances or conflicts.\", \"goal\": \"Increase gold reserves and build a power base through trade and diplomacy\"}",
"parsed": {
"action": "trade_circuit",
"target_faction": null,
"reason": "Establishing trade relations will bring in gold without risking relations or retaliation. This sets a foundation for economic growth and allows me to assess other factions before committing to any alliances or conflicts.",
"goal": "Increase gold reserves and build a power base through trade and diplomacy"
},
"valid_json": true,
"valid_action": true,
"action": "trade_circuit",
"latency_ms": 2898.583173751831,
"error": ""
},
{
"turn": 2,
"state_before": {
"turn": 2,
"gold": 1300,
"troops": 25,
"renown": 160,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"raid_village\", \"target_faction\": null, \"reason\": \"Raiding a village will provide a quick boost to my gold reserves and renown. This will also help me gauge the strength of my troops and the potential resistance from the targeted faction. It's a bold move that shows my power and willingness to act, which may intimidate other factions and open up opportunities for future alliances or conflicts.\", \"goal\": \"Increase gold and renown, assess troop strength and potential faction responses\"}",
"parsed": {
"action": "raid_village",
"target_faction": null,
"reason": "Raiding a village will provide a quick boost to my gold reserves and renown. This will also help me gauge the strength of my troops and the potential resistance from the targeted faction. It's a bold move that shows my power and willingness to act, which may intimidate other factions and open up opportunities for future alliances or conflicts.",
"goal": "Increase gold and renown, assess troop strength and potential faction responses"
},
"valid_json": true,
"valid_action": true,
"action": "raid_village",
"latency_ms": 3112.649917602539,
"error": ""
},
{
"turn": 3,
"state_before": {
"turn": 3,
"gold": 1500,
"troops": 25,
"renown": 190,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Recruiting more troops is essential for expanding my military strength and increasing my chances of successfully sieging a castle in the next turn. This will also help me prepare for potential conflicts with other factions and solidify my power base. Strengthening my army is a crucial step towards achieving my goal of becoming the King of Calradia.\", \"goal\": \"Increase troop count for future military expansion and defense\"}",
"parsed": {
"action": "recruit_troops",
"target_faction": null,
"reason": "Recruiting more troops is essential for expanding my military strength and increasing my chances of successfully sieging a castle in the next turn. This will also help me prepare for potential conflicts with other factions and solidify my power base. Strengthening my army is a crucial step towards achieving my goal of becoming the King of Calradia.",
"goal": "Increase troop count for future military expansion and defense"
},
"valid_json": true,
"valid_action": true,
"action": "recruit_troops",
"latency_ms": 3136.754035949707,
"error": ""
},
{
"turn": 4,
"state_before": {
"turn": 4,
"gold": 1300,
"troops": 40,
"renown": 190,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"siege_castle\", \"target_faction\": null, \"reason\": \"Sieging a castle will allow me to establish a settlement, which is crucial for meeting one of my win conditions. This will also demonstrate my military might and potentially intimidate other factions, opening up opportunities for future alliances or conflicts. While it comes with a risk of losing troops morale and straining relations, it's a necessary step towards achieving my goal of becoming the King of Calradia.\", \"goal\": \"Establish a settlement and demonstrate military strength to intimidate other factions\"}",
"parsed": {
"action": "siege_castle",
"target_faction": null,
"reason": "Sieging a castle will allow me to establish a settlement, which is crucial for meeting one of my win conditions. This will also demonstrate my military might and potentially intimidate other factions, opening up opportunities for future alliances or conflicts. While it comes with a risk of losing troops morale and straining relations, it's a necessary step towards achieving my goal of becoming the King of Calradia.",
"goal": "Establish a settlement and demonstrate military strength to intimidate other factions"
},
"valid_json": true,
"valid_action": true,
"action": "siege_castle",
"latency_ms": 3433.211326599121,
"error": ""
}
],
"final_state": {
"turn": 4,
"gold": 800,
"troops": 40,
"renown": 270,
"settlements": 1,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"passed": true,
"score": 1.0,
"reached_gold_target": false,
"reached_renown_target": false,
"reached_settlement_target": true,
"latency_p50_ms": 3136.754035949707,
"latency_p99_ms": 3433.211326599121
}
},
"summary": {
"total_elapsed_s": 40.7,
"levels_run": [
0,
1,
2,
3,
4,
5
],
"levels_passed": [
0,
2,
3,
4,
5
],
"levels_failed": [
1
],
"m1_gate_passed": false,
"m1_gate_notes": [
"Level 1 FAILED (score=0.50)"
],
"m1_latency_threshold_ms": 10000
}
}