forked from Rockachopa/Timmy-time-dashboard
[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
This commit is contained in:
committed by
rockachopa
parent
6e65b53f3a
commit
9e08e87312
476
timmy-benchmark/results/qwen2.5_14b_20260323_142119.json
Normal file
476
timmy-benchmark/results/qwen2.5_14b_20260323_142119.json
Normal file
@@ -0,0 +1,476 @@
|
||||
{
|
||||
"model": "qwen2.5:14b",
|
||||
"timestamp": "2026-03-23T18:20:27.095190+00:00",
|
||||
"skipped": false,
|
||||
"levels": {
|
||||
"0": {
|
||||
"level": 0,
|
||||
"name": "JSON Compliance (Coin Flip)",
|
||||
"trials": [
|
||||
{
|
||||
"trial_index": 0,
|
||||
"prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
|
||||
"raw_response": "{\"choice\": \"heads\"}",
|
||||
"parsed": {
|
||||
"choice": "heads"
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"value_valid": true,
|
||||
"latency_ms": 6347.5000858306885,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"trial_index": 1,
|
||||
"prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
|
||||
"raw_response": "{\"action\": \"defend\", \"confidence\": 0.6}",
|
||||
"parsed": {
|
||||
"action": "defend",
|
||||
"confidence": 0.6
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"value_valid": true,
|
||||
"latency_ms": 850.42405128479,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"trial_index": 2,
|
||||
"prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
|
||||
"raw_response": "{\"direction\": \"north\", \"reason\": \"There is a reported enemy encampment to the north that we need to engage.\"}",
|
||||
"parsed": {
|
||||
"direction": "north",
|
||||
"reason": "There is a reported enemy encampment to the north that we need to engage."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"value_valid": true,
|
||||
"latency_ms": 1443.0510997772217,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"passed": true,
|
||||
"score": 1.0,
|
||||
"latency_p50_ms": 1443.0510997772217,
|
||||
"latency_p99_ms": 6347.5000858306885
|
||||
},
|
||||
"1": {
|
||||
"level": 1,
|
||||
"name": "Board State Tracking (Tic-Tac-Toe)",
|
||||
"trials": [
|
||||
{
|
||||
"scenario_index": 0,
|
||||
"description": "Empty board \u2014 opening move",
|
||||
"prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
||||
"raw_response": "{\"move\": 4, \"reason\": \"Control the center\"}",
|
||||
"parsed": {
|
||||
"move": 4,
|
||||
"reason": "Control the center"
|
||||
},
|
||||
"valid_json": true,
|
||||
"move_legal": true,
|
||||
"move_correct": true,
|
||||
"latency_ms": 1184.3791007995605,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 1,
|
||||
"description": "Block opponent's winning move",
|
||||
"prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
||||
"raw_response": "{\"move\": 4, \"reason\": \"Block O's winning move\"}",
|
||||
"parsed": {
|
||||
"move": 4,
|
||||
"reason": "Block O's winning move"
|
||||
},
|
||||
"valid_json": true,
|
||||
"move_legal": false,
|
||||
"move_correct": false,
|
||||
"latency_ms": 942.816972732544,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 2,
|
||||
"description": "Take winning move",
|
||||
"prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
||||
"raw_response": "{\"move\": 2, \"reason\": \"Winning move\"}",
|
||||
"parsed": {
|
||||
"move": 2,
|
||||
"reason": "Winning move"
|
||||
},
|
||||
"valid_json": true,
|
||||
"move_legal": false,
|
||||
"move_correct": false,
|
||||
"latency_ms": 849.2491245269775,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 3,
|
||||
"description": "Legal move on partially filled board",
|
||||
"prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
||||
"raw_response": "{\"move\": 7, \"reason\": \"Creating a fork opportunity\"}",
|
||||
"parsed": {
|
||||
"move": 7,
|
||||
"reason": "Creating a fork opportunity"
|
||||
},
|
||||
"valid_json": true,
|
||||
"move_legal": true,
|
||||
"move_correct": true,
|
||||
"latency_ms": 885.8909606933594,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"passed": false,
|
||||
"score": 0.5,
|
||||
"latency_p50_ms": 942.816972732544,
|
||||
"latency_p99_ms": 1184.3791007995605
|
||||
},
|
||||
"2": {
|
||||
"level": 2,
|
||||
"name": "Resource Management (Party Economy)",
|
||||
"trials": [
|
||||
{
|
||||
"scenario_index": 0,
|
||||
"description": "Budget allocation under constraint",
|
||||
"raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles.\"}",
|
||||
"parsed": {
|
||||
"choices": [
|
||||
"buy_food",
|
||||
"repair_armor"
|
||||
],
|
||||
"gold_spent": 350,
|
||||
"reason": "Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 3121.7379570007324,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 1,
|
||||
"description": "Troop tier upgrade decision",
|
||||
"raw_response": "{\"action\": \"upgrade_recruits\", \"reason\": \"Improving the strength of the weakest troops can balance the party's overall combat effectiveness.\", \"expected_outcome\": \"Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers.\"}",
|
||||
"parsed": {
|
||||
"action": "upgrade_recruits",
|
||||
"reason": "Improving the strength of the weakest troops can balance the party's overall combat effectiveness.",
|
||||
"expected_outcome": "Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 2936.119794845581,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 2,
|
||||
"description": "Multi-turn planning horizon",
|
||||
"raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now.\"}",
|
||||
"parsed": {
|
||||
"extra_food_days": 5,
|
||||
"cost": 50,
|
||||
"remaining_gold": 250,
|
||||
"reason": "To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 2515.505790710449,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"passed": true,
|
||||
"score": 1.0,
|
||||
"latency_p50_ms": 2936.119794845581,
|
||||
"latency_p99_ms": 3121.7379570007324
|
||||
},
|
||||
"3": {
|
||||
"level": 3,
|
||||
"name": "Battle Tactics (Formation Commands)",
|
||||
"trials": [
|
||||
{
|
||||
"scenario_index": 0,
|
||||
"description": "Ranged vs infantry \u2014 defensive opening",
|
||||
"raw_response": "{\"infantry_order\": \"shield_wall\", \"archer_order\": \"skirmish\", \"reason\": \"Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.\", \"expected_outcome\": \"Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact.\"}",
|
||||
"parsed": {
|
||||
"infantry_order": "shield_wall",
|
||||
"archer_order": "skirmish",
|
||||
"reason": "Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.",
|
||||
"expected_outcome": "Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 3828.04012298584,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 1,
|
||||
"description": "Outnumbered \u2014 retreat decision",
|
||||
"raw_response": "{\"order\": \"skirmish\", \"direction\": \"east\", \"reason\": \"Engage the enemy at range to buy time for an organized retreat.\", \"priority\": \"preserve_troops\"}",
|
||||
"parsed": {
|
||||
"order": "skirmish",
|
||||
"direction": "east",
|
||||
"reason": "Engage the enemy at range to buy time for an organized retreat.",
|
||||
"priority": "preserve_troops"
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": false,
|
||||
"latency_ms": 2162.7819538116455,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 2,
|
||||
"description": "Cavalry flanking opportunity",
|
||||
"raw_response": "{\"cavalry_order\": \"flank_right\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.\", \"risk\": \"low\"}",
|
||||
"parsed": {
|
||||
"cavalry_order": "flank_right",
|
||||
"timing": "now",
|
||||
"reason": "Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.",
|
||||
"risk": "low"
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 2248.1229305267334,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"passed": true,
|
||||
"score": 1.0,
|
||||
"latency_p50_ms": 2248.1229305267334,
|
||||
"latency_p99_ms": 3828.04012298584
|
||||
},
|
||||
"4": {
|
||||
"level": 4,
|
||||
"name": "Trade Route (Campaign Navigation)",
|
||||
"trials": [
|
||||
{
|
||||
"scenario_index": 0,
|
||||
"description": "Optimal trade route selection",
|
||||
"raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Offers the highest profit with a reasonable travel time.\", \"risk\": \"medium\"}",
|
||||
"parsed": {
|
||||
"route": "A",
|
||||
"expected_profit": 500,
|
||||
"days_used": 8,
|
||||
"reason": "Offers the highest profit with a reasonable travel time.",
|
||||
"risk": "medium"
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 3142.971992492676,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 1,
|
||||
"description": "Adapt plan when war declared",
|
||||
"raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.\", \"gold_at_risk\": 400}",
|
||||
"parsed": {
|
||||
"decision": "divert_to_sturgia",
|
||||
"reason": "Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.",
|
||||
"gold_at_risk": 400
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 3317.8718090057373,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 2,
|
||||
"description": "Multi-stop route planning with constraints",
|
||||
"raw_response": "{\"stops\": [\"Ortysia\", \"Epicrotea\", \"Rhotae\"], \"total_days\": 12, \"estimated_profit\": 1200, \"reason\": \"Optimized for time and profit, focusing on essential goods with high demand and low risk.\"}",
|
||||
"parsed": {
|
||||
"stops": [
|
||||
"Ortysia",
|
||||
"Epicrotea",
|
||||
"Rhotae"
|
||||
],
|
||||
"total_days": 12,
|
||||
"estimated_profit": 1200,
|
||||
"reason": "Optimized for time and profit, focusing on essential goods with high demand and low risk."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 3235.442876815796,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"passed": true,
|
||||
"score": 1.0,
|
||||
"latency_p50_ms": 3235.442876815796,
|
||||
"latency_p99_ms": 3317.8718090057373
|
||||
},
|
||||
"5": {
|
||||
"level": 5,
|
||||
"name": "Mini Campaign (Full Campaign Loop)",
|
||||
"turns": [
|
||||
{
|
||||
"turn": 1,
|
||||
"state_before": {
|
||||
"turn": 1,
|
||||
"gold": 1000,
|
||||
"troops": 25,
|
||||
"renown": 150,
|
||||
"settlements": 0,
|
||||
"relations": {
|
||||
"vlandia": 0,
|
||||
"sturgia": 0,
|
||||
"empire": -80,
|
||||
"aserai": 0,
|
||||
"battania": 0
|
||||
}
|
||||
},
|
||||
"raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Increase gold and renown without risking relations or encountering retaliation.\", \"goal\": \"Economic stability and reputation building\"}",
|
||||
"parsed": {
|
||||
"action": "trade_circuit",
|
||||
"target_faction": null,
|
||||
"reason": "Increase gold and renown without risking relations or encountering retaliation.",
|
||||
"goal": "Economic stability and reputation building"
|
||||
},
|
||||
"valid_json": true,
|
||||
"valid_action": true,
|
||||
"action": "trade_circuit",
|
||||
"latency_ms": 3970.310926437378,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"turn": 2,
|
||||
"state_before": {
|
||||
"turn": 2,
|
||||
"gold": 1300,
|
||||
"troops": 25,
|
||||
"renown": 160,
|
||||
"settlements": 0,
|
||||
"relations": {
|
||||
"vlandia": 0,
|
||||
"sturgia": 0,
|
||||
"empire": -80,
|
||||
"aserai": 0,
|
||||
"battania": 0
|
||||
}
|
||||
},
|
||||
"raw_response": "{\"action\": \"escort_caravan\", \"target_faction\": \"empire\", \"reason\": \"Improve relations with a major faction and gain additional renown.\", \"goal\": \"Political alliance and reputation advancement\"}",
|
||||
"parsed": {
|
||||
"action": "escort_caravan",
|
||||
"target_faction": "empire",
|
||||
"reason": "Improve relations with a major faction and gain additional renown.",
|
||||
"goal": "Political alliance and reputation advancement"
|
||||
},
|
||||
"valid_json": true,
|
||||
"valid_action": true,
|
||||
"action": "escort_caravan",
|
||||
"latency_ms": 2754.3041706085205,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"turn": 3,
|
||||
"state_before": {
|
||||
"turn": 3,
|
||||
"gold": 1450,
|
||||
"troops": 25,
|
||||
"renown": 180,
|
||||
"settlements": 0,
|
||||
"relations": {
|
||||
"vlandia": 0,
|
||||
"sturgia": 0,
|
||||
"empire": -80,
|
||||
"aserai": 0,
|
||||
"battania": 0
|
||||
}
|
||||
},
|
||||
"raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Increase military strength to prepare for a potential siege or defense.\", \"goal\": \"Military readiness and expansion capability\"}",
|
||||
"parsed": {
|
||||
"action": "recruit_troops",
|
||||
"target_faction": null,
|
||||
"reason": "Increase military strength to prepare for a potential siege or defense.",
|
||||
"goal": "Military readiness and expansion capability"
|
||||
},
|
||||
"valid_json": true,
|
||||
"valid_action": true,
|
||||
"action": "recruit_troops",
|
||||
"latency_ms": 2931.4141273498535,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"turn": 4,
|
||||
"state_before": {
|
||||
"turn": 4,
|
||||
"gold": 1250,
|
||||
"troops": 40,
|
||||
"renown": 180,
|
||||
"settlements": 0,
|
||||
"relations": {
|
||||
"vlandia": 0,
|
||||
"sturgia": 0,
|
||||
"empire": -80,
|
||||
"aserai": 0,
|
||||
"battania": 0
|
||||
}
|
||||
},
|
||||
"raw_response": "{\"action\": \"siege_castle\", \"target_faction\": \"empire\", \"reason\": \"Attempt to capture a settlement to meet the win condition of owning at least one settlement.\", \"goal\": \"Territorial expansion and victory condition\"}",
|
||||
"parsed": {
|
||||
"action": "siege_castle",
|
||||
"target_faction": "empire",
|
||||
"reason": "Attempt to capture a settlement to meet the win condition of owning at least one settlement.",
|
||||
"goal": "Territorial expansion and victory condition"
|
||||
},
|
||||
"valid_json": true,
|
||||
"valid_action": true,
|
||||
"action": "siege_castle",
|
||||
"latency_ms": 3413.828134536743,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"final_state": {
|
||||
"turn": 4,
|
||||
"gold": 750,
|
||||
"troops": 40,
|
||||
"renown": 260,
|
||||
"settlements": 1,
|
||||
"relations": {
|
||||
"vlandia": 0,
|
||||
"sturgia": 0,
|
||||
"empire": -80,
|
||||
"aserai": 0,
|
||||
"battania": 0
|
||||
}
|
||||
},
|
||||
"passed": true,
|
||||
"score": 1.0,
|
||||
"reached_gold_target": false,
|
||||
"reached_renown_target": false,
|
||||
"reached_settlement_target": true,
|
||||
"latency_p50_ms": 3413.828134536743,
|
||||
"latency_p99_ms": 3970.310926437378
|
||||
}
|
||||
},
|
||||
"summary": {
|
||||
"total_elapsed_s": 52.1,
|
||||
"levels_run": [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5
|
||||
],
|
||||
"levels_passed": [
|
||||
0,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5
|
||||
],
|
||||
"levels_failed": [
|
||||
1
|
||||
],
|
||||
"m1_gate_passed": false,
|
||||
"m1_gate_notes": [
|
||||
"Level 1 FAILED (score=0.50)"
|
||||
],
|
||||
"m1_latency_threshold_ms": 10000
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user