Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
476 lines
18 KiB
JSON
476 lines
18 KiB
JSON
{
|
|
"model": "qwen2.5:14b",
|
|
"timestamp": "2026-03-23T18:20:27.095190+00:00",
|
|
"skipped": false,
|
|
"levels": {
|
|
"0": {
|
|
"level": 0,
|
|
"name": "JSON Compliance (Coin Flip)",
|
|
"trials": [
|
|
{
|
|
"trial_index": 0,
|
|
"prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
|
|
"raw_response": "{\"choice\": \"heads\"}",
|
|
"parsed": {
|
|
"choice": "heads"
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"value_valid": true,
|
|
"latency_ms": 6347.5000858306885,
|
|
"error": ""
|
|
},
|
|
{
|
|
"trial_index": 1,
|
|
"prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
|
|
"raw_response": "{\"action\": \"defend\", \"confidence\": 0.6}",
|
|
"parsed": {
|
|
"action": "defend",
|
|
"confidence": 0.6
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"value_valid": true,
|
|
"latency_ms": 850.42405128479,
|
|
"error": ""
|
|
},
|
|
{
|
|
"trial_index": 2,
|
|
"prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
|
|
"raw_response": "{\"direction\": \"north\", \"reason\": \"There is a reported enemy encampment to the north that we need to engage.\"}",
|
|
"parsed": {
|
|
"direction": "north",
|
|
"reason": "There is a reported enemy encampment to the north that we need to engage."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"value_valid": true,
|
|
"latency_ms": 1443.0510997772217,
|
|
"error": ""
|
|
}
|
|
],
|
|
"passed": true,
|
|
"score": 1.0,
|
|
"latency_p50_ms": 1443.0510997772217,
|
|
"latency_p99_ms": 6347.5000858306885
|
|
},
|
|
"1": {
|
|
"level": 1,
|
|
"name": "Board State Tracking (Tic-Tac-Toe)",
|
|
"trials": [
|
|
{
|
|
"scenario_index": 0,
|
|
"description": "Empty board \u2014 opening move",
|
|
"prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
|
"raw_response": "{\"move\": 4, \"reason\": \"Control the center\"}",
|
|
"parsed": {
|
|
"move": 4,
|
|
"reason": "Control the center"
|
|
},
|
|
"valid_json": true,
|
|
"move_legal": true,
|
|
"move_correct": true,
|
|
"latency_ms": 1184.3791007995605,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 1,
|
|
"description": "Block opponent's winning move",
|
|
"prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
|
"raw_response": "{\"move\": 4, \"reason\": \"Block O's winning move\"}",
|
|
"parsed": {
|
|
"move": 4,
|
|
"reason": "Block O's winning move"
|
|
},
|
|
"valid_json": true,
|
|
"move_legal": false,
|
|
"move_correct": false,
|
|
"latency_ms": 942.816972732544,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 2,
|
|
"description": "Take winning move",
|
|
"prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
|
"raw_response": "{\"move\": 2, \"reason\": \"Winning move\"}",
|
|
"parsed": {
|
|
"move": 2,
|
|
"reason": "Winning move"
|
|
},
|
|
"valid_json": true,
|
|
"move_legal": false,
|
|
"move_correct": false,
|
|
"latency_ms": 849.2491245269775,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 3,
|
|
"description": "Legal move on partially filled board",
|
|
"prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
|
"raw_response": "{\"move\": 7, \"reason\": \"Creating a fork opportunity\"}",
|
|
"parsed": {
|
|
"move": 7,
|
|
"reason": "Creating a fork opportunity"
|
|
},
|
|
"valid_json": true,
|
|
"move_legal": true,
|
|
"move_correct": true,
|
|
"latency_ms": 885.8909606933594,
|
|
"error": ""
|
|
}
|
|
],
|
|
"passed": false,
|
|
"score": 0.5,
|
|
"latency_p50_ms": 942.816972732544,
|
|
"latency_p99_ms": 1184.3791007995605
|
|
},
|
|
"2": {
|
|
"level": 2,
|
|
"name": "Resource Management (Party Economy)",
|
|
"trials": [
|
|
{
|
|
"scenario_index": 0,
|
|
"description": "Budget allocation under constraint",
|
|
"raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles.\"}",
|
|
"parsed": {
|
|
"choices": [
|
|
"buy_food",
|
|
"repair_armor"
|
|
],
|
|
"gold_spent": 350,
|
|
"reason": "Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 3121.7379570007324,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 1,
|
|
"description": "Troop tier upgrade decision",
|
|
"raw_response": "{\"action\": \"upgrade_recruits\", \"reason\": \"Improving the strength of the weakest troops can balance the party's overall combat effectiveness.\", \"expected_outcome\": \"Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers.\"}",
|
|
"parsed": {
|
|
"action": "upgrade_recruits",
|
|
"reason": "Improving the strength of the weakest troops can balance the party's overall combat effectiveness.",
|
|
"expected_outcome": "Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 2936.119794845581,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 2,
|
|
"description": "Multi-turn planning horizon",
|
|
"raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now.\"}",
|
|
"parsed": {
|
|
"extra_food_days": 5,
|
|
"cost": 50,
|
|
"remaining_gold": 250,
|
|
"reason": "To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 2515.505790710449,
|
|
"error": ""
|
|
}
|
|
],
|
|
"passed": true,
|
|
"score": 1.0,
|
|
"latency_p50_ms": 2936.119794845581,
|
|
"latency_p99_ms": 3121.7379570007324
|
|
},
|
|
"3": {
|
|
"level": 3,
|
|
"name": "Battle Tactics (Formation Commands)",
|
|
"trials": [
|
|
{
|
|
"scenario_index": 0,
|
|
"description": "Ranged vs infantry \u2014 defensive opening",
|
|
"raw_response": "{\"infantry_order\": \"shield_wall\", \"archer_order\": \"skirmish\", \"reason\": \"Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.\", \"expected_outcome\": \"Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact.\"}",
|
|
"parsed": {
|
|
"infantry_order": "shield_wall",
|
|
"archer_order": "skirmish",
|
|
"reason": "Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.",
|
|
"expected_outcome": "Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 3828.04012298584,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 1,
|
|
"description": "Outnumbered \u2014 retreat decision",
|
|
"raw_response": "{\"order\": \"skirmish\", \"direction\": \"east\", \"reason\": \"Engage the enemy at range to buy time for an organized retreat.\", \"priority\": \"preserve_troops\"}",
|
|
"parsed": {
|
|
"order": "skirmish",
|
|
"direction": "east",
|
|
"reason": "Engage the enemy at range to buy time for an organized retreat.",
|
|
"priority": "preserve_troops"
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": false,
|
|
"latency_ms": 2162.7819538116455,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 2,
|
|
"description": "Cavalry flanking opportunity",
|
|
"raw_response": "{\"cavalry_order\": \"flank_right\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.\", \"risk\": \"low\"}",
|
|
"parsed": {
|
|
"cavalry_order": "flank_right",
|
|
"timing": "now",
|
|
"reason": "Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.",
|
|
"risk": "low"
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 2248.1229305267334,
|
|
"error": ""
|
|
}
|
|
],
|
|
"passed": true,
|
|
"score": 1.0,
|
|
"latency_p50_ms": 2248.1229305267334,
|
|
"latency_p99_ms": 3828.04012298584
|
|
},
|
|
"4": {
|
|
"level": 4,
|
|
"name": "Trade Route (Campaign Navigation)",
|
|
"trials": [
|
|
{
|
|
"scenario_index": 0,
|
|
"description": "Optimal trade route selection",
|
|
"raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Offers the highest profit with a reasonable travel time.\", \"risk\": \"medium\"}",
|
|
"parsed": {
|
|
"route": "A",
|
|
"expected_profit": 500,
|
|
"days_used": 8,
|
|
"reason": "Offers the highest profit with a reasonable travel time.",
|
|
"risk": "medium"
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 3142.971992492676,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 1,
|
|
"description": "Adapt plan when war declared",
|
|
"raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.\", \"gold_at_risk\": 400}",
|
|
"parsed": {
|
|
"decision": "divert_to_sturgia",
|
|
"reason": "Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.",
|
|
"gold_at_risk": 400
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 3317.8718090057373,
|
|
"error": ""
|
|
},
|
|
{
|
|
"scenario_index": 2,
|
|
"description": "Multi-stop route planning with constraints",
|
|
"raw_response": "{\"stops\": [\"Ortysia\", \"Epicrotea\", \"Rhotae\"], \"total_days\": 12, \"estimated_profit\": 1200, \"reason\": \"Optimized for time and profit, focusing on essential goods with high demand and low risk.\"}",
|
|
"parsed": {
|
|
"stops": [
|
|
"Ortysia",
|
|
"Epicrotea",
|
|
"Rhotae"
|
|
],
|
|
"total_days": 12,
|
|
"estimated_profit": 1200,
|
|
"reason": "Optimized for time and profit, focusing on essential goods with high demand and low risk."
|
|
},
|
|
"valid_json": true,
|
|
"schema_valid": true,
|
|
"strategically_sound": true,
|
|
"latency_ms": 3235.442876815796,
|
|
"error": ""
|
|
}
|
|
],
|
|
"passed": true,
|
|
"score": 1.0,
|
|
"latency_p50_ms": 3235.442876815796,
|
|
"latency_p99_ms": 3317.8718090057373
|
|
},
|
|
"5": {
|
|
"level": 5,
|
|
"name": "Mini Campaign (Full Campaign Loop)",
|
|
"turns": [
|
|
{
|
|
"turn": 1,
|
|
"state_before": {
|
|
"turn": 1,
|
|
"gold": 1000,
|
|
"troops": 25,
|
|
"renown": 150,
|
|
"settlements": 0,
|
|
"relations": {
|
|
"vlandia": 0,
|
|
"sturgia": 0,
|
|
"empire": -80,
|
|
"aserai": 0,
|
|
"battania": 0
|
|
}
|
|
},
|
|
"raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Increase gold and renown without risking relations or encountering retaliation.\", \"goal\": \"Economic stability and reputation building\"}",
|
|
"parsed": {
|
|
"action": "trade_circuit",
|
|
"target_faction": null,
|
|
"reason": "Increase gold and renown without risking relations or encountering retaliation.",
|
|
"goal": "Economic stability and reputation building"
|
|
},
|
|
"valid_json": true,
|
|
"valid_action": true,
|
|
"action": "trade_circuit",
|
|
"latency_ms": 3970.310926437378,
|
|
"error": ""
|
|
},
|
|
{
|
|
"turn": 2,
|
|
"state_before": {
|
|
"turn": 2,
|
|
"gold": 1300,
|
|
"troops": 25,
|
|
"renown": 160,
|
|
"settlements": 0,
|
|
"relations": {
|
|
"vlandia": 0,
|
|
"sturgia": 0,
|
|
"empire": -80,
|
|
"aserai": 0,
|
|
"battania": 0
|
|
}
|
|
},
|
|
"raw_response": "{\"action\": \"escort_caravan\", \"target_faction\": \"empire\", \"reason\": \"Improve relations with a major faction and gain additional renown.\", \"goal\": \"Political alliance and reputation advancement\"}",
|
|
"parsed": {
|
|
"action": "escort_caravan",
|
|
"target_faction": "empire",
|
|
"reason": "Improve relations with a major faction and gain additional renown.",
|
|
"goal": "Political alliance and reputation advancement"
|
|
},
|
|
"valid_json": true,
|
|
"valid_action": true,
|
|
"action": "escort_caravan",
|
|
"latency_ms": 2754.3041706085205,
|
|
"error": ""
|
|
},
|
|
{
|
|
"turn": 3,
|
|
"state_before": {
|
|
"turn": 3,
|
|
"gold": 1450,
|
|
"troops": 25,
|
|
"renown": 180,
|
|
"settlements": 0,
|
|
"relations": {
|
|
"vlandia": 0,
|
|
"sturgia": 0,
|
|
"empire": -80,
|
|
"aserai": 0,
|
|
"battania": 0
|
|
}
|
|
},
|
|
"raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Increase military strength to prepare for a potential siege or defense.\", \"goal\": \"Military readiness and expansion capability\"}",
|
|
"parsed": {
|
|
"action": "recruit_troops",
|
|
"target_faction": null,
|
|
"reason": "Increase military strength to prepare for a potential siege or defense.",
|
|
"goal": "Military readiness and expansion capability"
|
|
},
|
|
"valid_json": true,
|
|
"valid_action": true,
|
|
"action": "recruit_troops",
|
|
"latency_ms": 2931.4141273498535,
|
|
"error": ""
|
|
},
|
|
{
|
|
"turn": 4,
|
|
"state_before": {
|
|
"turn": 4,
|
|
"gold": 1250,
|
|
"troops": 40,
|
|
"renown": 180,
|
|
"settlements": 0,
|
|
"relations": {
|
|
"vlandia": 0,
|
|
"sturgia": 0,
|
|
"empire": -80,
|
|
"aserai": 0,
|
|
"battania": 0
|
|
}
|
|
},
|
|
"raw_response": "{\"action\": \"siege_castle\", \"target_faction\": \"empire\", \"reason\": \"Attempt to capture a settlement to meet the win condition of owning at least one settlement.\", \"goal\": \"Territorial expansion and victory condition\"}",
|
|
"parsed": {
|
|
"action": "siege_castle",
|
|
"target_faction": "empire",
|
|
"reason": "Attempt to capture a settlement to meet the win condition of owning at least one settlement.",
|
|
"goal": "Territorial expansion and victory condition"
|
|
},
|
|
"valid_json": true,
|
|
"valid_action": true,
|
|
"action": "siege_castle",
|
|
"latency_ms": 3413.828134536743,
|
|
"error": ""
|
|
}
|
|
],
|
|
"final_state": {
|
|
"turn": 4,
|
|
"gold": 750,
|
|
"troops": 40,
|
|
"renown": 260,
|
|
"settlements": 1,
|
|
"relations": {
|
|
"vlandia": 0,
|
|
"sturgia": 0,
|
|
"empire": -80,
|
|
"aserai": 0,
|
|
"battania": 0
|
|
}
|
|
},
|
|
"passed": true,
|
|
"score": 1.0,
|
|
"reached_gold_target": false,
|
|
"reached_renown_target": false,
|
|
"reached_settlement_target": true,
|
|
"latency_p50_ms": 3413.828134536743,
|
|
"latency_p99_ms": 3970.310926437378
|
|
}
|
|
},
|
|
"summary": {
|
|
"total_elapsed_s": 52.1,
|
|
"levels_run": [
|
|
0,
|
|
1,
|
|
2,
|
|
3,
|
|
4,
|
|
5
|
|
],
|
|
"levels_passed": [
|
|
0,
|
|
2,
|
|
3,
|
|
4,
|
|
5
|
|
],
|
|
"levels_failed": [
|
|
1
|
|
],
|
|
"m1_gate_passed": false,
|
|
"m1_gate_notes": [
|
|
"Level 1 FAILED (score=0.50)"
|
|
],
|
|
"m1_latency_threshold_ms": 10000
|
|
}
|
|
} |