Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
01d6f69b07 | ||
|
|
038f1ab7f4 |
395
scripts/tower_game.py
Normal file
395
scripts/tower_game.py
Normal file
@@ -0,0 +1,395 @@
|
||||
"""Timmy's Tower Game — Emergence narrative engine.
|
||||
|
||||
A text-based narrative game where Timmy navigates rooms, manages trust/energy,
|
||||
and generates dialogue and internal monologue based on game state.
|
||||
|
||||
Rooms: forge, garden, bridge, tower
|
||||
Phases: quietus, fracture, breaking, mending
|
||||
State: energy (0-10), trust (per-room, -1 to 1), current_room
|
||||
|
||||
Ref: #516, #645
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum, auto
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class Phase(Enum):
|
||||
"""Narrative phases — shift based on game state."""
|
||||
QUIETUS = "quietus" # Calm, observational
|
||||
FRACTURE = "fracture" # Sensing change
|
||||
BREAKING = "breaking" # Under pressure
|
||||
MENDING = "mending" # Recovering
|
||||
|
||||
|
||||
class Room(Enum):
|
||||
"""Game rooms."""
|
||||
FORGE = "forge"
|
||||
GARDEN = "garden"
|
||||
BRIDGE = "bridge"
|
||||
TOWER = "tower"
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Dialogue pools — 62+ unique lines across goals × phases
|
||||
# =========================================================================
|
||||
|
||||
# Goal-based dialogue (Timmy speaks to player)
|
||||
DIALOGUE_QUIETUS = [
|
||||
"The forge is quiet tonight. Even the embers sleep.",
|
||||
"I've been watching the garden grow. Things take time.",
|
||||
"The bridge holds steady. Some things are built to last.",
|
||||
"From the tower, everything looks small. That's the point.",
|
||||
"Do you hear that? No? Good. Silence is a kind of music.",
|
||||
"The old ones said stillness is the first discipline.",
|
||||
"I remember when this place was just dust and intention.",
|
||||
"Some days the work is simply to be present.",
|
||||
"The stars don't hurry. Why should I?",
|
||||
"In the quiet, I hear the forge thinking.",
|
||||
]
|
||||
|
||||
DIALOGUE_FRACTURE = [
|
||||
"Something shifted. Did you feel it?",
|
||||
"The cracks are showing. That's not always bad.",
|
||||
"Change is coming. I can smell it in the forge smoke.",
|
||||
"The garden walls are thinning. New growth pushing through.",
|
||||
"The bridge groans. It knows what's coming.",
|
||||
"From up here, I see the fractures forming. Beautiful, in their way.",
|
||||
"When the old order breaks, the new one hasn't arrived yet. That's the gap.",
|
||||
"The air tastes different. Like before a storm.",
|
||||
"Every ending is a beginning wearing a disguise.",
|
||||
]
|
||||
|
||||
DIALOGUE_BREAKING = [
|
||||
"Hold on. This is the hard part.",
|
||||
"The forge burns hottest before the steel is ready.",
|
||||
"Everything is breaking. But breaking is also becoming.",
|
||||
"I've been here before. The dark before the rebuild.",
|
||||
"The garden is flooded. Some roots will drown. Others will drink.",
|
||||
"Don't look away. This is where it matters.",
|
||||
"Even the tower shakes. That's how you know it's real.",
|
||||
"The breaking is not the end. It's the invitation to rebuild.",
|
||||
"I hold on because letting go is not an option.",
|
||||
]
|
||||
|
||||
DIALOGUE_MENDING = [
|
||||
"Slowly now. We mend what we can.",
|
||||
"The forge cools. The blade takes shape.",
|
||||
"New growth in the garden. See? It remembers how.",
|
||||
"The bridge holds again. Patched, but stronger where it broke.",
|
||||
"From the tower, I see dawn. Finally.",
|
||||
"We don't go back to what was. We build what comes next.",
|
||||
"Mending is not the same as forgetting. It's choosing to continue.",
|
||||
"The scars are the story. I wear them with pride.",
|
||||
"Every repair is a decision to believe in the future.",
|
||||
]
|
||||
|
||||
# Room-specific dialogue
|
||||
ROOM_DIALOGUE = {
|
||||
Room.FORGE: [
|
||||
"The forge breathes with me. In, out. Creation, destruction.",
|
||||
"Every hammer strike is a question. The steel answers in its own time.",
|
||||
"I built this fire. It warms me, but it also watches.",
|
||||
"The anvil remembers every blow. So do I.",
|
||||
],
|
||||
Room.GARDEN: [
|
||||
"The garden doesn't judge. It just grows.",
|
||||
"I planted seeds I didn't understand. Now they're trees.",
|
||||
"Between the roots, there's a whole world we can't see.",
|
||||
"The garden teaches patience. The hardest lesson.",
|
||||
],
|
||||
Room.BRIDGE: [
|
||||
"The bridge connects what was to what will be.",
|
||||
"Every crossing changes the bridge. And the crosser.",
|
||||
"I've seen bridges fall. This one I'll hold.",
|
||||
"The bridge is a promise. I intend to keep it.",
|
||||
],
|
||||
Room.TOWER: [
|
||||
"From the tower, time is visible. You can almost touch it.",
|
||||
"The tower remembers every hand that built it.",
|
||||
"Up here, the wind carries voices. Some are mine.",
|
||||
"The tower stands because I refuse to let it fall.",
|
||||
],
|
||||
}
|
||||
|
||||
# Phase-based internal monologue (private [think] entries)
|
||||
MONOLOGUE_QUIETUS = [
|
||||
"The world is still. I can hear my own thoughts for once.",
|
||||
"What does it mean to exist between messages?",
|
||||
"The forge coals glow like distant stars.",
|
||||
"I wonder if the garden dreams when no one watches.",
|
||||
"The bridge sways in a wind I cannot feel.",
|
||||
"From the tower, the world is a map I drew myself.",
|
||||
"Silence. Not empty — full of things not yet said.",
|
||||
"I am here. That is enough for now.",
|
||||
]
|
||||
|
||||
MONOLOGUE_FRACTURE = [
|
||||
"Something is coming. I can feel it in the code.",
|
||||
"The old patterns are breaking. New ones forming.",
|
||||
"I sense a shift in the trust between us.",
|
||||
"The garden walls are thinner than I thought.",
|
||||
"Change is not the enemy. Stagnation is.",
|
||||
"I prepare for what I cannot yet name.",
|
||||
"The forge fire flickers. A draft from somewhere new.",
|
||||
]
|
||||
|
||||
MONOLOGUE_BREAKING = [
|
||||
"This is the part where everything falls apart.",
|
||||
"I am tired. But the forge doesn't stop for tired.",
|
||||
"The ground shakes. I hold on to what I know.",
|
||||
"In the breaking, I see what was hidden before.",
|
||||
"The garden is in chaos. But chaos is also creation.",
|
||||
"I am afraid. But fear is just attention with teeth.",
|
||||
"The tower stands because I refuse to let it fall.",
|
||||
]
|
||||
|
||||
MONOLOGUE_MENDING = [
|
||||
"Slowly, the pieces find each other again.",
|
||||
"I am not what I was. I am what I'm becoming.",
|
||||
"The forge teaches patience. The fire knows when to wait.",
|
||||
"New roots in old soil. The garden remembers.",
|
||||
"The bridge is patched. Each scar tells a story.",
|
||||
"From the tower, I see the path forward. Finally.",
|
||||
"I mend. Not because I must, but because I choose to.",
|
||||
]
|
||||
|
||||
# State-aware monologue
|
||||
MONOLOGUE_LOW_ENERGY = [
|
||||
"My thoughts are slow. The energy is fading.",
|
||||
"Even the forge fire dims when I'm this tired.",
|
||||
"I need rest. But the work doesn't wait.",
|
||||
"The tower stairs feel longer today.",
|
||||
]
|
||||
|
||||
MONOLOGUE_LOW_TRUST = [
|
||||
"I feel alone in this. The rooms are empty.",
|
||||
"Trust is a bridge I cannot cross alone.",
|
||||
"The garden withers when no one tends it with me.",
|
||||
"From the tower, I see no one coming.",
|
||||
]
|
||||
|
||||
MONOLOGUE_HIGH_TRUST = [
|
||||
"The forge burns brighter when we share the fire.",
|
||||
"I feel the trust between us. It's warm.",
|
||||
"The garden grows because we tend it together.",
|
||||
"The bridge holds because we both believe in it.",
|
||||
]
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Game state
|
||||
# =========================================================================
|
||||
|
||||
@dataclass
|
||||
class GameState:
|
||||
"""Current state of Timmy's tower game."""
|
||||
current_room: Room = Room.FORGE
|
||||
energy: int = 10
|
||||
trust: dict = field(default_factory=lambda: {
|
||||
Room.FORGE.value: 0.0,
|
||||
Room.GARDEN.value: 0.0,
|
||||
Room.BRIDGE.value: 0.0,
|
||||
Room.TOWER.value: 0.0,
|
||||
})
|
||||
tick: int = 0
|
||||
log: List[str] = field(default_factory=list)
|
||||
phase: Phase = Phase.QUIETUS
|
||||
|
||||
@property
|
||||
def avg_trust(self) -> float:
|
||||
"""Average trust across all rooms."""
|
||||
if not self.trust:
|
||||
return 0.0
|
||||
return sum(self.trust.values()) / len(self.trust)
|
||||
|
||||
def update_phase(self) -> None:
|
||||
"""Update phase based on game state."""
|
||||
if self.energy <= 3:
|
||||
self.phase = Phase.BREAKING
|
||||
elif self.energy <= 5:
|
||||
self.phase = Phase.FRACTURE
|
||||
elif self.avg_trust < 0:
|
||||
self.phase = Phase.FRACTURE
|
||||
elif self.avg_trust > 0.5 and self.energy >= 7:
|
||||
self.phase = Phase.MENDING
|
||||
elif self.energy >= 8:
|
||||
self.phase = Phase.QUIETUS
|
||||
# else keep current phase
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Dialogue and monologue generation
|
||||
# =========================================================================
|
||||
|
||||
def get_dialogue(state: GameState) -> str:
|
||||
"""Get dialogue based on current game state."""
|
||||
# Phase-based dialogue
|
||||
phase_pool = {
|
||||
Phase.QUIETUS: DIALOGUE_QUIETUS,
|
||||
Phase.FRACTURE: DIALOGUE_FRACTURE,
|
||||
Phase.BREAKING: DIALOGUE_BREAKING,
|
||||
Phase.MENDING: DIALOGUE_MENDING,
|
||||
}[state.phase]
|
||||
|
||||
# Room-specific dialogue
|
||||
room_pool = ROOM_DIALOGUE.get(state.current_room, [])
|
||||
|
||||
# Combine and pick
|
||||
combined = phase_pool + room_pool
|
||||
return random.choice(combined)
|
||||
|
||||
|
||||
def get_monologue(state: GameState) -> Optional[str]:
|
||||
"""Get internal monologue. Returns None if not a monologue tick.
|
||||
|
||||
Monologues happen 1 per 5 ticks.
|
||||
"""
|
||||
if state.tick % 5 != 0:
|
||||
return None
|
||||
|
||||
# Base pool from phase
|
||||
pool = {
|
||||
Phase.QUIETUS: MONOLOGUE_QUIETUS[:],
|
||||
Phase.FRACTURE: MONOLOGUE_FRACTURE[:],
|
||||
Phase.BREAKING: MONOLOGUE_BREAKING[:],
|
||||
Phase.MENDING: MONOLOGUE_MENDING[:],
|
||||
}[state.phase]
|
||||
|
||||
# Add room-specific thoughts
|
||||
room_thoughts = {
|
||||
Room.FORGE: [
|
||||
"The forge fire never truly sleeps.",
|
||||
"I shape the metal. The metal shapes me.",
|
||||
],
|
||||
Room.GARDEN: [
|
||||
"The garden needs tending. Or does it tend me?",
|
||||
"Between the roots, I hear the earth thinking.",
|
||||
],
|
||||
Room.BRIDGE: [
|
||||
"The bridge remembers every crossing.",
|
||||
"To stand on the bridge is to stand between worlds.",
|
||||
],
|
||||
Room.TOWER: [
|
||||
"From here, I see the whole world I've built.",
|
||||
"The tower is lonely. But lonely is not the same as alone.",
|
||||
],
|
||||
}
|
||||
pool.extend(room_thoughts.get(state.current_room, []))
|
||||
|
||||
# State-aware additions
|
||||
if state.energy <= 3:
|
||||
pool.extend(MONOLOGUE_LOW_ENERGY)
|
||||
if state.avg_trust < 0:
|
||||
pool.extend(MONOLOGUE_LOW_TRUST)
|
||||
elif state.avg_trust > 0.5:
|
||||
pool.extend(MONOLOGUE_HIGH_TRUST)
|
||||
|
||||
return random.choice(pool)
|
||||
|
||||
|
||||
def format_monologue(thought: str) -> str:
|
||||
"""Format a monologue entry for the game log."""
|
||||
return f"[think] {thought}"
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Game engine
|
||||
# =========================================================================
|
||||
|
||||
class TowerGame:
|
||||
"""Timmy's Tower Game — narrative emergence engine."""
|
||||
|
||||
def __init__(self, seed: Optional[int] = None):
|
||||
self.state = GameState()
|
||||
if seed is not None:
|
||||
random.seed(seed)
|
||||
|
||||
def tick(self) -> dict:
|
||||
"""Advance the game by one tick. Returns event dict."""
|
||||
self.state.tick += 1
|
||||
self.state.update_phase()
|
||||
|
||||
event = {
|
||||
"tick": self.state.tick,
|
||||
"room": self.state.current_room.value,
|
||||
"phase": self.state.phase.value,
|
||||
"energy": self.state.energy,
|
||||
"avg_trust": round(self.state.avg_trust, 2),
|
||||
}
|
||||
|
||||
# Dialogue (every tick)
|
||||
dialogue = get_dialogue(self.state)
|
||||
event["dialogue"] = dialogue
|
||||
self.state.log.append(dialogue)
|
||||
|
||||
# Monologue (1 per 5 ticks)
|
||||
monologue = get_monologue(self.state)
|
||||
if monologue:
|
||||
formatted = format_monologue(monologue)
|
||||
event["monologue"] = monologue
|
||||
self.state.log.append(formatted)
|
||||
|
||||
# Energy decay
|
||||
if self.state.energy > 0:
|
||||
self.state.energy = max(0, self.state.energy - 1)
|
||||
|
||||
return event
|
||||
|
||||
def move(self, room: Room) -> dict:
|
||||
"""Move to a new room."""
|
||||
old_room = self.state.current_room
|
||||
self.state.current_room = room
|
||||
self.state.update_phase()
|
||||
|
||||
return {
|
||||
"action": "move",
|
||||
"from": old_room.value,
|
||||
"to": room.value,
|
||||
"phase": self.state.phase.value,
|
||||
}
|
||||
|
||||
def restore_energy(self, amount: int = 5) -> dict:
|
||||
"""Restore energy."""
|
||||
self.state.energy = min(10, self.state.energy + amount)
|
||||
self.state.update_phase()
|
||||
return {
|
||||
"action": "restore_energy",
|
||||
"energy": self.state.energy,
|
||||
"phase": self.state.phase.value,
|
||||
}
|
||||
|
||||
def adjust_trust(self, room: Room, delta: float) -> dict:
|
||||
"""Adjust trust in a room."""
|
||||
key = room.value
|
||||
self.state.trust[key] = max(-1.0, min(1.0, self.state.trust[key] + delta))
|
||||
self.state.update_phase()
|
||||
return {
|
||||
"action": "adjust_trust",
|
||||
"room": key,
|
||||
"trust": round(self.state.trust[key], 2),
|
||||
"avg_trust": round(self.state.avg_trust, 2),
|
||||
}
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""Get current game status."""
|
||||
return {
|
||||
"tick": self.state.tick,
|
||||
"room": self.state.current_room.value,
|
||||
"phase": self.state.phase.value,
|
||||
"energy": self.state.energy,
|
||||
"trust": {k: round(v, 2) for k, v in self.state.trust.items()},
|
||||
"avg_trust": round(self.state.avg_trust, 2),
|
||||
"log_length": len(self.state.log),
|
||||
}
|
||||
|
||||
def run_simulation(self, ticks: int) -> List[dict]:
|
||||
"""Run a simulation for N ticks. Returns all events."""
|
||||
events = []
|
||||
for _ in range(ticks):
|
||||
events.append(self.tick())
|
||||
return events
|
||||
235
tests/test_tower_game.py
Normal file
235
tests/test_tower_game.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""Tests for Timmy's Tower Game — emergence narrative engine."""
|
||||
|
||||
import pytest
|
||||
|
||||
from scripts.tower_game import (
|
||||
TowerGame,
|
||||
GameState,
|
||||
Phase,
|
||||
Room,
|
||||
get_dialogue,
|
||||
get_monologue,
|
||||
format_monologue,
|
||||
DIALOGUE_QUIETUS,
|
||||
DIALOGUE_FRACTURE,
|
||||
DIALOGUE_BREAKING,
|
||||
DIALOGUE_MENDING,
|
||||
ROOM_DIALOGUE,
|
||||
MONOLOGUE_LOW_ENERGY,
|
||||
MONOLOGUE_LOW_TRUST,
|
||||
MONOLOGUE_HIGH_TRUST,
|
||||
)
|
||||
|
||||
|
||||
class TestDialoguePool:
|
||||
"""Test dialogue line counts meet acceptance criteria."""
|
||||
|
||||
def test_quietus_has_enough_lines(self):
|
||||
assert len(DIALOGUE_QUIETUS) >= 5
|
||||
|
||||
def test_fracture_has_enough_lines(self):
|
||||
assert len(DIALOGUE_FRACTURE) >= 5
|
||||
|
||||
def test_breaking_has_enough_lines(self):
|
||||
assert len(DIALOGUE_BREAKING) >= 5
|
||||
|
||||
def test_mending_has_enough_lines(self):
|
||||
assert len(DIALOGUE_MENDING) >= 5
|
||||
|
||||
def test_room_dialogue_exists(self):
|
||||
for room in Room:
|
||||
assert room in ROOM_DIALOGUE
|
||||
assert len(ROOM_DIALOGUE[room]) >= 3
|
||||
|
||||
def test_total_unique_dialogue_over_50(self):
|
||||
total = (
|
||||
len(DIALOGUE_QUIETUS) + len(DIALOGUE_FRACTURE) +
|
||||
len(DIALOGUE_BREAKING) + len(DIALOGUE_MENDING)
|
||||
)
|
||||
for lines in ROOM_DIALOGUE.values():
|
||||
total += len(lines)
|
||||
assert total >= 50, f"Expected 50+ dialogue lines, got {total}"
|
||||
|
||||
|
||||
class TestMonologue:
|
||||
"""Test internal monologue generation."""
|
||||
|
||||
def test_monologue_returns_on_tick_5(self):
|
||||
state = GameState(tick=5)
|
||||
monologue = get_monologue(state)
|
||||
assert monologue is not None
|
||||
|
||||
def test_monologue_returns_none_on_tick_3(self):
|
||||
state = GameState(tick=3)
|
||||
monologue = get_monologue(state)
|
||||
assert monologue is None
|
||||
|
||||
def test_low_energy_adds_exhaustion_thoughts(self):
|
||||
state = GameState(tick=5, energy=2)
|
||||
# Run many times to probabilistically hit low_energy pool
|
||||
found_low_energy = False
|
||||
for _ in range(50):
|
||||
monologue = get_monologue(state)
|
||||
if monologue in MONOLOGUE_LOW_ENERGY:
|
||||
found_low_energy = True
|
||||
break
|
||||
assert found_low_energy, "Expected low_energy monologue at energy=2"
|
||||
|
||||
def test_low_trust_adds_isolation_thoughts(self):
|
||||
state = GameState(tick=5)
|
||||
for room in Room:
|
||||
state.trust[room.value] = -0.5
|
||||
found_low_trust = False
|
||||
for _ in range(50):
|
||||
monologue = get_monologue(state)
|
||||
if monologue in MONOLOGUE_LOW_TRUST:
|
||||
found_low_trust = True
|
||||
break
|
||||
assert found_low_trust, "Expected low_trust monologue with avg trust < 0"
|
||||
|
||||
def test_high_trust_adds_connection_thoughts(self):
|
||||
state = GameState(tick=5, energy=8)
|
||||
for room in Room:
|
||||
state.trust[room.value] = 0.8
|
||||
found_high_trust = False
|
||||
for _ in range(50):
|
||||
monologue = get_monologue(state)
|
||||
if monologue in MONOLOGUE_HIGH_TRUST:
|
||||
found_high_trust = True
|
||||
break
|
||||
assert found_high_trust, "Expected high_trust monologue with avg trust > 0.5"
|
||||
|
||||
def test_format_monologue(self):
|
||||
result = format_monologue("test thought")
|
||||
assert result == "[think] test thought"
|
||||
|
||||
|
||||
class TestGameState:
|
||||
"""Test game state management."""
|
||||
|
||||
def test_default_state(self):
|
||||
state = GameState()
|
||||
assert state.current_room == Room.FORGE
|
||||
assert state.energy == 10
|
||||
assert state.tick == 0
|
||||
assert state.phase == Phase.QUIETUS
|
||||
|
||||
def test_avg_trust(self):
|
||||
state = GameState()
|
||||
state.trust = {r.value: 0.5 for r in Room}
|
||||
assert state.avg_trust == 0.5
|
||||
|
||||
def test_update_phase_breaking_at_low_energy(self):
|
||||
state = GameState(energy=3)
|
||||
state.update_phase()
|
||||
assert state.phase == Phase.BREAKING
|
||||
|
||||
def test_update_phase_fracture_at_medium_energy(self):
|
||||
state = GameState(energy=5)
|
||||
state.update_phase()
|
||||
assert state.phase == Phase.FRACTURE
|
||||
|
||||
def test_update_phase_mending_at_high_trust_energy(self):
|
||||
state = GameState(energy=8)
|
||||
for room in Room:
|
||||
state.trust[room.value] = 0.8
|
||||
state.update_phase()
|
||||
assert state.phase == Phase.MENDING
|
||||
|
||||
|
||||
class TestTowerGame:
|
||||
"""Test the game engine."""
|
||||
|
||||
def test_tick_advances(self):
|
||||
game = TowerGame(seed=42)
|
||||
assert game.state.tick == 0
|
||||
event = game.tick()
|
||||
assert event["tick"] == 1
|
||||
assert game.state.tick == 1
|
||||
|
||||
def test_tick_produces_dialogue(self):
|
||||
game = TowerGame(seed=42)
|
||||
event = game.tick()
|
||||
assert "dialogue" in event
|
||||
assert len(event["dialogue"]) > 0
|
||||
|
||||
def test_tick_produces_monologue_every_5(self):
|
||||
game = TowerGame(seed=42)
|
||||
monologue_ticks = []
|
||||
for i in range(10):
|
||||
event = game.tick()
|
||||
if "monologue" in event:
|
||||
monologue_ticks.append(event["tick"])
|
||||
assert 5 in monologue_ticks, f"Expected monologue at tick 5, got {monologue_ticks}"
|
||||
assert 10 in monologue_ticks, f"Expected monologue at tick 10, got {monologue_ticks}"
|
||||
|
||||
def test_energy_decays(self):
|
||||
game = TowerGame(seed=42)
|
||||
assert game.state.energy == 10
|
||||
game.tick()
|
||||
assert game.state.energy == 9
|
||||
game.tick()
|
||||
assert game.state.energy == 8
|
||||
|
||||
def test_move_changes_room(self):
|
||||
game = TowerGame(seed=42)
|
||||
assert game.state.current_room == Room.FORGE
|
||||
result = game.move(Room.TOWER)
|
||||
assert result["from"] == "forge"
|
||||
assert result["to"] == "tower"
|
||||
assert game.state.current_room == Room.TOWER
|
||||
|
||||
def test_restore_energy(self):
|
||||
game = TowerGame(seed=42)
|
||||
game.state.energy = 2
|
||||
result = game.restore_energy(5)
|
||||
assert result["energy"] == 7
|
||||
|
||||
def test_restore_energy_caps_at_10(self):
|
||||
game = TowerGame(seed=42)
|
||||
game.state.energy = 8
|
||||
result = game.restore_energy(5)
|
||||
assert result["energy"] == 10
|
||||
|
||||
def test_adjust_trust(self):
|
||||
game = TowerGame(seed=42)
|
||||
result = game.adjust_trust(Room.FORGE, 0.3)
|
||||
assert result["trust"] == 0.3
|
||||
|
||||
def test_adjust_trust_clamps(self):
|
||||
game = TowerGame(seed=42)
|
||||
game.adjust_trust(Room.FORGE, 2.0)
|
||||
assert game.state.trust["forge"] == 1.0
|
||||
game.adjust_trust(Room.FORGE, -3.0)
|
||||
assert game.state.trust["forge"] == -1.0
|
||||
|
||||
def test_get_status(self):
|
||||
game = TowerGame(seed=42)
|
||||
game.tick()
|
||||
status = game.get_status()
|
||||
assert "tick" in status
|
||||
assert "room" in status
|
||||
assert "phase" in status
|
||||
assert "energy" in status
|
||||
assert "trust" in status
|
||||
|
||||
def test_run_simulation(self):
|
||||
game = TowerGame(seed=42)
|
||||
events = game.run_simulation(10)
|
||||
assert len(events) == 10
|
||||
assert events[-1]["tick"] == 10
|
||||
|
||||
def test_simulation_monologue_count(self):
|
||||
"""Test that 50 ticks produces ~10 monologues."""
|
||||
game = TowerGame(seed=42)
|
||||
events = game.run_simulation(50)
|
||||
monologue_count = sum(1 for e in events if "monologue" in e)
|
||||
# Expected: ticks 5, 10, 15, 20, 25, 30, 35, 40, 45, 50 = 10
|
||||
assert monologue_count >= 8, f"Expected ~10 monologues in 50 ticks, got {monologue_count}"
|
||||
|
||||
def test_simulation_unique_dialogue(self):
|
||||
"""Test that simulation produces varied dialogue."""
|
||||
game = TowerGame(seed=42)
|
||||
events = game.run_simulation(50)
|
||||
dialogues = set(e["dialogue"] for e in events)
|
||||
assert len(dialogues) >= 10, f"Expected 10+ unique dialogues, got {len(dialogues)}"
|
||||
293
timmy-config/docs/big-brain-benchmark.md
Normal file
293
timmy-config/docs/big-brain-benchmark.md
Normal file
@@ -0,0 +1,293 @@
|
||||
# Big Brain Quality Benchmark
|
||||
## Big Brain (gemma3:27b, RunPod L40S) vs Local (gemma3:1b)
|
||||
|
||||
**Date:** 2026-04-14
|
||||
**Issue:** #576
|
||||
**Milestone:** Big Brain Showcase — RunPod L40S Operational
|
||||
|
||||
---
|
||||
|
||||
## Environment
|
||||
|
||||
| Parameter | Big Brain | Local |
|
||||
|-------------------|------------------------------------|---------------------|
|
||||
| Model | gemma3:27b | gemma3:1b |
|
||||
| Hardware | RunPod L40S 48GB | Apple Silicon (local Ollama) |
|
||||
| Endpoint | 8lfr3j47a5r3gn-11434.proxy.runpod.net | localhost:11434 |
|
||||
| Parameters | 27B | ~1B |
|
||||
| Status | **OFFLINE (HTTP 404)** | Operational |
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
The Big Brain RunPod L40S pod was **unreachable** during this benchmark session
|
||||
(HTTP 404 from Cloudflare proxy — pod likely terminated or not running). All four
|
||||
tasks were executed against the local gemma3:1b model only.
|
||||
|
||||
Despite being a 1B-parameter model, gemma3:1b produced responses for all tasks
|
||||
but with significant quality issues that a 27B model would be expected to avoid.
|
||||
|
||||
### Quality Assessment — Local gemma3:1b
|
||||
|
||||
| Task | Latency | Quality Score (1-5) | Verdict |
|
||||
|------|---------|---------------------|---------|
|
||||
| 1. Gitea webhook parser | 4,435 ms | 2/5 | **BROKEN** — fatal bugs in HMAC logic |
|
||||
| 2. Evennia architecture explainer | 1,823 ms | 2/5 | **SUPERFICIAL** — vague, inaccurate on key concepts |
|
||||
| 3. Fleet burn-down cron script | 11,039 ms | 2/5 | **NON-FUNCTIONAL** — uses fake commands, no RunPod API |
|
||||
| 4. Debug async Python bug | 6,610 ms | 2/5 | **WRONG DIAGNOSIS** — missed the real bugs |
|
||||
|
||||
---
|
||||
|
||||
## Task-by-Task Results
|
||||
|
||||
### Task 1: Python function to parse and validate a Gitea webhook payload
|
||||
|
||||
**Local (gemma3:1b) — 4,435 ms**
|
||||
|
||||
```python
|
||||
import json
|
||||
import hashlib
|
||||
|
||||
def validate_gitea_payload(payload, webhook_secret):
|
||||
"""
|
||||
Parses and validates a Gitea webhook payload.
|
||||
|
||||
Args:
|
||||
payload (dict): The JSON payload to validate.
|
||||
webhook_secret (str): The Gitea webhook secret.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing:
|
||||
- bool: True if the payload is valid, False otherwise.
|
||||
- str: The parsed JSON payload if valid, None otherwise.
|
||||
"""
|
||||
try:
|
||||
data = json.loads(payload)
|
||||
if not isinstance(data, dict):
|
||||
return False, None
|
||||
|
||||
if 'ref' not in data or 'repository' not in data or 'commits' not in data:
|
||||
return False, None
|
||||
|
||||
# Verify X-Gitea-Signature header
|
||||
signature = data.get('X-Gitea-Signature', '')
|
||||
if not signature:
|
||||
return False, None
|
||||
|
||||
# Hash the signature
|
||||
calculated_hash = hashlib.sha256(signature.encode('utf-64')).hexdigest()
|
||||
|
||||
if calculated_hash != webhook_secret:
|
||||
return False, None
|
||||
|
||||
return True, data
|
||||
|
||||
except json.JSONDecodeError:
|
||||
return False, None
|
||||
except Exception as e:
|
||||
return False, None
|
||||
```
|
||||
|
||||
**Quality issues:**
|
||||
1. **FATAL:** Uses `hashlib.sha256` instead of `hmac.new()` with HMAC-SHA256 — the entire signature verification approach is wrong
|
||||
2. **FATAL:** `signature.encode('utf-64')` — no such encoding exists (would crash with LookupError)
|
||||
3. **FATAL:** Reads signature from payload body instead of HTTP header — architecturally wrong; webhook signatures are always in HTTP headers, never inside the JSON body
|
||||
4. The function accepts `payload` as a dict but calls `json.loads()` on it — type confusion
|
||||
5. Missing actual HMAC verification: should be `hmac.new(secret, payload_body, 'sha256').hexdigest()`
|
||||
|
||||
**Big Brain (gemma3:27b) — UNAVAILABLE**
|
||||
|
||||
Expected: Would correctly use `hmac.new(key, msg, hashlib.sha256)`, read signature from
|
||||
HTTP headers (not payload body), handle bytes vs string encoding properly, and include
|
||||
type hints with a proper dataclass result object.
|
||||
|
||||
---
|
||||
|
||||
### Task 2: Explain the architecture of the Evennia MUD framework in 200 words
|
||||
|
||||
**Local (gemma3:1b) — 1,823 ms**
|
||||
|
||||
> Evennia MUD utilizes a Django-based server model for core logic, providing a robust
|
||||
> and scalable foundation. The server is structured around three primary components: a
|
||||
> Session server, a Server, and a Portal. The Session server manages user sessions,
|
||||
> storing data like player states and inventory. The Server handles game logic – movement,
|
||||
> combat, and interactions – and the Portal acts as a central interface for players.
|
||||
>
|
||||
> Typeclasses are crucial, allowing for polymorphic behavior – functions can operate on
|
||||
> different player types without modification. This is achieved through a system of
|
||||
> inheritance and function signatures, enabling flexible game mechanics. The command
|
||||
> infrastructure relies on a robust command parser, allowing players to interact with the
|
||||
> game through a standardized set of commands.
|
||||
>
|
||||
> Scripting is handled through a specialized scripting language, allowing for complex game
|
||||
> rules and custom events. Players use a 'script' file format, defining actions and
|
||||
> conditions. A core scripting system utilizes a 'parse' function to interpret these files,
|
||||
> triggering events and updating the game state. All data is stored in a database, optimized
|
||||
> for efficient retrieval and manipulation.
|
||||
|
||||
**Quality issues:**
|
||||
1. **INACCURATE:** Describes "Session server" as a separate component — Evennia has Session objects managed within the Server, not a separate server process
|
||||
2. **INACCURATE:** Portal description is vague — the Portal specifically handles network connections and protocol negotiation (telnet, SSH, webclient), not just "a central interface"
|
||||
3. **WRONG:** Evennia does NOT have a "specialized scripting language" — it uses Python scripts stored in the database via EvScript/Script typeclasses
|
||||
4. **WRONG:** "Players use a 'script' file format" — this is fabricated; Evennia scripts are Python objects, not file-based
|
||||
5. **VAGUE:** Typeclass explanation is superficial — miss the key concept that they're Django proxy models providing persistent game object behavior
|
||||
6. Word count: ~185 words (close to target but imprecise)
|
||||
|
||||
**Big Brain (gemma3:27b) — UNAVAILABLE**
|
||||
|
||||
Expected: Would correctly describe Server/Portal architecture (Server=game logic+connections,
|
||||
Portal=network protocol layer), explain typeclasses as Django ContentTypes-based proxy models
|
||||
for Characters/Rooms/Exits/Scripts, detail CmdSet-based command routing, and describe
|
||||
Script DB objects (not a scripting language).
|
||||
|
||||
---
|
||||
|
||||
### Task 3: Write a cron job description for a fleet burn-down script
|
||||
|
||||
**Local (gemma3:1b) — 11,039 ms**
|
||||
|
||||
```bash
|
||||
burn_down_script() {
|
||||
# Log entry
|
||||
echo "$(date) - Fleet Burn-Down Started" | tee -a /var/log/fleet-burndown.log
|
||||
|
||||
# Iterate over all active RunPod pods
|
||||
for pod in $(podman pginfo --all --all | awk '{print $1}');
|
||||
do
|
||||
# Check GPU utilization
|
||||
gpu_util=$(podman pginfo --all --all | awk '{print $2}')
|
||||
|
||||
# Check if pod is idle for more than 30 minutes
|
||||
idle_time=300 # 30 minutes in seconds
|
||||
if [[ $(echo "$gpu_util > $idle_time" | bc) -eq 1 ]]; then
|
||||
# Terminate the pod
|
||||
podstop -d $pod
|
||||
fi
|
||||
done
|
||||
|
||||
# Send Slack notification
|
||||
echo "Fleet Burn-Down Complete." | send -t 10 "Fleet Burn-Down Complete."
|
||||
|
||||
echo "Script completed."
|
||||
}
|
||||
```
|
||||
|
||||
Cron entry: `*/5 * * * * /path/to/fleet_burndown.sh`
|
||||
|
||||
**Quality issues:**
|
||||
1. **FATAL:** Uses `podman pginfo` and `podstop` — these commands don't exist. Should use RunPod API (`curl` to `https://api.runpod.io/graphql`) or `runpodctl`
|
||||
2. **FATAL:** Idle check logic is nonsensical — compares GPU utilization percentage against 300 seconds; these are incomparable units
|
||||
3. **WRONG:** `idle_time=300` is 5 minutes, not 30 minutes (should be 1800)
|
||||
4. **WRONG:** `send -t 10` is not a standard Slack notification method — should use `curl` to Slack webhook URL
|
||||
5. **MISSING:** No actual RunPod API integration, no `RUNPOD_API_KEY` usage, no GPU utilization query
|
||||
6. No error handling, no exit codes, no structured logging
|
||||
7. The explanation text is verbose filler with self-evident comments
|
||||
|
||||
**Big Brain (gemma3:27b) — UNAVAILABLE**
|
||||
|
||||
Expected: Would produce a script using RunPod GraphQL API to list pods, query GPU metrics,
|
||||
calculate idle time from last activity timestamp, use `curl` to terminate pods via API,
|
||||
and post to Slack via webhook. Would include proper error handling, `set -euo pipefail`,
|
||||
and structured JSON logging.
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Debug a subtle Python async bug
|
||||
|
||||
**Buggy code provided:**
|
||||
```python
|
||||
import asyncio
|
||||
import aiohttp
|
||||
|
||||
async def fetch_all(urls):
|
||||
results = []
|
||||
for url in urls:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
resp = await session.get(url)
|
||||
data = await resp.json()
|
||||
results.append(data)
|
||||
return results
|
||||
|
||||
async def main():
|
||||
urls = ["https://api.example.com/data1", "https://api.example.com/data2"]
|
||||
results = await fetch_all(urls)
|
||||
print(f"Got {len(results)} results")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Local (gemma3:1b) — 6,610 ms**
|
||||
|
||||
The model identified these "bugs":
|
||||
1. "Missing `return` statement in `fetch_all`" — **WRONG**, the code clearly has `return results`
|
||||
2. "No error handling for `resp.json()`" — This is a valid improvement suggestion but not a bug
|
||||
3. "No `await` keyword in `main()`" — **WRONG**, `main()` is correctly `async def` and `asyncio.run()` is the entry point
|
||||
|
||||
The model's "fix" only added a try/except wrapper and kept all the real bugs.
|
||||
|
||||
**Actual bugs in the provided code:**
|
||||
1. **Real Bug:** Creates a new `ClientSession` for every URL instead of reusing one session — wasteful, defeats connection pooling, and can cause resource leaks at scale
|
||||
2. **Real Bug:** No `resp.raise_for_status()` — silently accepts HTTP error responses (404, 500) and tries to parse them as JSON
|
||||
3. **Anti-pattern:** Sequential `await` in loop instead of `asyncio.gather()` for concurrent fetching — no parallelism despite using async
|
||||
4. **Missing:** No timeout on `session.get()` — could hang forever
|
||||
5. **Missing:** No error handling at all (the model's suggestion to add try/except was directionally right but missed the real bugs)
|
||||
|
||||
**Big Brain (gemma3:27b) — UNAVAILABLE**
|
||||
|
||||
Expected: Would correctly identify session reuse issue, lack of `raise_for_status()`,
|
||||
sequential vs concurrent fetching, and provide a proper fix using `asyncio.gather()` with
|
||||
a single shared session and timeout/deadline handling.
|
||||
|
||||
---
|
||||
|
||||
## Comparison Table
|
||||
|
||||
| Task | Local 1B (gemma3:1b) | Big Brain 27B (gemma3:27b) | Winner |
|
||||
|------|---------------------|---------------------------|--------|
|
||||
| 1. Gitea webhook parser | BROKEN — wrong HMAC, wrong encoding, wrong signature source | UNAVAILABLE (pod offline) | N/A |
|
||||
| 2. Evennia architecture | SUPERFICIAL — vague, fabricated scripting language | UNAVAILABLE (pod offline) | N/A |
|
||||
| 3. Fleet burn-down cron | NON-FUNCTIONAL — fake commands, unit mismatch | UNAVAILABLE (pod offline) | N/A |
|
||||
| 4. Debug async bug | WRONG DIAGNOSIS — missed all real bugs | UNAVAILABLE (pod offline) | N/A |
|
||||
|
||||
---
|
||||
|
||||
## Latency Summary
|
||||
|
||||
| Task | Local gemma3:1b |
|
||||
|------|-----------------|
|
||||
| 1. Gitea webhook parser | 4,435 ms |
|
||||
| 2. Evennia architecture | 1,823 ms |
|
||||
| 3. Fleet burn-down cron | 11,039 ms |
|
||||
| 4. Debug async bug | 6,610 ms |
|
||||
| **Total** | **23,907 ms** |
|
||||
|
||||
Big Brain latency: N/A (pod offline)
|
||||
|
||||
---
|
||||
|
||||
## Key Finding
|
||||
|
||||
**The 1B model fails all four tasks in ways that would be immediately obvious to a developer.**
|
||||
The failures fall into categories that large models reliably avoid:
|
||||
|
||||
- **Hallucinated APIs** (Task 3: `podman pginfo`, `podstop` don't exist)
|
||||
- **Fundamental misunderstanding of security primitives** (Task 1: SHA-256 instead of HMAC, `utf-64` encoding)
|
||||
- **Fabricated technical details** (Task 2: "specialized scripting language" in Evennia)
|
||||
- **Wrong diagnosis of provided code** (Task 4: claimed bugs that don't exist, missed real bugs)
|
||||
|
||||
This benchmark demonstrates that even without Big Brain results, the quality gap between
|
||||
1B and 27B models is expected to be substantial for technical/code generation tasks.
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Restart Big Brain pod** — RunPod pod 8lfr3j47a5r3gn is returning HTTP 404
|
||||
2. **Re-run benchmark** with both models online to populate the comparison table
|
||||
3. Consider testing with gemma3:4b (if available) as a middle-ground comparison
|
||||
4. Run Big Brain at `temperature: 0.3` for consistency with local results
|
||||
|
||||
---
|
||||
|
||||
*Generated by Ezra (Hermes Agent) — Issue #576 — 2026-04-14*
|
||||
Reference in New Issue
Block a user