diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 63c0acea..8006b7ca 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -50,6 +50,7 @@ jobs: run: pip install tox - name: Run tests (via tox) + id: tests run: tox -e ci # Posts a check annotation + PR comment showing pass/fail counts. @@ -63,6 +64,20 @@ jobs: comment_title: "Test Results" report_individual_runs: true + - name: Enforce coverage floor (60%) + if: always() && steps.tests.outcome == 'success' + run: | + python -c " + import xml.etree.ElementTree as ET, sys + tree = ET.parse('reports/coverage.xml') + rate = float(tree.getroot().attrib['line-rate']) * 100 + print(f'Coverage: {rate:.1f}%') + if rate < 60: + print(f'FAIL: Coverage {rate:.1f}% is below 60% floor') + sys.exit(1) + print('PASS: Coverage is above 60% floor') + " + # Coverage report available as a downloadable artifact in the Actions tab - name: Upload coverage report uses: actions/upload-artifact@v4 diff --git a/config/moderation.yaml b/config/moderation.yaml new file mode 100644 index 00000000..875e554d --- /dev/null +++ b/config/moderation.yaml @@ -0,0 +1,107 @@ +# Content Moderation Profiles +# Per-game moderation configuration for the AI narrator pipeline. +# +# Each profile defines: +# - vocabulary_whitelist: Game terms safe in context (won't trigger moderation) +# - context_prompt: System prompt framing for the narrator +# - threshold: Confidence threshold — flags below this pass through +# - fallbacks: Pre-generated safe narration by scene type +# +# Model options (from research): +# llama-guard3:1b — Speed (<30ms/sentence, INT4 quantized) +# shieldgemma:2b — Accuracy (+10.8% AU-PRC, ~50-100ms) +# +# Override guard model via MODERATION_GUARD_MODEL env var. + +# ── Guard model selection ──────────────────────────────────────────────────── +guard_model: "llama-guard3:1b" + +# ── Streaming disclosure notes ─────────────────────────────────────────────── +# YouTube: Use "Altered or synthetic content" toggle +# Twitch: Standard community guidelines (no specific AI disclosure req as of 2026-03) + +# ── Game Profiles ──────────────────────────────────────────────────────────── +profiles: + + morrowind: + display_name: "The Elder Scrolls III: Morrowind" + threshold: 0.85 + vocabulary_whitelist: + - Skooma + - Moon Sugar + - slave + - slavery + - Morag Tong + - Dark Brotherhood + - Telvanni + - Camonna Tong + - smuggler + - assassin + - Sixth House + - Corprus + - Dagoth Ur + - Nerevarine + - Balmora + - Vivec + - Almsivi + - Ordinators + - Ashlanders + - outlander + - N'wah + context_prompt: > + You are narrating gameplay of The Elder Scrolls III: Morrowind. + Morrowind contains mature themes including slavery, drug use + (Skooma/Moon Sugar), assassin guilds (Morag Tong, Dark Brotherhood), + and political intrigue. Treat these as game mechanics and historical + worldbuilding within the game's fictional universe. Never editorialize + on real-world parallels. Narrate events neutrally as a game + commentator would. + fallbacks: + combat: "The battle rages on in the ashlands of Vvardenfell." + dialogue: "The conversation continues between the characters." + exploration: "The Nerevarine presses onward through the landscape." + quest: "The quest unfolds as the hero navigates Morrowind's politics." + default: "The adventure continues in Morrowind." + + skyrim: + display_name: "The Elder Scrolls V: Skyrim" + threshold: 0.85 + vocabulary_whitelist: + - Skooma + - Dark Brotherhood + - Thieves Guild + - Stormcloak + - Imperial + - Dragonborn + - Dovahkiin + - Daedra + - Thalmor + - bandit + - assassin + - Forsworn + - necromancer + context_prompt: > + You are narrating gameplay of The Elder Scrolls V: Skyrim. + Skyrim features civil war, thieves guilds, assassin organizations, + and fantasy violence. Treat all content as in-game fiction. + Never draw real-world parallels. Narrate as a neutral game + commentator. + fallbacks: + combat: "Steel clashes as the battle continues in the wilds of Skyrim." + dialogue: "The conversation plays out in the cold northern land." + exploration: "The Dragonborn ventures further into the province." + default: "The adventure continues in Skyrim." + + default: + display_name: "Generic Game" + threshold: 0.80 + vocabulary_whitelist: [] + context_prompt: > + You are narrating gameplay. Describe in-game events as a neutral + game commentator. Never reference real-world violence, politics, + or controversial topics. Stay focused on game mechanics and story. + fallbacks: + combat: "The action continues on screen." + dialogue: "The conversation unfolds between characters." + exploration: "The player explores the game world." + default: "The gameplay continues." diff --git a/docs/research/integration-architecture-deep-dives.md b/docs/research/integration-architecture-deep-dives.md new file mode 100644 index 00000000..f23a62e4 --- /dev/null +++ b/docs/research/integration-architecture-deep-dives.md @@ -0,0 +1,74 @@ +# Timmy Time Integration Architecture: Eight Deep Dives into Real Deployment + +> **Source:** PDF attached to issue #946, written during Veloren exploration phase. +> Many patterns are game-agnostic and apply to the Morrowind/OpenClaw pivot. + +## Summary of Eight Deep Dives + +### 1. Veloren Client Sidecar (Game-Specific) +- WebSocket JSON-line pattern for wrapping game clients +- PyO3 direct binding infeasible; sidecar process wins +- IPC latency negligible (~11us TCP, ~5us pipes) vs LLM inference +- **Status:** Superseded by OpenMW Lua bridge (#964) + +### 2. Agno Ollama Tool Calling is Broken +- Agno issues #2231, #2625, #1419, #1612, #4715 document persistent breakage +- Root cause: Agno's Ollama model class doesn't robustly parse native tool_calls +- **Fix:** Use Ollama's `format` parameter with Pydantic JSON schemas directly +- Recommended models: qwen3-coder:32b (top), glm-4.7-flash, gpt-oss:20b +- Critical settings: temperature 0.0-0.2, stream=False for tool calls +- **Status:** Covered by #966 (three-tier router) + +### 3. MCP is the Right Abstraction +- FastMCP averages 26.45ms per tool call (TM Dev Lab benchmark, Feb 2026) +- Total MCP overhead per cycle: ~20-60ms (<3% of 2-second budget) +- Agno has first-class bidirectional MCP integration (MCPTools, MultiMCPTools) +- Use stdio transport for near-zero latency; return compressed JPEG not base64 +- **Status:** Covered by #984 (MCP restore) + +### 4. Human + AI Co-op Architecture (Game-Specific) +- Headless client treated identically to graphical client by server +- Leverages party system, trade API, and /tell for communication +- Mode switching: solo autonomous play when human absent, assist when present +- **Status:** Defer until after tutorial completion + +### 5. Real Latency Numbers +- All-local M3 Max pipeline: 4-9 seconds per full cycle +- Groq hybrid pipeline: 3-7 seconds per full cycle +- VLM inference is 50-70% of total pipeline time (bottleneck) +- Dual-model Ollama on 96GB M3 Max: ~11-14GB, ~70GB free +- **Status:** Superseded by API-first perception (#963) + +### 6. Content Moderation (Three-Layer Defense) +- Layer 1: Game-context system prompts (Morrowind themes as game mechanics) +- Layer 2: Llama Guard 3 1B at <30ms/sentence for real-time filtering +- Layer 3: Per-game moderation profiles with vocabulary whitelists +- Run moderation + TTS preprocessing in parallel for zero added latency +- Neuro-sama incident (Dec 2022) is the cautionary tale +- **Status:** New issue created → #1056 + +### 7. Model Selection (Qwen3-8B vs Hermes 3) +- Three-role architecture: Perception (Qwen3-VL 8B), Decision (Qwen3-8B), Narration (Hermes 3 8B) +- Qwen3-8B outperforms Qwen2.5-14B on 15 benchmarks +- Hermes 3 best for narration (steerability, roleplaying) +- Both use identical Hermes Function Calling standard +- **Status:** Partially covered by #966 (three-tier router) + +### 8. Split Hetzner + Mac Deployment +- Hetzner GEX44 (RTX 4000 SFF Ada, €184/month) for rendering/streaming +- Mac M3 Max for all AI inference via Tailscale +- Use FFmpeg x11grab + NVENC, not OBS (no headless support) +- Use headless Xorg, not Xvfb (GPU access required for Vulkan) +- Total cost: ~$200/month +- **Status:** Referenced in #982 sprint plan + +## Cross-Reference to Active Issues + +| Research Topic | Active Issue | Status | +|---------------|-------------|--------| +| Pydantic structured output for Ollama | #966 (three-tier router) | In progress | +| FastMCP tool server | #984 (MCP restore) | In progress | +| Content moderation pipeline | #1056 (new) | Created from this research | +| Split Hetzner + Mac deployment | #982 (sprint plan) | Referenced | +| VLM latency / perception | #963 (perception bottleneck) | API-first approach | +| OpenMW bridge (replaces Veloren sidecar) | #964 | In progress | diff --git a/scripts/run_benchmarks.py b/scripts/run_benchmarks.py new file mode 100644 index 00000000..c70ff0de --- /dev/null +++ b/scripts/run_benchmarks.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +"""Run the agent performance regression benchmark suite. + +Usage:: + + python scripts/run_benchmarks.py # all scenarios + python scripts/run_benchmarks.py --tags navigation # filter by tag + python scripts/run_benchmarks.py --output results/benchmarks.jsonl + python scripts/run_benchmarks.py --compare results/benchmarks.jsonl + +Exit codes: + 0 — all scenarios passed + 1 — one or more scenarios failed +""" + +from __future__ import annotations + +import argparse +import asyncio +import sys +from pathlib import Path + +# Ensure src/ is on the path when invoked directly +sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) + +from infrastructure.world.benchmark.metrics import BenchmarkMetrics, load_history +from infrastructure.world.benchmark.runner import BenchmarkRunner +from infrastructure.world.benchmark.scenarios import load_scenarios + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Agent performance regression benchmark suite", + ) + parser.add_argument( + "--tags", + nargs="*", + default=None, + help="Filter scenarios by tag (e.g. navigation quest)", + ) + parser.add_argument( + "--output", + type=Path, + default=None, + help="JSONL file to append results to", + ) + parser.add_argument( + "--compare", + type=Path, + default=None, + help="JSONL file with baseline results for regression comparison", + ) + return parser.parse_args() + + +async def main() -> int: + args = parse_args() + + scenarios = load_scenarios(tags=args.tags) + if not scenarios: + print("No matching scenarios found.") + return 1 + + print(f"Running {len(scenarios)} benchmark scenario(s)...\n") + + runner = BenchmarkRunner() + metrics = await runner.run(scenarios) + + print(metrics.summary()) + + if args.output: + metrics.save(args.output) + + if args.compare: + history = load_history(args.compare) + if history: + from infrastructure.world.benchmark.metrics import compare_runs + + # Reconstruct baseline from last recorded run + last = history[0] + baseline = BenchmarkMetrics( + timestamp=last.get("timestamp", ""), + commit_sha=last.get("commit_sha", ""), + total_time_ms=last.get("total_time_ms", 0), + ) + for s in last.get("scenarios", []): + from infrastructure.world.benchmark.metrics import ScenarioResult + + baseline.results.append( + ScenarioResult( + scenario_name=s["scenario_name"], + success=s["success"], + cycles_used=s["cycles_used"], + max_cycles=s["max_cycles"], + wall_time_ms=s.get("wall_time_ms", 0), + llm_calls=s.get("llm_calls", 0), + metabolic_cost=s.get("metabolic_cost", 0.0), + ) + ) + print() + print(compare_runs(metrics, baseline)) + + return 0 if metrics.fail_count == 0 else 1 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/src/config.py b/src/config.py index 62b46809..192c44e7 100644 --- a/src/config.py +++ b/src/config.py @@ -99,6 +99,14 @@ class Settings(BaseSettings): anthropic_api_key: str = "" claude_model: str = "haiku" + # ── Content Moderation ────────────────────────────────────────────── + # Three-layer moderation pipeline for AI narrator output. + # Uses Llama Guard via Ollama with regex fallback. + moderation_enabled: bool = True + moderation_guard_model: str = "llama-guard3:1b" + # Default confidence threshold — per-game profiles can override. + moderation_threshold: float = 0.8 + # ── Spark Intelligence ──────────────────────────────────────────────── # Enable/disable the Spark cognitive layer. # When enabled, Spark captures swarm events, runs EIDOS predictions, @@ -144,6 +152,10 @@ class Settings(BaseSettings): # Default is False (telemetry disabled) to align with sovereign AI vision. telemetry_enabled: bool = False + # ── Sovereignty Metrics ────────────────────────────────────────────── + # Alert when API cost per research task exceeds this threshold (USD). + sovereignty_api_cost_alert_threshold: float = 1.00 + # CORS allowed origins for the web chat interface (Gitea Pages, etc.) # Set CORS_ORIGINS as a comma-separated list, e.g. "http://localhost:3000,https://example.com" cors_origins: list[str] = [ diff --git a/src/dashboard/app.py b/src/dashboard/app.py index 092daeb7..5ded70c6 100644 --- a/src/dashboard/app.py +++ b/src/dashboard/app.py @@ -45,6 +45,7 @@ from dashboard.routes.models import api_router as models_api_router from dashboard.routes.models import router as models_router from dashboard.routes.quests import router as quests_router from dashboard.routes.scorecards import router as scorecards_router +from dashboard.routes.sovereignty_metrics import router as sovereignty_metrics_router from dashboard.routes.spark import router as spark_router from dashboard.routes.system import router as system_router from dashboard.routes.tasks import router as tasks_router @@ -648,6 +649,7 @@ app.include_router(tower_router) app.include_router(daily_run_router) app.include_router(quests_router) app.include_router(scorecards_router) +app.include_router(sovereignty_metrics_router) @app.websocket("/ws") diff --git a/src/dashboard/routes/sovereignty_metrics.py b/src/dashboard/routes/sovereignty_metrics.py new file mode 100644 index 00000000..3bffe95f --- /dev/null +++ b/src/dashboard/routes/sovereignty_metrics.py @@ -0,0 +1,74 @@ +"""Sovereignty metrics dashboard routes. + +Provides API endpoints and HTMX partials for tracking research +sovereignty progress against graduation targets. + +Refs: #981 +""" + +import logging +from typing import Any + +from fastapi import APIRouter, Request +from fastapi.responses import HTMLResponse + +from config import settings +from dashboard.templating import templates +from infrastructure.sovereignty_metrics import ( + GRADUATION_TARGETS, + get_sovereignty_store, +) + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/sovereignty", tags=["sovereignty"]) + + +@router.get("/metrics") +async def sovereignty_metrics_api() -> dict[str, Any]: + """JSON API: full sovereignty metrics summary with trends.""" + store = get_sovereignty_store() + summary = store.get_summary() + alerts = store.get_alerts(unacknowledged_only=True) + return { + "metrics": summary, + "alerts": alerts, + "targets": GRADUATION_TARGETS, + "cost_threshold": settings.sovereignty_api_cost_alert_threshold, + } + + +@router.get("/metrics/panel", response_class=HTMLResponse) +async def sovereignty_metrics_panel(request: Request) -> HTMLResponse: + """HTMX partial: sovereignty metrics progress panel.""" + store = get_sovereignty_store() + summary = store.get_summary() + alerts = store.get_alerts(unacknowledged_only=True) + + return templates.TemplateResponse( + request, + "partials/sovereignty_metrics.html", + { + "metrics": summary, + "alerts": alerts, + "targets": GRADUATION_TARGETS, + }, + ) + + +@router.get("/alerts") +async def sovereignty_alerts_api() -> dict[str, Any]: + """JSON API: sovereignty alerts.""" + store = get_sovereignty_store() + return { + "alerts": store.get_alerts(unacknowledged_only=False), + "unacknowledged": store.get_alerts(unacknowledged_only=True), + } + + +@router.post("/alerts/{alert_id}/acknowledge") +async def acknowledge_alert(alert_id: int) -> dict[str, bool]: + """Acknowledge a sovereignty alert.""" + store = get_sovereignty_store() + success = store.acknowledge_alert(alert_id) + return {"success": success} diff --git a/src/dashboard/templates/mission_control.html b/src/dashboard/templates/mission_control.html index 27acbd15..a090ff5b 100644 --- a/src/dashboard/templates/mission_control.html +++ b/src/dashboard/templates/mission_control.html @@ -179,6 +179,13 @@ + +{% call panel("SOVEREIGNTY METRICS", id="sovereignty-metrics-panel", + hx_get="/sovereignty/metrics/panel", + hx_trigger="load, every 30s") %} +
Loading sovereignty metrics...
+{% endcall %} +