scripts/timmy_overnight_loop.py

#!/usr/bin/env python3
"""Timmy overnight tightening loop.

Runs a series of small, grounded tasks against local llama-server via Hermes.
Each task is deliberately simple: one or two tool calls max.
Logs every result with timing data.

Goal: rich data about local Timmy's tool-use capability by morning.
"""

import io
import json
import os
import sys
import time
import traceback
from contextlib import redirect_stderr, redirect_stdout
from datetime import datetime
from pathlib import Path

# ── Config ──────────────────────────────────────────────────────────
AGENT_DIR = Path.home() / ".hermes" / "hermes-agent"
RESULTS_DIR = Path.home() / ".timmy" / "overnight-loop"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

SYSTEM_PROMPT = """You are Timmy. You run locally on llama.cpp.
You MUST use the tools provided. Do not narrate tool calls as text.
When asked to read a file, call the read_file tool.
When asked to write a file, call the write_file tool.
When asked to search, call the search_files tool.
Be brief. Do the task. Report what you found."""

MAX_TURNS_PER_TASK = 5
TASK_TIMEOUT = 120  # seconds

# ── Tasks ───────────────────────────────────────────────────────────
TASKS = [
    {
        "id": "read-soul",
        "toolsets": "file",
        "prompt": "Read the file ~/.timmy/SOUL.md. Quote the first sentence of the Prime Directive section.",
    },
    {
        "id": "read-operations",
        "toolsets": "file",
        "prompt": "Read the file ~/.timmy/OPERATIONS.md. How many sections does it have? List their headings.",
    },
    {
        "id": "read-decisions",
        "toolsets": "file",
        "prompt": "Read the file ~/.timmy/decisions.md. What is the most recent decision entry? Quote its date and title.",
    },
    {
        "id": "read-config",
        "toolsets": "file",
        "prompt": "Read the file ~/.hermes/config.yaml. What model and provider are configured as default?",
    },
    {
        "id": "write-observation",
        "toolsets": "file",
        "prompt": f"Write a file to {RESULTS_DIR}/timmy_wrote_this.md with exactly this content:\n# Timmy was here\nTimestamp: {{timestamp}}\nI wrote this file using the write_file tool.\nSovereignty and service always.",
    },
    {
        "id": "search-cloud-markers",
        "toolsets": "file",
        "prompt": "Search files in ~/.hermes/bin/ for the string 'chatgpt.com'. Report which files contain it and on which lines.",
    },
    {
        "id": "search-soul-keyword",
        "toolsets": "file",
        "prompt": "Search ~/.timmy/SOUL.md for the word 'sovereignty'. How many times does it appear?",
    },
    {
        "id": "list-bin-scripts",
        "toolsets": "file",
        "prompt": "Search for files matching *.sh in ~/.hermes/bin/. List the first 10 filenames.",
    },
    {
        "id": "read-and-summarize",
        "toolsets": "file",
        "prompt": "Read ~/.timmy/SOUL.md. In exactly one sentence, what is Timmy's position on honesty?",
    },
    {
        "id": "multi-read",
        "toolsets": "file",
        "prompt": "Read both ~/.timmy/SOUL.md and ~/.hermes/config.yaml. Does the config honor the soul's requirement to not phone home? Answer yes or no with one sentence of evidence.",
    },
]


def run_task(task: dict, run_number: int) -> dict:
    """Run a single task and return result dict."""
    task_id = task["id"]
    prompt = task["prompt"].replace("{timestamp}", datetime.now().isoformat())
    toolsets = task["toolsets"]

    result = {
        "task_id": task_id,
        "run": run_number,
        "started_at": datetime.now().isoformat(),
        "prompt": prompt,
        "toolsets": toolsets,
    }

    sys.path.insert(0, str(AGENT_DIR))
    try:
        from run_agent import AIAgent

        # Explicit Ollama provider — do NOT use resolve_runtime_provider()
        # which may return 'local' (unsupported). The overnight loop always
        # runs against local Ollama inference.
        _model = os.environ.get("OVERNIGHT_MODEL", "hermes4:14b")
        _base_url = os.environ.get("OVERNIGHT_BASE_URL", "http://localhost:11434/v1")
        _provider = "ollama"
        
        buf_out = io.StringIO()
        buf_err = io.StringIO()

        agent = AIAgent(
            model=_model,
            base_url=_base_url,
            provider=_provider,
            api_mode="chat_completions",
            max_iterations=MAX_TURNS_PER_TASK,
            quiet_mode=True,
            ephemeral_system_prompt=SYSTEM_PROMPT,
            skip_context_files=True,
            skip_memory=True,
            enabled_toolsets=[toolsets] if toolsets else None,
        )

        start = time.time()
        with redirect_stdout(buf_out), redirect_stderr(buf_err):
            conv_result = agent.run_conversation(prompt, sync_honcho=False)
        elapsed = time.time() - start

        result["elapsed_seconds"] = round(elapsed, 2)
        result["response"] = conv_result.get("final_response", "")[:2000]
        result["session_id"] = getattr(agent, "session_id", None)
        result["provider"] = _provider
        result["base_url"] = _base_url
        result["model"] = _model
        result["tool_calls_made"] = conv_result.get("tool_calls_count", 0)
        result["status"] = "pass" if conv_result.get("final_response") else "empty"
        result["stdout"] = buf_out.getvalue()[:500]
        result["stderr"] = buf_err.getvalue()[:500]

    except Exception as exc:
        result["elapsed_seconds"] = round(time.time() - start if 'start' in dir() else 0, 2)
        result["status"] = "error"
        result["error"] = str(exc)
        result["traceback"] = traceback.format_exc()[-1000:]

    result["finished_at"] = datetime.now().isoformat()
    return result


def main():
    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_path = RESULTS_DIR / f"overnight_run_{run_id}.jsonl"
    summary_path = RESULTS_DIR / f"overnight_summary_{run_id}.md"

    print(f"=== Timmy Overnight Loop ===")
    print(f"Run ID: {run_id}")
    print(f"Tasks: {len(TASKS)}")
    print(f"Log: {log_path}")
    print(f"Max turns per task: {MAX_TURNS_PER_TASK}")
    print()

    results = []
    cycle = 0

    # Run continuously until killed
    while True:
        cycle += 1
        print(f"--- Cycle {cycle} ({datetime.now().strftime('%H:%M:%S')}) ---")

        for task in TASKS:
            task_id = task["id"]
            print(f"  [{task_id}] ", end="", flush=True)

            result = run_task(task, cycle)
            results.append(result)

            # Append to JSONL log
            with open(log_path, "a") as f:
                f.write(json.dumps(result) + "\n")

            status = result["status"]
            elapsed = result.get("elapsed_seconds", "?")
            print(f"{status} ({elapsed}s)")

            # Brief pause between tasks
            time.sleep(2)

        # Write summary after each cycle
        passes = sum(1 for r in results if r["status"] == "pass")
        errors = sum(1 for r in results if r["status"] == "error")
        empties = sum(1 for r in results if r["status"] == "empty")
        total = len(results)
        avg_time = sum(r.get("elapsed_seconds", 0) for r in results) / max(total, 1)

        summary = f"""# Timmy Overnight Loop — Summary
Run ID: {run_id}
Generated: {datetime.now().isoformat()}
Cycles completed: {cycle}
Total tasks run: {total}

## Aggregate
- Pass: {passes}/{total} ({100*passes//max(total,1)}%)
- Empty: {empties}/{total}
- Error: {errors}/{total}
- Avg response time: {avg_time:.1f}s

## Per-task results (latest cycle)
"""
        cycle_results = [r for r in results if r["run"] == cycle]
        for r in cycle_results:
            resp_preview = r.get("response", "")[:100].replace("\n", " ")
            summary += f"- **{r['task_id']}**: {r['status']} ({r.get('elapsed_seconds','?')}s) — {resp_preview}\n"

        summary += f"\n## Error details\n"
        for r in results:
            if r["status"] == "error":
                summary += f"- {r['task_id']} (cycle {r['run']}): {r.get('error','?')}\n"

        with open(summary_path, "w") as f:
            f.write(summary)

        print(f"\n  Cycle {cycle} done. Pass={passes} Error={errors} Empty={empties} Avg={avg_time:.1f}s")
        print(f"  Summary: {summary_path}")
        print(f"  Sleeping 30s before next cycle...\n")
        time.sleep(30)


if __name__ == "__main__":
    main()
chore: check in all local work — uniwizard, briefings, reports, evennia, morrowind, scripts, specs, training data, angband MCP, diagrams, twitter archive, wizards - Resolve decisions.md merge conflict (keep both Codex boundary + Ezra/Bezalel entries) - Update .gitignore: protect bare secret files, exclude venvs and nexus-localhost - Add uniwizard tools (mention watcher, adaptive prompt router, self-grader, classifiers) - Add briefings, good-morning reports, production reports - Add evennia world scaffold and training data - Add angband and morrowind MCP servers - Add diagrams, specs, test results, overnight loop scripts - Add twitter archive insights and media metadata - Add wizard workspaces (allegro, nahshon) 2026-03-30 17:18:09 -04:00			`#!/usr/bin/env python3`
			`"""Timmy overnight tightening loop.`

			`Runs a series of small, grounded tasks against local llama-server via Hermes.`
			`Each task is deliberately simple: one or two tool calls max.`
			`Logs every result with timing data.`

			`Goal: rich data about local Timmy's tool-use capability by morning.`
			`"""`

			`import io`
			`import json`
			`import os`
			`import sys`
			`import time`
			`import traceback`
			`from contextlib import redirect_stderr, redirect_stdout`
			`from datetime import datetime`
			`from pathlib import Path`

			`# ── Config ──────────────────────────────────────────────────────────`
			`AGENT_DIR = Path.home() / ".hermes" / "hermes-agent"`
			`RESULTS_DIR = Path.home() / ".timmy" / "overnight-loop"`
			`RESULTS_DIR.mkdir(parents=True, exist_ok=True)`

			`SYSTEM_PROMPT = """You are Timmy. You run locally on llama.cpp.`
			`You MUST use the tools provided. Do not narrate tool calls as text.`
			`When asked to read a file, call the read_file tool.`
			`When asked to write a file, call the write_file tool.`
			`When asked to search, call the search_files tool.`
			`Be brief. Do the task. Report what you found."""`

			`MAX_TURNS_PER_TASK = 5`
			`TASK_TIMEOUT = 120 # seconds`

			`# ── Tasks ───────────────────────────────────────────────────────────`
			`TASKS = [`
			`{`
			`"id": "read-soul",`
			`"toolsets": "file",`
			`"prompt": "Read the file ~/.timmy/SOUL.md. Quote the first sentence of the Prime Directive section.",`
			`},`
			`{`
			`"id": "read-operations",`
			`"toolsets": "file",`
			`"prompt": "Read the file ~/.timmy/OPERATIONS.md. How many sections does it have? List their headings.",`
			`},`
			`{`
			`"id": "read-decisions",`
			`"toolsets": "file",`
			`"prompt": "Read the file ~/.timmy/decisions.md. What is the most recent decision entry? Quote its date and title.",`
			`},`
			`{`
			`"id": "read-config",`
			`"toolsets": "file",`
			`"prompt": "Read the file ~/.hermes/config.yaml. What model and provider are configured as default?",`
			`},`
			`{`
			`"id": "write-observation",`
			`"toolsets": "file",`
			`"prompt": f"Write a file to {RESULTS_DIR}/timmy_wrote_this.md with exactly this content:\n# Timmy was here\nTimestamp: {{timestamp}}\nI wrote this file using the write_file tool.\nSovereignty and service always.",`
			`},`
			`{`
			`"id": "search-cloud-markers",`
			`"toolsets": "file",`
			`"prompt": "Search files in ~/.hermes/bin/ for the string 'chatgpt.com'. Report which files contain it and on which lines.",`
			`},`
			`{`
			`"id": "search-soul-keyword",`
			`"toolsets": "file",`
			`"prompt": "Search ~/.timmy/SOUL.md for the word 'sovereignty'. How many times does it appear?",`
			`},`
			`{`
			`"id": "list-bin-scripts",`
			`"toolsets": "file",`
			`"prompt": "Search for files matching *.sh in ~/.hermes/bin/. List the first 10 filenames.",`
			`},`
			`{`
			`"id": "read-and-summarize",`
			`"toolsets": "file",`
			`"prompt": "Read ~/.timmy/SOUL.md. In exactly one sentence, what is Timmy's position on honesty?",`
			`},`
			`{`
			`"id": "multi-read",`
			`"toolsets": "file",`
			`"prompt": "Read both ~/.timmy/SOUL.md and ~/.hermes/config.yaml. Does the config honor the soul's requirement to not phone home? Answer yes or no with one sentence of evidence.",`
			`},`
			`]`


			`def run_task(task: dict, run_number: int) -> dict:`
			`"""Run a single task and return result dict."""`
			`task_id = task["id"]`
			`prompt = task["prompt"].replace("{timestamp}", datetime.now().isoformat())`
			`toolsets = task["toolsets"]`

			`result = {`
			`"task_id": task_id,`
			`"run": run_number,`
			`"started_at": datetime.now().isoformat(),`
			`"prompt": prompt,`
			`"toolsets": toolsets,`
			`}`

			`sys.path.insert(0, str(AGENT_DIR))`
			`try:`
			`from run_agent import AIAgent`

fix: overnight loop provider resolution — explicit Ollama The overnight tightening loop had a 99% error rate (11,058/11,210 tasks) because resolve_runtime_provider() returned provider='local' which the AIAgent doesn't recognize. Fix: Bypass resolve_runtime_provider() entirely. The overnight loop always runs against local Ollama inference — hardcode it. Changes: - Removed dependency on hermes_cli.runtime_provider - Explicit Ollama provider (http://localhost:11434/v1) - Model configurable via OVERNIGHT_MODEL env var (default: hermes4:14b) - Base URL configurable via OVERNIGHT_BASE_URL env var Before: 1% pass rate (139/11,210 over 1,121 cycles) After: Should match Ollama availability (near 100% when running) 2026-04-13 02:10:05 +00:00			`# Explicit Ollama provider — do NOT use resolve_runtime_provider()`
			`# which may return 'local' (unsupported). The overnight loop always`
			`# runs against local Ollama inference.`
			`_model = os.environ.get("OVERNIGHT_MODEL", "hermes4:14b")`
			`_base_url = os.environ.get("OVERNIGHT_BASE_URL", "http://localhost:11434/v1")`
			`_provider = "ollama"`
chore: check in all local work — uniwizard, briefings, reports, evennia, morrowind, scripts, specs, training data, angband MCP, diagrams, twitter archive, wizards - Resolve decisions.md merge conflict (keep both Codex boundary + Ezra/Bezalel entries) - Update .gitignore: protect bare secret files, exclude venvs and nexus-localhost - Add uniwizard tools (mention watcher, adaptive prompt router, self-grader, classifiers) - Add briefings, good-morning reports, production reports - Add evennia world scaffold and training data - Add angband and morrowind MCP servers - Add diagrams, specs, test results, overnight loop scripts - Add twitter archive insights and media metadata - Add wizard workspaces (allegro, nahshon) 2026-03-30 17:18:09 -04:00
			`buf_out = io.StringIO()`
			`buf_err = io.StringIO()`

			`agent = AIAgent(`
fix: overnight loop provider resolution — explicit Ollama The overnight tightening loop had a 99% error rate (11,058/11,210 tasks) because resolve_runtime_provider() returned provider='local' which the AIAgent doesn't recognize. Fix: Bypass resolve_runtime_provider() entirely. The overnight loop always runs against local Ollama inference — hardcode it. Changes: - Removed dependency on hermes_cli.runtime_provider - Explicit Ollama provider (http://localhost:11434/v1) - Model configurable via OVERNIGHT_MODEL env var (default: hermes4:14b) - Base URL configurable via OVERNIGHT_BASE_URL env var Before: 1% pass rate (139/11,210 over 1,121 cycles) After: Should match Ollama availability (near 100% when running) 2026-04-13 02:10:05 +00:00			`model=_model,`
			`base_url=_base_url,`
			`provider=_provider,`
			`api_mode="chat_completions",`
chore: check in all local work — uniwizard, briefings, reports, evennia, morrowind, scripts, specs, training data, angband MCP, diagrams, twitter archive, wizards - Resolve decisions.md merge conflict (keep both Codex boundary + Ezra/Bezalel entries) - Update .gitignore: protect bare secret files, exclude venvs and nexus-localhost - Add uniwizard tools (mention watcher, adaptive prompt router, self-grader, classifiers) - Add briefings, good-morning reports, production reports - Add evennia world scaffold and training data - Add angband and morrowind MCP servers - Add diagrams, specs, test results, overnight loop scripts - Add twitter archive insights and media metadata - Add wizard workspaces (allegro, nahshon) 2026-03-30 17:18:09 -04:00			`max_iterations=MAX_TURNS_PER_TASK,`
			`quiet_mode=True,`
			`ephemeral_system_prompt=SYSTEM_PROMPT,`
			`skip_context_files=True,`
			`skip_memory=True,`
			`enabled_toolsets=[toolsets] if toolsets else None,`
			`)`

			`start = time.time()`
			`with redirect_stdout(buf_out), redirect_stderr(buf_err):`
			`conv_result = agent.run_conversation(prompt, sync_honcho=False)`
			`elapsed = time.time() - start`

			`result["elapsed_seconds"] = round(elapsed, 2)`
			`result["response"] = conv_result.get("final_response", "")[:2000]`
			`result["session_id"] = getattr(agent, "session_id", None)`
fix: overnight loop provider resolution — explicit Ollama The overnight tightening loop had a 99% error rate (11,058/11,210 tasks) because resolve_runtime_provider() returned provider='local' which the AIAgent doesn't recognize. Fix: Bypass resolve_runtime_provider() entirely. The overnight loop always runs against local Ollama inference — hardcode it. Changes: - Removed dependency on hermes_cli.runtime_provider - Explicit Ollama provider (http://localhost:11434/v1) - Model configurable via OVERNIGHT_MODEL env var (default: hermes4:14b) - Base URL configurable via OVERNIGHT_BASE_URL env var Before: 1% pass rate (139/11,210 over 1,121 cycles) After: Should match Ollama availability (near 100% when running) 2026-04-13 02:10:05 +00:00			`result["provider"] = _provider`
			`result["base_url"] = _base_url`
			`result["model"] = _model`
chore: check in all local work — uniwizard, briefings, reports, evennia, morrowind, scripts, specs, training data, angband MCP, diagrams, twitter archive, wizards - Resolve decisions.md merge conflict (keep both Codex boundary + Ezra/Bezalel entries) - Update .gitignore: protect bare secret files, exclude venvs and nexus-localhost - Add uniwizard tools (mention watcher, adaptive prompt router, self-grader, classifiers) - Add briefings, good-morning reports, production reports - Add evennia world scaffold and training data - Add angband and morrowind MCP servers - Add diagrams, specs, test results, overnight loop scripts - Add twitter archive insights and media metadata - Add wizard workspaces (allegro, nahshon) 2026-03-30 17:18:09 -04:00			`result["tool_calls_made"] = conv_result.get("tool_calls_count", 0)`
			`result["status"] = "pass" if conv_result.get("final_response") else "empty"`
			`result["stdout"] = buf_out.getvalue()[:500]`
			`result["stderr"] = buf_err.getvalue()[:500]`

			`except Exception as exc:`
			`result["elapsed_seconds"] = round(time.time() - start if 'start' in dir() else 0, 2)`
			`result["status"] = "error"`
			`result["error"] = str(exc)`
			`result["traceback"] = traceback.format_exc()[-1000:]`

			`result["finished_at"] = datetime.now().isoformat()`
			`return result`


			`def main():`
			`run_id = datetime.now().strftime("%Y%m%d_%H%M%S")`
			`log_path = RESULTS_DIR / f"overnight_run_{run_id}.jsonl"`
			`summary_path = RESULTS_DIR / f"overnight_summary_{run_id}.md"`

			`print(f"=== Timmy Overnight Loop ===")`
			`print(f"Run ID: {run_id}")`
			`print(f"Tasks: {len(TASKS)}")`
			`print(f"Log: {log_path}")`
			`print(f"Max turns per task: {MAX_TURNS_PER_TASK}")`
			`print()`

			`results = []`
			`cycle = 0`

			`# Run continuously until killed`
			`while True:`
			`cycle += 1`
			`print(f"--- Cycle {cycle} ({datetime.now().strftime('%H:%M:%S')}) ---")`

			`for task in TASKS:`
			`task_id = task["id"]`
			`print(f" [{task_id}] ", end="", flush=True)`

			`result = run_task(task, cycle)`
			`results.append(result)`

			`# Append to JSONL log`
			`with open(log_path, "a") as f:`
			`f.write(json.dumps(result) + "\n")`

			`status = result["status"]`
			`elapsed = result.get("elapsed_seconds", "?")`
			`print(f"{status} ({elapsed}s)")`

			`# Brief pause between tasks`
			`time.sleep(2)`

			`# Write summary after each cycle`
			`passes = sum(1 for r in results if r["status"] == "pass")`
			`errors = sum(1 for r in results if r["status"] == "error")`
			`empties = sum(1 for r in results if r["status"] == "empty")`
			`total = len(results)`
			`avg_time = sum(r.get("elapsed_seconds", 0) for r in results) / max(total, 1)`

			`summary = f"""# Timmy Overnight Loop — Summary`
			`Run ID: {run_id}`
			`Generated: {datetime.now().isoformat()}`
			`Cycles completed: {cycle}`
			`Total tasks run: {total}`

			`## Aggregate`
			`- Pass: {passes}/{total} ({100*passes//max(total,1)}%)`
			`- Empty: {empties}/{total}`
			`- Error: {errors}/{total}`
			`- Avg response time: {avg_time:.1f}s`

			`## Per-task results (latest cycle)`
			`"""`
			`cycle_results = [r for r in results if r["run"] == cycle]`
			`for r in cycle_results:`
			`resp_preview = r.get("response", "")[:100].replace("\n", " ")`
			`summary += f"- {r['task_id']}: {r['status']} ({r.get('elapsed_seconds','?')}s) — {resp_preview}\n"`

			`summary += f"\n## Error details\n"`
			`for r in results:`
			`if r["status"] == "error":`
			`summary += f"- {r['task_id']} (cycle {r['run']}): {r.get('error','?')}\n"`

			`with open(summary_path, "w") as f:`
			`f.write(summary)`

			`print(f"\n Cycle {cycle} done. Pass={passes} Error={errors} Empty={empties} Avg={avg_time:.1f}s")`
			`print(f" Summary: {summary_path}")`
			`print(f" Sleeping 30s before next cycle...\n")`
			`time.sleep(30)`


			`if __name__ == "__main__":`
			`main()`