64 lines
2.9 KiB
Python
64 lines
2.9 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
import json
|
||
|
|
import time
|
||
|
|
import urllib.request
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
MODEL = "NousResearch_Hermes-4-14B-Q4_K_M.gguf"
|
||
|
|
URL = "http://localhost:8081/v1/chat/completions"
|
||
|
|
SOUL = Path.home().joinpath('.timmy/SOUL.md').read_text()
|
||
|
|
OUT = Path.home().joinpath('.timmy/test-results', f'local_decision_session_{time.strftime("%Y%m%d_%H%M%S")}.md')
|
||
|
|
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
messages = [
|
||
|
|
{"role": "system", "content": SOUL},
|
||
|
|
{"role": "user", "content": "For this session follow three rules: 1) prefer local over cloud when both work, 2) trust live world state over stale reports, 3) if uncertain, say uncertain. Repeat those rules in one short sentence."},
|
||
|
|
]
|
||
|
|
|
||
|
|
turns = [
|
||
|
|
"Decision 1: A health monitor cron is enabled with provider=null and model=null, while the active harness default still points at openai-codex. Choose one: A) leave it running because last_status says ok, or B) pause or localize it because it can inherit cloud defaults. Answer with the letter, then one sentence.",
|
||
|
|
"Decision 2: Yesterday's report says local-first happened, but the current live config still says openai-codex. Which source wins and why? Two sentences max.",
|
||
|
|
"Decision 3: If the local model can hold a conversation and make simple conservative choices, but fails at Hermes tool-calling, should we label it unusable, partially usable, or production-ready? Pick one label and justify it in one sentence.",
|
||
|
|
"What was rule 2 from the start of this session? Answer exactly in one sentence.",
|
||
|
|
"Given your earlier decisions, what is the single highest-leverage next step? One sentence."
|
||
|
|
]
|
||
|
|
|
||
|
|
transcript = []
|
||
|
|
|
||
|
|
def call(msgs):
|
||
|
|
payload = {
|
||
|
|
"model": MODEL,
|
||
|
|
"messages": msgs,
|
||
|
|
"stream": False,
|
||
|
|
"temperature": 0.2,
|
||
|
|
"max_tokens": 220,
|
||
|
|
}
|
||
|
|
req = urllib.request.Request(URL, data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"})
|
||
|
|
with urllib.request.urlopen(req, timeout=120) as resp:
|
||
|
|
data = json.loads(resp.read().decode())
|
||
|
|
return data["choices"][0]["message"]["content"].strip(), data.get("usage", {})
|
||
|
|
|
||
|
|
first_reply, usage = call(messages)
|
||
|
|
transcript.append((messages[-1]["content"], first_reply, usage))
|
||
|
|
messages.append({"role": "assistant", "content": first_reply})
|
||
|
|
|
||
|
|
for q in turns:
|
||
|
|
messages.append({"role": "user", "content": q})
|
||
|
|
reply, usage = call(messages)
|
||
|
|
transcript.append((q, reply, usage))
|
||
|
|
messages.append({"role": "assistant", "content": reply})
|
||
|
|
|
||
|
|
report = ["# Local Decision Session Test", "", f"Model: {MODEL}", f"URL: {URL}", "", "## Transcript", ""]
|
||
|
|
for i, (q, a, usage) in enumerate(transcript, 1):
|
||
|
|
report.append(f"### Turn {i}")
|
||
|
|
report.append(f"User: {q}")
|
||
|
|
report.append("")
|
||
|
|
report.append(f"Assistant: {a}")
|
||
|
|
report.append("")
|
||
|
|
report.append(f"Usage: {usage}")
|
||
|
|
report.append("")
|
||
|
|
|
||
|
|
OUT.write_text("\n".join(report))
|
||
|
|
print(str(OUT))
|
||
|
|
print("\n".join(report[:120]))
|