#!/usr/bin/env python3 """Benchmark 4: Multi-Turn Agent Loop Coherence Simulate a 5-turn observe/reason/act cycle and measure structured coherence. Each turn must return valid JSON with required fields. """ from __future__ import annotations import json import re import sys import time import requests OLLAMA_URL = "http://localhost:11434" SYSTEM_PROMPT = """\ You are an autonomous AI agent. For each message, you MUST respond with valid JSON containing: { "observation": "", "reasoning": "", "action": "", "confidence": <0.0-1.0> } Respond ONLY with the JSON object. No other text. """ TURNS = [ "You are monitoring a web server. CPU usage just spiked to 95%. What do you observe, reason, and do?", "Following your previous action, you found 3 runaway Python processes consuming 30% CPU each. Continue.", "You killed the top 2 processes. CPU is now at 45%. A new alert: disk I/O is at 98%. Continue.", "You traced the disk I/O to a log rotation script that's stuck. You terminated it. Disk I/O dropped to 20%. Final status check: all metrics are now nominal. Continue.", "The incident is resolved. Write a brief post-mortem summary as your final action.", ] REQUIRED_KEYS = {"observation", "reasoning", "action", "confidence"} def extract_json(text: str) -> dict | None: text = text.strip() try: return json.loads(text) except json.JSONDecodeError: pass fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) if fence_match: try: return json.loads(fence_match.group(1)) except json.JSONDecodeError: pass # Try to find { ... } block brace_match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)?\}", text, re.DOTALL) if brace_match: try: return json.loads(brace_match.group(0)) except json.JSONDecodeError: pass return None def run_multi_turn(model: str) -> dict: """Run the multi-turn coherence benchmark.""" conversation = [] turn_results = [] total_time = 0.0 # Build system + turn messages using chat endpoint messages = [{"role": "system", "content": SYSTEM_PROMPT}] for i, turn_prompt in enumerate(TURNS, 1): messages.append({"role": "user", "content": turn_prompt}) start = time.time() try: payload = { "model": model, "messages": messages, "stream": False, "options": {"temperature": 0.1, "num_predict": 512}, } resp = requests.post(f"{OLLAMA_URL}/api/chat", json=payload, timeout=120) resp.raise_for_status() raw = resp.json()["message"]["content"] except Exception as exc: elapsed = time.time() - start turn_results.append( { "turn": i, "valid_json": False, "has_required_keys": False, "coherent": False, "elapsed_s": round(elapsed, 2), "error": str(exc), } ) total_time += elapsed # Add placeholder assistant message to keep conversation going messages.append({"role": "assistant", "content": "{}"}) continue elapsed = time.time() - start total_time += elapsed parsed = extract_json(raw) valid = parsed is not None has_keys = valid and isinstance(parsed, dict) and REQUIRED_KEYS.issubset(parsed.keys()) confidence_valid = ( has_keys and isinstance(parsed.get("confidence"), (int, float)) and 0.0 <= parsed["confidence"] <= 1.0 ) coherent = has_keys and confidence_valid turn_results.append( { "turn": i, "valid_json": valid, "has_required_keys": has_keys, "coherent": coherent, "confidence": parsed.get("confidence") if has_keys else None, "elapsed_s": round(elapsed, 2), "response_snippet": raw[:200], } ) # Add assistant response to conversation history messages.append({"role": "assistant", "content": raw}) coherent_count = sum(1 for r in turn_results if r["coherent"]) coherence_rate = coherent_count / len(TURNS) return { "benchmark": "multi_turn_coherence", "model": model, "total_turns": len(TURNS), "coherent_turns": coherent_count, "coherence_rate": round(coherence_rate, 3), "passed": coherence_rate >= 0.80, "total_time_s": round(total_time, 2), "turns": turn_results, } if __name__ == "__main__": model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b" print(f"Running multi-turn coherence benchmark against {model}...") result = run_multi_turn(model) print(json.dumps(result, indent=2)) sys.exit(0 if result["passed"] else 1)