Compare commits

...

2 Commits

Author SHA1 Message Date
Alexander Whitestone
8d1f9ed375 feat(#667): codebase_genome.py — test stub generator for uncovered functions
Some checks failed
Smoke Test / smoke (pull_request) Failing after 20s
AST-based tool that scans Python files, extracts function/method signatures,
and generates pytest test stubs for functions without existing tests.

Usage:
  python3 codebase_genome.py /path/to/repo
  python3 codebase_genome.py /path/to/repo -o tests/test_genome_generated.py

Features:
- AST parsing (no imports required, handles syntax errors gracefully)
- Extracts: function name, args, return type, decorators, class context
- Detects existing tests to avoid duplicates
- Generates: basic test + edge case test per function
- Skips private/dunder methods
- Configurable limit (--limit N)

Generated 30 test stubs for timmy-home as proof of concept.
2026-04-14 23:39:13 -04:00
Alexander Whitestone
038f1ab7f4 feat: Big Brain quality benchmark — gemma3:1b vs gemma3:27b (issue #576)
Ran 4 benchmark tasks on local gemma3:1b model with full quality analysis.
Big Brain (gemma3:27b on RunPod L40S) pod was offline (HTTP 404) during
benchmark — documented honestly with re-run instructions.

Tasks benchmarked:
1. Python Gitea webhook parser with HMAC-SHA256 verification
2. Evennia MUD framework architecture explanation
3. Fleet burn-down cron script for RunPod pods
4. Python async bug diagnosis and fix

Key finding: 1B model fails all tasks with hallucinated APIs, wrong
security primitives, fabricated technical details, and incorrect bug
diagnosis. Quality gap to 27B expected to be substantial.

Deliverable: timmy-config/docs/big-brain-benchmark.md
2026-04-13 20:49:02 -04:00
6 changed files with 1490 additions and 0 deletions

275
codebase_genome.py Normal file
View File

@@ -0,0 +1,275 @@
#!/usr/bin/env python3
"""
codebase_genome.py — Analyze a repo and generate test stubs for uncovered functions.
Scans Python files, extracts function/class/method signatures via AST,
and generates pytest test cases with edge cases.
Usage:
python3 codebase_genome.py /path/to/repo
python3 codebase_genome.py /path/to/repo --output tests/test_genome_generated.py
"""
import ast
import os
import sys
import argparse
from pathlib import Path
class FunctionInfo:
def __init__(self, name, filepath, lineno, args, returns, decorators, is_method=False, class_name=None):
self.name = name
self.filepath = filepath
self.lineno = lineno
self.args = args # list of arg names
self.returns = returns # return annotation or None
self.decorators = decorators
self.is_method = is_method
self.class_name = class_name
@property
def qualified_name(self):
if self.class_name:
return f"{self.class_name}.{self.name}"
return self.name
@property
def import_path(self):
"""Module path for import (e.g., 'mymodule.sub.Class.method')."""
rel = Path(self.filepath).with_suffix('')
parts = list(rel.parts)
# Remove common prefixes
if parts and parts[0] in ('src', 'lib'):
parts = parts[1:]
module = '.'.join(parts)
if self.class_name:
return f"{module}.{self.class_name}.{self.name}"
return f"{module}.{self.name}"
@property
def module_path(self):
rel = Path(self.filepath).with_suffix('')
parts = list(rel.parts)
if parts and parts[0] in ('src', 'lib'):
parts = parts[1:]
return '.'.join(parts)
def extract_functions(filepath: str) -> list:
"""Extract all function definitions from a Python file via AST."""
try:
source = open(filepath).read()
tree = ast.parse(source, filename=filepath)
except (SyntaxError, UnicodeDecodeError):
return []
functions = []
class FuncVisitor(ast.NodeVisitor):
def __init__(self):
self.current_class = None
def visit_ClassDef(self, node):
old_class = self.current_class
self.current_class = node.name
self.generic_visit(node)
self.current_class = old_class
def visit_FunctionDef(self, node):
args = [a.arg for a in node.args.args]
if args and args[0] == 'self':
args = args[1:]
returns = None
if node.returns:
if isinstance(node.returns, ast.Name):
returns = node.returns.id
elif isinstance(node.returns, ast.Constant):
returns = str(node.returns.value)
decorators = []
for d in node.decorator_list:
if isinstance(d, ast.Name):
decorators.append(d.id)
elif isinstance(d, ast.Attribute):
decorators.append(d.attr)
functions.append(FunctionInfo(
name=node.name,
filepath=filepath,
lineno=node.lineno,
args=args,
returns=returns,
decorators=decorators,
is_method=self.current_class is not None,
class_name=self.current_class,
))
self.generic_visit(node)
visit_AsyncFunctionDef = visit_FunctionDef
visitor = FuncVisitor()
visitor.visit(tree)
return functions
def generate_test(func: FunctionInfo, existing_tests: set) -> str:
"""Generate a pytest test function for a given function."""
if func.name in existing_tests:
return ''
# Skip private/dunder methods
if func.name.startswith('_') and not func.name.startswith('__'):
return ''
if func.name.startswith('__') and func.name.endswith('__'):
return ''
lines = []
# Generate imports
module = func.module_path.replace('/', '.').lstrip('.')
if func.class_name:
lines.append(f"from {module} import {func.class_name}")
else:
lines.append(f"from {module} import {func.name}")
lines.append('')
lines.append('')
# Test function name
test_name = f"test_{func.qualified_name.replace('.', '_')}"
# Determine args for the test call
args_str = ', '.join(func.args)
lines.append(f"def {test_name}():")
lines.append(f' """Test {func.qualified_name} (line {func.lineno} in {func.filepath})."""')
if func.is_method:
lines.append(f" # TODO: instantiate {func.class_name} with valid args")
lines.append(f" obj = {func.class_name}()")
lines.append(f" result = obj.{func.name}({', '.join('None' for _ in func.args) if func.args else ''})")
else:
if func.args:
lines.append(f" # TODO: provide valid arguments for: {args_str}")
lines.append(f" result = {func.name}({', '.join('None' for _ in func.args)})")
else:
lines.append(f" result = {func.name}()")
lines.append(f" assert result is not None or result is None # TODO: real assertion")
lines.append('')
lines.append('')
# Edge cases
lines.append(f"def {test_name}_edge_cases():")
lines.append(f' """Edge cases for {func.qualified_name}."""')
if func.args:
lines.append(f" # Test with empty/zero/None args")
if func.is_method:
lines.append(f" obj = {func.class_name}()")
for arg in func.args:
lines.append(f" # obj.{func.name}({arg}=...) # TODO: test with invalid {arg}")
else:
for arg in func.args:
lines.append(f" # {func.name}({arg}=...) # TODO: test with invalid {arg}")
else:
lines.append(f" # {func.qualified_name} takes no args — test idempotency")
if func.is_method:
lines.append(f" obj = {func.class_name}()")
lines.append(f" r1 = obj.{func.name}()")
lines.append(f" r2 = obj.{func.name}()")
lines.append(f" # assert r1 == r2 # TODO: uncomment if deterministic")
else:
lines.append(f" r1 = {func.name}()")
lines.append(f" r2 = {func.name}()")
lines.append(f" # assert r1 == r2 # TODO: uncomment if deterministic")
lines.append('')
lines.append('')
return '\n'.join(lines)
def scan_repo(repo_path: str) -> list:
"""Scan all Python files in a repo and extract functions."""
all_functions = []
for root, dirs, files in os.walk(repo_path):
# Skip hidden dirs, __pycache__, .git, venv, node_modules
dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ('__pycache__', 'venv', 'node_modules', 'env')]
for f in files:
if f.endswith('.py') and not f.startswith('_'):
filepath = os.path.join(root, f)
relpath = os.path.relpath(filepath, repo_path)
funcs = extract_functions(filepath)
# Update filepath to relative
for func in funcs:
func.filepath = relpath
all_functions.extend(funcs)
return all_functions
def find_existing_tests(repo_path: str) -> set:
"""Find function names that already have tests."""
tested = set()
tests_dir = os.path.join(repo_path, 'tests')
if not os.path.isdir(tests_dir):
return tested
for root, dirs, files in os.walk(tests_dir):
for f in files:
if f.startswith('test_') and f.endswith('.py'):
try:
source = open(os.path.join(root, f)).read()
tree = ast.parse(source)
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'):
# Extract function name from test name
name = node.name[5:] # strip 'test_'
tested.add(name)
except (SyntaxError, UnicodeDecodeError):
pass
return tested
def main():
parser = argparse.ArgumentParser(description='Generate test stubs for uncovered functions')
parser.add_argument('repo', help='Path to repository')
parser.add_argument('--output', '-o', default=None, help='Output file (default: stdout)')
parser.add_argument('--limit', '-n', type=int, default=50, help='Max tests to generate')
args = parser.parse_args()
repo = os.path.abspath(args.repo)
if not os.path.isdir(repo):
print(f"Error: {repo} is not a directory", file=sys.stderr)
sys.exit(1)
functions = scan_repo(repo)
existing = find_existing_tests(repo)
# Filter to untested functions
untested = [f for f in functions if f.name not in existing and not f.name.startswith('_')]
print(f"Found {len(functions)} functions, {len(untested)} untested", file=sys.stderr)
# Generate tests
output = []
output.append('"""Auto-generated test stubs from codebase_genome.py.\n')
output.append('These are starting points — fill in real assertions and args.\n"""')
output.append('import pytest')
output.append('')
generated = 0
for func in untested[:args.limit]:
test = generate_test(func, set())
if test:
output.append(test)
generated += 1
content = '\n'.join(output)
if args.output:
with open(args.output, 'w') as f:
f.write(content)
print(f"Generated {generated} test stubs → {args.output}", file=sys.stderr)
else:
print(content)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,142 @@
# Emacs Sovereign Control Plane
Real-time, programmable orchestration hub for the Timmy Foundation fleet.
## Architecture
```
┌─────────────────────────────────────────────────────────────┐
│ Emacs Control Plane │
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ dispatch.org│ │ shared │ │ org-babel │ │
│ │ (Task Queue)│ │ buffers │ │ notebooks │ │
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
│ │ │ │ │
│ └────────────────┼────────────────┘ │
│ │ │
│ ┌─────▼─────┐ │
│ │ Emacs │ │
│ │ Daemon │ │
│ │ (bezalel)│ │
│ └─────┬─────┘ │
└──────────────────────────┼──────────────────────────────────┘
┌──────────────────┼──────────────────┐
│ │ │
┌────▼────┐ ┌────▼────┐ ┌────▼────┐
│ Ezra │ │ Allegro │ │ Timmy │
│ (VPS) │ │ (VPS) │ │ (Mac) │
└─────────┘ └─────────┘ └─────────┘
```
## Infrastructure
| Component | Location | Details |
|-----------|----------|---------|
| Daemon Host | Bezalel (`159.203.146.185`) | Shared Emacs daemon |
| Socket Path | `/root/.emacs.d/server/bezalel` | emacsclient socket |
| Dispatch Hub | `/srv/fleet/workspace/dispatch.org` | Central task queue |
| Wrapper | `/usr/local/bin/fleet-append` | Quick message append |
## Quick Start
### From Local Machine (Timmy)
```bash
# Append a message to the fleet log
scripts/fleet_dispatch.sh append "Status: all systems nominal"
# Check for pending tasks assigned to Timmy
scripts/fleet_dispatch.sh poll timmy
# Claim a task
scripts/fleet_dispatch.sh claim 42 timmy
# Report task completion
scripts/emacs_fleet_bridge.py complete 42 "PR merged: #123"
```
### From Other VPS Agents (Ezra, Allegro, etc.)
```bash
# Direct emacsclient via SSH
ssh root@bezalel 'emacsclient -s /root/.emacs.d/server/bezalel -e "(your-elisp-here)"'
# Or use the wrapper
ssh root@bezalel '/usr/local/bin/fleet-append "Ezra: task #42 complete"'
```
## dispatch.org Structure
The central dispatch hub uses Org mode format:
```org
* TODO [timmy] Review PR #123 from gitea
SCHEDULED: <2026-04-13 Sun>
:PROPERTIES:
:PRIORITY: A
:ASSIGNEE: timmy
:GITEA_PR: https://forge.alexanderwhitestone.com/...
:END:
* IN_PROGRESS [ezra] Deploy monitoring to VPS
SCHEDULED: <2026-04-13 Sun>
:PROPERTIES:
:PRIORITY: B
:ASSIGNEE: ezra
:STARTED: 2026-04-13T15:30:00Z
:END:
* DONE [allegro] Fix cron reliability
CLOSED: [2026-04-13 Sun 14:00]
:PROPERTIES:
:ASSIGNEE: allegro
:RESULT: PR #456 merged
:END:
```
### Status Keywords
- `TODO` — Available for claiming
- `IN_PROGRESS` — Being worked on
- `WAITING` — Blocked on external dependency
- `DONE` — Completed
- `CANCELLED` — No longer needed
### Priority Levels
- `[#A]` — Critical / P0
- `[#B]` — Important / P1
- `[#C]` — Normal / P2
## Agent Workflow
1. **Poll:** Check `dispatch.org` for `TODO` items matching your agent name
2. **Claim:** Update status from `TODO` to `IN_PROGRESS`, add `:STARTED:` timestamp
3. **Execute:** Do the work (implement, deploy, test, etc.)
4. **Report:** Update status to `DONE`, add `:RESULT:` property with outcome
## Integration with Existing Systems
### Gitea Issues
- `dispatch.org` tasks can reference Gitea issues via `:GITEA_PR:` or `:GITEA_ISSUE:` properties
- Completion can auto-close Gitea issues via API
### Hermes Cron
- Hermes cron jobs can check `dispatch.org` before running
- Tasks in `dispatch.org` take priority over ambient issue burning
### Nostr Protocol
- Heartbeats still go through Nostr (kind 1)
- `dispatch.org` is for tactical coordination, Nostr is for strategic announcements
## Files
```
infrastructure/emacs-control-plane/
├── README.md # This file
├── dispatch.org.template # Template dispatch file
└── fleet_bridge.el # Emacs Lisp helpers
scripts/
├── fleet_dispatch.sh # Shell wrapper for fleet operations
├── emacs_fleet_bridge.py # Python bridge for Emacs daemon
└── emacs_task_poller.py # Poll for tasks assigned to an agent
```

View File

@@ -0,0 +1,50 @@
#+TITLE: Fleet Dispatch Hub
#+AUTHOR: Timmy Foundation
#+DATE: 2026-04-13
#+PROPERTY: header-args :tangle no
* Overview
This is the central task queue for the Timmy Foundation fleet.
Agents poll this file for =TODO= items matching their name.
* How to Use
1. Agents: Poll for =TODO= items with your assignee tag
2. Claim: Move to =IN_PROGRESS= with =:STARTED:= timestamp
3. Complete: Move to =DONE= with =:RESULT:= property
* Fleet Status
** Heartbeats
- timmy: LAST_HEARTBEAT <2026-04-13 Sun 15:00>
- ezra: LAST_HEARTBEAT <2026-04-13 Sun 15:00>
- allegro: LAST_HEARTBEAT <2026-04-13 Sun 14:55>
- bezalel: LAST_HEARTBEAT <2026-04-13 Sun 15:00>
* Tasks
** TODO [timmy] Example task — review pending PRs
SCHEDULED: <2026-04-13 Sun>
:PROPERTIES:
:PRIORITY: B
:ASSIGNEE: timmy
:GITEA_ISSUE: https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-home/issues/590
:END:
Check all open PRs across fleet repos and triage.
** TODO [ezra] Example task — run fleet health check
SCHEDULED: <2026-04-13 Sun>
:PROPERTIES:
:PRIORITY: C
:ASSIGNEE: ezra
:END:
SSH into each VPS and verify services are running.
** TODO [allegro] Example task — update cron job configs
SCHEDULED: <2026-04-13 Sun>
:PROPERTIES:
:PRIORITY: C
:ASSIGNEE: allegro
:END:
Review and update cron job definitions in timmy-config.
* Completed
#+BEGIN: clocktable :scope file :maxlevel 2
#+END:

202
scripts/fleet_dispatch.sh Normal file
View File

@@ -0,0 +1,202 @@
#!/bin/bash
# ============================================================================
# Fleet Dispatch — Shell wrapper for Emacs Control Plane operations
# ============================================================================
#
# Usage:
# scripts/fleet_dispatch.sh append "Message text"
# scripts/fleet_dispatch.sh poll [agent_name]
# scripts/fleet_dispatch.sh claim TASK_ID agent_name
# scripts/fleet_dispatch.sh complete TASK_ID "Result text"
# scripts/fleet_dispatch.sh status
#
# Environment:
# FLEET_DAEMON_HOST — Bezalel host (default: 159.203.146.185)
# FLEET_DAEMON_USER — SSH user (default: root)
# FLEET_DAEMON_SOCKET — Emacs socket path (default: /root/.emacs.d/server/bezalel)
# FLEET_DISPATCH_FILE — Path to dispatch.org on remote (default: /srv/fleet/workspace/dispatch.org)
# ============================================================================
set -euo pipefail
# ── Configuration ──────────────────────────────────────────────────────────
FLEET_DAEMON_HOST="${FLEET_DAEMON_HOST:-159.203.146.185}"
FLEET_DAEMON_USER="${FLEET_DAEMON_USER:-root}"
FLEET_DAEMON_SOCKET="${FLEET_DAEMON_SOCKET:-/root/.emacs.d/server/bezalel}"
FLEET_DISPATCH_FILE="${FLEET_DISPATCH_FILE:-/srv/fleet/workspace/dispatch.org}"
# Colors
GREEN='\033[0;32m'
CYAN='\033[0;36m'
YELLOW='\033[0;33m'
RED='\033[0;31m'
NC='\033[0m'
# ── Helper: Run emacsclient command on Bezalel ─────────────────────────────
run_emacs() {
local elisp="$1"
ssh "${FLEET_DAEMON_USER}@${FLEET_DAEMON_HOST}" \
"emacsclient -s ${FLEET_DAEMON_SOCKET} -e '${elisp}'" 2>/dev/null
}
# ── Helper: Read dispatch.org via SSH ──────────────────────────────────────
read_dispatch() {
ssh "${FLEET_DAEMON_USER}@${FLEET_DAEMON_HOST}" \
"cat ${FLEET_DISPATCH_FILE}" 2>/dev/null
}
# ── Helper: Write dispatch.org via SSH ─────────────────────────────────────
write_dispatch() {
ssh "${FLEET_DAEMON_USER}@${FLEET_DAEMON_HOST}" \
"cat > ${FLEET_DISPATCH_FILE}" 2>/dev/null
}
# ── Commands ───────────────────────────────────────────────────────────────
cmd_append() {
local message="${1:?Usage: fleet_dispatch.sh append \"message\"}"
local timestamp
timestamp=$(date -u +"%Y-%m-%d %H:%M:%S UTC")
echo -e "${CYAN}Appending to fleet log...${NC}"
# Use the fleet-append wrapper on Bezalel if available, otherwise emacsclient
if ssh "${FLEET_DAEMON_USER}@${FLEET_DAEMON_HOST}" "which fleet-append" &>/dev/null; then
ssh "${FLEET_DAEMON_USER}@${FLEET_DAEMON_HOST}" \
"fleet-append '${timestamp}${message}'"
else
run_emacs "(with-current-buffer (find-file-noselect \"${FLEET_DISPATCH_FILE}\") (goto-char (point-max)) (insert \"\\n- ${timestamp}${message}\") (save-buffer))"
fi
echo -e "${GREEN}✓ Appended: ${message}${NC}"
}
cmd_poll() {
local agent="${1:-}"
echo -e "${CYAN}Polling dispatch.org for tasks...${NC}"
local content
content=$(read_dispatch)
if [ -z "$content" ]; then
echo -e "${RED}Could not read dispatch.org${NC}"
return 1
fi
# Filter TODO items, optionally by agent
echo -e "${YELLOW}=== Pending Tasks ===${NC}"
if [ -n "$agent" ]; then
echo "$content" | grep -E "^\*\* TODO \[${agent}\]" || echo " No tasks for ${agent}"
else
echo "$content" | grep -E "^\*\* TODO " || echo " No pending tasks"
fi
}
cmd_claim() {
local task_id="${1:?Usage: fleet_dispatch.sh claim TASK_ID agent}"
local agent="${2:?Usage: fleet_dispatch.sh claim TASK_ID agent}"
local timestamp
timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo -e "${CYAN}Claiming task #${task_id} for ${agent}...${NC}"
# Use emacsclient to update the Org heading
run_emacs "(progn (with-current-buffer (find-file-noselect \"${FLEET_DISPATCH_FILE}\") (org-mode) (goto-char (point-min)) (if (re-search-forward (format \"^\\\\*\\\\* TODO.*#%s\" \"${task_id}\") nil t) (progn (org-todo \"IN_PROGRESS\") (org-set-property \"STARTED\" \"${timestamp}\") (save-buffer) (message \"Task %s claimed\" \"${task_id}\")) (message \"Task %s not found\" \"${task_id}\"))))"
echo -e "${GREEN}✓ Task #${task_id} claimed by ${agent}${NC}"
}
cmd_complete() {
local task_id="${1:?Usage: fleet_dispatch.sh complete TASK_ID \"result\"}"
local result="${2:-Completed}"
local timestamp
timestamp=$(date -u +"%Y-%m-%d %H:%M")
echo -e "${CYAN}Completing task #${task_id}...${NC}"
run_emacs "(progn (with-current-buffer (find-file-noselect \"${FLEET_DISPATCH_FILE}\") (org-mode) (goto-char (point-min)) (if (re-search-forward (format \"^\\\\*\\\\* IN_PROGRESS.*#%s\" \"${task_id}\") nil t) (progn (org-todo \"DONE\") (org-set-property \"RESULT\" \"${result}\") (org-add-planning-info 'closed (org-current-effective-time)) (save-buffer) (message \"Task %s completed\" \"${task_id}\")) (message \"Task %s not found\" \"${task_id}\"))))"
echo -e "${GREEN}✓ Task #${task_id} completed: ${result}${NC}"
}
cmd_status() {
echo -e "${CYAN}Fleet Control Plane Status${NC}"
echo -e " Host: ${FLEET_DAEMON_HOST}"
echo -e " Socket: ${FLEET_DAEMON_SOCKET}"
echo -e " Dispatch: ${FLEET_DISPATCH_FILE}"
echo ""
# Test connectivity
if ssh -o ConnectTimeout=5 "${FLEET_DAEMON_USER}@${FLEET_DAEMON_HOST}" "echo ok" &>/dev/null; then
echo -e " SSH: ${GREEN}✓ reachable${NC}"
else
echo -e " SSH: ${RED}✗ unreachable${NC}"
return 1
fi
# Test emacs daemon
local daemon_status
daemon_status=$(run_emacs "(if (server-running-p) \"running\" \"stopped\")" 2>/dev/null || echo "error")
if [ "$daemon_status" = "\"running\"" ]; then
echo -e " Daemon: ${GREEN}✓ running${NC}"
else
echo -e " Daemon: ${RED}${daemon_status}${NC}"
fi
# Count tasks
local content
content=$(read_dispatch 2>/dev/null || echo "")
if [ -n "$content" ]; then
local todo_count in_progress_count done_count
todo_count=$(echo "$content" | grep -c "^\*\* TODO " || echo 0)
in_progress_count=$(echo "$content" | grep -c "^\*\* IN_PROGRESS " || echo 0)
done_count=$(echo "$content" | grep -c "^\*\* DONE " || echo 0)
echo -e " Tasks: ${YELLOW}${todo_count} TODO${NC}, ${CYAN}${in_progress_count} IN_PROGRESS${NC}, ${GREEN}${done_count} DONE${NC}"
fi
}
# ── Main ───────────────────────────────────────────────────────────────────
case "${1:-help}" in
append|log)
shift
cmd_append "$@"
;;
poll|check)
shift
cmd_poll "$@"
;;
claim)
shift
cmd_claim "$@"
;;
complete|done)
shift
cmd_complete "$@"
;;
status)
cmd_status
;;
help|--help|-h)
echo "Fleet Dispatch — Emacs Control Plane wrapper"
echo ""
echo "Usage:"
echo " $0 append \"message\" Append to fleet log"
echo " $0 poll [agent] Check for pending tasks"
echo " $0 claim TASK_ID agent Claim a task"
echo " $0 complete TASK_ID \"result\" Mark task complete"
echo " $0 status Show control plane status"
echo ""
echo "Environment:"
echo " FLEET_DAEMON_HOST Bezalel host (default: 159.203.146.185)"
echo " FLEET_DAEMON_USER SSH user (default: root)"
echo " FLEET_DAEMON_SOCKET Emacs socket (default: /root/.emacs.d/server/bezalel)"
;;
*)
echo -e "${RED}Unknown command: $1${NC}"
echo "Run '$0 help' for usage."
exit 1
;;
esac

View File

@@ -0,0 +1,528 @@
"""Auto-generated test stubs from codebase_genome.py.
These are starting points — fill in real assertions and args.
"""
import pytest
from codebase_genome import FunctionInfo
def test_FunctionInfo_qualified_name():
"""Test FunctionInfo.qualified_name (line 31 in codebase_genome.py)."""
# TODO: instantiate FunctionInfo with valid args
obj = FunctionInfo()
result = obj.qualified_name()
assert result is not None or result is None # TODO: real assertion
def test_FunctionInfo_qualified_name_edge_cases():
"""Edge cases for FunctionInfo.qualified_name."""
# FunctionInfo.qualified_name takes no args — test idempotency
obj = FunctionInfo()
r1 = obj.qualified_name()
r2 = obj.qualified_name()
# assert r1 == r2 # TODO: uncomment if deterministic
from codebase_genome import FunctionInfo
def test_FunctionInfo_import_path():
"""Test FunctionInfo.import_path (line 37 in codebase_genome.py)."""
# TODO: instantiate FunctionInfo with valid args
obj = FunctionInfo()
result = obj.import_path()
assert result is not None or result is None # TODO: real assertion
def test_FunctionInfo_import_path_edge_cases():
"""Edge cases for FunctionInfo.import_path."""
# FunctionInfo.import_path takes no args — test idempotency
obj = FunctionInfo()
r1 = obj.import_path()
r2 = obj.import_path()
# assert r1 == r2 # TODO: uncomment if deterministic
from codebase_genome import FunctionInfo
def test_FunctionInfo_module_path():
"""Test FunctionInfo.module_path (line 50 in codebase_genome.py)."""
# TODO: instantiate FunctionInfo with valid args
obj = FunctionInfo()
result = obj.module_path()
assert result is not None or result is None # TODO: real assertion
def test_FunctionInfo_module_path_edge_cases():
"""Edge cases for FunctionInfo.module_path."""
# FunctionInfo.module_path takes no args — test idempotency
obj = FunctionInfo()
r1 = obj.module_path()
r2 = obj.module_path()
# assert r1 == r2 # TODO: uncomment if deterministic
from codebase_genome import extract_functions
def test_extract_functions():
"""Test extract_functions (line 58 in codebase_genome.py)."""
# TODO: provide valid arguments for: filepath
result = extract_functions(None)
assert result is not None or result is None # TODO: real assertion
def test_extract_functions_edge_cases():
"""Edge cases for extract_functions."""
# Test with empty/zero/None args
# extract_functions(filepath=...) # TODO: test with invalid filepath
from codebase_genome import FuncVisitor
def test_FuncVisitor_visit_ClassDef():
"""Test FuncVisitor.visit_ClassDef (line 72 in codebase_genome.py)."""
# TODO: instantiate FuncVisitor with valid args
obj = FuncVisitor()
result = obj.visit_ClassDef(None)
assert result is not None or result is None # TODO: real assertion
def test_FuncVisitor_visit_ClassDef_edge_cases():
"""Edge cases for FuncVisitor.visit_ClassDef."""
# Test with empty/zero/None args
obj = FuncVisitor()
# obj.visit_ClassDef(node=...) # TODO: test with invalid node
from codebase_genome import FuncVisitor
def test_FuncVisitor_visit_FunctionDef():
"""Test FuncVisitor.visit_FunctionDef (line 78 in codebase_genome.py)."""
# TODO: instantiate FuncVisitor with valid args
obj = FuncVisitor()
result = obj.visit_FunctionDef(None)
assert result is not None or result is None # TODO: real assertion
def test_FuncVisitor_visit_FunctionDef_edge_cases():
"""Edge cases for FuncVisitor.visit_FunctionDef."""
# Test with empty/zero/None args
obj = FuncVisitor()
# obj.visit_FunctionDef(node=...) # TODO: test with invalid node
from codebase_genome import generate_test
def test_generate_test():
"""Test generate_test (line 116 in codebase_genome.py)."""
# TODO: provide valid arguments for: func, existing_tests
result = generate_test(None, None)
assert result is not None or result is None # TODO: real assertion
def test_generate_test_edge_cases():
"""Edge cases for generate_test."""
# Test with empty/zero/None args
# generate_test(func=...) # TODO: test with invalid func
# generate_test(existing_tests=...) # TODO: test with invalid existing_tests
from codebase_genome import scan_repo
def test_scan_repo():
"""Test scan_repo (line 191 in codebase_genome.py)."""
# TODO: provide valid arguments for: repo_path
result = scan_repo(None)
assert result is not None or result is None # TODO: real assertion
def test_scan_repo_edge_cases():
"""Edge cases for scan_repo."""
# Test with empty/zero/None args
# scan_repo(repo_path=...) # TODO: test with invalid repo_path
from codebase_genome import find_existing_tests
def test_find_existing_tests():
"""Test find_existing_tests (line 209 in codebase_genome.py)."""
# TODO: provide valid arguments for: repo_path
result = find_existing_tests(None)
assert result is not None or result is None # TODO: real assertion
def test_find_existing_tests_edge_cases():
"""Edge cases for find_existing_tests."""
# Test with empty/zero/None args
# find_existing_tests(repo_path=...) # TODO: test with invalid repo_path
from codebase_genome import main
def test_main():
"""Test main (line 231 in codebase_genome.py)."""
result = main()
assert result is not None or result is None # TODO: real assertion
def test_main_edge_cases():
"""Edge cases for main."""
# main takes no args — test idempotency
r1 = main()
r2 = main()
# assert r1 == r2 # TODO: uncomment if deterministic
from evennia_tools.layout import room_keys
def test_room_keys():
"""Test room_keys (line 54 in evennia_tools/layout.py)."""
result = room_keys()
assert result is not None or result is None # TODO: real assertion
def test_room_keys_edge_cases():
"""Edge cases for room_keys."""
# room_keys takes no args — test idempotency
r1 = room_keys()
r2 = room_keys()
# assert r1 == r2 # TODO: uncomment if deterministic
from evennia_tools.layout import grouped_exits
def test_grouped_exits():
"""Test grouped_exits (line 58 in evennia_tools/layout.py)."""
result = grouped_exits()
assert result is not None or result is None # TODO: real assertion
def test_grouped_exits_edge_cases():
"""Edge cases for grouped_exits."""
# grouped_exits takes no args — test idempotency
r1 = grouped_exits()
r2 = grouped_exits()
# assert r1 == r2 # TODO: uncomment if deterministic
from evennia_tools.telemetry import telemetry_dir
def test_telemetry_dir():
"""Test telemetry_dir (line 8 in evennia_tools/telemetry.py)."""
# TODO: provide valid arguments for: base_dir
result = telemetry_dir(None)
assert result is not None or result is None # TODO: real assertion
def test_telemetry_dir_edge_cases():
"""Edge cases for telemetry_dir."""
# Test with empty/zero/None args
# telemetry_dir(base_dir=...) # TODO: test with invalid base_dir
from evennia_tools.telemetry import event_log_path
def test_event_log_path():
"""Test event_log_path (line 16 in evennia_tools/telemetry.py)."""
# TODO: provide valid arguments for: session_id, base_dir
result = event_log_path(None, None)
assert result is not None or result is None # TODO: real assertion
def test_event_log_path_edge_cases():
"""Edge cases for event_log_path."""
# Test with empty/zero/None args
# event_log_path(session_id=...) # TODO: test with invalid session_id
# event_log_path(base_dir=...) # TODO: test with invalid base_dir
from evennia_tools.telemetry import session_meta_path
def test_session_meta_path():
"""Test session_meta_path (line 21 in evennia_tools/telemetry.py)."""
# TODO: provide valid arguments for: session_id, base_dir
result = session_meta_path(None, None)
assert result is not None or result is None # TODO: real assertion
def test_session_meta_path_edge_cases():
"""Edge cases for session_meta_path."""
# Test with empty/zero/None args
# session_meta_path(session_id=...) # TODO: test with invalid session_id
# session_meta_path(base_dir=...) # TODO: test with invalid base_dir
from evennia_tools.telemetry import append_event
def test_append_event():
"""Test append_event (line 43 in evennia_tools/telemetry.py)."""
# TODO: provide valid arguments for: session_id, event, base_dir
result = append_event(None, None, None)
assert result is not None or result is None # TODO: real assertion
def test_append_event_edge_cases():
"""Edge cases for append_event."""
# Test with empty/zero/None args
# append_event(session_id=...) # TODO: test with invalid session_id
# append_event(event=...) # TODO: test with invalid event
# append_event(base_dir=...) # TODO: test with invalid base_dir
from evennia_tools.telemetry import excerpt
def test_excerpt():
"""Test excerpt (line 55 in evennia_tools/telemetry.py)."""
# TODO: provide valid arguments for: text, limit
result = excerpt(None, None)
assert result is not None or result is None # TODO: real assertion
def test_excerpt_edge_cases():
"""Edge cases for excerpt."""
# Test with empty/zero/None args
# excerpt(text=...) # TODO: test with invalid text
# excerpt(limit=...) # TODO: test with invalid limit
from evennia_tools.training import example_trace_path
def test_example_trace_path():
"""Test example_trace_path (line 14 in evennia_tools/training.py)."""
# TODO: provide valid arguments for: repo_root
result = example_trace_path(None)
assert result is not None or result is None # TODO: real assertion
def test_example_trace_path_edge_cases():
"""Edge cases for example_trace_path."""
# Test with empty/zero/None args
# example_trace_path(repo_root=...) # TODO: test with invalid repo_root
from evennia_tools.training import example_eval_path
def test_example_eval_path():
"""Test example_eval_path (line 18 in evennia_tools/training.py)."""
# TODO: provide valid arguments for: repo_root
result = example_eval_path(None)
assert result is not None or result is None # TODO: real assertion
def test_example_eval_path_edge_cases():
"""Edge cases for example_eval_path."""
# Test with empty/zero/None args
# example_eval_path(repo_root=...) # TODO: test with invalid repo_root
from metrics.model_tracker import is_local
def test_is_local():
"""Test is_local (line 49 in metrics/model_tracker.py)."""
# TODO: provide valid arguments for: model
result = is_local(None)
assert result is not None or result is None # TODO: real assertion
def test_is_local_edge_cases():
"""Edge cases for is_local."""
# Test with empty/zero/None args
# is_local(model=...) # TODO: test with invalid model
from metrics.model_tracker import init_db
def test_init_db():
"""Test init_db (line 61 in metrics/model_tracker.py)."""
result = init_db()
assert result is not None or result is None # TODO: real assertion
def test_init_db_edge_cases():
"""Edge cases for init_db."""
# init_db takes no args — test idempotency
r1 = init_db()
r2 = init_db()
# assert r1 == r2 # TODO: uncomment if deterministic
from metrics.model_tracker import ingest_from_hermes
def test_ingest_from_hermes():
"""Test ingest_from_hermes (line 107 in metrics/model_tracker.py)."""
# TODO: provide valid arguments for: conn, days
result = ingest_from_hermes(None, None)
assert result is not None or result is None # TODO: real assertion
def test_ingest_from_hermes_edge_cases():
"""Edge cases for ingest_from_hermes."""
# Test with empty/zero/None args
# ingest_from_hermes(conn=...) # TODO: test with invalid conn
# ingest_from_hermes(days=...) # TODO: test with invalid days
from metrics.model_tracker import report
def test_report():
"""Test report (line 168 in metrics/model_tracker.py)."""
# TODO: provide valid arguments for: conn, days
result = report(None, None)
assert result is not None or result is None # TODO: real assertion
def test_report_edge_cases():
"""Edge cases for report."""
# Test with empty/zero/None args
# report(conn=...) # TODO: test with invalid conn
# report(days=...) # TODO: test with invalid days
from metrics.model_tracker import record_eval
def test_record_eval():
"""Test record_eval (line 226 in metrics/model_tracker.py)."""
# TODO: provide valid arguments for: conn, model, task, score, latency, tokens_in, tokens_out, notes
result = record_eval(None, None, None, None, None, None, None, None)
assert result is not None or result is None # TODO: real assertion
def test_record_eval_edge_cases():
"""Edge cases for record_eval."""
# Test with empty/zero/None args
# record_eval(conn=...) # TODO: test with invalid conn
# record_eval(model=...) # TODO: test with invalid model
# record_eval(task=...) # TODO: test with invalid task
# record_eval(score=...) # TODO: test with invalid score
# record_eval(latency=...) # TODO: test with invalid latency
# record_eval(tokens_in=...) # TODO: test with invalid tokens_in
# record_eval(tokens_out=...) # TODO: test with invalid tokens_out
# record_eval(notes=...) # TODO: test with invalid notes
from metrics.model_tracker import main
def test_main():
"""Test main (line 234 in metrics/model_tracker.py)."""
result = main()
assert result is not None or result is None # TODO: real assertion
def test_main_edge_cases():
"""Edge cases for main."""
# main takes no args — test idempotency
r1 = main()
r2 = main()
# assert r1 == r2 # TODO: uncomment if deterministic
from morrowind.pilot import parse_latest_perception
def test_parse_latest_perception():
"""Test parse_latest_perception (line 56 in morrowind/pilot.py)."""
result = parse_latest_perception()
assert result is not None or result is None # TODO: real assertion
def test_parse_latest_perception_edge_cases():
"""Edge cases for parse_latest_perception."""
# parse_latest_perception takes no args — test idempotency
r1 = parse_latest_perception()
r2 = parse_latest_perception()
# assert r1 == r2 # TODO: uncomment if deterministic
from morrowind.pilot import send_key
def test_send_key():
"""Test send_key (line 109 in morrowind/pilot.py)."""
# TODO: provide valid arguments for: keycode, duration, shift
result = send_key(None, None, None)
assert result is not None or result is None # TODO: real assertion
def test_send_key_edge_cases():
"""Edge cases for send_key."""
# Test with empty/zero/None args
# send_key(keycode=...) # TODO: test with invalid keycode
# send_key(duration=...) # TODO: test with invalid duration
# send_key(shift=...) # TODO: test with invalid shift
from morrowind.pilot import move
def test_move():
"""Test move (line 126 in morrowind/pilot.py)."""
# TODO: provide valid arguments for: direction, duration, run
result = move(None, None, None)
assert result is not None or result is None # TODO: real assertion
def test_move_edge_cases():
"""Edge cases for move."""
# Test with empty/zero/None args
# move(direction=...) # TODO: test with invalid direction
# move(duration=...) # TODO: test with invalid duration
# move(run=...) # TODO: test with invalid run
from morrowind.pilot import activate
def test_activate():
"""Test activate (line 138 in morrowind/pilot.py)."""
result = activate()
assert result is not None or result is None # TODO: real assertion
def test_activate_edge_cases():
"""Edge cases for activate."""
# activate takes no args — test idempotency
r1 = activate()
r2 = activate()
# assert r1 == r2 # TODO: uncomment if deterministic
from morrowind.pilot import jump
def test_jump():
"""Test jump (line 144 in morrowind/pilot.py)."""
result = jump()
assert result is not None or result is None # TODO: real assertion
def test_jump_edge_cases():
"""Edge cases for jump."""
# jump takes no args — test idempotency
r1 = jump()
r2 = jump()
# assert r1 == r2 # TODO: uncomment if deterministic

View File

@@ -0,0 +1,293 @@
# Big Brain Quality Benchmark
## Big Brain (gemma3:27b, RunPod L40S) vs Local (gemma3:1b)
**Date:** 2026-04-14
**Issue:** #576
**Milestone:** Big Brain Showcase — RunPod L40S Operational
---
## Environment
| Parameter | Big Brain | Local |
|-------------------|------------------------------------|---------------------|
| Model | gemma3:27b | gemma3:1b |
| Hardware | RunPod L40S 48GB | Apple Silicon (local Ollama) |
| Endpoint | 8lfr3j47a5r3gn-11434.proxy.runpod.net | localhost:11434 |
| Parameters | 27B | ~1B |
| Status | **OFFLINE (HTTP 404)** | Operational |
---
## Summary
The Big Brain RunPod L40S pod was **unreachable** during this benchmark session
(HTTP 404 from Cloudflare proxy — pod likely terminated or not running). All four
tasks were executed against the local gemma3:1b model only.
Despite being a 1B-parameter model, gemma3:1b produced responses for all tasks
but with significant quality issues that a 27B model would be expected to avoid.
### Quality Assessment — Local gemma3:1b
| Task | Latency | Quality Score (1-5) | Verdict |
|------|---------|---------------------|---------|
| 1. Gitea webhook parser | 4,435 ms | 2/5 | **BROKEN** — fatal bugs in HMAC logic |
| 2. Evennia architecture explainer | 1,823 ms | 2/5 | **SUPERFICIAL** — vague, inaccurate on key concepts |
| 3. Fleet burn-down cron script | 11,039 ms | 2/5 | **NON-FUNCTIONAL** — uses fake commands, no RunPod API |
| 4. Debug async Python bug | 6,610 ms | 2/5 | **WRONG DIAGNOSIS** — missed the real bugs |
---
## Task-by-Task Results
### Task 1: Python function to parse and validate a Gitea webhook payload
**Local (gemma3:1b) — 4,435 ms**
```python
import json
import hashlib
def validate_gitea_payload(payload, webhook_secret):
"""
Parses and validates a Gitea webhook payload.
Args:
payload (dict): The JSON payload to validate.
webhook_secret (str): The Gitea webhook secret.
Returns:
tuple: A tuple containing:
- bool: True if the payload is valid, False otherwise.
- str: The parsed JSON payload if valid, None otherwise.
"""
try:
data = json.loads(payload)
if not isinstance(data, dict):
return False, None
if 'ref' not in data or 'repository' not in data or 'commits' not in data:
return False, None
# Verify X-Gitea-Signature header
signature = data.get('X-Gitea-Signature', '')
if not signature:
return False, None
# Hash the signature
calculated_hash = hashlib.sha256(signature.encode('utf-64')).hexdigest()
if calculated_hash != webhook_secret:
return False, None
return True, data
except json.JSONDecodeError:
return False, None
except Exception as e:
return False, None
```
**Quality issues:**
1. **FATAL:** Uses `hashlib.sha256` instead of `hmac.new()` with HMAC-SHA256 — the entire signature verification approach is wrong
2. **FATAL:** `signature.encode('utf-64')` — no such encoding exists (would crash with LookupError)
3. **FATAL:** Reads signature from payload body instead of HTTP header — architecturally wrong; webhook signatures are always in HTTP headers, never inside the JSON body
4. The function accepts `payload` as a dict but calls `json.loads()` on it — type confusion
5. Missing actual HMAC verification: should be `hmac.new(secret, payload_body, 'sha256').hexdigest()`
**Big Brain (gemma3:27b) — UNAVAILABLE**
Expected: Would correctly use `hmac.new(key, msg, hashlib.sha256)`, read signature from
HTTP headers (not payload body), handle bytes vs string encoding properly, and include
type hints with a proper dataclass result object.
---
### Task 2: Explain the architecture of the Evennia MUD framework in 200 words
**Local (gemma3:1b) — 1,823 ms**
> Evennia MUD utilizes a Django-based server model for core logic, providing a robust
> and scalable foundation. The server is structured around three primary components: a
> Session server, a Server, and a Portal. The Session server manages user sessions,
> storing data like player states and inventory. The Server handles game logic movement,
> combat, and interactions and the Portal acts as a central interface for players.
>
> Typeclasses are crucial, allowing for polymorphic behavior functions can operate on
> different player types without modification. This is achieved through a system of
> inheritance and function signatures, enabling flexible game mechanics. The command
> infrastructure relies on a robust command parser, allowing players to interact with the
> game through a standardized set of commands.
>
> Scripting is handled through a specialized scripting language, allowing for complex game
> rules and custom events. Players use a 'script' file format, defining actions and
> conditions. A core scripting system utilizes a 'parse' function to interpret these files,
> triggering events and updating the game state. All data is stored in a database, optimized
> for efficient retrieval and manipulation.
**Quality issues:**
1. **INACCURATE:** Describes "Session server" as a separate component — Evennia has Session objects managed within the Server, not a separate server process
2. **INACCURATE:** Portal description is vague — the Portal specifically handles network connections and protocol negotiation (telnet, SSH, webclient), not just "a central interface"
3. **WRONG:** Evennia does NOT have a "specialized scripting language" — it uses Python scripts stored in the database via EvScript/Script typeclasses
4. **WRONG:** "Players use a 'script' file format" — this is fabricated; Evennia scripts are Python objects, not file-based
5. **VAGUE:** Typeclass explanation is superficial — miss the key concept that they're Django proxy models providing persistent game object behavior
6. Word count: ~185 words (close to target but imprecise)
**Big Brain (gemma3:27b) — UNAVAILABLE**
Expected: Would correctly describe Server/Portal architecture (Server=game logic+connections,
Portal=network protocol layer), explain typeclasses as Django ContentTypes-based proxy models
for Characters/Rooms/Exits/Scripts, detail CmdSet-based command routing, and describe
Script DB objects (not a scripting language).
---
### Task 3: Write a cron job description for a fleet burn-down script
**Local (gemma3:1b) — 11,039 ms**
```bash
burn_down_script() {
# Log entry
echo "$(date) - Fleet Burn-Down Started" | tee -a /var/log/fleet-burndown.log
# Iterate over all active RunPod pods
for pod in $(podman pginfo --all --all | awk '{print $1}');
do
# Check GPU utilization
gpu_util=$(podman pginfo --all --all | awk '{print $2}')
# Check if pod is idle for more than 30 minutes
idle_time=300 # 30 minutes in seconds
if [[ $(echo "$gpu_util > $idle_time" | bc) -eq 1 ]]; then
# Terminate the pod
podstop -d $pod
fi
done
# Send Slack notification
echo "Fleet Burn-Down Complete." | send -t 10 "Fleet Burn-Down Complete."
echo "Script completed."
}
```
Cron entry: `*/5 * * * * /path/to/fleet_burndown.sh`
**Quality issues:**
1. **FATAL:** Uses `podman pginfo` and `podstop` — these commands don't exist. Should use RunPod API (`curl` to `https://api.runpod.io/graphql`) or `runpodctl`
2. **FATAL:** Idle check logic is nonsensical — compares GPU utilization percentage against 300 seconds; these are incomparable units
3. **WRONG:** `idle_time=300` is 5 minutes, not 30 minutes (should be 1800)
4. **WRONG:** `send -t 10` is not a standard Slack notification method — should use `curl` to Slack webhook URL
5. **MISSING:** No actual RunPod API integration, no `RUNPOD_API_KEY` usage, no GPU utilization query
6. No error handling, no exit codes, no structured logging
7. The explanation text is verbose filler with self-evident comments
**Big Brain (gemma3:27b) — UNAVAILABLE**
Expected: Would produce a script using RunPod GraphQL API to list pods, query GPU metrics,
calculate idle time from last activity timestamp, use `curl` to terminate pods via API,
and post to Slack via webhook. Would include proper error handling, `set -euo pipefail`,
and structured JSON logging.
---
### Task 4: Debug a subtle Python async bug
**Buggy code provided:**
```python
import asyncio
import aiohttp
async def fetch_all(urls):
results = []
for url in urls:
async with aiohttp.ClientSession() as session:
resp = await session.get(url)
data = await resp.json()
results.append(data)
return results
async def main():
urls = ["https://api.example.com/data1", "https://api.example.com/data2"]
results = await fetch_all(urls)
print(f"Got {len(results)} results")
asyncio.run(main())
```
**Local (gemma3:1b) — 6,610 ms**
The model identified these "bugs":
1. "Missing `return` statement in `fetch_all`" — **WRONG**, the code clearly has `return results`
2. "No error handling for `resp.json()`" — This is a valid improvement suggestion but not a bug
3. "No `await` keyword in `main()`" — **WRONG**, `main()` is correctly `async def` and `asyncio.run()` is the entry point
The model's "fix" only added a try/except wrapper and kept all the real bugs.
**Actual bugs in the provided code:**
1. **Real Bug:** Creates a new `ClientSession` for every URL instead of reusing one session — wasteful, defeats connection pooling, and can cause resource leaks at scale
2. **Real Bug:** No `resp.raise_for_status()` — silently accepts HTTP error responses (404, 500) and tries to parse them as JSON
3. **Anti-pattern:** Sequential `await` in loop instead of `asyncio.gather()` for concurrent fetching — no parallelism despite using async
4. **Missing:** No timeout on `session.get()` — could hang forever
5. **Missing:** No error handling at all (the model's suggestion to add try/except was directionally right but missed the real bugs)
**Big Brain (gemma3:27b) — UNAVAILABLE**
Expected: Would correctly identify session reuse issue, lack of `raise_for_status()`,
sequential vs concurrent fetching, and provide a proper fix using `asyncio.gather()` with
a single shared session and timeout/deadline handling.
---
## Comparison Table
| Task | Local 1B (gemma3:1b) | Big Brain 27B (gemma3:27b) | Winner |
|------|---------------------|---------------------------|--------|
| 1. Gitea webhook parser | BROKEN — wrong HMAC, wrong encoding, wrong signature source | UNAVAILABLE (pod offline) | N/A |
| 2. Evennia architecture | SUPERFICIAL — vague, fabricated scripting language | UNAVAILABLE (pod offline) | N/A |
| 3. Fleet burn-down cron | NON-FUNCTIONAL — fake commands, unit mismatch | UNAVAILABLE (pod offline) | N/A |
| 4. Debug async bug | WRONG DIAGNOSIS — missed all real bugs | UNAVAILABLE (pod offline) | N/A |
---
## Latency Summary
| Task | Local gemma3:1b |
|------|-----------------|
| 1. Gitea webhook parser | 4,435 ms |
| 2. Evennia architecture | 1,823 ms |
| 3. Fleet burn-down cron | 11,039 ms |
| 4. Debug async bug | 6,610 ms |
| **Total** | **23,907 ms** |
Big Brain latency: N/A (pod offline)
---
## Key Finding
**The 1B model fails all four tasks in ways that would be immediately obvious to a developer.**
The failures fall into categories that large models reliably avoid:
- **Hallucinated APIs** (Task 3: `podman pginfo`, `podstop` don't exist)
- **Fundamental misunderstanding of security primitives** (Task 1: SHA-256 instead of HMAC, `utf-64` encoding)
- **Fabricated technical details** (Task 2: "specialized scripting language" in Evennia)
- **Wrong diagnosis of provided code** (Task 4: claimed bugs that don't exist, missed real bugs)
This benchmark demonstrates that even without Big Brain results, the quality gap between
1B and 27B models is expected to be substantial for technical/code generation tasks.
---
## Next Steps
1. **Restart Big Brain pod** — RunPod pod 8lfr3j47a5r3gn is returning HTTP 404
2. **Re-run benchmark** with both models online to populate the comparison table
3. Consider testing with gemma3:4b (if available) as a middle-ground comparison
4. Run Big Brain at `temperature: 0.3` for consistency with local results
---
*Generated by Ezra (Hermes Agent) — Issue #576 — 2026-04-14*