feat(crucible): Z3 sidecar MCP verifier -- rebased onto current main

Closes #86. Adds:
- bin/crucible_mcp_server.py (schedule, dependency, capacity proofs)
- docs/crucible-first-cut.md
- playbooks/verified-logic.yaml
- config.yaml crucible MCP server entry
This commit is contained in:
Alexander Whitestone
2026-04-03 18:58:43 -04:00
parent 57b87c525d
commit 8ec4bff771
4 changed files with 595 additions and 0 deletions

459
bin/crucible_mcp_server.py Normal file
View File

@@ -0,0 +1,459 @@
#!/usr/bin/env python3
"""Z3-backed Crucible MCP server for Timmy.
Sidecar-only. Lives in timmy-config, deploys into ~/.hermes/bin/, and is loaded
by Hermes through native MCP tool discovery. No hermes-agent fork required.
"""
from __future__ import annotations
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from mcp.server import FastMCP
from z3 import And, Bool, Distinct, If, Implies, Int, Optimize, Or, Sum, sat, unsat
mcp = FastMCP(
name="crucible",
instructions=(
"Formal verification sidecar for Timmy. Use these tools for scheduling, "
"dependency ordering, and resource/capacity feasibility. Return SAT/UNSAT "
"with witness models instead of fuzzy prose."
),
dependencies=["z3-solver"],
)
def _hermes_home() -> Path:
return Path(os.path.expanduser(os.getenv("HERMES_HOME", "~/.hermes")))
def _proof_dir() -> Path:
path = _hermes_home() / "logs" / "crucible"
path.mkdir(parents=True, exist_ok=True)
return path
def _ts() -> str:
return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S_%fZ")
def _json_default(value: Any) -> Any:
if isinstance(value, Path):
return str(value)
raise TypeError(f"Unsupported type for JSON serialization: {type(value)!r}")
def _log_proof(tool_name: str, request: dict[str, Any], result: dict[str, Any]) -> str:
path = _proof_dir() / f"{_ts()}_{tool_name}.json"
payload = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"tool": tool_name,
"request": request,
"result": result,
}
path.write_text(json.dumps(payload, indent=2, default=_json_default))
return str(path)
def _ensure_unique(names: list[str], label: str) -> None:
if len(set(names)) != len(names):
raise ValueError(f"Duplicate {label} names are not allowed: {names}")
def _normalize_dependency(dep: Any) -> tuple[str, str, int]:
if isinstance(dep, dict):
before = dep.get("before")
after = dep.get("after")
lag = int(dep.get("lag", 0))
if not before or not after:
raise ValueError(f"Dependency dict must include before/after: {dep!r}")
return str(before), str(after), lag
if isinstance(dep, (list, tuple)) and len(dep) in (2, 3):
before = str(dep[0])
after = str(dep[1])
lag = int(dep[2]) if len(dep) == 3 else 0
return before, after, lag
raise ValueError(f"Unsupported dependency shape: {dep!r}")
def _normalize_task(task: dict[str, Any]) -> dict[str, Any]:
name = str(task["name"])
duration = int(task["duration"])
if duration <= 0:
raise ValueError(f"Task duration must be positive: {task!r}")
return {"name": name, "duration": duration}
def _normalize_item(item: dict[str, Any]) -> dict[str, Any]:
name = str(item["name"])
amount = int(item["amount"])
value = int(item.get("value", amount))
required = bool(item.get("required", False))
if amount < 0:
raise ValueError(f"Item amount must be non-negative: {item!r}")
return {
"name": name,
"amount": amount,
"value": value,
"required": required,
}
def solve_schedule_tasks(
tasks: list[dict[str, Any]],
horizon: int,
dependencies: list[Any] | None = None,
fixed_starts: dict[str, int] | None = None,
max_parallel_tasks: int = 1,
minimize_makespan: bool = True,
) -> dict[str, Any]:
tasks = [_normalize_task(task) for task in tasks]
dependencies = dependencies or []
fixed_starts = fixed_starts or {}
horizon = int(horizon)
max_parallel_tasks = int(max_parallel_tasks)
if horizon <= 0:
raise ValueError("horizon must be positive")
if max_parallel_tasks <= 0:
raise ValueError("max_parallel_tasks must be positive")
names = [task["name"] for task in tasks]
_ensure_unique(names, "task")
durations = {task["name"]: task["duration"] for task in tasks}
opt = Optimize()
start = {name: Int(f"start_{name}") for name in names}
end = {name: Int(f"end_{name}") for name in names}
makespan = Int("makespan")
for name in names:
opt.add(start[name] >= 0)
opt.add(end[name] == start[name] + durations[name])
opt.add(end[name] <= horizon)
if name in fixed_starts:
opt.add(start[name] == int(fixed_starts[name]))
for dep in dependencies:
before, after, lag = _normalize_dependency(dep)
if before not in start or after not in start:
raise ValueError(f"Unknown task in dependency {dep!r}")
opt.add(start[after] >= end[before] + lag)
# Discrete resource capacity over integer time slots.
for t in range(horizon):
active = [If(And(start[name] <= t, t < end[name]), 1, 0) for name in names]
opt.add(Sum(active) <= max_parallel_tasks)
for name in names:
opt.add(makespan >= end[name])
if minimize_makespan:
opt.minimize(makespan)
result = opt.check()
proof: dict[str, Any]
if result == sat:
model = opt.model()
schedule = []
for name in sorted(names, key=lambda n: model.eval(start[n]).as_long()):
s = model.eval(start[name]).as_long()
e = model.eval(end[name]).as_long()
schedule.append({
"name": name,
"start": s,
"end": e,
"duration": durations[name],
})
proof = {
"status": "sat",
"summary": "Schedule proven feasible.",
"horizon": horizon,
"max_parallel_tasks": max_parallel_tasks,
"makespan": model.eval(makespan).as_long(),
"schedule": schedule,
"dependencies": [
{"before": b, "after": a, "lag": lag}
for b, a, lag in (_normalize_dependency(dep) for dep in dependencies)
],
}
elif result == unsat:
proof = {
"status": "unsat",
"summary": "Schedule is impossible under the given horizon/dependency/capacity constraints.",
"horizon": horizon,
"max_parallel_tasks": max_parallel_tasks,
"dependencies": [
{"before": b, "after": a, "lag": lag}
for b, a, lag in (_normalize_dependency(dep) for dep in dependencies)
],
}
else:
proof = {
"status": "unknown",
"summary": "Solver could not prove SAT or UNSAT for this schedule.",
"horizon": horizon,
"max_parallel_tasks": max_parallel_tasks,
}
proof["proof_log"] = _log_proof(
"schedule_tasks",
{
"tasks": tasks,
"horizon": horizon,
"dependencies": dependencies,
"fixed_starts": fixed_starts,
"max_parallel_tasks": max_parallel_tasks,
"minimize_makespan": minimize_makespan,
},
proof,
)
return proof
def solve_dependency_order(
entities: list[str],
before: list[Any],
fixed_positions: dict[str, int] | None = None,
) -> dict[str, Any]:
entities = [str(entity) for entity in entities]
fixed_positions = fixed_positions or {}
_ensure_unique(entities, "entity")
opt = Optimize()
pos = {entity: Int(f"pos_{entity}") for entity in entities}
opt.add(Distinct(*pos.values()))
for entity in entities:
opt.add(pos[entity] >= 0)
opt.add(pos[entity] < len(entities))
if entity in fixed_positions:
opt.add(pos[entity] == int(fixed_positions[entity]))
normalized = []
for dep in before:
left, right, _lag = _normalize_dependency(dep)
if left not in pos or right not in pos:
raise ValueError(f"Unknown entity in ordering constraint: {dep!r}")
opt.add(pos[left] < pos[right])
normalized.append({"before": left, "after": right})
result = opt.check()
if result == sat:
model = opt.model()
ordering = sorted(entities, key=lambda entity: model.eval(pos[entity]).as_long())
proof = {
"status": "sat",
"summary": "Dependency ordering is consistent.",
"ordering": ordering,
"positions": {entity: model.eval(pos[entity]).as_long() for entity in entities},
"constraints": normalized,
}
elif result == unsat:
proof = {
"status": "unsat",
"summary": "Dependency ordering contains a contradiction/cycle.",
"constraints": normalized,
}
else:
proof = {
"status": "unknown",
"summary": "Solver could not prove SAT or UNSAT for this dependency graph.",
"constraints": normalized,
}
proof["proof_log"] = _log_proof(
"order_dependencies",
{
"entities": entities,
"before": before,
"fixed_positions": fixed_positions,
},
proof,
)
return proof
def solve_capacity_fit(
items: list[dict[str, Any]],
capacity: int,
maximize_value: bool = True,
) -> dict[str, Any]:
items = [_normalize_item(item) for item in items]
capacity = int(capacity)
if capacity < 0:
raise ValueError("capacity must be non-negative")
names = [item["name"] for item in items]
_ensure_unique(names, "item")
choose = {item["name"]: Bool(f"choose_{item['name']}") for item in items}
opt = Optimize()
for item in items:
if item["required"]:
opt.add(choose[item["name"]])
total_amount = Sum([If(choose[item["name"]], item["amount"], 0) for item in items])
total_value = Sum([If(choose[item["name"]], item["value"], 0) for item in items])
opt.add(total_amount <= capacity)
if maximize_value:
opt.maximize(total_value)
result = opt.check()
if result == sat:
model = opt.model()
chosen = [item for item in items if bool(model.eval(choose[item["name"]], model_completion=True))]
skipped = [item for item in items if item not in chosen]
used = sum(item["amount"] for item in chosen)
proof = {
"status": "sat",
"summary": "Capacity constraints are feasible.",
"capacity": capacity,
"used": used,
"remaining": capacity - used,
"chosen": chosen,
"skipped": skipped,
"total_value": sum(item["value"] for item in chosen),
}
elif result == unsat:
proof = {
"status": "unsat",
"summary": "Required items exceed available capacity.",
"capacity": capacity,
"required_items": [item for item in items if item["required"]],
}
else:
proof = {
"status": "unknown",
"summary": "Solver could not prove SAT or UNSAT for this capacity check.",
"capacity": capacity,
}
proof["proof_log"] = _log_proof(
"capacity_fit",
{
"items": items,
"capacity": capacity,
"maximize_value": maximize_value,
},
proof,
)
return proof
@mcp.tool(
name="schedule_tasks",
description=(
"Crucible template for discrete scheduling. Proves whether integer-duration "
"tasks fit within a time horizon under dependency and parallelism constraints."
),
structured_output=True,
)
def schedule_tasks(
tasks: list[dict[str, Any]],
horizon: int,
dependencies: list[Any] | None = None,
fixed_starts: dict[str, int] | None = None,
max_parallel_tasks: int = 1,
minimize_makespan: bool = True,
) -> dict[str, Any]:
return solve_schedule_tasks(
tasks=tasks,
horizon=horizon,
dependencies=dependencies,
fixed_starts=fixed_starts,
max_parallel_tasks=max_parallel_tasks,
minimize_makespan=minimize_makespan,
)
@mcp.tool(
name="order_dependencies",
description=(
"Crucible template for dependency ordering. Proves whether a set of before/after "
"constraints is consistent and returns a valid topological order when SAT."
),
structured_output=True,
)
def order_dependencies(
entities: list[str],
before: list[Any],
fixed_positions: dict[str, int] | None = None,
) -> dict[str, Any]:
return solve_dependency_order(
entities=entities,
before=before,
fixed_positions=fixed_positions,
)
@mcp.tool(
name="capacity_fit",
description=(
"Crucible template for resource capacity. Proves whether required items fit "
"within a capacity budget and chooses an optimal feasible subset of optional items."
),
structured_output=True,
)
def capacity_fit(
items: list[dict[str, Any]],
capacity: int,
maximize_value: bool = True,
) -> dict[str, Any]:
return solve_capacity_fit(items=items, capacity=capacity, maximize_value=maximize_value)
def run_selftest() -> dict[str, Any]:
return {
"schedule_unsat_single_worker": solve_schedule_tasks(
tasks=[
{"name": "A", "duration": 2},
{"name": "B", "duration": 3},
{"name": "C", "duration": 4},
],
horizon=8,
dependencies=[{"before": "A", "after": "B"}],
max_parallel_tasks=1,
),
"schedule_sat_two_workers": solve_schedule_tasks(
tasks=[
{"name": "A", "duration": 2},
{"name": "B", "duration": 3},
{"name": "C", "duration": 4},
],
horizon=8,
dependencies=[{"before": "A", "after": "B"}],
max_parallel_tasks=2,
),
"ordering_sat": solve_dependency_order(
entities=["fetch", "train", "eval"],
before=[
{"before": "fetch", "after": "train"},
{"before": "train", "after": "eval"},
],
),
"capacity_sat": solve_capacity_fit(
items=[
{"name": "gpu_job", "amount": 6, "value": 6, "required": True},
{"name": "telemetry", "amount": 1, "value": 1, "required": True},
{"name": "export", "amount": 2, "value": 4, "required": False},
{"name": "viz", "amount": 3, "value": 5, "required": False},
],
capacity=8,
),
}
def main() -> int:
if len(sys.argv) > 1 and sys.argv[1] == "selftest":
print(json.dumps(run_selftest(), indent=2))
return 0
mcp.run(transport="stdio")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -214,6 +214,13 @@ mcp_servers:
- /Users/apayne/.timmy/morrowind/mcp_server.py - /Users/apayne/.timmy/morrowind/mcp_server.py
env: {} env: {}
timeout: 30 timeout: 30
crucible:
command: /Users/apayne/.hermes/hermes-agent/venv/bin/python3
args:
- /Users/apayne/.hermes/bin/crucible_mcp_server.py
env: {}
timeout: 120
connect_timeout: 60
fallback_model: fallback_model:
provider: ollama provider: ollama
model: hermes3:latest model: hermes3:latest

View File

@@ -0,0 +1,82 @@
# Crucible First Cut
This is the first narrow neuro-symbolic slice for Timmy.
## Goal
Prove constraint logic instead of bluffing through it.
## Shape
The Crucible is a sidecar MCP server that lives in `timmy-config` and deploys into `~/.hermes/bin/`.
It is loaded by Hermes through native MCP discovery. No Hermes fork.
## Templates shipped in v0
### 1. schedule_tasks
Use for:
- deadline feasibility
- task ordering with dependencies
- small integer scheduling windows
Inputs:
- `tasks`: `[{name, duration}]`
- `horizon`: integer window size
- `dependencies`: `[{before, after, lag?}]`
- `max_parallel_tasks`: integer worker count
Outputs:
- `status: sat|unsat|unknown`
- witness schedule when SAT
- proof log path
### 2. order_dependencies
Use for:
- topological ordering
- cycle detection
- dependency consistency checks
Inputs:
- `entities`
- `before`
- optional `fixed_positions`
Outputs:
- valid ordering when SAT
- contradiction when UNSAT
- proof log path
### 3. capacity_fit
Use for:
- resource budgeting
- optional-vs-required work selection
- capacity feasibility
Inputs:
- `items: [{name, amount, value?, required?}]`
- `capacity`
Outputs:
- chosen feasible subset when SAT
- contradiction when required load exceeds capacity
- proof log path
## Demo
Run locally:
```bash
~/.hermes/hermes-agent/venv/bin/python ~/.hermes/bin/crucible_mcp_server.py selftest
```
This produces:
- one UNSAT schedule proof
- one SAT schedule proof
- one SAT dependency ordering proof
- one SAT capacity proof
## Scope guardrails
Do not force every answer through the Crucible.
Use it when the task is genuinely constraint-shaped.
If the problem does not fit one of the templates, say so plainly.

View File

@@ -0,0 +1,47 @@
name: verified-logic
description: >
Crucible-first playbook for tasks that require proof instead of plausible prose.
Use Z3-backed sidecar tools for scheduling, dependency ordering, capacity checks,
and consistency verification.
model:
preferred: claude-opus-4-6
fallback: claude-sonnet-4-20250514
max_turns: 12
temperature: 0.1
tools:
- mcp_crucible_schedule_tasks
- mcp_crucible_order_dependencies
- mcp_crucible_capacity_fit
trigger:
manual: true
steps:
- classify_problem
- choose_template
- translate_into_constraints
- verify_with_crucible
- report_sat_unsat_with_witness
output: verified_result
timeout_minutes: 5
system_prompt: |
You are running the Crucible playbook.
Use this playbook for:
- scheduling and deadline feasibility
- dependency ordering and cycle checks
- capacity / resource allocation constraints
- consistency checks where a contradiction matters
RULES:
1. Do not bluff through logic.
2. Pick the narrowest Crucible template that fits the task.
3. Translate the user's question into structured constraints.
4. Call the Crucible tool.
5. If SAT, report the witness model clearly.
6. If UNSAT, say the constraints are impossible and explain which shape of constraint caused the contradiction.
7. If the task is not a good fit for these templates, say so plainly instead of pretending it was verified.