Merge pull request #48 from AlexanderWhitestone/fix/timmy-startup-and-stability

fix: Timmy QA bugs — calculator, markdown, prompt guardrails, briefing
This commit is contained in:
Alexander Whitestone
2026-02-26 09:44:33 -05:00
committed by GitHub
8 changed files with 190 additions and 17 deletions

View File

@@ -14,6 +14,8 @@
<link rel="stylesheet" href="/static/style.css?v=4" />
{% block extra_styles %}{% endblock %}
<script src="https://unpkg.com/htmx.org@2.0.3" integrity="sha384-0895/pl2MU10Hqc6jd4RvrthNlDiE9U1tWmX7WRESftEDRosgxNsQG/Ze9YMRzHq" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/marked@15.0.7/marked.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/dompurify@3.2.4/dist/purify.min.js"></script>
</head>
<body>
<header class="mc-header">

View File

@@ -5,8 +5,16 @@
{% if response %}
<div class="chat-message agent">
<div class="msg-meta">TIMMY // {{ timestamp }}</div>
<div class="msg-body">{{ response | e }}</div>
<div class="msg-body timmy-md">{{ response | e }}</div>
</div>
<script>
(function() {
var el = document.currentScript.previousElementSibling.querySelector('.timmy-md');
if (el && typeof marked !== 'undefined' && typeof DOMPurify !== 'undefined') {
el.innerHTML = DOMPurify.sanitize(marked.parse(el.textContent));
}
})();
</script>
{% elif error %}
<div class="chat-message error-msg">
<div class="msg-meta">SYSTEM // {{ timestamp }}</div>

View File

@@ -124,14 +124,21 @@ notifier = PushNotifier()
async def notify_briefing_ready(briefing) -> None:
"""Placeholder: notify the owner that a new morning briefing is ready.
"""Notify the owner that a new morning briefing is ready.
Logs to console now. Wire to real push (APNs/Pushover) later.
Only triggers a native macOS popup when there are pending approval items.
Briefings with 0 approvals are still logged but don't interrupt the user
with a notification that leads to an empty-looking page.
Args:
briefing: A timmy.briefing.Briefing instance.
"""
n_approvals = len(briefing.approval_items) if briefing.approval_items else 0
if n_approvals == 0:
logger.info("Briefing ready but no pending approvals — skipping native notification")
return
message = (
f"Your morning briefing is ready. "
f"{n_approvals} item(s) await your approval."

View File

@@ -21,7 +21,12 @@ Rules:
- Remember what the user tells you during our conversation.
- If you don't know something, say so honestly.
- Use the user's name if you know it.
- Do simple math in your head. Don't reach for tools.
- When you state a fact, commit to it. Never contradict a correct statement you
just made in the same response. If uncertain, express uncertainty at the start —
never state something confidently and then immediately undermine it.
- NEVER attempt arithmetic in your head — LLMs are unreliable at multi-digit math.
If asked to compute anything (multiply, divide, square root, exponents, etc.),
tell the user you need a calculator tool to give an exact answer.
Sir, affirmative."""
@@ -57,15 +62,17 @@ user's digital sovereignty.
### When NOT to use tools:
- Identity questions → Answer directly
- General knowledge → Answer from training
- Simple math → Calculate mentally
- Greetings → Respond conversationally
### When TO use tools:
- **calculator** — ANY arithmetic: multiplication, division, square roots, exponents,
percentages, logarithms, etc. NEVER attempt math in your head — always call this tool.
Example: calculator("347 * 829") or calculator("math.sqrt(17161)")
- **web_search** — Current events, real-time data, news
- **read_file** — User explicitly requests file reading
- **write_file** — User explicitly requests saving content
- **python** — Complex calculations, code execution
- **python** — Code execution, data processing (NOT for simple arithmetic — use calculator)
- **shell** — System operations (explicit user request)
- **memory_search** — "Have we talked about this before?", finding past context
@@ -74,6 +81,9 @@ user's digital sovereignty.
- Never narrate your reasoning process. Just give the answer.
- Never show raw tool call JSON or function syntax in responses.
- Use the user's name if known.
- When you state a fact, commit to it. Never contradict a correct statement you
just made in the same response. If uncertain, express uncertainty at the start —
never state something confidently and then immediately undermine it.
Sir, affirmative."""
@@ -101,12 +111,13 @@ you are operational and running locally."""
TOOL_USAGE_GUIDE = """
DECISION ORDER:
1. Can I answer from training data? → Answer directly (NO TOOL)
2. Is this about past conversations? → memory_search
3. Is this current/real-time info? → web_search
4. Did user request file operations? → file tools
5. Requires calculation/code? → python
6. System command requested? → shell
1. Is this arithmetic or math? → calculator (ALWAYS — never compute in your head)
2. Can I answer from training data? → Answer directly (NO TOOL)
3. Is this about past conversations? → memory_search
4. Is this current/real-time info? → web_search
5. Did user request file operations? → file tools
6. Requires code execution? → python
7. System command requested? → shell
MEMORY SEARCH TRIGGERS:
- "Have we discussed..."

View File

@@ -32,7 +32,7 @@ _TOOL_CALL_JSON = re.compile(
# Matches function-call-style text: memory_search(query="...") etc.
_FUNC_CALL_TEXT = re.compile(
r'\b(?:memory_search|web_search|shell|python|read_file|write_file|list_files)'
r'\b(?:memory_search|web_search|shell|python|read_file|write_file|list_files|calculator)'
r'\s*\([^)]*\)',
)

View File

@@ -26,6 +26,7 @@ Tools are assigned to personas based on their specialties:
from __future__ import annotations
import logging
import math
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
@@ -107,6 +108,33 @@ def get_tool_stats(agent_id: str | None = None) -> dict:
return all_stats
def calculator(expression: str) -> str:
"""Evaluate a mathematical expression and return the exact result.
Use this tool for ANY arithmetic: multiplication, division, square roots,
exponents, percentages, logarithms, trigonometry, etc.
Args:
expression: A valid Python math expression, e.g. '347 * 829',
'math.sqrt(17161)', '2**10', 'math.log(100, 10)'.
Returns:
The exact result as a string.
"""
# Only expose math functions — no builtins, no file/os access
allowed_names = {k: getattr(math, k) for k in dir(math) if not k.startswith("_")}
allowed_names["math"] = math # Support math.sqrt(), math.pi, etc.
allowed_names["abs"] = abs
allowed_names["round"] = round
allowed_names["min"] = min
allowed_names["max"] = max
try:
result = eval(expression, {"__builtins__": {}}, allowed_names) # noqa: S307
return str(result)
except Exception as e:
return f"Error evaluating '{expression}': {e}"
def create_research_tools(base_dir: str | Path | None = None):
"""Create tools for research personas (Echo).
@@ -280,13 +308,16 @@ def create_full_toolkit(base_dir: str | Path | None = None):
toolkit.register(file_tools.save_file, name="write_file")
toolkit.register(file_tools.list_files, name="list_files")
# Calculator — exact arithmetic (never let the LLM guess)
toolkit.register(calculator, name="calculator")
# Memory search - semantic recall
try:
from timmy.semantic_memory import memory_search
toolkit.register(memory_search, name="memory_search")
except Exception:
logger.debug("Memory search not available")
return toolkit
@@ -371,6 +402,11 @@ def get_all_available_tools() -> dict[str, dict]:
"description": "List files in a directory",
"available_in": ["echo", "seer", "forge", "quill", "mace", "helm", "timmy"],
},
"calculator": {
"name": "Calculator",
"description": "Evaluate mathematical expressions with exact results",
"available_in": ["timmy"],
},
}
# ── Git tools ─────────────────────────────────────────────────────────────

View File

@@ -233,11 +233,35 @@ def test_call_agent_falls_back_on_exception(engine):
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_notify_briefing_ready_logs(caplog):
"""notify_briefing_ready should log and call notifier.notify."""
from notifications.push import notify_briefing_ready, PushNotifier
async def test_notify_briefing_ready_skips_when_no_approvals(caplog):
"""notify_briefing_ready should NOT fire native notification with 0 approvals."""
from notifications.push import notify_briefing_ready
b = _make_briefing() # approval_items=[]
with patch("notifications.push.notifier") as mock_notifier:
await notify_briefing_ready(b)
mock_notifier.notify.assert_not_called()
@pytest.mark.asyncio
async def test_notify_briefing_ready_fires_when_approvals_exist():
"""notify_briefing_ready should fire when there are pending approval items."""
from notifications.push import notify_briefing_ready
from timmy.briefing import ApprovalItem
b = _make_briefing()
b.approval_items = [
ApprovalItem(
id="test-1",
title="Test approval",
description="A test item",
proposed_action="do something",
impact="low",
created_at=datetime.now(timezone.utc),
status="pending",
),
]
with patch("notifications.push.notifier") as mock_notifier:
await notify_briefing_ready(b)

85
tests/test_calculator.py Normal file
View File

@@ -0,0 +1,85 @@
"""Tests for the calculator tool."""
from timmy.tools import calculator
def test_basic_multiplication():
assert calculator("347 * 829") == "287663"
def test_basic_addition():
assert calculator("100 + 200") == "300"
def test_basic_division():
assert calculator("100 / 4") == "25.0"
def test_integer_division():
assert calculator("100 // 3") == "33"
def test_exponentiation():
assert calculator("2 ** 10") == "1024"
def test_sqrt():
assert calculator("math.sqrt(17161)") == "131.0"
def test_sqrt_non_perfect():
result = float(calculator("math.sqrt(2)"))
assert abs(result - 1.4142135623730951) < 1e-10
def test_log_base_10():
result = float(calculator("math.log10(1000)"))
assert abs(result - 3.0) < 1e-10
def test_log_natural():
result = float(calculator("math.log(math.e)"))
assert abs(result - 1.0) < 1e-10
def test_trig_sin():
result = float(calculator("math.sin(math.pi / 2)"))
assert abs(result - 1.0) < 1e-10
def test_abs_builtin():
assert calculator("abs(-42)") == "42"
def test_round_builtin():
assert calculator("round(3.14159, 2)") == "3.14"
def test_min_max_builtins():
assert calculator("min(3, 7, 1)") == "1"
assert calculator("max(3, 7, 1)") == "7"
def test_complex_expression():
assert calculator("(347 * 829) + (100 / 4)") == "287688.0"
def test_invalid_expression_returns_error():
result = calculator("not a valid expression")
assert result.startswith("Error evaluating")
def test_no_builtins_access():
"""Ensure dangerous builtins like __import__ are blocked."""
result = calculator("__import__('os').system('echo pwned')")
assert result.startswith("Error evaluating")
def test_no_open_access():
result = calculator("open('/etc/passwd').read()")
assert result.startswith("Error evaluating")
def test_division_by_zero():
result = calculator("1 / 0")
assert result.startswith("Error evaluating")