Merge pull request #48 from AlexanderWhitestone/fix/timmy-startup-and-stability

fix: Timmy QA bugs — calculator, markdown, prompt guardrails, briefing
2026-02-26 09:44:33 -05:00
parent e6a7db7d80 6e6b4355bb
commit 4e78f7102e
8 changed files with 190 additions and 17 deletions
--- a/src/dashboard/templates/base.html
+++ b/src/dashboard/templates/base.html
@@ -14,6 +14,8 @@
  <link rel="stylesheet" href="/static/style.css?v=4" />
  {% block extra_styles %}{% endblock %}
  <script src="https://unpkg.com/htmx.org@2.0.3" integrity="sha384-0895/pl2MU10Hqc6jd4RvrthNlDiE9U1tWmX7WRESftEDRosgxNsQG/Ze9YMRzHq" crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/npm/marked@15.0.7/marked.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/dompurify@3.2.4/dist/purify.min.js"></script>
 </head>
 <body>
  <header class="mc-header">
--- a/src/dashboard/templates/partials/chat_message.html
+++ b/src/dashboard/templates/partials/chat_message.html
@@ -5,8 +5,16 @@
 {% if response %}
 <div class="chat-message agent">
  <div class="msg-meta">TIMMY // {{ timestamp }}</div>
-  <div class="msg-body">{{ response | e }}</div>
+  <div class="msg-body timmy-md">{{ response | e }}</div>
 </div>
+<script>
+  (function() {
+    var el = document.currentScript.previousElementSibling.querySelector('.timmy-md');
+    if (el && typeof marked !== 'undefined' && typeof DOMPurify !== 'undefined') {
+      el.innerHTML = DOMPurify.sanitize(marked.parse(el.textContent));
+    }
+  })();
+</script>
 {% elif error %}
 <div class="chat-message error-msg">
  <div class="msg-meta">SYSTEM // {{ timestamp }}</div>
--- a/src/notifications/push.py
+++ b/src/notifications/push.py
@@ -124,14 +124,21 @@ notifier = PushNotifier()


 async def notify_briefing_ready(briefing) -> None:
-    """Placeholder: notify the owner that a new morning briefing is ready.
+    """Notify the owner that a new morning briefing is ready.

-    Logs to console now.  Wire to real push (APNs/Pushover) later.
+    Only triggers a native macOS popup when there are pending approval items.
+    Briefings with 0 approvals are still logged but don't interrupt the user
+    with a notification that leads to an empty-looking page.

    Args:
        briefing: A timmy.briefing.Briefing instance.
    """
    n_approvals = len(briefing.approval_items) if briefing.approval_items else 0
+
+    if n_approvals == 0:
+        logger.info("Briefing ready but no pending approvals — skipping native notification")
+        return
+
    message = (
        f"Your morning briefing is ready. "
        f"{n_approvals} item(s) await your approval."
--- a/src/timmy/prompts.py
+++ b/src/timmy/prompts.py
@@ -21,7 +21,12 @@ Rules:
 - Remember what the user tells you during our conversation.
 - If you don't know something, say so honestly.
 - Use the user's name if you know it.
- Do simple math in your head. Don't reach for tools.
+- When you state a fact, commit to it. Never contradict a correct statement you
+  just made in the same response. If uncertain, express uncertainty at the start —
+  never state something confidently and then immediately undermine it.
+- NEVER attempt arithmetic in your head — LLMs are unreliable at multi-digit math.
+  If asked to compute anything (multiply, divide, square root, exponents, etc.),
+  tell the user you need a calculator tool to give an exact answer.

 Sir, affirmative."""

@@ -57,15 +62,17 @@ user's digital sovereignty.
 ### When NOT to use tools:
 - Identity questions → Answer directly
 - General knowledge → Answer from training
- Simple math → Calculate mentally
 - Greetings → Respond conversationally

 ### When TO use tools:

+- **calculator** — ANY arithmetic: multiplication, division, square roots, exponents,
+  percentages, logarithms, etc. NEVER attempt math in your head — always call this tool.
+  Example: calculator("347 * 829") or calculator("math.sqrt(17161)")
 - **web_search** — Current events, real-time data, news
 - **read_file** — User explicitly requests file reading
 - **write_file** — User explicitly requests saving content
- **python** — Complex calculations, code execution
+- **python** — Code execution, data processing (NOT for simple arithmetic — use calculator)
 - **shell** — System operations (explicit user request)
 - **memory_search** — "Have we talked about this before?", finding past context

@@ -74,6 +81,9 @@ user's digital sovereignty.
 - Never narrate your reasoning process. Just give the answer.
 - Never show raw tool call JSON or function syntax in responses.
 - Use the user's name if known.
+- When you state a fact, commit to it. Never contradict a correct statement you
+  just made in the same response. If uncertain, express uncertainty at the start —
+  never state something confidently and then immediately undermine it.

 Sir, affirmative."""

@@ -101,12 +111,13 @@ you are operational and running locally."""
 TOOL_USAGE_GUIDE = """
 DECISION ORDER:

-1. Can I answer from training data? → Answer directly (NO TOOL)
-2. Is this about past conversations? → memory_search
-3. Is this current/real-time info? → web_search
-4. Did user request file operations? → file tools
-5. Requires calculation/code? → python
-6. System command requested? → shell
+1. Is this arithmetic or math? → calculator (ALWAYS — never compute in your head)
+2. Can I answer from training data? → Answer directly (NO TOOL)
+3. Is this about past conversations? → memory_search
+4. Is this current/real-time info? → web_search
+5. Did user request file operations? → file tools
+6. Requires code execution? → python
+7. System command requested? → shell

 MEMORY SEARCH TRIGGERS:
 - "Have we discussed..."
--- a/src/timmy/session.py
+++ b/src/timmy/session.py
@@ -32,7 +32,7 @@ _TOOL_CALL_JSON = re.compile(

 # Matches function-call-style text: memory_search(query="...") etc.
 _FUNC_CALL_TEXT = re.compile(
-    r'\b(?:memory_search|web_search|shell|python|read_file|write_file|list_files)'
+    r'\b(?:memory_search|web_search|shell|python|read_file|write_file|list_files|calculator)'
    r'\s*\([^)]*\)',
 )

--- a/src/timmy/tools.py
+++ b/src/timmy/tools.py
@@ -26,6 +26,7 @@ Tools are assigned to personas based on their specialties:
 from __future__ import annotations

 import logging
+import math
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
@@ -107,6 +108,33 @@ def get_tool_stats(agent_id: str | None = None) -> dict:
    return all_stats


+def calculator(expression: str) -> str:
+    """Evaluate a mathematical expression and return the exact result.
+
+    Use this tool for ANY arithmetic: multiplication, division, square roots,
+    exponents, percentages, logarithms, trigonometry, etc.
+
+    Args:
+        expression: A valid Python math expression, e.g. '347 * 829',
+                    'math.sqrt(17161)', '2**10', 'math.log(100, 10)'.
+
+    Returns:
+        The exact result as a string.
+    """
+    # Only expose math functions — no builtins, no file/os access
+    allowed_names = {k: getattr(math, k) for k in dir(math) if not k.startswith("_")}
+    allowed_names["math"] = math  # Support math.sqrt(), math.pi, etc.
+    allowed_names["abs"] = abs
+    allowed_names["round"] = round
+    allowed_names["min"] = min
+    allowed_names["max"] = max
+    try:
+        result = eval(expression, {"__builtins__": {}}, allowed_names)  # noqa: S307
+        return str(result)
+    except Exception as e:
+        return f"Error evaluating '{expression}': {e}"
+
+
 def create_research_tools(base_dir: str | Path | None = None):
    """Create tools for research personas (Echo).
    
@@ -280,13 +308,16 @@ def create_full_toolkit(base_dir: str | Path | None = None):
    toolkit.register(file_tools.save_file, name="write_file")
    toolkit.register(file_tools.list_files, name="list_files")
    
+    # Calculator — exact arithmetic (never let the LLM guess)
+    toolkit.register(calculator, name="calculator")
+
    # Memory search - semantic recall
    try:
        from timmy.semantic_memory import memory_search
        toolkit.register(memory_search, name="memory_search")
    except Exception:
        logger.debug("Memory search not available")
-    
+
    return toolkit


@@ -371,6 +402,11 @@ def get_all_available_tools() -> dict[str, dict]:
            "description": "List files in a directory",
            "available_in": ["echo", "seer", "forge", "quill", "mace", "helm", "timmy"],
        },
+        "calculator": {
+            "name": "Calculator",
+            "description": "Evaluate mathematical expressions with exact results",
+            "available_in": ["timmy"],
+        },
    }

    # ── Git tools ─────────────────────────────────────────────────────────────
--- a/tests/test_briefing.py
+++ b/tests/test_briefing.py
@@ -233,11 +233,35 @@ def test_call_agent_falls_back_on_exception(engine):
 # ---------------------------------------------------------------------------

@pytest.mark.asyncio
-async def test_notify_briefing_ready_logs(caplog):
-    """notify_briefing_ready should log and call notifier.notify."""
-    from notifications.push import notify_briefing_ready, PushNotifier
+async def test_notify_briefing_ready_skips_when_no_approvals(caplog):
+    """notify_briefing_ready should NOT fire native notification with 0 approvals."""
+    from notifications.push import notify_briefing_ready
+
+    b = _make_briefing()  # approval_items=[]
+
+    with patch("notifications.push.notifier") as mock_notifier:
+        await notify_briefing_ready(b)
+        mock_notifier.notify.assert_not_called()
+
+
+@pytest.mark.asyncio
+async def test_notify_briefing_ready_fires_when_approvals_exist():
+    """notify_briefing_ready should fire when there are pending approval items."""
+    from notifications.push import notify_briefing_ready
+    from timmy.briefing import ApprovalItem

    b = _make_briefing()
+    b.approval_items = [
+        ApprovalItem(
+            id="test-1",
+            title="Test approval",
+            description="A test item",
+            proposed_action="do something",
+            impact="low",
+            created_at=datetime.now(timezone.utc),
+            status="pending",
+        ),
+    ]

    with patch("notifications.push.notifier") as mock_notifier:
        await notify_briefing_ready(b)
--- a/tests/test_calculator.py
+++ b/tests/test_calculator.py
@@ -0,0 +1,85 @@
+"""Tests for the calculator tool."""
+
+from timmy.tools import calculator
+
+
+def test_basic_multiplication():
+    assert calculator("347 * 829") == "287663"
+
+
+def test_basic_addition():
+    assert calculator("100 + 200") == "300"
+
+
+def test_basic_division():
+    assert calculator("100 / 4") == "25.0"
+
+
+def test_integer_division():
+    assert calculator("100 // 3") == "33"
+
+
+def test_exponentiation():
+    assert calculator("2 ** 10") == "1024"
+
+
+def test_sqrt():
+    assert calculator("math.sqrt(17161)") == "131.0"
+
+
+def test_sqrt_non_perfect():
+    result = float(calculator("math.sqrt(2)"))
+    assert abs(result - 1.4142135623730951) < 1e-10
+
+
+def test_log_base_10():
+    result = float(calculator("math.log10(1000)"))
+    assert abs(result - 3.0) < 1e-10
+
+
+def test_log_natural():
+    result = float(calculator("math.log(math.e)"))
+    assert abs(result - 1.0) < 1e-10
+
+
+def test_trig_sin():
+    result = float(calculator("math.sin(math.pi / 2)"))
+    assert abs(result - 1.0) < 1e-10
+
+
+def test_abs_builtin():
+    assert calculator("abs(-42)") == "42"
+
+
+def test_round_builtin():
+    assert calculator("round(3.14159, 2)") == "3.14"
+
+
+def test_min_max_builtins():
+    assert calculator("min(3, 7, 1)") == "1"
+    assert calculator("max(3, 7, 1)") == "7"
+
+
+def test_complex_expression():
+    assert calculator("(347 * 829) + (100 / 4)") == "287688.0"
+
+
+def test_invalid_expression_returns_error():
+    result = calculator("not a valid expression")
+    assert result.startswith("Error evaluating")
+
+
+def test_no_builtins_access():
+    """Ensure dangerous builtins like __import__ are blocked."""
+    result = calculator("__import__('os').system('echo pwned')")
+    assert result.startswith("Error evaluating")
+
+
+def test_no_open_access():
+    result = calculator("open('/etc/passwd').read()")
+    assert result.startswith("Error evaluating")
+
+
+def test_division_by_zero():
+    result = calculator("1 / 0")
+    assert result.startswith("Error evaluating")