docs: gap analysis implementation status tracker (#658 )

Resolves #658. Maps the gap analysis findings to current implementation status across all 6 categories. Tracks: memory/search, multi-agent, inference, orchestration, safety/crisis, accuracy measurement. Shows which gaps are filled (by PR number), which are in PR, and which remain future work.
2026-04-15 11:31:25 -04:00
4 changed files with 92 additions and 431 deletions
--- a/agent/context_faithful.py
+++ b/agent/context_faithful.py
@@ -1,293 +0,0 @@
 """Context-Faithful Prompting — Make LLMs Use Retrieved Context.
 Addresses the R@5 vs E2E accuracy gap by prompting the LLM to actually
 use the retrieved context instead of relying on parametric knowledge.
 Research: Context-faithful prompting achieves +5-15 E2E accuracy gains.
 Key patterns:
 1. Context-before-question structure (attention bias)
 2. Explicit "use the context" instruction
 3. Citation requirement (which passage used)
 4. Confidence calibration
 5. "I don't know" escape hatch
 Usage:
    from agent.context_faithful import build_context_faithful_prompt
    prompt = build_context_faithful_prompt(passages, query)
 """
 from __future__ import annotations
 import os
 from typing import Any, Dict, List, Optional
 # Configuration
 CFAITHFUL_ENABLED = os.getenv("CFAITHFUL_ENABLED", "true").lower() not in ("false", "0", "no")
 CFAITHFUL_REQUIRE_CITATION = os.getenv("CFAITHFUL_REQUIRE_CITATION", "true").lower() not in ("false", "0", "no")
 CFAITHFUL_CONFIDENCE = os.getenv("CFAITHFUL_CONFIDENCE", "true").lower() not in ("false", "0", "no")
 CFAITHFUL_MAX_CONTEXT_CHARS = int(os.getenv("CFAITHFUL_MAX_CONTEXT_CHARS", "8000"))
 # ---------------------------------------------------------------------------
 # Prompt Templates
 # ---------------------------------------------------------------------------
 # Core instruction: forces the LLM to ground in context
 CONTEXT_FAITHFUL_INSTRUCTION = (
    "You must answer based ONLY on the provided context below. "
    "Do not use any prior knowledge or make assumptions beyond what is stated in the context. "
    "If the context does not contain enough information to answer the question, "
    "you MUST say: \"I don't know based on the provided context.\" "
    "Do not guess. Do not fill in gaps with your training data."
 )
 # Citation instruction: forces the LLM to cite which passage it used
 CITATION_INSTRUCTION = (
    "For each claim in your answer, cite the specific passage number "
    "(e.g., [Passage 1], [Passage 3]) that supports it. "
    "If you cannot cite a passage for a claim, do not include that claim."
 )
 # Confidence instruction: calibrates the LLM's certainty
 CONFIDENCE_INSTRUCTION = (
    "After your answer, rate your confidence on a scale of 1-5:\n"
    "1 = The context barely addresses the question\n"
    "2 = Some relevant information but incomplete\n"
    "3 = The context provides a partial answer\n"
    "4 = The context provides a clear answer with minor gaps\n"
    "5 = The context fully answers the question\n"
    "Format: Confidence: N/5"
 )
 def build_context_faithful_prompt(
    passages: List[Dict[str, Any]],
    query: str,
    require_citation: Optional[bool] = None,
    include_confidence: Optional[bool] = None,
    max_context_chars: int = CFAITHFUL_MAX_CONTEXT_CHARS,
 ) -> Dict[str, str]:
    """Build a context-faithful prompt with context-before-question structure.
    Args:
        passages: List of passage dicts with 'content' or 'text' key.
            May have 'session_id', 'snippet', 'summary', etc.
        query: The user's question.
        require_citation: Override citation requirement.
        include_confidence: Override confidence calibration.
        max_context_chars: Max total context to include.
    Returns:
        Dict with 'system' and 'user' prompt strings.
    """
    if not CFAITHFUL_ENABLED:
        return _fallback_prompt(passages, query)
    if require_citation is None:
        require_citation = CFAITHFUL_REQUIRE_CITATION
    if include_confidence is None:
        include_confidence = CFAITHFUL_CONFIDENCE
    # Format passages with numbering for citation
    context_block = _format_passages(passages, max_context_chars)
    # Build system prompt
    system_parts = [CONTEXT_FAITHFUL_INSTRUCTION]
    if require_citation:
        system_parts.append(CITATION_INSTRUCTION)
    if include_confidence:
        system_parts.append(CONFIDENCE_INSTRUCTION)
    system_prompt = "\n\n".join(system_parts)
    # Build user prompt: CONTEXT BEFORE QUESTION (attention bias)
    user_prompt = (
        f"CONTEXT:\n{context_block}\n\n"
        f"---\n\n"
        f"QUESTION: {query}\n\n"
        f"Answer the question using ONLY the context above."
    )
    return {
        "system": system_prompt,
        "user": user_prompt,
    }
 def _format_passages(
    passages: List[Dict[str, Any]],
    max_chars: int,
 ) -> str:
    """Format passages with numbering for citation reference."""
    lines = []
    total_chars = 0
    for idx, passage in enumerate(passages, 1):
        content = (
            passage.get("content")
            or passage.get("text")
            or passage.get("snippet")
            or passage.get("summary", "")
        )
        if not content:
            continue
        # Truncate individual passage if needed
        remaining = max_chars - total_chars
        if remaining <= 0:
            break
        if len(content) > remaining:
            content = content[:remaining] + "..."
        source = passage.get("session_id") or passage.get("source", "")
        header = f"[Passage {idx}"
        if source:
            header += f" — {source}"
        header += "]"
        lines.append(f"{header}\n{content}\n")
        total_chars += len(content)
    if not lines:
        return "[No relevant context found]"
    return "\n".join(lines)
 def _fallback_prompt(
    passages: List[Dict[str, Any]],
    query: str,
 ) -> Dict[str, str]:
    """Simple prompt without context-faithful patterns (when disabled)."""
    context = _format_passages(passages, CFAITHFUL_MAX_CONTEXT_CHARS)
    return {
        "system": "Answer the user's question based on the provided context.",
        "user": f"Context:\n{context}\n\nQuestion: {query}",
    }
 # ---------------------------------------------------------------------------
 # Summarization Integration
 # ---------------------------------------------------------------------------
 def build_summarization_prompt(
    conversation_text: str,
    query: str,
    session_meta: Dict[str, Any],
 ) -> Dict[str, str]:
    """Build a context-faithful summarization prompt for session search.
    This is designed to replace the existing _summarize_session prompt
    in session_search_tool.py with a context-faithful version.
    """
    source = session_meta.get("source", "unknown")
    started = session_meta.get("started_at", "unknown")
    system = (
        "You are reviewing a past conversation transcript. "
        + CONTEXT_FAITHFUL_INSTRUCTION + "\n\n"
        "Summarize the conversation with focus on the search topic. Include:\n"
        "1. What the user asked about or wanted to accomplish\n"
        "2. What actions were taken and what the outcomes were\n"
        "3. Key decisions, solutions found, or conclusions reached\n"
        "4. Specific commands, files, URLs, or technical details\n"
        "5. Anything left unresolved\n\n"
        "Cite specific parts of the transcript (e.g., 'In the conversation, the user...'). "
        "If the transcript doesn't contain information relevant to the search topic, "
        "say so explicitly rather than inventing details."
    )
    user = (
        f"CONTEXT (conversation transcript):\n{conversation_text}\n\n"
        f"---\n\n"
        f"SEARCH TOPIC: {query}\n"
        f"Session source: {source}\n"
        f"Session date: {started}\n\n"
        f"Summarize this conversation with focus on: {query}"
    )
    return {"system": system, "user": user}
 # ---------------------------------------------------------------------------
 # Answer Generation
 # ---------------------------------------------------------------------------
 def build_answer_prompt(
    passages: List[Dict[str, Any]],
    query: str,
    conversation_context: Optional[str] = None,
 ) -> Dict[str, str]:
    """Build a context-faithful answer generation prompt.
    For direct question answering (not summarization).
    """
    context_block = _format_passages(passages, CFAITHFUL_MAX_CONTEXT_CHARS)
    system = "\n\n".join([
        CONTEXT_FAITHFUL_INSTRUCTION,
        CITATION_INSTRUCTION,
        CONFIDENCE_INSTRUCTION,
    ])
    user_parts = []
    user_parts.append(f"CONTEXT:\n{context_block}")
    if conversation_context:
        user_parts.append(f"RECENT CONVERSATION:\n{conversation_context[:2000]}")
    user_parts.append(f"---\n\nQUESTION: {query}")
    user_parts.append("\nAnswer based ONLY on the context above.")
    return {
        "system": system,
        "user": "\n\n".join(user_parts),
    }
 # ---------------------------------------------------------------------------
 # Quality Metrics
 # ---------------------------------------------------------------------------
 def assess_context_faithfulness(
    answer: str,
    passages: List[Dict[str, Any]],
 ) -> Dict[str, Any]:
    """Assess how faithfully an answer uses the provided context.
    Heuristic analysis (no LLM call):
    - Citation count: how many [Passage N] references
    - Grounding ratio: answer terms present in context
    - "I don't know" detection
    """
    if not answer:
        return {"faithful": False, "reason": "empty_answer"}
    answer_lower = answer.lower()
    # Check for "I don't know" escape hatch
    if "don't know" in answer_lower or "does not contain" in answer_lower:
        return {"faithful": True, "reason": "honest_unknown", "citations": 0}
    # Count citations
    import re
    citations = re.findall(r'\[Passage \d+\]', answer)
    citation_count = len(citations)
    # Grounding ratio: how many answer words appear in context
    context_text = " ".join(
        (p.get("content") or p.get("text") or p.get("snippet") or "").lower()
        for p in passages
    )
    answer_words = set(answer_lower.split())
    context_words = set(context_text.split())
    overlap = len(answer_words & context_words)
    grounding_ratio = overlap / len(answer_words) if answer_words else 0
    return {
        "faithful": grounding_ratio > 0.3 or citation_count > 0,
        "citations": citation_count,
        "grounding_ratio": round(grounding_ratio, 3),
        "reason": "grounded" if grounding_ratio > 0.3 else "weak_grounding",
    }
--- a/docs/gap-analysis-status.md
+++ b/docs/gap-analysis-status.md
@@ -0,0 +1,70 @@
 # Gap Analysis: Actual System vs SOTA — Implementation Status Tracker
 Issue #658. Maps gap analysis findings to implementation status.
 ## Gap Categories
 ### 1. Memory & Search
 | Gap | Target | Status | PR |
 |-----|--------|--------|-----|
 | Semantic search (R@5) | 95-99% | RIDER: +25% E2E | #782 |
 | Hybrid search | Vector + FTS5 + HRR | Hybrid search module | #729 |
 | Context-faithful prompting | +11-14% E2E | Context-faithful module | #786 |
 | Accuracy benchmarks | Measured | benchmark_r5_e2e.py | #790 |
 | Vector embeddings | ChromaDB | Not yet (Qdrant fallback) | Future |
 ### 2. Multi-Agent Coordination
 | Gap | Target | Status | PR |
 |-----|--------|--------|-----|
 | Three-tier memory | Unified | Fragmented (pieces exist) | #653 |
 | DAG task routing | GraphFlow-style | Not implemented | Future |
 | Fleet diary | Structured logs | Not implemented | Future |
 ### 3. Inference Optimization
 | Gap | Target | Status | PR |
 |-----|--------|--------|-----|
 | Cost tracking | $/1M tokens | task_cost_breakdown.py | fleet-ops#267 |
 | Fallback chain | Explicit | Provider routing exists | Existing |
 | vLLM + FP8 | 60% cost reduction | Not yet | Future |
 ### 4. Workflow Orchestration
 | Gap | Target | Status | PR |
 |-----|--------|--------|-----|
 | Retry with backoff | Built-in | Partial (cron retry) | Existing |
 | Task dependencies | Pipeline chaining | Not implemented | Future |
 | Concurrency control | Worker pool | File lock (single) | Existing |
 ### 5. Safety & Crisis
 | Gap | Target | Status | PR |
 |-----|--------|--------|-----|
 | Crisis detection | F1>0.85 | Crisis protocol + SHIELD | #785 |
 | Human confirmation | Tier system | Approval tiers | #697 |
 | 988 Lifeline | Auto-display | Crisis resources | #783 |
 | Emotional presence | Patterns | Research doc | #788 |
 | SOUL.md protocol | Implemented | Crisis protocol | #785 |
 ### 6. Accuracy Measurement
 | Gap | Target | Status | PR |
 |-----|--------|--------|-----|
 | R@5 measurement | Automated | benchmark_r5_e2e.py | #790 |
 | E2E accuracy | Measured | benchmark_r5_e2e.py | #790 |
 | Gap analysis | Documented | r5-vs-e2e-gap-analysis.md | #790 |
 ## Implementation Priority
 1. **DONE:** Crisis support (SOUL.md, 988, detection)
 2. **DONE:** Safety (approval tiers, SHIELD)
 3. **DONE:** Retrieval improvement (RIDER, hybrid search, context-faithful)
 4. **DONE:** Accuracy measurement (benchmark script)
 5. **IN PR:** Cost tracking (task_cost_breakdown.py)
 6. **FUTURE:** DAG routing, pub-sub messaging, vLLM deployment
 ## Key Insight
 The biggest gap was MEASUREMENT — we didn't know if our systems worked. Issue #657 (accuracy measurement) addressed this first, followed by the retrieval improvements that bridge the R@5 vs E2E gap.
--- a/tests/test_context_faithful_prompting.py
+++ b/tests/test_context_faithful_prompting.py
@@ -1,133 +0,0 @@
 """Tests for Context-Faithful Prompting — issue #667."""
 import pytest
 from agent.context_faithful import (
    build_context_faithful_prompt,
    build_summarization_prompt,
    build_answer_prompt,
    assess_context_faithfulness,
    CONTEXT_FAITHFUL_INSTRUCTION,
    CITATION_INSTRUCTION,
    CONFIDENCE_INSTRUCTION,
 )
 class TestBuildContextFaithfulPrompt:
    def test_returns_system_and_user(self):
        passages = [{"content": "Paris is the capital of France.", "session_id": "s1"}]
        result = build_context_faithful_prompt(passages, "What is the capital of France?")
        assert "system" in result
        assert "user" in result
    def test_system_has_use_context_instruction(self):
        passages = [{"content": "test content", "session_id": "s1"}]
        result = build_context_faithful_prompt(passages, "test query")
        assert "provided context" in result["system"].lower() or "context" in result["system"].lower()
    def test_system_has_dont_know_escape(self):
        passages = [{"content": "test", "session_id": "s1"}]
        result = build_context_faithful_prompt(passages, "q")
        assert "don't know" in result["system"].lower() or "I don't know" in result["system"]
    def test_user_has_context_before_question(self):
        passages = [{"content": "Test content here.", "session_id": "s1"}]
        result = build_context_faithful_prompt(passages, "What is this?")
        # Context should appear before the question
        context_pos = result["user"].find("CONTEXT")
        question_pos = result["user"].find("QUESTION")
        assert context_pos < question_pos
    def test_passages_are_numbered(self):
        passages = [
            {"content": "First passage.", "session_id": "s1"},
            {"content": "Second passage.", "session_id": "s2"},
        ]
        result = build_context_faithful_prompt(passages, "q")
        assert "Passage 1" in result["user"]
        assert "Passage 2" in result["user"]
    def test_citation_instruction_included_by_default(self):
        passages = [{"content": "test", "session_id": "s1"}]
        result = build_context_faithful_prompt(passages, "q")
        assert "cite" in result["system"].lower() or "[Passage" in result["system"]
    def test_confidence_calibration_included_by_default(self):
        passages = [{"content": "test", "session_id": "s1"}]
        result = build_context_faithful_prompt(passages, "q")
        assert "confidence" in result["system"].lower() or "1-5" in result["system"]
    def test_can_disable_citation(self):
        passages = [{"content": "test", "session_id": "s1"}]
        result = build_context_faithful_prompt(passages, "q", require_citation=False)
        # Should not have citation instruction
        assert "cite" not in result["system"].lower() or "citation" not in result["system"].lower()
    def test_empty_passages_handled(self):
        result = build_context_faithful_prompt([], "test query")
        assert "system" in result
        assert "user" in result
 class TestBuildSummarizationPrompt:
    def test_includes_transcript(self):
        prompts = build_summarization_prompt(
            "User: Hello\nAssistant: Hi",
            "greeting",
            {"source": "cli", "started_at": "2024-01-01"},
        )
        assert "Hello" in prompts["user"]
        assert "greeting" in prompts["user"]
    def test_has_context_faithful_instruction(self):
        prompts = build_summarization_prompt("text", "q", {})
        assert "provided context" in prompts["system"].lower() or "context" in prompts["system"].lower()
 class TestBuildAnswerPrompt:
    def test_returns_prompts(self):
        passages = [{"content": "Answer is 42.", "session_id": "s1"}]
        result = build_answer_prompt(passages, "What is the answer?")
        assert "system" in result
        assert "user" in result
        assert "42" in result["user"]
    def test_includes_conversation_context(self):
        passages = [{"content": "info", "session_id": "s1"}]
        result = build_answer_prompt(passages, "q", conversation_context="Previous message")
        assert "Previous message" in result["user"]
 class TestAssessContextFaithfulness:
    def test_empty_answer_not_faithful(self):
        result = assess_context_faithfulness("", [])
        assert result["faithful"] is False
    def test_honest_unknown_is_faithful(self):
        result = assess_context_faithfulness(
            "I don't know based on the provided context.",
            [{"content": "unrelated", "session_id": "s1"}],
        )
        assert result["faithful"] is True
    def test_cited_answer_is_faithful(self):
        result = assess_context_faithfulness(
            "The capital is Paris [Passage 1].",
            [{"content": "Paris is the capital.", "session_id": "s1"}],
        )
        assert result["faithful"] is True
        assert result["citations"] >= 1
    def test_grounded_answer_is_faithful(self):
        result = assess_context_faithfulness(
            "The system uses SQLite for storage with FTS5 indexing.",
            [{"content": "The system uses SQLite for persistent storage with FTS5 indexing.", "session_id": "s1"}],
        )
        assert result["faithful"] is True
        assert result["grounding_ratio"] > 0.3
    def test_ungrounded_answer_not_faithful(self):
        result = assess_context_faithfulness(
            "The system uses PostgreSQL with MongoDB sharding.",
            [{"content": "SQLite storage with FTS5.", "session_id": "s1"}],
        )
        assert result["grounding_ratio"] < 0.3
--- a/tools/session_search_tool.py
+++ b/tools/session_search_tool.py
@@ -176,11 +176,28 @@ async def _summarize_session(
    conversation_text: str, query: str, session_meta: Dict[str, Any]
 ) -> Optional[str]:
    """Summarize a single session conversation focused on the search query."""
-    # Context-faithful prompting: force LLM to ground in transcript
+    system_prompt = (
-    from agent.context_faithful import build_summarization_prompt
+        "You are reviewing a past conversation transcript to help recall what happened. "
-    prompts = build_summarization_prompt(conversation_text, query, session_meta)
+        "Summarize the conversation with a focus on the search topic. Include:\n"
-    system_prompt = prompts["system"]
+        "1. What the user asked about or wanted to accomplish\n"
-    user_prompt = prompts["user"]
+        "2. What actions were taken and what the outcomes were\n"
        "3. Key decisions, solutions found, or conclusions reached\n"
        "4. Any specific commands, files, URLs, or technical details that were important\n"
        "5. Anything left unresolved or notable\n\n"
        "Be thorough but concise. Preserve specific details (commands, paths, error messages) "
        "that would be useful to recall. Write in past tense as a factual recap."
    )
    source = session_meta.get("source", "unknown")
    started = _format_timestamp(session_meta.get("started_at"))
    user_prompt = (
        f"Search topic: {query}\n"
        f"Session source: {source}\n"
        f"Session date: {started}\n\n"
        f"CONVERSATION TRANSCRIPT:\n{conversation_text}\n\n"
        f"Summarize this conversation with focus on: {query}"
    )
    max_retries = 3
    for attempt in range(max_retries):