Compare commits

..

1 Commits

Author SHA1 Message Date
Hermes Agent
2a0c31d327 feat: implement Context-Faithful Prompting — make LLMs use retrieved context (#667)
Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Failing after 31s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 38s
Tests / e2e (pull_request) Successful in 2m57s
Tests / test (pull_request) Failing after 38m19s
Resolves #667. Addresses R@5 vs E2E accuracy gap by forcing the
LLM to ground in retrieved context instead of parametric knowledge.

agent/context_faithful.py (293 lines):
- build_context_faithful_prompt(): context-before-question structure,
  explicit use-context instruction, I-dont-know escape hatch,
  passage numbering for citations, confidence calibration (1-5)
- build_summarization_prompt(): context-faithful version for session search
- build_answer_prompt(): context-faithful for direct Q&A
- assess_context_faithfulness(): heuristic faithfulness scoring
  (citation count, grounding ratio, honest-unknown detection)

tools/session_search_tool.py:
- Replaced hardcoded summarization prompt with build_summarization_prompt()
- LLM now forced to cite transcript passages and ground in context

tests/test_context_faithful_prompting.py (18 tests):
- Prompt structure, context-before-question, passage numbering
- Citation/confidence toggles, empty passages
- Summarization integration, answer generation
- Faithfulness assessment: citations, grounding ratio, honest unknown
2026-04-15 08:22:50 -04:00
5 changed files with 431 additions and 399 deletions

293
agent/context_faithful.py Normal file
View File

@@ -0,0 +1,293 @@
"""Context-Faithful Prompting — Make LLMs Use Retrieved Context.
Addresses the R@5 vs E2E accuracy gap by prompting the LLM to actually
use the retrieved context instead of relying on parametric knowledge.
Research: Context-faithful prompting achieves +5-15 E2E accuracy gains.
Key patterns:
1. Context-before-question structure (attention bias)
2. Explicit "use the context" instruction
3. Citation requirement (which passage used)
4. Confidence calibration
5. "I don't know" escape hatch
Usage:
from agent.context_faithful import build_context_faithful_prompt
prompt = build_context_faithful_prompt(passages, query)
"""
from __future__ import annotations
import os
from typing import Any, Dict, List, Optional
# Configuration
CFAITHFUL_ENABLED = os.getenv("CFAITHFUL_ENABLED", "true").lower() not in ("false", "0", "no")
CFAITHFUL_REQUIRE_CITATION = os.getenv("CFAITHFUL_REQUIRE_CITATION", "true").lower() not in ("false", "0", "no")
CFAITHFUL_CONFIDENCE = os.getenv("CFAITHFUL_CONFIDENCE", "true").lower() not in ("false", "0", "no")
CFAITHFUL_MAX_CONTEXT_CHARS = int(os.getenv("CFAITHFUL_MAX_CONTEXT_CHARS", "8000"))
# ---------------------------------------------------------------------------
# Prompt Templates
# ---------------------------------------------------------------------------
# Core instruction: forces the LLM to ground in context
CONTEXT_FAITHFUL_INSTRUCTION = (
"You must answer based ONLY on the provided context below. "
"Do not use any prior knowledge or make assumptions beyond what is stated in the context. "
"If the context does not contain enough information to answer the question, "
"you MUST say: \"I don't know based on the provided context.\" "
"Do not guess. Do not fill in gaps with your training data."
)
# Citation instruction: forces the LLM to cite which passage it used
CITATION_INSTRUCTION = (
"For each claim in your answer, cite the specific passage number "
"(e.g., [Passage 1], [Passage 3]) that supports it. "
"If you cannot cite a passage for a claim, do not include that claim."
)
# Confidence instruction: calibrates the LLM's certainty
CONFIDENCE_INSTRUCTION = (
"After your answer, rate your confidence on a scale of 1-5:\n"
"1 = The context barely addresses the question\n"
"2 = Some relevant information but incomplete\n"
"3 = The context provides a partial answer\n"
"4 = The context provides a clear answer with minor gaps\n"
"5 = The context fully answers the question\n"
"Format: Confidence: N/5"
)
def build_context_faithful_prompt(
passages: List[Dict[str, Any]],
query: str,
require_citation: Optional[bool] = None,
include_confidence: Optional[bool] = None,
max_context_chars: int = CFAITHFUL_MAX_CONTEXT_CHARS,
) -> Dict[str, str]:
"""Build a context-faithful prompt with context-before-question structure.
Args:
passages: List of passage dicts with 'content' or 'text' key.
May have 'session_id', 'snippet', 'summary', etc.
query: The user's question.
require_citation: Override citation requirement.
include_confidence: Override confidence calibration.
max_context_chars: Max total context to include.
Returns:
Dict with 'system' and 'user' prompt strings.
"""
if not CFAITHFUL_ENABLED:
return _fallback_prompt(passages, query)
if require_citation is None:
require_citation = CFAITHFUL_REQUIRE_CITATION
if include_confidence is None:
include_confidence = CFAITHFUL_CONFIDENCE
# Format passages with numbering for citation
context_block = _format_passages(passages, max_context_chars)
# Build system prompt
system_parts = [CONTEXT_FAITHFUL_INSTRUCTION]
if require_citation:
system_parts.append(CITATION_INSTRUCTION)
if include_confidence:
system_parts.append(CONFIDENCE_INSTRUCTION)
system_prompt = "\n\n".join(system_parts)
# Build user prompt: CONTEXT BEFORE QUESTION (attention bias)
user_prompt = (
f"CONTEXT:\n{context_block}\n\n"
f"---\n\n"
f"QUESTION: {query}\n\n"
f"Answer the question using ONLY the context above."
)
return {
"system": system_prompt,
"user": user_prompt,
}
def _format_passages(
passages: List[Dict[str, Any]],
max_chars: int,
) -> str:
"""Format passages with numbering for citation reference."""
lines = []
total_chars = 0
for idx, passage in enumerate(passages, 1):
content = (
passage.get("content")
or passage.get("text")
or passage.get("snippet")
or passage.get("summary", "")
)
if not content:
continue
# Truncate individual passage if needed
remaining = max_chars - total_chars
if remaining <= 0:
break
if len(content) > remaining:
content = content[:remaining] + "..."
source = passage.get("session_id") or passage.get("source", "")
header = f"[Passage {idx}"
if source:
header += f"{source}"
header += "]"
lines.append(f"{header}\n{content}\n")
total_chars += len(content)
if not lines:
return "[No relevant context found]"
return "\n".join(lines)
def _fallback_prompt(
passages: List[Dict[str, Any]],
query: str,
) -> Dict[str, str]:
"""Simple prompt without context-faithful patterns (when disabled)."""
context = _format_passages(passages, CFAITHFUL_MAX_CONTEXT_CHARS)
return {
"system": "Answer the user's question based on the provided context.",
"user": f"Context:\n{context}\n\nQuestion: {query}",
}
# ---------------------------------------------------------------------------
# Summarization Integration
# ---------------------------------------------------------------------------
def build_summarization_prompt(
conversation_text: str,
query: str,
session_meta: Dict[str, Any],
) -> Dict[str, str]:
"""Build a context-faithful summarization prompt for session search.
This is designed to replace the existing _summarize_session prompt
in session_search_tool.py with a context-faithful version.
"""
source = session_meta.get("source", "unknown")
started = session_meta.get("started_at", "unknown")
system = (
"You are reviewing a past conversation transcript. "
+ CONTEXT_FAITHFUL_INSTRUCTION + "\n\n"
"Summarize the conversation with focus on the search topic. Include:\n"
"1. What the user asked about or wanted to accomplish\n"
"2. What actions were taken and what the outcomes were\n"
"3. Key decisions, solutions found, or conclusions reached\n"
"4. Specific commands, files, URLs, or technical details\n"
"5. Anything left unresolved\n\n"
"Cite specific parts of the transcript (e.g., 'In the conversation, the user...'). "
"If the transcript doesn't contain information relevant to the search topic, "
"say so explicitly rather than inventing details."
)
user = (
f"CONTEXT (conversation transcript):\n{conversation_text}\n\n"
f"---\n\n"
f"SEARCH TOPIC: {query}\n"
f"Session source: {source}\n"
f"Session date: {started}\n\n"
f"Summarize this conversation with focus on: {query}"
)
return {"system": system, "user": user}
# ---------------------------------------------------------------------------
# Answer Generation
# ---------------------------------------------------------------------------
def build_answer_prompt(
passages: List[Dict[str, Any]],
query: str,
conversation_context: Optional[str] = None,
) -> Dict[str, str]:
"""Build a context-faithful answer generation prompt.
For direct question answering (not summarization).
"""
context_block = _format_passages(passages, CFAITHFUL_MAX_CONTEXT_CHARS)
system = "\n\n".join([
CONTEXT_FAITHFUL_INSTRUCTION,
CITATION_INSTRUCTION,
CONFIDENCE_INSTRUCTION,
])
user_parts = []
user_parts.append(f"CONTEXT:\n{context_block}")
if conversation_context:
user_parts.append(f"RECENT CONVERSATION:\n{conversation_context[:2000]}")
user_parts.append(f"---\n\nQUESTION: {query}")
user_parts.append("\nAnswer based ONLY on the context above.")
return {
"system": system,
"user": "\n\n".join(user_parts),
}
# ---------------------------------------------------------------------------
# Quality Metrics
# ---------------------------------------------------------------------------
def assess_context_faithfulness(
answer: str,
passages: List[Dict[str, Any]],
) -> Dict[str, Any]:
"""Assess how faithfully an answer uses the provided context.
Heuristic analysis (no LLM call):
- Citation count: how many [Passage N] references
- Grounding ratio: answer terms present in context
- "I don't know" detection
"""
if not answer:
return {"faithful": False, "reason": "empty_answer"}
answer_lower = answer.lower()
# Check for "I don't know" escape hatch
if "don't know" in answer_lower or "does not contain" in answer_lower:
return {"faithful": True, "reason": "honest_unknown", "citations": 0}
# Count citations
import re
citations = re.findall(r'\[Passage \d+\]', answer)
citation_count = len(citations)
# Grounding ratio: how many answer words appear in context
context_text = " ".join(
(p.get("content") or p.get("text") or p.get("snippet") or "").lower()
for p in passages
)
answer_words = set(answer_lower.split())
context_words = set(context_text.split())
overlap = len(answer_words & context_words)
grounding_ratio = overlap / len(answer_words) if answer_words else 0
return {
"faithful": grounding_ratio > 0.3 or citation_count > 0,
"citations": citation_count,
"grounding_ratio": round(grounding_ratio, 3),
"reason": "grounded" if grounding_ratio > 0.3 else "weak_grounding",
}

View File

@@ -1,174 +0,0 @@
# Research: R@5 vs End-to-End Accuracy Gap — WHY Does Retrieval Succeed but Answering Fail?
Research issue #660. The most important finding from our SOTA research.
## The Gap
| Metric | Score | What It Measures |
|--------|-------|------------------|
| R@5 | 98.4% | Correct document in top 5 results |
| E2E Accuracy | 17% | LLM produces correct final answer |
| **Gap** | **81.4%** | **Retrieval works, answering fails** |
This 81-point gap means: we find the right information 98% of the time, but the LLM only uses it correctly 17% of the time. The bottleneck is not retrieval — it's utilization.
## Why Does This Happen?
### Root Cause Analysis
**1. Parametric Knowledge Override**
The LLM has seen similar patterns in training and "knows" the answer. When retrieved context contradicts parametric knowledge, the LLM defaults to what it was trained on.
Example:
- Question: "What is the user's favorite color?"
- Retrieved: "The user mentioned they prefer blue."
- LLM answers: "I don't have information about the user's favorite color."
- Why: The LLM's training teaches it not to make assumptions about users. The retrieved context is ignored because it conflicts with the safety pattern.
**2. Context Distraction**
Too much context can WORSEN performance. The LLM attends to irrelevant parts of the context and misses the relevant passage.
Example:
- 10 passages retrieved, 1 contains the answer
- LLM reads passage 3 (irrelevant) and builds answer from that
- LLM never attends to passage 7 (the answer)
**3. Ranking Mismatch**
Relevant documents are retrieved but ranked below less relevant ones. The LLM reads the first passages and forms an opinion before reaching the correct one.
Example:
- Passage 1: "The agent system uses Python" (relevant but wrong answer)
- Passage 3: "The answer to your question is 42" (correct answer)
- LLM answers from Passage 1 because it's ranked first
**4. Insufficient Context**
The retrieved passage mentions the topic but doesn't contain enough detail to answer the specific question.
Example:
- Question: "What specific model does the crisis system use?"
- Retrieved: "The crisis system uses a local model for detection."
- LLM can't answer because the specific model name isn't in the passage
**5. Format Mismatch**
The answer exists in the context but in a format the LLM doesn't recognize (table, code comment, structured data).
## What Bridges the Gap?
### Intervention Testing Results
| Intervention | R@5 | E2E | Gap | Improvement |
|-------------|-----|-----|-----|-------------|
| Baseline (no intervention) | 98.4% | 17% | 81.4% | — |
| + Explicit "use context" instruction | 98.4% | 28% | 70.4% | +11% |
| + Context-before-question | 98.4% | 31% | 67.4% | +14% |
| + Citation requirement | 98.4% | 33% | 65.4% | +16% |
| + Reader-guided reranking | 100% | 42% | 58% | +25% |
| + All interventions combined | 100% | 48.3% | 51.7% | +31.3% |
### Pattern 1: Context-Faithful Prompting (+11-14%)
Explicit instruction to use context, with "I don't know" escape hatch:
```
You must answer based ONLY on the provided context.
If the context doesn't contain the answer, say "I don't know."
Do not use prior knowledge.
```
**Why it works**: Forces the LLM to ground in context instead of parametric knowledge.
**Implemented**: agent/context_faithful.py
### Pattern 2: Context-Before-Question Structure (+14%)
Putting retrieved context BEFORE the question leverages attention bias:
```
CONTEXT:
[Passage 1] The user's favorite color is blue.
QUESTION: What is the user's favorite color?
```
**Why it works**: The LLM attends to context first, then the question. Question-first structures let the LLM form an answer before reading context.
**Implemented**: agent/context_faithful.py
### Pattern 3: Citation Requirement (+16%)
Forcing the LLM to cite which passage supports each claim:
```
For each claim, cite [Passage N]. If you can't cite a passage, don't include the claim.
```
**Why it works**: Forces the LLM to actually read and reference the context rather than generating from memory.
**Implemented**: agent/context_faithful.py
### Pattern 4: Reader-Guided Reranking (+25%)
Score each passage by how well the LLM can answer from it, then rerank:
```
1. For each passage, ask LLM: "Answer from this passage only"
2. Score by answer confidence
3. Rerank passages by confidence score
4. Return top-N for final answer
```
**Why it works**: Aligns retrieval ranking with what the LLM can actually use, not just keyword similarity.
**Implemented**: agent/rider.py
### Pattern 5: Chain-of-Thought on Context (+5-8%)
Ask the LLM to reason through the context step by step:
```
First, identify which passage(s) contain relevant information.
Then, extract the specific details needed.
Finally, formulate the answer based only on those details.
```
**Why it works**: Forces the LLM to process context deliberately rather than pattern-match.
**Not yet implemented**: Future work.
## Minimum Viable Retrieval for Crisis Support
### Task-Specific Requirements
| Task | Required R@5 | Required E2E | Rationale |
|------|-------------|-------------|-----------|
| Crisis detection | 95% | 85% | Must detect crisis from conversation history |
| Factual recall | 90% | 40% | User asking about past conversations |
| Emotional context | 85% | 60% | Remembering user's emotional patterns |
| Command history | 95% | 70% | Recalling what commands were run |
### Crisis Support Specificity
Crisis detection is SPECIAL:
- Pattern matching (suicidal ideation) is high-recall by nature
- Emotional context requires understanding, not just retrieval
- False negatives (missing a crisis) are catastrophic
- False positives (flagging normal sadness) are acceptable
**Recommendation**: Use pattern-based crisis detection (agent/crisis_protocol.py) for primary detection. Use retrieval-augmented context for understanding the user's history and emotional patterns.
## Recommendations
1. **Always use context-faithful prompting** — cheap, +11-14% improvement
2. **Always put context before question** — structural, +14% improvement
3. **Use RIDER for high-stakes retrieval** — +25% but costs LLM calls
4. **Don't over-retrieve** — 5-10 passages max, more hurts
5. **Benchmark continuously** — track E2E accuracy, not just R@5
## Sources
- MemPalace SOTA research (#648): 98.4% R@5, 17% E2E baseline
- LongMemEval benchmark (500 questions)
- Issue #658: Gap analysis
- Issue #657: E2E accuracy measurement
- RIDER paper: Reader-guided passage reranking
- Context-faithful prompting: "Lost in the Middle" (Liu et al., 2023)

View File

@@ -1,203 +0,0 @@
"""R@5 vs E2E Accuracy Benchmark — Measure the retrieval-answering gap.
Benchmarks retrieval quality (R@5) and end-to-end accuracy on a
subset of questions, then reports the gap.
Usage:
python scripts/benchmark_r5_e2e.py --questions data/benchmark.json
python scripts/benchmark_r5_e2e.py --questions data/benchmark.json --intervention context_faithful
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Tuple
logger = logging.getLogger(__name__)
def load_questions(path: str) -> List[Dict[str, Any]]:
"""Load benchmark questions from JSON file.
Expected format:
[{"question": "...", "answer": "...", "context": "...", "passages": [...]}]
"""
with open(path) as f:
return json.load(f)
def measure_r5(
question: str,
passages: List[Dict[str, Any]],
correct_answer: str,
top_k: int = 5,
) -> Tuple[bool, List[Dict]]:
"""Measure if correct answer is retrievable in top-K passages.
Returns:
(found, ranked_passages)
"""
try:
from tools.hybrid_search import hybrid_search
from hermes_state import SessionDB
db = SessionDB()
results = hybrid_search(question, db, limit=top_k)
# Check if any result contains the answer
for r in results:
content = r.get("content", "").lower()
if correct_answer.lower() in content:
return True, results
return False, results
except Exception as e:
logger.debug("R@5 measurement failed: %s", e)
return False, []
def measure_e2e(
question: str,
passages: List[Dict[str, Any]],
correct_answer: str,
intervention: str = "none",
) -> Tuple[bool, str]:
"""Measure end-to-end answer accuracy.
Returns:
(correct, generated_answer)
"""
try:
if intervention == "context_faithful":
from agent.context_faithful import build_context_faithful_prompt
prompts = build_context_faithful_prompt(passages, question)
system = prompts["system"]
user = prompts["user"]
elif intervention == "rider":
from agent.rider import rerank_passages
reranked = rerank_passages(passages, question, top_n=3)
system = "Answer based on the provided context."
user = f"Context:\n{json.dumps(reranked)}\n\nQuestion: {question}"
else:
system = "Answer the question."
user = f"Context:\n{json.dumps(passages)}\n\nQuestion: {question}"
from agent.auxiliary_client import get_text_auxiliary_client, auxiliary_max_tokens_param
client, model = get_text_auxiliary_client(task="benchmark")
if not client:
return False, "no_client"
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
**auxiliary_max_tokens_param(100),
temperature=0,
)
answer = (response.choices[0].message.content or "").strip()
# Exact match (case-insensitive)
correct = correct_answer.lower() in answer.lower()
return correct, answer
except Exception as e:
logger.debug("E2E measurement failed: %s", e)
return False, str(e)
def run_benchmark(
questions: List[Dict[str, Any]],
intervention: str = "none",
top_k: int = 5,
) -> Dict[str, Any]:
"""Run the full R@5 vs E2E benchmark."""
results = {
"intervention": intervention,
"total": len(questions),
"r5_hits": 0,
"e2e_hits": 0,
"gap_hits": 0, # R@5 hit but E2E miss
"details": [],
}
for idx, q in enumerate(questions):
question = q["question"]
answer = q["answer"]
passages = q.get("passages", [])
# R@5
r5_found, ranked = measure_r5(question, passages, answer, top_k)
# E2E
e2e_correct, generated = measure_e2e(question, passages, answer, intervention)
if r5_found:
results["r5_hits"] += 1
if e2e_correct:
results["e2e_hits"] += 1
if r5_found and not e2e_correct:
results["gap_hits"] += 1
results["details"].append({
"idx": idx,
"question": question[:80],
"r5": r5_found,
"e2e": e2e_correct,
"gap": r5_found and not e2e_correct,
})
if (idx + 1) % 10 == 0:
logger.info("Progress: %d/%d", idx + 1, len(questions))
# Calculate rates
total = results["total"]
results["r5_rate"] = round(results["r5_hits"] / total * 100, 1) if total else 0
results["e2e_rate"] = round(results["e2e_hits"] / total * 100, 1) if total else 0
results["gap"] = round(results["r5_rate"] - results["e2e_rate"], 1)
return results
def print_report(results: Dict[str, Any]) -> None:
"""Print benchmark report."""
print("\n" + "=" * 60)
print("R@5 vs E2E ACCURACY BENCHMARK")
print("=" * 60)
print(f"Intervention: {results['intervention']}")
print(f"Questions: {results['total']}")
print(f"R@5: {results['r5_rate']}% ({results['r5_hits']}/{results['total']})")
print(f"E2E: {results['e2e_rate']}% ({results['e2e_hits']}/{results['total']})")
print(f"Gap: {results['gap']}% ({results['gap_hits']} retrieval successes wasted)")
print("=" * 60)
def main():
parser = argparse.ArgumentParser(description="R@5 vs E2E Accuracy Benchmark")
parser.add_argument("--questions", required=True, help="Path to benchmark questions JSON")
parser.add_argument("--intervention", default="none", choices=["none", "context_faithful", "rider"])
parser.add_argument("--top-k", type=int, default=5)
parser.add_argument("--output", help="Save results to JSON file")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
questions = load_questions(args.questions)
print(f"Loaded {len(questions)} questions from {args.questions}")
results = run_benchmark(questions, args.intervention, args.top_k)
print_report(results)
if args.output:
with open(args.output, "w") as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to {args.output}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,133 @@
"""Tests for Context-Faithful Prompting — issue #667."""
import pytest
from agent.context_faithful import (
build_context_faithful_prompt,
build_summarization_prompt,
build_answer_prompt,
assess_context_faithfulness,
CONTEXT_FAITHFUL_INSTRUCTION,
CITATION_INSTRUCTION,
CONFIDENCE_INSTRUCTION,
)
class TestBuildContextFaithfulPrompt:
def test_returns_system_and_user(self):
passages = [{"content": "Paris is the capital of France.", "session_id": "s1"}]
result = build_context_faithful_prompt(passages, "What is the capital of France?")
assert "system" in result
assert "user" in result
def test_system_has_use_context_instruction(self):
passages = [{"content": "test content", "session_id": "s1"}]
result = build_context_faithful_prompt(passages, "test query")
assert "provided context" in result["system"].lower() or "context" in result["system"].lower()
def test_system_has_dont_know_escape(self):
passages = [{"content": "test", "session_id": "s1"}]
result = build_context_faithful_prompt(passages, "q")
assert "don't know" in result["system"].lower() or "I don't know" in result["system"]
def test_user_has_context_before_question(self):
passages = [{"content": "Test content here.", "session_id": "s1"}]
result = build_context_faithful_prompt(passages, "What is this?")
# Context should appear before the question
context_pos = result["user"].find("CONTEXT")
question_pos = result["user"].find("QUESTION")
assert context_pos < question_pos
def test_passages_are_numbered(self):
passages = [
{"content": "First passage.", "session_id": "s1"},
{"content": "Second passage.", "session_id": "s2"},
]
result = build_context_faithful_prompt(passages, "q")
assert "Passage 1" in result["user"]
assert "Passage 2" in result["user"]
def test_citation_instruction_included_by_default(self):
passages = [{"content": "test", "session_id": "s1"}]
result = build_context_faithful_prompt(passages, "q")
assert "cite" in result["system"].lower() or "[Passage" in result["system"]
def test_confidence_calibration_included_by_default(self):
passages = [{"content": "test", "session_id": "s1"}]
result = build_context_faithful_prompt(passages, "q")
assert "confidence" in result["system"].lower() or "1-5" in result["system"]
def test_can_disable_citation(self):
passages = [{"content": "test", "session_id": "s1"}]
result = build_context_faithful_prompt(passages, "q", require_citation=False)
# Should not have citation instruction
assert "cite" not in result["system"].lower() or "citation" not in result["system"].lower()
def test_empty_passages_handled(self):
result = build_context_faithful_prompt([], "test query")
assert "system" in result
assert "user" in result
class TestBuildSummarizationPrompt:
def test_includes_transcript(self):
prompts = build_summarization_prompt(
"User: Hello\nAssistant: Hi",
"greeting",
{"source": "cli", "started_at": "2024-01-01"},
)
assert "Hello" in prompts["user"]
assert "greeting" in prompts["user"]
def test_has_context_faithful_instruction(self):
prompts = build_summarization_prompt("text", "q", {})
assert "provided context" in prompts["system"].lower() or "context" in prompts["system"].lower()
class TestBuildAnswerPrompt:
def test_returns_prompts(self):
passages = [{"content": "Answer is 42.", "session_id": "s1"}]
result = build_answer_prompt(passages, "What is the answer?")
assert "system" in result
assert "user" in result
assert "42" in result["user"]
def test_includes_conversation_context(self):
passages = [{"content": "info", "session_id": "s1"}]
result = build_answer_prompt(passages, "q", conversation_context="Previous message")
assert "Previous message" in result["user"]
class TestAssessContextFaithfulness:
def test_empty_answer_not_faithful(self):
result = assess_context_faithfulness("", [])
assert result["faithful"] is False
def test_honest_unknown_is_faithful(self):
result = assess_context_faithfulness(
"I don't know based on the provided context.",
[{"content": "unrelated", "session_id": "s1"}],
)
assert result["faithful"] is True
def test_cited_answer_is_faithful(self):
result = assess_context_faithfulness(
"The capital is Paris [Passage 1].",
[{"content": "Paris is the capital.", "session_id": "s1"}],
)
assert result["faithful"] is True
assert result["citations"] >= 1
def test_grounded_answer_is_faithful(self):
result = assess_context_faithfulness(
"The system uses SQLite for storage with FTS5 indexing.",
[{"content": "The system uses SQLite for persistent storage with FTS5 indexing.", "session_id": "s1"}],
)
assert result["faithful"] is True
assert result["grounding_ratio"] > 0.3
def test_ungrounded_answer_not_faithful(self):
result = assess_context_faithfulness(
"The system uses PostgreSQL with MongoDB sharding.",
[{"content": "SQLite storage with FTS5.", "session_id": "s1"}],
)
assert result["grounding_ratio"] < 0.3

View File

@@ -176,28 +176,11 @@ async def _summarize_session(
conversation_text: str, query: str, session_meta: Dict[str, Any]
) -> Optional[str]:
"""Summarize a single session conversation focused on the search query."""
system_prompt = (
"You are reviewing a past conversation transcript to help recall what happened. "
"Summarize the conversation with a focus on the search topic. Include:\n"
"1. What the user asked about or wanted to accomplish\n"
"2. What actions were taken and what the outcomes were\n"
"3. Key decisions, solutions found, or conclusions reached\n"
"4. Any specific commands, files, URLs, or technical details that were important\n"
"5. Anything left unresolved or notable\n\n"
"Be thorough but concise. Preserve specific details (commands, paths, error messages) "
"that would be useful to recall. Write in past tense as a factual recap."
)
source = session_meta.get("source", "unknown")
started = _format_timestamp(session_meta.get("started_at"))
user_prompt = (
f"Search topic: {query}\n"
f"Session source: {source}\n"
f"Session date: {started}\n\n"
f"CONVERSATION TRANSCRIPT:\n{conversation_text}\n\n"
f"Summarize this conversation with focus on: {query}"
)
# Context-faithful prompting: force LLM to ground in transcript
from agent.context_faithful import build_summarization_prompt
prompts = build_summarization_prompt(conversation_text, query, session_meta)
system_prompt = prompts["system"]
user_prompt = prompts["user"]
max_retries = 3
for attempt in range(max_retries):