intelligence/deepdive/quality_eval.py

#!/usr/bin/env python3
"""Deep Dive Quality Evaluation Framework — Issue #830

Scores generated briefings against a multi-dimensional rubric.
Detects drift across consecutive runs. Supports A/B prompt testing.

Usage:
    python3 quality_eval.py /path/to/briefing_20260405_124506.json
    python3 quality_eval.py /path/to/briefing.json --previous /path/to/briefing_yesterday.json
    python3 quality_eval.py /path/to/briefing.json --json
"""

import argparse
import json
import math
import sys
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import List, Optional, Dict, Any


# ---------------------------------------------------------------------------
# Rubric configuration (tunable)
# ---------------------------------------------------------------------------

TARGET_WORD_COUNT_MIN = 600
TARGET_WORD_COUNT_MAX = 1200
TARGET_AUDIO_MINUTES_MIN = 10
TARGET_AUDIO_MINUTES_MAX = 15
MAX_SOURCES_EXPECTED = 12

RELEVANCE_KEYWORDS = [
    "llm", "agent", "architecture", "hermes", "tool use", "mcp",
    "reinforcement learning", "rlhf", "grpo", "transformer",
    "local model", "llama.cpp", "gemma", "inference", "alignment",
    "fleet", "timmy", "nexus", "openclaw", "sovereign",
]

ACTIONABILITY_MARKERS = [
    "implication", "recommend", "should", "next step", "action",
    "deploy", "integrate", "watch", "risk", "opportunity",
]

GROUNDING_MARKERS = [
    "fleet", "repo", "issue", "pr ", "commit", "milestone",
    "wizard", "hermes", "timmy", "nexus", "openclaw", "bezalel",
]


@dataclass
class QualityReport:
    briefing_path: str
    overall_score: float  # 0.0 - 100.0
    relevance_score: float  # 0.0 - 100.0
    grounding_score: float  # 0.0 - 100.0
    conciseness_score: float  # 0.0 - 100.0
    actionability_score: float  # 0.0 - 100.0
    source_diversity_score: float  # 0.0 - 100.0
    drift_score: Optional[float] = None  # 0.0 - 100.0 (similarity to previous)
    warnings: List[str] = None
    recommendations: List[str] = None

    def __post_init__(self):
        if self.warnings is None:
            self.warnings = []
        if self.recommendations is None:
            self.recommendations = []


def load_briefing(path: Path) -> Dict[str, Any]:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def _word_count(text: str) -> int:
    return len(text.split())


def _estimate_audio_minutes(word_count: int, wpm: int = 130) -> float:
    return round(word_count / wpm, 1)


def score_relevance(briefing: Dict[str, Any]) -> tuple[float, List[str]]:
    """Score how well the briefing covers AI/ML topics relevant to Hermes work."""
    text = _extract_full_text(briefing).lower()
    hits = sum(1 for kw in RELEVANCE_KEYWORDS if kw in text)
    score = min(100.0, (hits / max(len(RELEVANCE_KEYWORDS) * 0.3, 1)) * 100.0)

    warnings = []
    if hits < 3:
        warnings.append("Briefing lacks AI/ML relevance keywords.")

    return round(score, 1), warnings


def score_grounding(briefing: Dict[str, Any]) -> tuple[float, List[str]]:
    """Score how well the briefing incorporates fleet context."""
    text = _extract_full_text(briefing).lower()
    fleet_ctx = briefing.get("fleet_context") or briefing.get("context") or {}
    has_fleet_context = bool(fleet_ctx)

    hits = sum(1 for marker in GROUNDING_MARKERS if marker in text)
    score = min(100.0, (hits / max(len(GROUNDING_MARKERS) * 0.2, 1)) * 100.0)

    if has_fleet_context and hits < 2:
        score *= 0.5  # Penalty for ignoring injected context

    warnings = []
    if not has_fleet_context:
        warnings.append("No fleet_context found in briefing payload.")
    elif hits < 2:
        warnings.append("Fleet context was injected but not referenced in briefing text.")

    return round(score, 1), warnings


def score_conciseness(briefing: Dict[str, Any]) -> tuple[float, List[str]]:
    """Score whether briefing length lands in the target zone."""
    text = _extract_full_text(briefing)
    wc = _word_count(text)
    audio_min = _estimate_audio_minutes(wc)

    warnings = []
    if wc < TARGET_WORD_COUNT_MIN:
        warnings.append(f"Briefing too short ({wc} words). Target: {TARGET_WORD_COUNT_MIN}-{TARGET_WORD_COUNT_MAX}.")
    elif wc > TARGET_WORD_COUNT_MAX:
        warnings.append(f"Briefing too long ({wc} words). Target: {TARGET_WORD_COUNT_MIN}-{TARGET_WORD_COUNT_MAX}.")

    if audio_min < TARGET_AUDIO_MINUTES_MIN:
        warnings.append(f"Audio estimate too short ({audio_min} min). Target: {TARGET_AUDIO_MINUTES_MIN}-{TARGET_AUDIO_MINUTES_MAX}.")
    elif audio_min > TARGET_AUDIO_MINUTES_MAX:
        warnings.append(f"Audio estimate too long ({audio_min} min). Target: {TARGET_AUDIO_MINUTES_MIN}-{TARGET_AUDIO_MINUTES_MAX}.")

    # Score peaks at target center, falls off linearly outside
    center_wc = (TARGET_WORD_COUNT_MIN + TARGET_WORD_COUNT_MAX) / 2
    deviation = abs(wc - center_wc)
    max_dev = max(center_wc - 0, TARGET_WORD_COUNT_MAX - center_wc) * 2
    score = max(0.0, 100.0 - (deviation / max_dev) * 100.0)

    return round(score, 1), warnings


def score_actionability(briefing: Dict[str, Any]) -> tuple[float, List[str]]:
    """Score whether the briefing contains explicit recommendations or next steps."""
    text = _extract_full_text(briefing).lower()
    hits = sum(1 for marker in ACTIONABILITY_MARKERS if marker in text)
    score = min(100.0, (hits / max(len(ACTIONABILITY_MARKERS) * 0.3, 1)) * 100.0)

    warnings = []
    if hits < 2:
        warnings.append("Briefing lacks explicit actionability markers (recommendations, next steps, risks).")

    return round(score, 1), warnings


def score_source_diversity(briefing: Dict[str, Any]) -> tuple[float, List[str]]:
    """Score whether the briefing draws from a healthy variety of sources."""
    sources = briefing.get("sources", [])
    if not sources and "items_ranked" in briefing:
        # Fallback: use items_ranked count as proxy
        n = briefing.get("items_ranked", 0)
        score = min(100.0, (n / 8) * 100.0)
        warnings = []
        if n < 5:
            warnings.append(f"Only {n} items ranked — source diversity may be low.")
        return round(score, 1), warnings

    domains = set()
    for src in sources:
        url = src.get("url", "")
        if url:
            domain = url.split("/")[2] if "//" in url else url.split("/")[0]
            domains.add(domain)

    score = min(100.0, (len(domains) / 5) * 100.0)
    warnings = []
    if len(domains) < 3:
        warnings.append(f"Only {len(domains)} unique sources — diversity may be low.")

    return round(score, 1), warnings


def detect_drift(current: Dict[str, Any], previous: Dict[str, Any]) -> tuple[float, List[str]]:
    """Detect content drift between two briefings using simple overlap heuristics."""
    curr_text = _extract_full_text(current).lower()
    prev_text = _extract_full_text(previous).lower()

    curr_words = set(curr_text.split())
    prev_words = set(prev_text.split())

    if not curr_words or not prev_words:
        return 0.0, ["Cannot compute drift — empty briefing text."]

    jaccard = len(curr_words & prev_words) / len(curr_words | prev_words)
    # Scale to 0-100 where 100 = identical, 0 = completely different
    score = round(jaccard * 100, 1)

    warnings = []
    if score < 15:
        warnings.append(f"High drift detected (Jaccard={jaccard:.2f}). Briefings share very little vocabulary.")
    elif score > 85:
        warnings.append(f"Low drift (Jaccard={jaccard:.2f}). Briefings may be repetitive or stale.")

    return score, warnings


def _extract_full_text(briefing: Dict[str, Any]) -> str:
    """Best-effort extraction of briefing text from payload variants."""
    candidates = [
        briefing.get("briefing_text"),
        briefing.get("text"),
        briefing.get("summary"),
        briefing.get("content"),
    ]
    for c in candidates:
        if c and isinstance(c, str):
            return c

    # If briefing has sections
    sections = briefing.get("sections", [])
    if sections:
        return "\n\n".join(str(s.get("text", s)) for s in sections)

    # If briefing has ranked items
    items = briefing.get("ranked_items", briefing.get("items", []))
    if items:
        return "\n\n".join(
            f"{i.get('title', '')}\n{i.get('summary', i.get('text', ''))}" for i in items
        )

    return json.dumps(briefing, indent=2)


def evaluate(briefing_path: Path, previous_path: Optional[Path] = None) -> QualityReport:
    briefing = load_briefing(briefing_path)

    rel_score, rel_warn = score_relevance(briefing)
    grd_score, grd_warn = score_grounding(briefing)
    con_score, con_warn = score_conciseness(briefing)
    act_score, act_warn = score_actionability(briefing)
    div_score, div_warn = score_source_diversity(briefing)

    warnings = rel_warn + grd_warn + con_warn + act_warn + div_warn

    overall = round(
        (rel_score * 0.25 + grd_score * 0.25 + con_score * 0.20 +
         act_score * 0.20 + div_score * 0.10),
        1,
    )

    recommendations = []
    if overall < 60:
        recommendations.append("CRITICAL: Briefing quality is below acceptable threshold. Review synthesis prompt and source configuration.")
    if rel_score < 50:
        recommendations.append("Relevance is low. Expand keyword list or tighten source aggregation.")
    if grd_score < 50:
        recommendations.append("Grounding is weak. Verify fleet_context injection is working and prompt references it explicitly.")
    if con_score < 50:
        recommendations.append("Length is off-target. Adjust synthesis prompt word-count guidance or ranking threshold.")
    if act_score < 50:
        recommendations.append("Actionability is low. Add explicit instructions to the synthesis prompt to include 'Implications' and 'Recommended Actions' sections.")

    drift_score = None
    if previous_path:
        previous = load_briefing(previous_path)
        drift_score, drift_warn = detect_drift(briefing, previous)
        warnings.extend(drift_warn)

    return QualityReport(
        briefing_path=str(briefing_path),
        overall_score=overall,
        relevance_score=rel_score,
        grounding_score=grd_score,
        conciseness_score=con_score,
        actionability_score=act_score,
        source_diversity_score=div_score,
        drift_score=drift_score,
        warnings=warnings,
        recommendations=recommendations,
    )


def print_report(report: QualityReport, json_mode: bool = False):
    if json_mode:
        print(json.dumps(asdict(report), indent=2))
        return

    print("=" * 70)
    print(" DEEP DIVE QUALITY EVALUATION REPORT")
    print("=" * 70)
    print(f" Briefing : {report.briefing_path}")
    print(f" Overall  : {report.overall_score}/100")
    print("-" * 70)
    print(f"  Relevance        : {report.relevance_score:>6}/100")
    print(f"  Grounding        : {report.grounding_score:>6}/100")
    print(f"  Conciseness      : {report.conciseness_score:>6}/100")
    print(f"  Actionability    : {report.actionability_score:>6}/100")
    print(f"  Source Diversity : {report.source_diversity_score:>6}/100")
    if report.drift_score is not None:
        print(f"  Drift vs Previous: {report.drift_score:>6}/100")
    print("-" * 70)

    if report.warnings:
        print("\n⚠️  WARNINGS:")
        for w in report.warnings:
            print(f"   • {w}")

    if report.recommendations:
        print("\n💡 RECOMMENDATIONS:")
        for r in report.recommendations:
            print(f"   • {r}")

    print("=" * 70)


def main():
    parser = argparse.ArgumentParser(description="Evaluate Deep Dive briefing quality")
    parser.add_argument("briefing", type=Path, help="Path to briefing JSON")
    parser.add_argument("--previous", type=Path, help="Path to previous briefing JSON for drift detection")
    parser.add_argument("--json", action="store_true", help="Output JSON")
    args = parser.parse_args()

    if not args.briefing.exists():
        print(f"Error: briefing not found: {args.briefing}", file=sys.stderr)
        sys.exit(1)

    report = evaluate(args.briefing, args.previous)
    print_report(report, json_mode=args.json)

    # Exit non-zero if quality is critically low
    sys.exit(0 if report.overall_score >= 50 else 2)


if __name__ == "__main__":
    main()
feat(deepdive): quality evaluation framework - Add quality_eval.py: automated briefing quality scorer with drift detection - Add QUALITY_FRAMEWORK.md: rubric, usage guide, and production integration spec Refs #830 2026-04-05 19:03:01 +00:00			`#!/usr/bin/env python3`
			`"""Deep Dive Quality Evaluation Framework — Issue #830`

			`Scores generated briefings against a multi-dimensional rubric.`
			`Detects drift across consecutive runs. Supports A/B prompt testing.`

			`Usage:`
			`python3 quality_eval.py /path/to/briefing_20260405_124506.json`
			`python3 quality_eval.py /path/to/briefing.json --previous /path/to/briefing_yesterday.json`
			`python3 quality_eval.py /path/to/briefing.json --json`
			`"""`

			`import argparse`
			`import json`
			`import math`
			`import sys`
			`from dataclasses import dataclass, asdict`
			`from pathlib import Path`
			`from typing import List, Optional, Dict, Any`


			`# ---------------------------------------------------------------------------`
			`# Rubric configuration (tunable)`
			`# ---------------------------------------------------------------------------`

			`TARGET_WORD_COUNT_MIN = 600`
			`TARGET_WORD_COUNT_MAX = 1200`
			`TARGET_AUDIO_MINUTES_MIN = 10`
			`TARGET_AUDIO_MINUTES_MAX = 15`
			`MAX_SOURCES_EXPECTED = 12`

			`RELEVANCE_KEYWORDS = [`
			`"llm", "agent", "architecture", "hermes", "tool use", "mcp",`
			`"reinforcement learning", "rlhf", "grpo", "transformer",`
			`"local model", "llama.cpp", "gemma", "inference", "alignment",`
			`"fleet", "timmy", "nexus", "openclaw", "sovereign",`
			`]`

			`ACTIONABILITY_MARKERS = [`
			`"implication", "recommend", "should", "next step", "action",`
			`"deploy", "integrate", "watch", "risk", "opportunity",`
			`]`

			`GROUNDING_MARKERS = [`
			`"fleet", "repo", "issue", "pr ", "commit", "milestone",`
			`"wizard", "hermes", "timmy", "nexus", "openclaw", "bezalel",`
			`]`


			`@dataclass`
			`class QualityReport:`
			`briefing_path: str`
			`overall_score: float # 0.0 - 100.0`
			`relevance_score: float # 0.0 - 100.0`
			`grounding_score: float # 0.0 - 100.0`
			`conciseness_score: float # 0.0 - 100.0`
			`actionability_score: float # 0.0 - 100.0`
			`source_diversity_score: float # 0.0 - 100.0`
			`drift_score: Optional[float] = None # 0.0 - 100.0 (similarity to previous)`
			`warnings: List[str] = None`
			`recommendations: List[str] = None`

			`def __post_init__(self):`
			`if self.warnings is None:`
			`self.warnings = []`
			`if self.recommendations is None:`
			`self.recommendations = []`


			`def load_briefing(path: Path) -> Dict[str, Any]:`
			`with open(path, "r", encoding="utf-8") as f:`
			`return json.load(f)`


			`def _word_count(text: str) -> int:`
			`return len(text.split())`


			`def _estimate_audio_minutes(word_count: int, wpm: int = 130) -> float:`
			`return round(word_count / wpm, 1)`


			`def score_relevance(briefing: Dict[str, Any]) -> tuple[float, List[str]]:`
			`"""Score how well the briefing covers AI/ML topics relevant to Hermes work."""`
			`text = _extract_full_text(briefing).lower()`
			`hits = sum(1 for kw in RELEVANCE_KEYWORDS if kw in text)`
			`score = min(100.0, (hits / max(len(RELEVANCE_KEYWORDS) * 0.3, 1)) * 100.0)`

			`warnings = []`
			`if hits < 3:`
			`warnings.append("Briefing lacks AI/ML relevance keywords.")`

			`return round(score, 1), warnings`


			`def score_grounding(briefing: Dict[str, Any]) -> tuple[float, List[str]]:`
			`"""Score how well the briefing incorporates fleet context."""`
			`text = _extract_full_text(briefing).lower()`
			`fleet_ctx = briefing.get("fleet_context") or briefing.get("context") or {}`
			`has_fleet_context = bool(fleet_ctx)`

			`hits = sum(1 for marker in GROUNDING_MARKERS if marker in text)`
			`score = min(100.0, (hits / max(len(GROUNDING_MARKERS) * 0.2, 1)) * 100.0)`

			`if has_fleet_context and hits < 2:`
			`score *= 0.5 # Penalty for ignoring injected context`

			`warnings = []`
			`if not has_fleet_context:`
			`warnings.append("No fleet_context found in briefing payload.")`
			`elif hits < 2:`
			`warnings.append("Fleet context was injected but not referenced in briefing text.")`

			`return round(score, 1), warnings`


			`def score_conciseness(briefing: Dict[str, Any]) -> tuple[float, List[str]]:`
			`"""Score whether briefing length lands in the target zone."""`
			`text = _extract_full_text(briefing)`
			`wc = _word_count(text)`
			`audio_min = _estimate_audio_minutes(wc)`

			`warnings = []`
			`if wc < TARGET_WORD_COUNT_MIN:`
			`warnings.append(f"Briefing too short ({wc} words). Target: {TARGET_WORD_COUNT_MIN}-{TARGET_WORD_COUNT_MAX}.")`
			`elif wc > TARGET_WORD_COUNT_MAX:`
			`warnings.append(f"Briefing too long ({wc} words). Target: {TARGET_WORD_COUNT_MIN}-{TARGET_WORD_COUNT_MAX}.")`

			`if audio_min < TARGET_AUDIO_MINUTES_MIN:`
			`warnings.append(f"Audio estimate too short ({audio_min} min). Target: {TARGET_AUDIO_MINUTES_MIN}-{TARGET_AUDIO_MINUTES_MAX}.")`
			`elif audio_min > TARGET_AUDIO_MINUTES_MAX:`
			`warnings.append(f"Audio estimate too long ({audio_min} min). Target: {TARGET_AUDIO_MINUTES_MIN}-{TARGET_AUDIO_MINUTES_MAX}.")`

			`# Score peaks at target center, falls off linearly outside`
			`center_wc = (TARGET_WORD_COUNT_MIN + TARGET_WORD_COUNT_MAX) / 2`
			`deviation = abs(wc - center_wc)`
			`max_dev = max(center_wc - 0, TARGET_WORD_COUNT_MAX - center_wc) * 2`
			`score = max(0.0, 100.0 - (deviation / max_dev) * 100.0)`

			`return round(score, 1), warnings`


			`def score_actionability(briefing: Dict[str, Any]) -> tuple[float, List[str]]:`
			`"""Score whether the briefing contains explicit recommendations or next steps."""`
			`text = _extract_full_text(briefing).lower()`
			`hits = sum(1 for marker in ACTIONABILITY_MARKERS if marker in text)`
			`score = min(100.0, (hits / max(len(ACTIONABILITY_MARKERS) * 0.3, 1)) * 100.0)`

			`warnings = []`
			`if hits < 2:`
			`warnings.append("Briefing lacks explicit actionability markers (recommendations, next steps, risks).")`

			`return round(score, 1), warnings`


			`def score_source_diversity(briefing: Dict[str, Any]) -> tuple[float, List[str]]:`
			`"""Score whether the briefing draws from a healthy variety of sources."""`
			`sources = briefing.get("sources", [])`
			`if not sources and "items_ranked" in briefing:`
			`# Fallback: use items_ranked count as proxy`
			`n = briefing.get("items_ranked", 0)`
			`score = min(100.0, (n / 8) * 100.0)`
			`warnings = []`
			`if n < 5:`
			`warnings.append(f"Only {n} items ranked — source diversity may be low.")`
			`return round(score, 1), warnings`

			`domains = set()`
			`for src in sources:`
			`url = src.get("url", "")`
			`if url:`
			`domain = url.split("/")[2] if "//" in url else url.split("/")[0]`
			`domains.add(domain)`

			`score = min(100.0, (len(domains) / 5) * 100.0)`
			`warnings = []`
			`if len(domains) < 3:`
			`warnings.append(f"Only {len(domains)} unique sources — diversity may be low.")`

			`return round(score, 1), warnings`


			`def detect_drift(current: Dict[str, Any], previous: Dict[str, Any]) -> tuple[float, List[str]]:`
			`"""Detect content drift between two briefings using simple overlap heuristics."""`
			`curr_text = _extract_full_text(current).lower()`
			`prev_text = _extract_full_text(previous).lower()`

			`curr_words = set(curr_text.split())`
			`prev_words = set(prev_text.split())`

			`if not curr_words or not prev_words:`
			`return 0.0, ["Cannot compute drift — empty briefing text."]`

			`jaccard = len(curr_words & prev_words) / len(curr_words \| prev_words)`
			`# Scale to 0-100 where 100 = identical, 0 = completely different`
			`score = round(jaccard * 100, 1)`

			`warnings = []`
			`if score < 15:`
			`warnings.append(f"High drift detected (Jaccard={jaccard:.2f}). Briefings share very little vocabulary.")`
			`elif score > 85:`
			`warnings.append(f"Low drift (Jaccard={jaccard:.2f}). Briefings may be repetitive or stale.")`

			`return score, warnings`


			`def _extract_full_text(briefing: Dict[str, Any]) -> str:`
			`"""Best-effort extraction of briefing text from payload variants."""`
			`candidates = [`
			`briefing.get("briefing_text"),`
			`briefing.get("text"),`
			`briefing.get("summary"),`
			`briefing.get("content"),`
			`]`
			`for c in candidates:`
			`if c and isinstance(c, str):`
			`return c`

			`# If briefing has sections`
			`sections = briefing.get("sections", [])`
			`if sections:`
			`return "\n\n".join(str(s.get("text", s)) for s in sections)`

			`# If briefing has ranked items`
			`items = briefing.get("ranked_items", briefing.get("items", []))`
			`if items:`
			`return "\n\n".join(`
			`f"{i.get('title', '')}\n{i.get('summary', i.get('text', ''))}" for i in items`
			`)`

			`return json.dumps(briefing, indent=2)`


			`def evaluate(briefing_path: Path, previous_path: Optional[Path] = None) -> QualityReport:`
			`briefing = load_briefing(briefing_path)`

			`rel_score, rel_warn = score_relevance(briefing)`
			`grd_score, grd_warn = score_grounding(briefing)`
			`con_score, con_warn = score_conciseness(briefing)`
			`act_score, act_warn = score_actionability(briefing)`
			`div_score, div_warn = score_source_diversity(briefing)`

			`warnings = rel_warn + grd_warn + con_warn + act_warn + div_warn`

			`overall = round(`
			`(rel_score * 0.25 + grd_score * 0.25 + con_score * 0.20 +`
			`act_score * 0.20 + div_score * 0.10),`
			`1,`
			`)`

			`recommendations = []`
			`if overall < 60:`
			`recommendations.append("CRITICAL: Briefing quality is below acceptable threshold. Review synthesis prompt and source configuration.")`
			`if rel_score < 50:`
			`recommendations.append("Relevance is low. Expand keyword list or tighten source aggregation.")`
			`if grd_score < 50:`
			`recommendations.append("Grounding is weak. Verify fleet_context injection is working and prompt references it explicitly.")`
			`if con_score < 50:`
			`recommendations.append("Length is off-target. Adjust synthesis prompt word-count guidance or ranking threshold.")`
			`if act_score < 50:`
			`recommendations.append("Actionability is low. Add explicit instructions to the synthesis prompt to include 'Implications' and 'Recommended Actions' sections.")`

			`drift_score = None`
			`if previous_path:`
			`previous = load_briefing(previous_path)`
			`drift_score, drift_warn = detect_drift(briefing, previous)`
			`warnings.extend(drift_warn)`

			`return QualityReport(`
			`briefing_path=str(briefing_path),`
			`overall_score=overall,`
			`relevance_score=rel_score,`
			`grounding_score=grd_score,`
			`conciseness_score=con_score,`
			`actionability_score=act_score,`
			`source_diversity_score=div_score,`
			`drift_score=drift_score,`
			`warnings=warnings,`
			`recommendations=recommendations,`
			`)`


			`def print_report(report: QualityReport, json_mode: bool = False):`
			`if json_mode:`
			`print(json.dumps(asdict(report), indent=2))`
			`return`

			`print("=" * 70)`
			`print(" DEEP DIVE QUALITY EVALUATION REPORT")`
			`print("=" * 70)`
			`print(f" Briefing : {report.briefing_path}")`
			`print(f" Overall : {report.overall_score}/100")`
			`print("-" * 70)`
			`print(f" Relevance : {report.relevance_score:>6}/100")`
			`print(f" Grounding : {report.grounding_score:>6}/100")`
			`print(f" Conciseness : {report.conciseness_score:>6}/100")`
			`print(f" Actionability : {report.actionability_score:>6}/100")`
			`print(f" Source Diversity : {report.source_diversity_score:>6}/100")`
			`if report.drift_score is not None:`
			`print(f" Drift vs Previous: {report.drift_score:>6}/100")`
			`print("-" * 70)`

			`if report.warnings:`
			`print("\n⚠️ WARNINGS:")`
			`for w in report.warnings:`
			`print(f" • {w}")`

			`if report.recommendations:`
			`print("\n💡 RECOMMENDATIONS:")`
			`for r in report.recommendations:`
			`print(f" • {r}")`

			`print("=" * 70)`


			`def main():`
			`parser = argparse.ArgumentParser(description="Evaluate Deep Dive briefing quality")`
			`parser.add_argument("briefing", type=Path, help="Path to briefing JSON")`
			`parser.add_argument("--previous", type=Path, help="Path to previous briefing JSON for drift detection")`
			`parser.add_argument("--json", action="store_true", help="Output JSON")`
			`args = parser.parse_args()`

			`if not args.briefing.exists():`
			`print(f"Error: briefing not found: {args.briefing}", file=sys.stderr)`
			`sys.exit(1)`

			`report = evaluate(args.briefing, args.previous)`
			`print_report(report, json_mode=args.json)`

			`# Exit non-zero if quality is critically low`
			`sys.exit(0 if report.overall_score >= 50 else 2)`


			`if __name__ == "__main__":`
			`main()`