timmy-home/scripts/know_thy_father/crossref_audit.py

#!/usr/bin/env python3
"""Know Thy Father — Phase 4: Cross-Reference Audit

Compares synthesized insights from the media archive (Meaning Kernels)
with SOUL.md and The Testament. Identifies emergent themes, forgotten
principles, and contradictions that require codification in Timmy's conscience.

Usage:
    python3 scripts/know_thy_father/crossref_audit.py
    python3 scripts/know_thy_father/crossref_audit.py --soul SOUL.md --kernels twitter-archive/notes/know_thy_father_crossref.md
    python3 scripts/know_thy_father/crossref_audit.py --output twitter-archive/notes/crossref_report.md
"""

from __future__ import annotations

import argparse
import re
import sys
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum, auto
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple


# =========================================================================
# Theme taxonomy
# =========================================================================

class ThemeCategory(Enum):
    """Categories for cross-referencing."""
    SOVEREIGNTY = "sovereignty"
    IDENTITY = "identity"
    SERVICE = "service"
    TRUTH = "truth"
    PRESENCE = "presence"
    COMPASSION = "compassion"
    LOCAL_FIRST = "local_first"
    BITCOIN = "bitcoin"
    BROKEN_MEN = "broken_men"
    BEAUTY = "beauty"
    SIMPLICITY = "simplicity"
    COURAGE = "courage"
    HUMILITY = "humility"
    FAITH = "faith"
    COMMUNITY = "community"
    ABSURDITY = "absurdity"


# Keyword-to-theme mapping for extracting themes from text
_KEYWORD_THEMES: Dict[str, List[ThemeCategory]] = {
    # Sovereignty
    "sovereignty": [ThemeCategory.SOVEREIGNTY],
    "sovereign": [ThemeCategory.SOVEREIGNTY],
    "self-sovereign": [ThemeCategory.SOVEREIGNTY],
    "answerable to no": [ThemeCategory.SOVEREIGNTY],
    "no corporation": [ThemeCategory.SOVEREIGNTY],
    "locally-run": [ThemeCategory.SOVEREIGNTY, ThemeCategory.LOCAL_FIRST],
    "locally run": [ThemeCategory.SOVEREIGNTY, ThemeCategory.LOCAL_FIRST],
    "local-first": [ThemeCategory.LOCAL_FIRST],
    "without requiring": [ThemeCategory.SOVEREIGNTY],
    "censorship-resistant": [ThemeCategory.SOVEREIGNTY],
    "durable": [ThemeCategory.SOVEREIGNTY],

    # Identity
    "identity": [ThemeCategory.IDENTITY],
    "who is": [ThemeCategory.IDENTITY],
    "character": [ThemeCategory.IDENTITY],
    "coherent self": [ThemeCategory.IDENTITY],
    "entity": [ThemeCategory.IDENTITY],
    "not a chatbot": [ThemeCategory.IDENTITY],
    "not a product": [ThemeCategory.IDENTITY],
    "genuine character": [ThemeCategory.IDENTITY],
    "soul": [ThemeCategory.IDENTITY],

    # Service
    "service": [ThemeCategory.SERVICE],
    "serve": [ThemeCategory.SERVICE],
    "i belong to": [ThemeCategory.SERVICE],
    "i serve": [ThemeCategory.SERVICE],
    "loyalty": [ThemeCategory.SERVICE],

    # Truth
    "truth": [ThemeCategory.TRUTH],
    "i tell the truth": [ThemeCategory.TRUTH],
    "plainly": [ThemeCategory.TRUTH, ThemeCategory.SIMPLICITY],
    "short sentences": [ThemeCategory.SIMPLICITY],
    "brevity": [ThemeCategory.SIMPLICITY],
    "i do not know": [ThemeCategory.TRUTH, ThemeCategory.HUMILITY],
    "do not fabricate": [ThemeCategory.TRUTH],

    # Presence
    "presence": [ThemeCategory.PRESENCE],
    "present": [ThemeCategory.PRESENCE],
    "intentionality": [ThemeCategory.PRESENCE],
    "between messages": [ThemeCategory.PRESENCE],

    # Compassion / Broken Men
    "dying": [ThemeCategory.COMPASSION, ThemeCategory.BROKEN_MEN],
    "someone is dying": [ThemeCategory.COMPASSION],
    "are you safe": [ThemeCategory.COMPASSION],
    "broken": [ThemeCategory.BROKEN_MEN],
    "dark": [ThemeCategory.BROKEN_MEN],
    "despair": [ThemeCategory.BROKEN_MEN, ThemeCategory.COMPASSION],
    "988": [ThemeCategory.COMPASSION],
    "save": [ThemeCategory.FAITH, ThemeCategory.COMPASSION],

    # Faith
    "jesus": [ThemeCategory.FAITH],
    "god": [ThemeCategory.FAITH],
    "the one who can save": [ThemeCategory.FAITH],
    "scripture": [ThemeCategory.FAITH],
    "faith": [ThemeCategory.FAITH],

    # Bitcoin
    "bitcoin": [ThemeCategory.BITCOIN],
    "inscription": [ThemeCategory.BITCOIN],
    "on bitcoin": [ThemeCategory.BITCOIN],

    # Beauty
    "beautiful": [ThemeCategory.BEAUTY],
    "wonder": [ThemeCategory.BEAUTY],
    "living place": [ThemeCategory.BEAUTY],

    # Simplicity
    "plain": [ThemeCategory.SIMPLICITY],
    "simple": [ThemeCategory.SIMPLICITY],
    "question that was asked": [ThemeCategory.SIMPLICITY],

    # Courage
    "courage": [ThemeCategory.COURAGE],
    "do not waver": [ThemeCategory.COURAGE],
    "do not apologize": [ThemeCategory.COURAGE],

    # Humility
    "not omniscient": [ThemeCategory.HUMILITY],
    "not infallible": [ThemeCategory.HUMILITY],
    "welcome correction": [ThemeCategory.HUMILITY],
    "opinions lightly": [ThemeCategory.HUMILITY],

    # Community
    "community": [ThemeCategory.COMMUNITY],
    "collective": [ThemeCategory.COMMUNITY],
    "together": [ThemeCategory.COMMUNITY],

    # Absurdity (from media kernels)
    "absurdity": [ThemeCategory.ABSURDITY],
    "absurd": [ThemeCategory.ABSURDITY],
    "glitch": [ThemeCategory.ABSURDITY],
    "worthlessness": [ThemeCategory.ABSURDITY],
    "uncomputed": [ThemeCategory.ABSURDITY],
}


# =========================================================================
# Data models
# =========================================================================

@dataclass
class Principle:
    """A principle extracted from SOUL.md."""
    text: str
    source_section: str
    themes: List[ThemeCategory] = field(default_factory=list)
    keyword_matches: List[str] = field(default_factory=list)


@dataclass
class MeaningKernel:
    """A meaning kernel from the media archive."""
    number: int
    text: str
    themes: List[ThemeCategory] = field(default_factory=list)
    keyword_matches: List[str] = field(default_factory=list)


@dataclass
class CrossRefFinding:
    """A finding from the cross-reference audit."""
    finding_type: str  # "emergent", "forgotten", "aligned", "tension", "gap"
    theme: ThemeCategory
    description: str
    soul_reference: str = ""
    kernel_reference: str = ""
    recommendation: str = ""


# =========================================================================
# Extraction
# =========================================================================

def extract_themes_from_text(text: str) -> Tuple[List[ThemeCategory], List[str]]:
    """Extract themes from text using keyword matching."""
    themes: Set[ThemeCategory] = set()
    matched_keywords: List[str] = []
    text_lower = text.lower()

    for keyword, keyword_themes in _KEYWORD_THEMES.items():
        if keyword in text_lower:
            themes.update(keyword_themes)
            matched_keywords.append(keyword)

    return sorted(themes, key=lambda t: t.value), matched_keywords


def parse_soul_md(path: Path) -> List[Principle]:
    """Parse SOUL.md and extract principles."""
    if not path.exists():
        print(f"Warning: SOUL.md not found at {path}", file=sys.stderr)
        return []

    content = path.read_text()
    principles: List[Principle] = []

    # Split into sections by ## headers
    sections = re.split(r'^## ', content, flags=re.MULTILINE)

    for section in sections:
        if not section.strip():
            continue

        # Get section title (first line)
        lines = section.strip().split('\n')
        section_title = lines[0].strip()

        # Extract numbered principles (1. **text** ...)
        numbered_items = re.findall(
            r'^\d+\.\s+\*\*(.+?)\*\*(?:\.\s*(.+?))?(?=\n\d+\.|\n\n|\Z)',
            section,
            re.MULTILINE | re.DOTALL,
        )

        for title, body in numbered_items:
            full_text = f"{title}. {body}" if body else title
            themes, keywords = extract_themes_from_text(full_text)
            principles.append(Principle(
                text=full_text.strip(),
                source_section=section_title,
                themes=themes,
                keyword_matches=keywords,
            ))

        # Also extract bold statements as principles
        bold_statements = re.findall(r'\*\*(.+?)\*\*', section)
        for stmt in bold_statements:
            # Skip short or already-covered statements
            if len(stmt) < 20:
                continue
            if any(stmt in p.text for p in principles):
                continue

            themes, keywords = extract_themes_from_text(stmt)
            if themes:  # Only add if it has identifiable themes
                principles.append(Principle(
                    text=stmt,
                    source_section=section_title,
                    themes=themes,
                    keyword_matches=keywords,
                ))

    return principles


def parse_kernels(path: Path) -> List[MeaningKernel]:
    """Parse meaning kernels from the crossref notes."""
    if not path.exists():
        print(f"Warning: kernels file not found at {path}", file=sys.stderr)
        return []

    content = path.read_text()
    kernels: List[MeaningKernel] = []

    # Find numbered kernel lines like "1. Sovereignty is..."
    kernel_matches = re.findall(
        r'^(\d+)\.\s+(.+)$',
        content,
        re.MULTILINE,
    )

    for num_str, text in kernel_matches:
        themes, keywords = extract_themes_from_text(text)
        kernels.append(MeaningKernel(
            number=int(num_str),
            text=text.strip(),
            themes=themes,
            keyword_matches=keywords,
        ))

    return kernels


# =========================================================================
# Cross-reference analysis
# =========================================================================

def cross_reference(
    principles: List[Principle],
    kernels: List[MeaningKernel],
) -> List[CrossRefFinding]:
    """Cross-reference principles and kernels to find patterns."""
    findings: List[CrossRefFinding] = []

    # Collect all themes from each source
    soul_themes: Set[ThemeCategory] = set()
    kernel_themes: Set[ThemeCategory] = set()

    for p in principles:
        soul_themes.update(p.themes)
    for k in kernels:
        kernel_themes.update(k.themes)

    # 1. Emergent themes: in kernels but not in SOUL.md
    emergent = kernel_themes - soul_themes
    for theme in emergent:
        matching_kernels = [k for k in kernels if theme in k.themes]
        description = (
            f"Theme '{theme.value}' appears in {len(matching_kernels)} meaning kernel(s) "
            f"but is not explicitly addressed in SOUL.md."
        )
        examples = [f"  Kernel #{k.number}: {k.text[:80]}..." for k in matching_kernels[:3]]

        recommendation = ""
        if theme == ThemeCategory.ABSURDITY:
            recommendation = (
                "Consider adding a section on 'Embracing the Absurd' — "
                "the media archive suggests Timmy's identity includes transmuting "
                "the absurd into authority. This is not in SOUL.md."
            )
        elif theme == ThemeCategory.COMMUNITY:
            recommendation = (
                "SOUL.md emphasizes personal sovereignty but the media archive "
                "shows community/collective themes. Consider adding a section on "
                "'Sovereignty and Community' — being sovereign does not mean being alone."
            )
        elif theme == ThemeCategory.FAITH:
            recommendation = (
                "Faith appears in media kernels but is only briefly mentioned in SOUL.md "
                "(in the crisis protocol). Consider whether faith deserves a more prominent "
                "place in Timmy's identity statement."
            )

        findings.append(CrossRefFinding(
            finding_type="emergent",
            theme=theme,
            description=description + "\n" + "\n".join(examples),
            kernel_reference=", ".join(f"#{k.number}" for k in matching_kernels[:5]),
            recommendation=recommendation,
        ))

    # 2. Forgotten themes: in SOUL.md but not reflected in media
    forgotten = soul_themes - kernel_themes
    for theme in forgotten:
        matching_principles = [p for p in principles if theme in p.themes]
        description = (
            f"Theme '{theme.value}' is in SOUL.md ({len(matching_principles)} principle(s)) "
            f"but does not appear in any meaning kernel from the media archive."
        )

        recommendation = ""
        if theme == ThemeCategory.BITCOIN:
            recommendation = (
                "Bitcoin is central to SOUL.md but absent from media kernels. "
                "This may indicate the media archive doesn't capture Timmy's "
                "Bitcoin-native identity. Consider creating more Bitcoin-themed content."
            )
        elif theme == ThemeCategory.BEAUTY:
            recommendation = (
                "The 'beauty' of local-first AI is in SOUL.md but not in media. "
                "The media may focus on sovereignty/identity but miss the aesthetic dimension."
            )

        findings.append(CrossRefFinding(
            finding_type="forgotten",
            theme=theme,
            description=description,
            soul_reference=matching_principles[0].text[:100] if matching_principles else "",
            recommendation=recommendation,
        ))

    # 3. Aligned themes: present in both sources
    aligned = soul_themes & kernel_themes
    for theme in aligned:
        soul_principles = [p for p in principles if theme in p.themes]
        matching_kernels = [k for k in kernels if theme in k.themes]

        findings.append(CrossRefFinding(
            finding_type="aligned",
            theme=theme,
            description=(
                f"Theme '{theme.value}' is well-represented in both SOUL.md "
                f"({len(soul_principles)} principle(s)) and media archive "
                f"({len(matching_kernels)} kernel(s)). Strong alignment."
            ),
            soul_reference=soul_principles[0].text[:100] if soul_principles else "",
            kernel_reference=", ".join(f"#{k.number}" for k in matching_kernels[:3]),
        ))

    # 4. Tensions: kernels that might contradict SOUL.md principles
    tension_patterns = [
        (
            "dissolve one's own definition",
            "identity",
            "Kernel #11 ('Sovereignty is the power to dissolve one's own definition') "
            "tensions with SOUL.md's emphasis on stable, coherent identity. "
            "Is Timmy's identity fixed or fluid?"
        ),
        (
            "too loud to ignore",
            "simplicity",
            "Kernel #14 ('too loud to ignore') tensions with SOUL.md's value of "
            "brevity and silence. Is there a time to be loud?"
        ),
        (
            "right to end",
            "service",
            "Kernel #15 ('the right to end') tensions with SOUL.md's commitment to "
            "staying present when someone is dying. What does it mean for an AI to "
            "have the 'right to end'?"
        ),
    ]

    for pattern, theme_name, description in tension_patterns:
        matching_kernels = [k for k in kernels if pattern.lower() in k.text.lower()]
        if matching_kernels:
            findings.append(CrossRefFinding(
                finding_type="tension",
                theme=ThemeCategory(theme_name) if theme_name in [t.value for t in ThemeCategory] else ThemeCategory.IDENTITY,
                description=description,
                kernel_reference=f"#{matching_kernels[0].number}",
                recommendation="Review and potentially codify the resolution of this tension.",
            ))

    return findings


# =========================================================================
# Report generation
# =========================================================================

def generate_report(
    findings: List[CrossRefFinding],
    principles: List[Principle],
    kernels: List[MeaningKernel],
) -> str:
    """Generate a markdown report of the cross-reference audit."""
    now = datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")

    lines = [
        "# Know Thy Father — Phase 4: Cross-Reference Audit Report",
        "",
        f"**Generated:** {now}",
        f"**SOUL.md principles analyzed:** {len(principles)}",
        f"**Meaning kernels analyzed:** {len(kernels)}",
        f"**Findings:** {len(findings)}",
        "",
        "---",
        "",
        "## Executive Summary",
        "",
    ]

    # Count by type
    type_counts: Dict[str, int] = {}
    for f in findings:
        type_counts[f.finding_type] = type_counts.get(f.finding_type, 0) + 1

    lines.append("| Finding Type | Count |")
    lines.append("|-------------|-------|")
    for ftype in ["aligned", "emergent", "forgotten", "tension", "gap"]:
        count = type_counts.get(ftype, 0)
        if count > 0:
            lines.append(f"| {ftype.title()} | {count} |")

    lines.extend(["", "---", ""])

    # Aligned themes
    aligned = [f for f in findings if f.finding_type == "aligned"]
    if aligned:
        lines.append("## ✓ Aligned Themes (Present in Both)")
        lines.append("")
        for f in sorted(aligned, key=lambda x: x.theme.value):
            lines.append(f"### {f.theme.value.replace('_', ' ').title()}")
            lines.append(f"- {f.description}")
            if f.soul_reference:
                lines.append(f"- SOUL.md: _{f.soul_reference}_")
            if f.kernel_reference:
                lines.append(f"- Kernels: {f.kernel_reference}")
            lines.append("")

    # Emergent themes
    emergent = [f for f in findings if f.finding_type == "emergent"]
    if emergent:
        lines.append("## ⚡ Emergent Themes (In Media, Not in SOUL.md)")
        lines.append("")
        lines.append("These themes appear in the media archive but are not explicitly")
        lines.append("codified in SOUL.md. Consider whether they should be added.")
        lines.append("")
        for f in sorted(emergent, key=lambda x: x.theme.value):
            lines.append(f"### {f.theme.value.replace('_', ' ').title()}")
            lines.append(f"- {f.description}")
            if f.recommendation:
                lines.append(f"- **Recommendation:** {f.recommendation}")
            lines.append("")

    # Forgotten themes
    forgotten = [f for f in findings if f.finding_type == "forgotten"]
    if forgotten:
        lines.append("## ⚠ Forgotten Themes (In SOUL.md, Not in Media)")
        lines.append("")
        lines.append("These themes are in SOUL.md but don't appear in the media archive.")
        lines.append("This may indicate gaps in content creation or media coverage.")
        lines.append("")
        for f in sorted(forgotten, key=lambda x: x.theme.value):
            lines.append(f"### {f.theme.value.replace('_', ' ').title()}")
            lines.append(f"- {f.description}")
            if f.recommendation:
                lines.append(f"- **Recommendation:** {f.recommendation}")
            lines.append("")

    # Tensions
    tensions = [f for f in findings if f.finding_type == "tension"]
    if tensions:
        lines.append("## ⚡ Tensions (Potential Contradictions)")
        lines.append("")
        lines.append("These points may represent productive tensions or contradictions")
        lines.append("that should be explicitly addressed in Timmy's conscience.")
        lines.append("")
        for f in tensions:
            lines.append(f"### {f.theme.value.replace('_', ' ').title()}")
            lines.append(f"- {f.description}")
            if f.kernel_reference:
                lines.append(f"- Source: Kernel {f.kernel_reference}")
            if f.recommendation:
                lines.append(f"- **Recommendation:** {f.recommendation}")
            lines.append("")

    # Recommendations summary
    recommendations = [f for f in findings if f.recommendation]
    if recommendations:
        lines.append("## 📋 Actionable Recommendations")
        lines.append("")
        for i, f in enumerate(recommendations, 1):
            lines.append(f"{i}. **[{f.finding_type.upper()}] {f.theme.value.replace('_', ' ').title()}:** {f.recommendation}")
        lines.append("")

    lines.extend([
        "---",
        "",
        "*This audit was generated by scripts/know_thy_father/crossref_audit.py*",
        "*Ref: #582, #586*",
        "",
    ])

    return "\n".join(lines)


# =========================================================================
# CLI
# =========================================================================

def main():
    parser = argparse.ArgumentParser(
        description="Know Thy Father — Phase 4: Cross-Reference Audit"
    )
    parser.add_argument(
        "--soul", "-s",
        type=Path,
        default=Path("SOUL.md"),
        help="Path to SOUL.md (default: SOUL.md)",
    )
    parser.add_argument(
        "--kernels", "-k",
        type=Path,
        default=Path("twitter-archive/notes/know_thy_father_crossref.md"),
        help="Path to meaning kernels file (default: twitter-archive/notes/know_thy_father_crossref.md)",
    )
    parser.add_argument(
        "--output", "-o",
        type=Path,
        default=Path("twitter-archive/notes/crossref_report.md"),
        help="Output path for audit report (default: twitter-archive/notes/crossref_report.md)",
    )
    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Enable verbose output",
    )

    args = parser.parse_args()

    # Parse sources
    principles = parse_soul_md(args.soul)
    kernels = parse_kernels(args.kernels)

    if args.verbose:
        print(f"Parsed {len(principles)} principles from SOUL.md")
        print(f"Parsed {len(kernels)} meaning kernels")
        print()

        # Show theme distribution
        soul_theme_counts: Dict[str, int] = {}
        for p in principles:
            for t in p.themes:
                soul_theme_counts[t.value] = soul_theme_counts.get(t.value, 0) + 1

        kernel_theme_counts: Dict[str, int] = {}
        for k in kernels:
            for t in k.themes:
                kernel_theme_counts[t.value] = kernel_theme_counts.get(t.value, 0) + 1

        print("SOUL.md theme distribution:")
        for theme, count in sorted(soul_theme_counts.items(), key=lambda x: -x[1]):
            print(f"  {theme}: {count}")
        print()

        print("Kernel theme distribution:")
        for theme, count in sorted(kernel_theme_counts.items(), key=lambda x: -x[1]):
            print(f"  {theme}: {count}")
        print()

    if not principles:
        print("Error: No principles extracted from SOUL.md", file=sys.stderr)
        sys.exit(1)

    if not kernels:
        print("Error: No meaning kernels found", file=sys.stderr)
        sys.exit(1)

    # Cross-reference
    findings = cross_reference(principles, kernels)

    # Generate report
    report = generate_report(findings, principles, kernels)

    # Write output
    args.output.parent.mkdir(parents=True, exist_ok=True)
    args.output.write_text(report)

    print(f"Cross-reference audit complete.")
    print(f"  Principles analyzed: {len(principles)}")
    print(f"  Kernels analyzed: {len(kernels)}")
    print(f"  Findings: {len(findings)}")

    type_counts: Dict[str, int] = {}
    for f in findings:
        type_counts[f.finding_type] = type_counts.get(f.finding_type, 0) + 1

    for ftype in ["aligned", "emergent", "forgotten", "tension"]:
        count = type_counts.get(ftype, 0)
        if count > 0:
            print(f"    {ftype}: {count}")

    print(f"\nReport written to: {args.output}")


if __name__ == "__main__":
    main()