feat: Add diff analyzer tests (closes #176 )

feat: Add diff analyzer (closes #176 )
2026-04-15 03:57:21 +00:00 · 2026-04-15 03:56:27 +00:00
3 changed files with 405 additions and 276 deletions
--- a/scripts/diff_analyzer.py
+++ b/scripts/diff_analyzer.py
@@ -0,0 +1,216 @@
 #!/usr/bin/env python3
 """
 Diff Analyzer — Parse unified diffs and categorize every change.
 Pipeline 6.1 for Compounding Intelligence.
 """
 import re
 from dataclasses import dataclass, field, asdict
 from enum import Enum
 from typing import List, Dict, Any, Optional
 class ChangeCategory(Enum):
    ADDED = "added"
    DELETED = "deleted"
    MODIFIED = "modified"
    MOVED = "moved"
    CONTEXT = "context"
@dataclass
 class Hunk:
    """A single diff hunk with header, line ranges, and category."""
    header: str
    old_start: int
    old_count: int
    new_start: int
    new_count: int
    lines: List[str] = field(default_factory=list)
    category: ChangeCategory = ChangeCategory.CONTEXT
    def to_dict(self) -> Dict[str, Any]:
        d = asdict(self)
        d["category"] = self.category.value
        return d
@dataclass
 class FileChange:
    """A single file's changes."""
    path: str
    old_path: Optional[str] = None  # For renames
    hunks: List[Hunk] = field(default_factory=list)
    added_lines: int = 0
    deleted_lines: int = 0
    is_new: bool = False
    is_deleted: bool = False
    is_renamed: bool = False
    is_binary: bool = False
    def to_dict(self) -> Dict[str, Any]:
        return {
            "path": self.path,
            "old_path": self.old_path,
            "hunks": [h.to_dict() for h in self.hunks],
            "added_lines": self.added_lines,
            "deleted_lines": self.deleted_lines,
            "is_new": self.is_new,
            "is_deleted": self.is_deleted,
            "is_renamed": self.is_renamed,
            "is_binary": self.is_binary,
        }
@dataclass
 class ChangeSummary:
    """Aggregate stats + per-file breakdown."""
    files: List[FileChange] = field(default_factory=list)
    total_added: int = 0
    total_deleted: int = 0
    total_files_changed: int = 0
    total_hunks: int = 0
    new_files: int = 0
    deleted_files: int = 0
    renamed_files: int = 0
    binary_files: int = 0
    def to_dict(self) -> Dict[str, Any]:
        return {
            "total_files_changed": self.total_files_changed,
            "total_added": self.total_added,
            "total_deleted": self.total_deleted,
            "total_hunks": self.total_hunks,
            "new_files": self.new_files,
            "deleted_files": self.deleted_files,
            "renamed_files": self.renamed_files,
            "binary_files": self.binary_files,
            "files": [f.to_dict() for f in self.files],
        }
 class DiffAnalyzer:
    """Parses unified diff format and produces structured ChangeSummary."""
    HUNK_HEADER_RE = re.compile(r"^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@(.*)$")
    DIFF_FILE_RE = re.compile(r"^diff --git a/(.*) b/(.*)")
    RENAME_RE = re.compile(r"^rename from (.+)$")
    RENAME_TO_RE = re.compile(r"^rename to (.+)$")
    NEW_FILE_RE = re.compile(r"^new file mode")
    DELETED_FILE_RE = re.compile(r"^deleted file mode")
    BINARY_RE = re.compile(r"^Binary files .* differ")
    def analyze(self, diff_text: str) -> ChangeSummary:
        """Parse a unified diff and return a ChangeSummary."""
        summary = ChangeSummary()
        if not diff_text or not diff_text.strip():
            return summary
        # Split diff into per-file sections
        file_diffs = self._split_files(diff_text)
        for file_diff in file_diffs:
            fc = self._parse_file_diff(file_diff)
            summary.files.append(fc)
            summary.total_added += fc.added_lines
            summary.total_deleted += fc.deleted_lines
            summary.total_hunks += len(fc.hunks)
            if fc.is_new:
                summary.new_files += 1
            if fc.is_deleted:
                summary.deleted_files += 1
            if fc.is_renamed:
                summary.renamed_files += 1
            if fc.is_binary:
                summary.binary_files += 1
        summary.total_files_changed = len(summary.files)
        return summary
    def _split_files(self, diff_text: str) -> List[str]:
        """Split a multi-file diff into individual file diffs."""
        lines = diff_text.split("\n")
        chunks = []
        current = []
        for line in lines:
            if line.startswith("diff --git ") and current:
                chunks.append("\n".join(current))
                current = [line]
            else:
                current.append(line)
        if current:
            chunks.append("\n".join(current))
        return chunks
    def _parse_file_diff(self, diff_text: str) -> FileChange:
        """Parse a single file's diff section."""
        lines = diff_text.split("\n")
        fc = FileChange(path="")
        # Extract file paths
        for line in lines:
            m = self.DIFF_FILE_RE.match(line)
            if m:
                fc.path = m.group(2)
                break
        # Check for special states
        for line in lines:
            if self.NEW_FILE_RE.match(line):
                fc.is_new = True
            elif self.DELETED_FILE_RE.match(line):
                fc.is_deleted = True
            elif self.RENAME_RE.match(line):
                fc.old_path = m.group(1) if (m := self.RENAME_RE.match(line)) else None
                fc.is_renamed = True
            elif self.BINARY_RE.match(line):
                fc.is_binary = True
                return fc  # No hunks for binary
        # Rename TO
        for line in lines:
            m = self.RENAME_TO_RE.match(line)
            if m and fc.is_renamed:
                fc.path = m.group(1)
        # Parse hunks
        current_hunk = None
        for line in lines:
            m = self.HUNK_HEADER_RE.match(line)
            if m:
                if current_hunk:
                    self._classify_hunk(current_hunk, fc)
                    fc.hunks.append(current_hunk)
                current_hunk = Hunk(
                    header=m.group(5).strip(),
                    old_start=int(m.group(1)),
                    old_count=int(m.group(2) or 1),
                    new_start=int(m.group(3)),
                    new_count=int(m.group(4) or 1),
                )
            elif current_hunk and (line.startswith("+") or line.startswith("-") or line.startswith(" ")):
                current_hunk.lines.append(line)
        if current_hunk:
            self._classify_hunk(current_hunk, fc)
            fc.hunks.append(current_hunk)
        return fc
    def _classify_hunk(self, hunk: Hunk, fc: FileChange):
        """Classify a hunk and count lines."""
        added = sum(1 for l in hunk.lines if l.startswith("+"))
        deleted = sum(1 for l in hunk.lines if l.startswith("-"))
        fc.added_lines += added
        fc.deleted_lines += deleted
        if added > 0 and deleted == 0:
            hunk.category = ChangeCategory.ADDED
        elif deleted > 0 and added == 0:
            hunk.category = ChangeCategory.DELETED
        elif added > 0 and deleted > 0:
            hunk.category = ChangeCategory.MODIFIED
        else:
            hunk.category = ChangeCategory.CONTEXT
--- a/scripts/session_metadata.py
+++ b/scripts/session_metadata.py
@@ -1,276 +0,0 @@
 #!/usr/bin/env python3
 """
 session_metadata.py - Extract structured metadata from Hermes session transcripts.
 Works alongside session_reader.py to provide higher-level session analysis.
 """
 import json
 import re
 import sys
 from dataclasses import dataclass, asdict
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, Optional, Any
 # Import from session_reader (the canonical reader)
 from session_reader import read_session
@dataclass
 class SessionSummary:
    """Structured summary of a Hermes session transcript."""
    session_id: str
    model: str
    repo: str
    outcome: str
    message_count: int
    tool_calls: int
    duration_estimate: str
    key_actions: List[str]
    errors_encountered: List[str]
    start_time: Optional[str] = None
    end_time: Optional[str] = None
    total_tokens_estimate: int = 0
    user_messages: int = 0
    assistant_messages: int = 0
    tool_outputs: int = 0
 def extract_session_metadata(file_path: str) -> SessionSummary:
    """
    Extract structured metadata from a Hermes session JSONL transcript.
    Uses session_reader.read_session() for file reading.
    """
    session_id = Path(file_path).stem
    messages = []
    model = "unknown"
    repo = "unknown"
    tool_calls_count = 0
    key_actions = []
    errors = []
    start_time = None
    end_time = None
    total_tokens = 0
    # Common repo patterns to look for
    repo_patterns = [
        r"(?:the-nexus|compounding-intelligence|timmy-config|hermes-agent)",
        r"(?:forge\.alexanderwhitestone\.com/([^/]+/[^/\\s]+))",
        r"(?:github\.com/([^/]+/[^/\\s]+))",
        r"(?:Timmy_Foundation/([^/\\s]+))",
    ]
    try:
        # Use the canonical reader from session_reader.py
        messages = read_session(file_path)
    except FileNotFoundError:
        return SessionSummary(
            session_id=session_id,
            model="unknown",
            repo="unknown",
            outcome="failure",
            message_count=0,
            tool_calls=0,
            duration_estimate="0m",
            key_actions=[],
            errors_encountered=[f"File not found: {file_path}"]
        )
    # Process messages for metadata
    for entry in messages:
        # Extract model from assistant messages
        if entry.get("role") == "assistant" and entry.get("model"):
            model = entry["model"]
        # Extract timestamps
        if entry.get("timestamp"):
            ts = entry["timestamp"]
            if start_time is None:
                start_time = ts
            end_time = ts
        # Count tool calls
        if entry.get("tool_calls"):
            tool_calls_count += len(entry["tool_calls"])
            for tc in entry["tool_calls"]:
                if tc.get("function", {}).get("name"):
                    action = f"{tc['function']['name']}"
                    if action not in key_actions:
                        key_actions.append(action)
        # Estimate tokens from content length
        content = entry.get("content", "")
        if isinstance(content, str):
            total_tokens += len(content.split())
        elif isinstance(content, list):
            for item in content:
                if isinstance(item, dict) and "text" in item:
                    total_tokens += len(item["text"].split())
        # Look for repo mentions in content
        if entry.get("content"):
            content_str = str(entry["content"])
            for pattern in repo_patterns:
                match = re.search(pattern, content_str, re.IGNORECASE)
                if match:
                    if match.groups():
                        repo = match.group(1)
                    else:
                        repo = match.group(0)
                    break
        # Look for error messages
        if entry.get("role") == "tool" and entry.get("is_error"):
            error_msg = entry.get("content", "Unknown error")
            if isinstance(error_msg, str) and len(error_msg) < 200:
                errors.append(error_msg[:200])
    # Count message types
    user_messages = sum(1 for m in messages if m.get("role") == "user")
    assistant_messages = sum(1 for m in messages if m.get("role") == "assistant")
    tool_outputs = sum(1 for m in messages if m.get("role") == "tool")
    # Calculate duration estimate
    duration_estimate = "unknown"
    if start_time and end_time:
        try:
            # Try to parse timestamps
            start_dt = None
            end_dt = None
            # Handle various timestamp formats
            for fmt in ["%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%d %H:%M:%S"]:
                try:
                    if start_dt is None:
                        start_dt = datetime.strptime(start_time, fmt)
                    if end_dt is None:
                        end_dt = datetime.strptime(end_time, fmt)
                except ValueError:
                    continue
            if start_dt and end_dt:
                duration = end_dt - start_dt
                minutes = duration.total_seconds() / 60
                duration_estimate = f"{minutes:.0f}m"
        except Exception:
            pass
    # Classify outcome
    outcome = "unknown"
    if errors:
        # Check if any errors are fatal
        fatal_errors = any("405" in e or "permission" in e.lower() or "authentication" in e.lower() 
                          for e in errors)
        if fatal_errors:
            outcome = "failure"
        else:
            outcome = "partial"
    elif messages:
        # Check last message for success indicators
        last_msg = messages[-1]
        if last_msg.get("role") == "assistant":
            content = last_msg.get("content", "")
            if isinstance(content, str):
                success_indicators = ["done", "completed", "success", "merged", "pushed"]
                if any(indicator in content.lower() for indicator in success_indicators):
                    outcome = "success"
                else:
                    outcome = "unknown"
    # Deduplicate key actions (keep unique, limit to 10)
    unique_actions = []
    for action in key_actions:
        if action not in unique_actions:
            unique_actions.append(action)
        if len(unique_actions) >= 10:
            break
    # Deduplicate errors (keep unique, limit to 5)
    unique_errors = []
    for error in errors:
        if error not in unique_errors:
            unique_errors.append(error)
        if len(unique_errors) >= 5:
            break
    return SessionSummary(
        session_id=session_id,
        model=model,
        repo=repo,
        outcome=outcome,
        message_count=len(messages),
        tool_calls=tool_calls_count,
        duration_estimate=duration_estimate,
        key_actions=unique_actions,
        errors_encountered=unique_errors,
        start_time=start_time,
        end_time=end_time,
        total_tokens_estimate=total_tokens,
        user_messages=user_messages,
        assistant_messages=assistant_messages,
        tool_outputs=tool_outputs
    )
 def process_session_directory(directory_path: str, output_file: Optional[str] = None) -> List[SessionSummary]:
    """
    Process all JSONL files in a directory.
    """
    directory = Path(directory_path)
    if not directory.exists():
        print(f"Error: Directory {directory_path} does not exist", file=sys.stderr)
        return []
    jsonl_files = list(directory.glob("*.jsonl"))
    if not jsonl_files:
        print(f"Warning: No JSONL files found in {directory_path}", file=sys.stderr)
        return []
    summaries = []
    for jsonl_file in sorted(jsonl_files):
        print(f"Processing {jsonl_file.name}...", file=sys.stderr)
        summary = extract_session_metadata(str(jsonl_file))
        summaries.append(summary)
    if output_file:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump([asdict(s) for s in summaries], f, indent=2)
        print(f"Wrote {len(summaries)} summaries to {output_file}", file=sys.stderr)
    return summaries
 def main():
    """CLI entry point."""
    import argparse
    parser = argparse.ArgumentParser(description="Extract metadata from Hermes session JSONL transcripts")
    parser.add_argument("path", help="Path to JSONL file or directory of session files")
    parser.add_argument("-o", "--output", help="Output JSON file (default: stdout)")
    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
    args = parser.parse_args()
    path = Path(args.path)
    if path.is_file():
        summary = extract_session_metadata(str(path))
        if args.output:
            with open(args.output, 'w') as f:
                json.dump(asdict(summary), f, indent=2)
            print(f"Wrote summary to {args.output}", file=sys.stderr)
        else:
            print(json.dumps(asdict(summary), indent=2))
    elif path.is_dir():
        summaries = process_session_directory(str(path), args.output)
        if not args.output:
            print(json.dumps([asdict(s) for s in summaries], indent=2))
    else:
        print(f"Error: {args.path} is not a file or directory", file=sys.stderr)
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/scripts/test_diff_analyzer.py
+++ b/scripts/test_diff_analyzer.py
@@ -0,0 +1,189 @@
 #!/usr/bin/env python3
 """Tests for scripts/diff_analyzer.py — 10 tests."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__) or ".")
 import importlib.util
 spec = importlib.util.spec_from_file_location("da", os.path.join(os.path.dirname(__file__) or ".", "diff_analyzer.py"))
 mod = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(mod)
 DiffAnalyzer = mod.DiffAnalyzer
 ChangeCategory = mod.ChangeCategory
 SAMPLE_ADD = """diff --git a/new.py b/new.py
 new file mode 100644
 --- /dev/null
 +++ b/new.py
@@ -0,0 +1,3 @@
 +def hello():
 +    print("world")
 +    return True
 """
 SAMPLE_DELETE = """diff --git a/old.py b/old.py
 deleted file mode 100644
 --- a/old.py
 +++ /dev/null
@@ -1,2 +0,0 @@
 -def goodbye():
 -    pass
 """
 SAMPLE_MODIFY = """diff --git a/app.py b/app.py
 --- a/app.py
 +++ b/app.py
@@ -1,3 +1,4 @@
 def main():
 -    print("old")
 +    print("new")
 +    print("extra")
     return 0
 """
 SAMPLE_RENAME = """diff --git a/old_name.py b/new_name.py
 rename from old_name.py
 rename to new_name.py
 --- a/old_name.py
 +++ b/new_name.py
@@ -1,1 +1,1 @@
 -old content
 +new content
 """
 SAMPLE_MULTI = """diff --git a/a.py b/a.py
 --- a/a.py
 +++ b/a.py
@@ -1,1 +1,2 @@
 existing
 +added line
 diff --git b/b.py b/b.py
 new file mode 100644
 --- /dev/null
 +++ b/b.py
@@ -0,0 +1,1 @@
 +new file
 """
 SAMPLE_BINARY = """diff --git a/img.png b/img.png
 Binary files a/img.png and b/img.png differ
 """
 def test_empty():
    a = DiffAnalyzer()
    s = a.analyze("")
    assert s.total_files_changed == 0
    print("PASS: test_empty")
 def test_addition():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_ADD)
    assert s.total_files_changed == 1
    assert s.total_added == 3
    assert s.total_deleted == 0
    assert s.new_files == 1
    assert s.files[0].hunks[0].category == ChangeCategory.ADDED
    print("PASS: test_addition")
 def test_deletion():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_DELETE)
    assert s.total_deleted == 2
    assert s.deleted_files == 1
    assert s.files[0].hunks[0].category == ChangeCategory.DELETED
    print("PASS: test_deletion")
 def test_modification():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_MODIFY)
    assert s.total_added == 2
    assert s.total_deleted == 1
    assert s.files[0].hunks[0].category == ChangeCategory.MODIFIED
    print("PASS: test_modification")
 def test_rename():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_RENAME)
    assert s.renamed_files == 1
    assert s.files[0].old_path == "old_name.py"
    assert s.files[0].path == "new_name.py"
    assert s.files[0].is_renamed == True
    print("PASS: test_rename")
 def test_multiple_files():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_MULTI)
    assert s.total_files_changed == 2
    assert s.new_files == 1
    print("PASS: test_multiple_files")
 def test_binary():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_BINARY)
    assert s.binary_files == 1
    assert s.files[0].is_binary == True
    assert len(s.files[0].hunks) == 0
    print("PASS: test_binary")
 def test_to_dict():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_MODIFY)
    d = s.to_dict()
    assert "total_files_changed" in d
    assert "files" in d
    assert isinstance(d["files"], list)
    print("PASS: test_to_dict")
 def test_context_only():
    diff = """diff --git a/f.py b/f.py
 --- a/f.py
 +++ b/f.py
@@ -1,3 +1,3 @@
 line1
 -old
 +new
 line3
 """
    a = DiffAnalyzer()
    s = a.analyze(diff)
    # Has both added and deleted = MODIFIED
    assert s.files[0].hunks[0].category == ChangeCategory.MODIFIED
    print("PASS: test_context_only")
 def test_multi_hunk():
    diff = """diff --git a/f.py b/f.py
 --- a/f.py
 +++ b/f.py
@@ -1,1 +1,2 @@
 existing
 +first addition
@@ -10,1 +11,2 @@
 more
 +second addition
 """
    a = DiffAnalyzer()
    s = a.analyze(diff)
    assert s.total_hunks == 2
    assert s.total_added == 2
    print("PASS: test_multi_hunk")
 def run_all():
    test_empty()
    test_addition()
    test_deletion()
    test_modification()
    test_rename()
    test_multiple_files()
    test_binary()
    test_to_dict()
    test_context_only()
    test_multi_hunk()
    print("\nAll 10 tests passed!")
 if __name__ == "__main__":
    run_all()
Author	SHA1	Message	Date
Alexander Whitestone	d664119b9c	feat: Add diff analyzer tests (closes #176 )	2026-04-15 03:57:21 +00:00
Alexander Whitestone	764414d4d5	feat: Add diff analyzer (closes #176 )	2026-04-15 03:56:27 +00:00