feat: Add diff analyzer tests (closes #176 )

feat: Add diff analyzer (closes #176 )
2026-04-15 03:57:21 +00:00 · 2026-04-15 03:56:27 +00:00
3 changed files with 405 additions and 276 deletions
--- a/scripts/diff_analyzer.py
+++ b/scripts/diff_analyzer.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+Diff Analyzer — Parse unified diffs and categorize every change.
+
+Pipeline 6.1 for Compounding Intelligence.
+"""
+
+import re
+from dataclasses import dataclass, field, asdict
+from enum import Enum
+from typing import List, Dict, Any, Optional
+
+
+class ChangeCategory(Enum):
+    ADDED = "added"
+    DELETED = "deleted"
+    MODIFIED = "modified"
+    MOVED = "moved"
+    CONTEXT = "context"
+
+
+@dataclass
+class Hunk:
+    """A single diff hunk with header, line ranges, and category."""
+    header: str
+    old_start: int
+    old_count: int
+    new_start: int
+    new_count: int
+    lines: List[str] = field(default_factory=list)
+    category: ChangeCategory = ChangeCategory.CONTEXT
+
+    def to_dict(self) -> Dict[str, Any]:
+        d = asdict(self)
+        d["category"] = self.category.value
+        return d
+
+
+@dataclass
+class FileChange:
+    """A single file's changes."""
+    path: str
+    old_path: Optional[str] = None  # For renames
+    hunks: List[Hunk] = field(default_factory=list)
+    added_lines: int = 0
+    deleted_lines: int = 0
+    is_new: bool = False
+    is_deleted: bool = False
+    is_renamed: bool = False
+    is_binary: bool = False
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "path": self.path,
+            "old_path": self.old_path,
+            "hunks": [h.to_dict() for h in self.hunks],
+            "added_lines": self.added_lines,
+            "deleted_lines": self.deleted_lines,
+            "is_new": self.is_new,
+            "is_deleted": self.is_deleted,
+            "is_renamed": self.is_renamed,
+            "is_binary": self.is_binary,
+        }
+
+
+@dataclass
+class ChangeSummary:
+    """Aggregate stats + per-file breakdown."""
+    files: List[FileChange] = field(default_factory=list)
+    total_added: int = 0
+    total_deleted: int = 0
+    total_files_changed: int = 0
+    total_hunks: int = 0
+    new_files: int = 0
+    deleted_files: int = 0
+    renamed_files: int = 0
+    binary_files: int = 0
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "total_files_changed": self.total_files_changed,
+            "total_added": self.total_added,
+            "total_deleted": self.total_deleted,
+            "total_hunks": self.total_hunks,
+            "new_files": self.new_files,
+            "deleted_files": self.deleted_files,
+            "renamed_files": self.renamed_files,
+            "binary_files": self.binary_files,
+            "files": [f.to_dict() for f in self.files],
+        }
+
+
+class DiffAnalyzer:
+    """Parses unified diff format and produces structured ChangeSummary."""
+
+    HUNK_HEADER_RE = re.compile(r"^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@(.*)$")
+    DIFF_FILE_RE = re.compile(r"^diff --git a/(.*) b/(.*)")
+    RENAME_RE = re.compile(r"^rename from (.+)$")
+    RENAME_TO_RE = re.compile(r"^rename to (.+)$")
+    NEW_FILE_RE = re.compile(r"^new file mode")
+    DELETED_FILE_RE = re.compile(r"^deleted file mode")
+    BINARY_RE = re.compile(r"^Binary files .* differ")
+
+    def analyze(self, diff_text: str) -> ChangeSummary:
+        """Parse a unified diff and return a ChangeSummary."""
+        summary = ChangeSummary()
+        if not diff_text or not diff_text.strip():
+            return summary
+
+        # Split diff into per-file sections
+        file_diffs = self._split_files(diff_text)
+
+        for file_diff in file_diffs:
+            fc = self._parse_file_diff(file_diff)
+            summary.files.append(fc)
+            summary.total_added += fc.added_lines
+            summary.total_deleted += fc.deleted_lines
+            summary.total_hunks += len(fc.hunks)
+            if fc.is_new:
+                summary.new_files += 1
+            if fc.is_deleted:
+                summary.deleted_files += 1
+            if fc.is_renamed:
+                summary.renamed_files += 1
+            if fc.is_binary:
+                summary.binary_files += 1
+
+        summary.total_files_changed = len(summary.files)
+        return summary
+
+    def _split_files(self, diff_text: str) -> List[str]:
+        """Split a multi-file diff into individual file diffs."""
+        lines = diff_text.split("\n")
+        chunks = []
+        current = []
+        for line in lines:
+            if line.startswith("diff --git ") and current:
+                chunks.append("\n".join(current))
+                current = [line]
+            else:
+                current.append(line)
+        if current:
+            chunks.append("\n".join(current))
+        return chunks
+
+    def _parse_file_diff(self, diff_text: str) -> FileChange:
+        """Parse a single file's diff section."""
+        lines = diff_text.split("\n")
+        fc = FileChange(path="")
+
+        # Extract file paths
+        for line in lines:
+            m = self.DIFF_FILE_RE.match(line)
+            if m:
+                fc.path = m.group(2)
+                break
+
+        # Check for special states
+        for line in lines:
+            if self.NEW_FILE_RE.match(line):
+                fc.is_new = True
+            elif self.DELETED_FILE_RE.match(line):
+                fc.is_deleted = True
+            elif self.RENAME_RE.match(line):
+                fc.old_path = m.group(1) if (m := self.RENAME_RE.match(line)) else None
+                fc.is_renamed = True
+            elif self.BINARY_RE.match(line):
+                fc.is_binary = True
+                return fc  # No hunks for binary
+
+        # Rename TO
+        for line in lines:
+            m = self.RENAME_TO_RE.match(line)
+            if m and fc.is_renamed:
+                fc.path = m.group(1)
+
+        # Parse hunks
+        current_hunk = None
+        for line in lines:
+            m = self.HUNK_HEADER_RE.match(line)
+            if m:
+                if current_hunk:
+                    self._classify_hunk(current_hunk, fc)
+                    fc.hunks.append(current_hunk)
+                current_hunk = Hunk(
+                    header=m.group(5).strip(),
+                    old_start=int(m.group(1)),
+                    old_count=int(m.group(2) or 1),
+                    new_start=int(m.group(3)),
+                    new_count=int(m.group(4) or 1),
+                )
+            elif current_hunk and (line.startswith("+") or line.startswith("-") or line.startswith(" ")):
+                current_hunk.lines.append(line)
+
+        if current_hunk:
+            self._classify_hunk(current_hunk, fc)
+            fc.hunks.append(current_hunk)
+
+        return fc
+
+    def _classify_hunk(self, hunk: Hunk, fc: FileChange):
+        """Classify a hunk and count lines."""
+        added = sum(1 for l in hunk.lines if l.startswith("+"))
+        deleted = sum(1 for l in hunk.lines if l.startswith("-"))
+
+        fc.added_lines += added
+        fc.deleted_lines += deleted
+
+        if added > 0 and deleted == 0:
+            hunk.category = ChangeCategory.ADDED
+        elif deleted > 0 and added == 0:
+            hunk.category = ChangeCategory.DELETED
+        elif added > 0 and deleted > 0:
+            hunk.category = ChangeCategory.MODIFIED
+        else:
+            hunk.category = ChangeCategory.CONTEXT
--- a/scripts/session_metadata.py
+++ b/scripts/session_metadata.py
@@ -1,276 +0,0 @@
-#!/usr/bin/env python3
-"""
-session_metadata.py - Extract structured metadata from Hermes session transcripts.
-Works alongside session_reader.py to provide higher-level session analysis.
-"""
-
-import json
-import re
-import sys
-from dataclasses import dataclass, asdict
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, List, Optional, Any
-
-# Import from session_reader (the canonical reader)
-from session_reader import read_session
-
-
-@dataclass
-class SessionSummary:
-    """Structured summary of a Hermes session transcript."""
-    session_id: str
-    model: str
-    repo: str
-    outcome: str
-    message_count: int
-    tool_calls: int
-    duration_estimate: str
-    key_actions: List[str]
-    errors_encountered: List[str]
-    start_time: Optional[str] = None
-    end_time: Optional[str] = None
-    total_tokens_estimate: int = 0
-    user_messages: int = 0
-    assistant_messages: int = 0
-    tool_outputs: int = 0
-
-
-def extract_session_metadata(file_path: str) -> SessionSummary:
-    """
-    Extract structured metadata from a Hermes session JSONL transcript.
-    Uses session_reader.read_session() for file reading.
-    """
-    session_id = Path(file_path).stem
-    messages = []
-    model = "unknown"
-    repo = "unknown"
-    tool_calls_count = 0
-    key_actions = []
-    errors = []
-    start_time = None
-    end_time = None
-    total_tokens = 0
-    
-    # Common repo patterns to look for
-    repo_patterns = [
-        r"(?:the-nexus|compounding-intelligence|timmy-config|hermes-agent)",
-        r"(?:forge\.alexanderwhitestone\.com/([^/]+/[^/\\s]+))",
-        r"(?:github\.com/([^/]+/[^/\\s]+))",
-        r"(?:Timmy_Foundation/([^/\\s]+))",
-    ]
-    
-    try:
-        # Use the canonical reader from session_reader.py
-        messages = read_session(file_path)
-    except FileNotFoundError:
-        return SessionSummary(
-            session_id=session_id,
-            model="unknown",
-            repo="unknown",
-            outcome="failure",
-            message_count=0,
-            tool_calls=0,
-            duration_estimate="0m",
-            key_actions=[],
-            errors_encountered=[f"File not found: {file_path}"]
-        )
-    
-    # Process messages for metadata
-    for entry in messages:
-        # Extract model from assistant messages
-        if entry.get("role") == "assistant" and entry.get("model"):
-            model = entry["model"]
-        
-        # Extract timestamps
-        if entry.get("timestamp"):
-            ts = entry["timestamp"]
-            if start_time is None:
-                start_time = ts
-            end_time = ts
-        
-        # Count tool calls
-        if entry.get("tool_calls"):
-            tool_calls_count += len(entry["tool_calls"])
-            for tc in entry["tool_calls"]:
-                if tc.get("function", {}).get("name"):
-                    action = f"{tc['function']['name']}"
-                    if action not in key_actions:
-                        key_actions.append(action)
-        
-        # Estimate tokens from content length
-        content = entry.get("content", "")
-        if isinstance(content, str):
-            total_tokens += len(content.split())
-        elif isinstance(content, list):
-            for item in content:
-                if isinstance(item, dict) and "text" in item:
-                    total_tokens += len(item["text"].split())
-        
-        # Look for repo mentions in content
-        if entry.get("content"):
-            content_str = str(entry["content"])
-            for pattern in repo_patterns:
-                match = re.search(pattern, content_str, re.IGNORECASE)
-                if match:
-                    if match.groups():
-                        repo = match.group(1)
-                    else:
-                        repo = match.group(0)
-                    break
-        
-        # Look for error messages
-        if entry.get("role") == "tool" and entry.get("is_error"):
-            error_msg = entry.get("content", "Unknown error")
-            if isinstance(error_msg, str) and len(error_msg) < 200:
-                errors.append(error_msg[:200])
-    
-    # Count message types
-    user_messages = sum(1 for m in messages if m.get("role") == "user")
-    assistant_messages = sum(1 for m in messages if m.get("role") == "assistant")
-    tool_outputs = sum(1 for m in messages if m.get("role") == "tool")
-    
-    # Calculate duration estimate
-    duration_estimate = "unknown"
-    if start_time and end_time:
-        try:
-            # Try to parse timestamps
-            start_dt = None
-            end_dt = None
-            
-            # Handle various timestamp formats
-            for fmt in ["%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%d %H:%M:%S"]:
-                try:
-                    if start_dt is None:
-                        start_dt = datetime.strptime(start_time, fmt)
-                    if end_dt is None:
-                        end_dt = datetime.strptime(end_time, fmt)
-                except ValueError:
-                    continue
-            
-            if start_dt and end_dt:
-                duration = end_dt - start_dt
-                minutes = duration.total_seconds() / 60
-                duration_estimate = f"{minutes:.0f}m"
-        except Exception:
-            pass
-    
-    # Classify outcome
-    outcome = "unknown"
-    if errors:
-        # Check if any errors are fatal
-        fatal_errors = any("405" in e or "permission" in e.lower() or "authentication" in e.lower() 
-                          for e in errors)
-        if fatal_errors:
-            outcome = "failure"
-        else:
-            outcome = "partial"
-    elif messages:
-        # Check last message for success indicators
-        last_msg = messages[-1]
-        if last_msg.get("role") == "assistant":
-            content = last_msg.get("content", "")
-            if isinstance(content, str):
-                success_indicators = ["done", "completed", "success", "merged", "pushed"]
-                if any(indicator in content.lower() for indicator in success_indicators):
-                    outcome = "success"
-                else:
-                    outcome = "unknown"
-    
-    # Deduplicate key actions (keep unique, limit to 10)
-    unique_actions = []
-    for action in key_actions:
-        if action not in unique_actions:
-            unique_actions.append(action)
-        if len(unique_actions) >= 10:
-            break
-    
-    # Deduplicate errors (keep unique, limit to 5)
-    unique_errors = []
-    for error in errors:
-        if error not in unique_errors:
-            unique_errors.append(error)
-        if len(unique_errors) >= 5:
-            break
-    
-    return SessionSummary(
-        session_id=session_id,
-        model=model,
-        repo=repo,
-        outcome=outcome,
-        message_count=len(messages),
-        tool_calls=tool_calls_count,
-        duration_estimate=duration_estimate,
-        key_actions=unique_actions,
-        errors_encountered=unique_errors,
-        start_time=start_time,
-        end_time=end_time,
-        total_tokens_estimate=total_tokens,
-        user_messages=user_messages,
-        assistant_messages=assistant_messages,
-        tool_outputs=tool_outputs
-    )
-
-
-def process_session_directory(directory_path: str, output_file: Optional[str] = None) -> List[SessionSummary]:
-    """
-    Process all JSONL files in a directory.
-    """
-    directory = Path(directory_path)
-    if not directory.exists():
-        print(f"Error: Directory {directory_path} does not exist", file=sys.stderr)
-        return []
-    
-    jsonl_files = list(directory.glob("*.jsonl"))
-    if not jsonl_files:
-        print(f"Warning: No JSONL files found in {directory_path}", file=sys.stderr)
-        return []
-    
-    summaries = []
-    for jsonl_file in sorted(jsonl_files):
-        print(f"Processing {jsonl_file.name}...", file=sys.stderr)
-        summary = extract_session_metadata(str(jsonl_file))
-        summaries.append(summary)
-    
-    if output_file:
-        with open(output_file, 'w', encoding='utf-8') as f:
-            json.dump([asdict(s) for s in summaries], f, indent=2)
-        print(f"Wrote {len(summaries)} summaries to {output_file}", file=sys.stderr)
-    
-    return summaries
-
-
-def main():
-    """CLI entry point."""
-    import argparse
-    
-    parser = argparse.ArgumentParser(description="Extract metadata from Hermes session JSONL transcripts")
-    parser.add_argument("path", help="Path to JSONL file or directory of session files")
-    parser.add_argument("-o", "--output", help="Output JSON file (default: stdout)")
-    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
-    
-    args = parser.parse_args()
-    
-    path = Path(args.path)
-    
-    if path.is_file():
-        summary = extract_session_metadata(str(path))
-        if args.output:
-            with open(args.output, 'w') as f:
-                json.dump(asdict(summary), f, indent=2)
-            print(f"Wrote summary to {args.output}", file=sys.stderr)
-        else:
-            print(json.dumps(asdict(summary), indent=2))
-    
-    elif path.is_dir():
-        summaries = process_session_directory(str(path), args.output)
-        if not args.output:
-            print(json.dumps([asdict(s) for s in summaries], indent=2))
-    
-    else:
-        print(f"Error: {args.path} is not a file or directory", file=sys.stderr)
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/test_diff_analyzer.py
+++ b/scripts/test_diff_analyzer.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""Tests for scripts/diff_analyzer.py — 10 tests."""
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__) or ".")
+
+import importlib.util
+spec = importlib.util.spec_from_file_location("da", os.path.join(os.path.dirname(__file__) or ".", "diff_analyzer.py"))
+mod = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(mod)
+DiffAnalyzer = mod.DiffAnalyzer
+ChangeCategory = mod.ChangeCategory
+
+
+SAMPLE_ADD = """diff --git a/new.py b/new.py
+new file mode 100644
+--- /dev/null
+++ b/new.py
+@@ -0,0 +1,3 @@
+def hello():
+    print("world")
+    return True
+"""
+
+SAMPLE_DELETE = """diff --git a/old.py b/old.py
+deleted file mode 100644
+--- a/old.py
+++ /dev/null
+@@ -1,2 +0,0 @@
+-def goodbye():
+-    pass
+"""
+
+SAMPLE_MODIFY = """diff --git a/app.py b/app.py
+--- a/app.py
+++ b/app.py
+@@ -1,3 +1,4 @@
+ def main():
+-    print("old")
+    print("new")
+    print("extra")
+     return 0
+"""
+
+SAMPLE_RENAME = """diff --git a/old_name.py b/new_name.py
+rename from old_name.py
+rename to new_name.py
+--- a/old_name.py
+++ b/new_name.py
+@@ -1,1 +1,1 @@
+-old content
+new content
+"""
+
+SAMPLE_MULTI = """diff --git a/a.py b/a.py
+--- a/a.py
+++ b/a.py
+@@ -1,1 +1,2 @@
+ existing
+added line
+diff --git b/b.py b/b.py
+new file mode 100644
+--- /dev/null
+++ b/b.py
+@@ -0,0 +1,1 @@
+new file
+"""
+
+SAMPLE_BINARY = """diff --git a/img.png b/img.png
+Binary files a/img.png and b/img.png differ
+"""
+
+
+def test_empty():
+    a = DiffAnalyzer()
+    s = a.analyze("")
+    assert s.total_files_changed == 0
+    print("PASS: test_empty")
+
+def test_addition():
+    a = DiffAnalyzer()
+    s = a.analyze(SAMPLE_ADD)
+    assert s.total_files_changed == 1
+    assert s.total_added == 3
+    assert s.total_deleted == 0
+    assert s.new_files == 1
+    assert s.files[0].hunks[0].category == ChangeCategory.ADDED
+    print("PASS: test_addition")
+
+def test_deletion():
+    a = DiffAnalyzer()
+    s = a.analyze(SAMPLE_DELETE)
+    assert s.total_deleted == 2
+    assert s.deleted_files == 1
+    assert s.files[0].hunks[0].category == ChangeCategory.DELETED
+    print("PASS: test_deletion")
+
+def test_modification():
+    a = DiffAnalyzer()
+    s = a.analyze(SAMPLE_MODIFY)
+    assert s.total_added == 2
+    assert s.total_deleted == 1
+    assert s.files[0].hunks[0].category == ChangeCategory.MODIFIED
+    print("PASS: test_modification")
+
+def test_rename():
+    a = DiffAnalyzer()
+    s = a.analyze(SAMPLE_RENAME)
+    assert s.renamed_files == 1
+    assert s.files[0].old_path == "old_name.py"
+    assert s.files[0].path == "new_name.py"
+    assert s.files[0].is_renamed == True
+    print("PASS: test_rename")
+
+def test_multiple_files():
+    a = DiffAnalyzer()
+    s = a.analyze(SAMPLE_MULTI)
+    assert s.total_files_changed == 2
+    assert s.new_files == 1
+    print("PASS: test_multiple_files")
+
+def test_binary():
+    a = DiffAnalyzer()
+    s = a.analyze(SAMPLE_BINARY)
+    assert s.binary_files == 1
+    assert s.files[0].is_binary == True
+    assert len(s.files[0].hunks) == 0
+    print("PASS: test_binary")
+
+def test_to_dict():
+    a = DiffAnalyzer()
+    s = a.analyze(SAMPLE_MODIFY)
+    d = s.to_dict()
+    assert "total_files_changed" in d
+    assert "files" in d
+    assert isinstance(d["files"], list)
+    print("PASS: test_to_dict")
+
+def test_context_only():
+    diff = """diff --git a/f.py b/f.py
+--- a/f.py
+++ b/f.py
+@@ -1,3 +1,3 @@
+ line1
+-old
+new
+ line3
+"""
+    a = DiffAnalyzer()
+    s = a.analyze(diff)
+    # Has both added and deleted = MODIFIED
+    assert s.files[0].hunks[0].category == ChangeCategory.MODIFIED
+    print("PASS: test_context_only")
+
+def test_multi_hunk():
+    diff = """diff --git a/f.py b/f.py
+--- a/f.py
+++ b/f.py
+@@ -1,1 +1,2 @@
+ existing
+first addition
+@@ -10,1 +11,2 @@
+ more
+second addition
+"""
+    a = DiffAnalyzer()
+    s = a.analyze(diff)
+    assert s.total_hunks == 2
+    assert s.total_added == 2
+    print("PASS: test_multi_hunk")
+
+
+def run_all():
+    test_empty()
+    test_addition()
+    test_deletion()
+    test_modification()
+    test_rename()
+    test_multiple_files()
+    test_binary()
+    test_to_dict()
+    test_context_only()
+    test_multi_hunk()
+    print("\nAll 10 tests passed!")
+
+
+if __name__ == "__main__":
+    run_all()
Author	SHA1	Message	Date
Alexander Whitestone	d664119b9c	feat: Add diff analyzer tests (closes #176 )	2026-04-15 03:57:21 +00:00
Alexander Whitestone	764414d4d5	feat: Add diff analyzer (closes #176 )	2026-04-15 03:56:27 +00:00