feat: Add diff analyzer tests (closes #176 )

feat: Add diff analyzer (closes #176 )
2026-04-15 03:57:21 +00:00 · 2026-04-15 03:56:27 +00:00
4 changed files with 405 additions and 240 deletions
--- a/scripts/diff_analyzer.py
+++ b/scripts/diff_analyzer.py
@@ -0,0 +1,216 @@
 #!/usr/bin/env python3
 """
 Diff Analyzer — Parse unified diffs and categorize every change.
 Pipeline 6.1 for Compounding Intelligence.
 """
 import re
 from dataclasses import dataclass, field, asdict
 from enum import Enum
 from typing import List, Dict, Any, Optional
 class ChangeCategory(Enum):
    ADDED = "added"
    DELETED = "deleted"
    MODIFIED = "modified"
    MOVED = "moved"
    CONTEXT = "context"
@dataclass
 class Hunk:
    """A single diff hunk with header, line ranges, and category."""
    header: str
    old_start: int
    old_count: int
    new_start: int
    new_count: int
    lines: List[str] = field(default_factory=list)
    category: ChangeCategory = ChangeCategory.CONTEXT
    def to_dict(self) -> Dict[str, Any]:
        d = asdict(self)
        d["category"] = self.category.value
        return d
@dataclass
 class FileChange:
    """A single file's changes."""
    path: str
    old_path: Optional[str] = None  # For renames
    hunks: List[Hunk] = field(default_factory=list)
    added_lines: int = 0
    deleted_lines: int = 0
    is_new: bool = False
    is_deleted: bool = False
    is_renamed: bool = False
    is_binary: bool = False
    def to_dict(self) -> Dict[str, Any]:
        return {
            "path": self.path,
            "old_path": self.old_path,
            "hunks": [h.to_dict() for h in self.hunks],
            "added_lines": self.added_lines,
            "deleted_lines": self.deleted_lines,
            "is_new": self.is_new,
            "is_deleted": self.is_deleted,
            "is_renamed": self.is_renamed,
            "is_binary": self.is_binary,
        }
@dataclass
 class ChangeSummary:
    """Aggregate stats + per-file breakdown."""
    files: List[FileChange] = field(default_factory=list)
    total_added: int = 0
    total_deleted: int = 0
    total_files_changed: int = 0
    total_hunks: int = 0
    new_files: int = 0
    deleted_files: int = 0
    renamed_files: int = 0
    binary_files: int = 0
    def to_dict(self) -> Dict[str, Any]:
        return {
            "total_files_changed": self.total_files_changed,
            "total_added": self.total_added,
            "total_deleted": self.total_deleted,
            "total_hunks": self.total_hunks,
            "new_files": self.new_files,
            "deleted_files": self.deleted_files,
            "renamed_files": self.renamed_files,
            "binary_files": self.binary_files,
            "files": [f.to_dict() for f in self.files],
        }
 class DiffAnalyzer:
    """Parses unified diff format and produces structured ChangeSummary."""
    HUNK_HEADER_RE = re.compile(r"^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@(.*)$")
    DIFF_FILE_RE = re.compile(r"^diff --git a/(.*) b/(.*)")
    RENAME_RE = re.compile(r"^rename from (.+)$")
    RENAME_TO_RE = re.compile(r"^rename to (.+)$")
    NEW_FILE_RE = re.compile(r"^new file mode")
    DELETED_FILE_RE = re.compile(r"^deleted file mode")
    BINARY_RE = re.compile(r"^Binary files .* differ")
    def analyze(self, diff_text: str) -> ChangeSummary:
        """Parse a unified diff and return a ChangeSummary."""
        summary = ChangeSummary()
        if not diff_text or not diff_text.strip():
            return summary
        # Split diff into per-file sections
        file_diffs = self._split_files(diff_text)
        for file_diff in file_diffs:
            fc = self._parse_file_diff(file_diff)
            summary.files.append(fc)
            summary.total_added += fc.added_lines
            summary.total_deleted += fc.deleted_lines
            summary.total_hunks += len(fc.hunks)
            if fc.is_new:
                summary.new_files += 1
            if fc.is_deleted:
                summary.deleted_files += 1
            if fc.is_renamed:
                summary.renamed_files += 1
            if fc.is_binary:
                summary.binary_files += 1
        summary.total_files_changed = len(summary.files)
        return summary
    def _split_files(self, diff_text: str) -> List[str]:
        """Split a multi-file diff into individual file diffs."""
        lines = diff_text.split("\n")
        chunks = []
        current = []
        for line in lines:
            if line.startswith("diff --git ") and current:
                chunks.append("\n".join(current))
                current = [line]
            else:
                current.append(line)
        if current:
            chunks.append("\n".join(current))
        return chunks
    def _parse_file_diff(self, diff_text: str) -> FileChange:
        """Parse a single file's diff section."""
        lines = diff_text.split("\n")
        fc = FileChange(path="")
        # Extract file paths
        for line in lines:
            m = self.DIFF_FILE_RE.match(line)
            if m:
                fc.path = m.group(2)
                break
        # Check for special states
        for line in lines:
            if self.NEW_FILE_RE.match(line):
                fc.is_new = True
            elif self.DELETED_FILE_RE.match(line):
                fc.is_deleted = True
            elif self.RENAME_RE.match(line):
                fc.old_path = m.group(1) if (m := self.RENAME_RE.match(line)) else None
                fc.is_renamed = True
            elif self.BINARY_RE.match(line):
                fc.is_binary = True
                return fc  # No hunks for binary
        # Rename TO
        for line in lines:
            m = self.RENAME_TO_RE.match(line)
            if m and fc.is_renamed:
                fc.path = m.group(1)
        # Parse hunks
        current_hunk = None
        for line in lines:
            m = self.HUNK_HEADER_RE.match(line)
            if m:
                if current_hunk:
                    self._classify_hunk(current_hunk, fc)
                    fc.hunks.append(current_hunk)
                current_hunk = Hunk(
                    header=m.group(5).strip(),
                    old_start=int(m.group(1)),
                    old_count=int(m.group(2) or 1),
                    new_start=int(m.group(3)),
                    new_count=int(m.group(4) or 1),
                )
            elif current_hunk and (line.startswith("+") or line.startswith("-") or line.startswith(" ")):
                current_hunk.lines.append(line)
        if current_hunk:
            self._classify_hunk(current_hunk, fc)
            fc.hunks.append(current_hunk)
        return fc
    def _classify_hunk(self, hunk: Hunk, fc: FileChange):
        """Classify a hunk and count lines."""
        added = sum(1 for l in hunk.lines if l.startswith("+"))
        deleted = sum(1 for l in hunk.lines if l.startswith("-"))
        fc.added_lines += added
        fc.deleted_lines += deleted
        if added > 0 and deleted == 0:
            hunk.category = ChangeCategory.ADDED
        elif deleted > 0 and added == 0:
            hunk.category = ChangeCategory.DELETED
        elif added > 0 and deleted > 0:
            hunk.category = ChangeCategory.MODIFIED
        else:
            hunk.category = ChangeCategory.CONTEXT
--- a/scripts/gitea_issue_parser.py
+++ b/scripts/gitea_issue_parser.py
@@ -1,131 +0,0 @@
 #!/usr/bin/env python3
 """
 Gitea Issue Body Parser — Extract structured data from markdown issue bodies.
 Usage:
    cat issue_body.txt | python3 scripts/gitea_issue_parser.py --stdin --pretty
    python3 scripts/gitea_issue_parser.py --url https://forge.../api/v1/repos/.../issues/123 --pretty
    python3 scripts/gitea_issue_parser.py body.txt --title "Fix thing (#42)" --labels pipeline extraction
 """
 import argparse
 import json
 import re
 import sys
 from typing import Dict, List, Any, Optional
 def parse_issue_body(body: str, title: str = "", labels: List[str] = None) -> Dict[str, Any]:
    """Parse a Gitea issue markdown body into structured JSON.
    Extracted fields:
    - title: Issue title
    - context: Background/description section
    - criteria[]: Acceptance criteria (checkboxes or numbered lists)
    - labels[]: Issue labels
    - epic_ref: Parent/epic issue reference (from "Closes #N" or title)
    - sections{}: All ## sections as key-value pairs
    """
    result = {
        "title": title,
        "context": "",
        "criteria": [],
        "labels": labels or [],
        "epic_ref": None,
        "sections": {},
    }
    if not body:
        return result
    # Extract epic reference from title or body
    epic_patterns = [
        r"(?:closes|fixes|addresses|refs?)\s+#(\d+)",
        r"#(\d+)",
    ]
    for pattern in epic_patterns:
        match = re.search(pattern, (title + " " + body).lower())
        if match:
            result["epic_ref"] = int(match.group(1))
            break
    # Parse ## sections
    section_pattern = r"^##\s+(.+?)$\n((?:^(?!##\s).*$\n?)*)"
    for match in re.finditer(section_pattern, body, re.MULTILINE):
        section_name = match.group(1).strip().lower().replace(" ", "_")
        section_content = match.group(2).strip()
        result["sections"][section_name] = section_content
    # Extract acceptance criteria (checkboxes)
    checkbox_pattern = r"^\s*-\s*\[([ xX])\]\s*(.+)$"
    for match in re.finditer(checkbox_pattern, body, re.MULTILINE):
        checked = match.group(1).lower() == "x"
        text = match.group(2).strip()
        result["criteria"].append({"text": text, "checked": checked})
    # If no checkboxes, try numbered lists in "Acceptance Criteria" or "Criteria" section
    if not result["criteria"]:
        for section_name in ["acceptance_criteria", "criteria", "acceptance criteria"]:
            if section_name in result["sections"]:
                numbered = r"^\s*\d+\.\s*(.+)$"
                for match in re.finditer(numbered, result["sections"][section_name], re.MULTILINE):
                    result["criteria"].append({"text": match.group(1).strip(), "checked": False})
                break
    # Extract context (first section or first paragraph before any ## heading)
    first_heading = body.find("## ")
    if first_heading > 0:
        context_text = body[:first_heading].strip()
    else:
        context_text = body.split("\n\n")[0].strip()
    # Clean up: remove "## Context" or "## Problem" header if present
    context_text = re.sub(r"^#+\s*\w+\s*\n?", "", context_text).strip()
    result["context"] = context_text[:500]  # Cap at 500 chars
    return result
 def fetch_issue_from_url(url: str) -> Dict[str, Any]:
    """Fetch an issue from a Gitea API URL and parse it."""
    import urllib.request
    req = urllib.request.Request(url, headers={"Accept": "application/json"})
    with urllib.request.urlopen(req) as resp:
        data = json.loads(resp.read())
    return parse_issue_body(
        body=data.get("body", ""),
        title=data.get("title", ""),
        labels=[l["name"] for l in data.get("labels", [])]
    )
 def main():
    parser = argparse.ArgumentParser(description="Parse Gitea issue markdown into structured JSON")
    parser.add_argument("file", nargs="?", help="Issue body file (or use --stdin)")
    parser.add_argument("--stdin", action="store_true", help="Read from stdin")
    parser.add_argument("--url", help="Gitea API URL to fetch issue from")
    parser.add_argument("--title", default="", help="Issue title")
    parser.add_argument("--labels", nargs="*", default=[], help="Issue labels")
    parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
    args = parser.parse_args()
    if args.url:
        result = fetch_issue_from_url(args.url)
    elif args.stdin:
        body = sys.stdin.read()
        result = parse_issue_body(body, args.title, args.labels)
    elif args.file:
        with open(args.file) as f:
            body = f.read()
        result = parse_issue_body(body, args.title, args.labels)
    else:
        parser.print_help()
        sys.exit(1)
    indent = 2 if args.pretty else None
    print(json.dumps(result, indent=indent))
 if __name__ == "__main__":
    main()
--- a/scripts/test_diff_analyzer.py
+++ b/scripts/test_diff_analyzer.py
@@ -0,0 +1,189 @@
 #!/usr/bin/env python3
 """Tests for scripts/diff_analyzer.py — 10 tests."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__) or ".")
 import importlib.util
 spec = importlib.util.spec_from_file_location("da", os.path.join(os.path.dirname(__file__) or ".", "diff_analyzer.py"))
 mod = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(mod)
 DiffAnalyzer = mod.DiffAnalyzer
 ChangeCategory = mod.ChangeCategory
 SAMPLE_ADD = """diff --git a/new.py b/new.py
 new file mode 100644
 --- /dev/null
 +++ b/new.py
@@ -0,0 +1,3 @@
 +def hello():
 +    print("world")
 +    return True
 """
 SAMPLE_DELETE = """diff --git a/old.py b/old.py
 deleted file mode 100644
 --- a/old.py
 +++ /dev/null
@@ -1,2 +0,0 @@
 -def goodbye():
 -    pass
 """
 SAMPLE_MODIFY = """diff --git a/app.py b/app.py
 --- a/app.py
 +++ b/app.py
@@ -1,3 +1,4 @@
 def main():
 -    print("old")
 +    print("new")
 +    print("extra")
     return 0
 """
 SAMPLE_RENAME = """diff --git a/old_name.py b/new_name.py
 rename from old_name.py
 rename to new_name.py
 --- a/old_name.py
 +++ b/new_name.py
@@ -1,1 +1,1 @@
 -old content
 +new content
 """
 SAMPLE_MULTI = """diff --git a/a.py b/a.py
 --- a/a.py
 +++ b/a.py
@@ -1,1 +1,2 @@
 existing
 +added line
 diff --git b/b.py b/b.py
 new file mode 100644
 --- /dev/null
 +++ b/b.py
@@ -0,0 +1,1 @@
 +new file
 """
 SAMPLE_BINARY = """diff --git a/img.png b/img.png
 Binary files a/img.png and b/img.png differ
 """
 def test_empty():
    a = DiffAnalyzer()
    s = a.analyze("")
    assert s.total_files_changed == 0
    print("PASS: test_empty")
 def test_addition():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_ADD)
    assert s.total_files_changed == 1
    assert s.total_added == 3
    assert s.total_deleted == 0
    assert s.new_files == 1
    assert s.files[0].hunks[0].category == ChangeCategory.ADDED
    print("PASS: test_addition")
 def test_deletion():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_DELETE)
    assert s.total_deleted == 2
    assert s.deleted_files == 1
    assert s.files[0].hunks[0].category == ChangeCategory.DELETED
    print("PASS: test_deletion")
 def test_modification():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_MODIFY)
    assert s.total_added == 2
    assert s.total_deleted == 1
    assert s.files[0].hunks[0].category == ChangeCategory.MODIFIED
    print("PASS: test_modification")
 def test_rename():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_RENAME)
    assert s.renamed_files == 1
    assert s.files[0].old_path == "old_name.py"
    assert s.files[0].path == "new_name.py"
    assert s.files[0].is_renamed == True
    print("PASS: test_rename")
 def test_multiple_files():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_MULTI)
    assert s.total_files_changed == 2
    assert s.new_files == 1
    print("PASS: test_multiple_files")
 def test_binary():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_BINARY)
    assert s.binary_files == 1
    assert s.files[0].is_binary == True
    assert len(s.files[0].hunks) == 0
    print("PASS: test_binary")
 def test_to_dict():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_MODIFY)
    d = s.to_dict()
    assert "total_files_changed" in d
    assert "files" in d
    assert isinstance(d["files"], list)
    print("PASS: test_to_dict")
 def test_context_only():
    diff = """diff --git a/f.py b/f.py
 --- a/f.py
 +++ b/f.py
@@ -1,3 +1,3 @@
 line1
 -old
 +new
 line3
 """
    a = DiffAnalyzer()
    s = a.analyze(diff)
    # Has both added and deleted = MODIFIED
    assert s.files[0].hunks[0].category == ChangeCategory.MODIFIED
    print("PASS: test_context_only")
 def test_multi_hunk():
    diff = """diff --git a/f.py b/f.py
 --- a/f.py
 +++ b/f.py
@@ -1,1 +1,2 @@
 existing
 +first addition
@@ -10,1 +11,2 @@
 more
 +second addition
 """
    a = DiffAnalyzer()
    s = a.analyze(diff)
    assert s.total_hunks == 2
    assert s.total_added == 2
    print("PASS: test_multi_hunk")
 def run_all():
    test_empty()
    test_addition()
    test_deletion()
    test_modification()
    test_rename()
    test_multiple_files()
    test_binary()
    test_to_dict()
    test_context_only()
    test_multi_hunk()
    print("\nAll 10 tests passed!")
 if __name__ == "__main__":
    run_all()
--- a/scripts/test_gitea_issue_parser.py
+++ b/scripts/test_gitea_issue_parser.py
@@ -1,109 +0,0 @@
 #!/usr/bin/env python3
 """Tests for scripts/gitea_issue_parser.py"""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__) or ".")
 # Import from sibling
 import importlib.util
 spec = importlib.util.spec_from_file_location("parser", os.path.join(os.path.dirname(__file__) or ".", "gitea_issue_parser.py"))
 mod = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(mod)
 parse_issue_body = mod.parse_issue_body
 def test_basic_parsing():
    body = """## Context
 This is the background info.
 ## Acceptance Criteria
 - [ ] First criterion
 - [x] Second criterion (done)
 ## What to build
 Some description.
 """
    result = parse_issue_body(body, title="Test (#42)", labels=["bug"])
    assert result["title"] == "Test (#42)"
    assert result["labels"] == ["bug"]
    assert result["epic_ref"] == 42
    assert len(result["criteria"]) == 2
    assert result["criteria"][0]["text"] == "First criterion"
    assert result["criteria"][0]["checked"] == False
    assert result["criteria"][1]["checked"] == True
    assert "context" in result["sections"]
    print("PASS: test_basic_parsing")
 def test_numbered_criteria():
    body = """## Acceptance Criteria
 1. First item
 2. Second item
 3. Third item
 """
    result = parse_issue_body(body)
    assert len(result["criteria"]) == 3
    assert result["criteria"][0]["text"] == "First item"
    print("PASS: test_numbered_criteria")
 def test_epic_ref_from_body():
    body = "Closes #123\n\nSome description."
    result = parse_issue_body(body)
    assert result["epic_ref"] == 123
    print("PASS: test_epic_ref_from_body")
 def test_empty_body():
    result = parse_issue_body("")
    assert result["criteria"] == []
    assert result["context"] == ""
    assert result["sections"] == {}
    print("PASS: test_empty_body")
 def test_no_sections():
    body = "Just a plain issue body with no headings."
    result = parse_issue_body(body)
    assert result["context"] == "Just a plain issue body with no headings."
    print("PASS: test_no_sections")
 def test_multiple_sections():
    body = """## Problem
 Something is broken.
 ## Fix
 Do this instead.
 ## Notes
 Additional info.
 """
    result = parse_issue_body(body)
    assert "problem" in result["sections"]
    assert "fix" in result["sections"]
    assert "notes" in result["sections"]
    assert "Something is broken" in result["sections"]["problem"]
    print("PASS: test_multiple_sections")
 def run_all():
    test_basic_parsing()
    test_numbered_criteria()
    test_epic_ref_from_body()
    test_empty_body()
    test_no_sections()
    test_multiple_sections()
    print("\nAll 6 tests passed!")
 if __name__ == "__main__":
    run_all()
Author	SHA1	Message	Date
Alexander Whitestone	d664119b9c	feat: Add diff analyzer tests (closes #176 )	2026-04-15 03:57:21 +00:00
Alexander Whitestone	764414d4d5	feat: Add diff analyzer (closes #176 )	2026-04-15 03:56:27 +00:00