test: add tests for issue body parser (#90 )

feat: Gitea issue body parser for structured task extraction (#90 )
2026-04-15 03:34:33 +00:00 · 2026-04-15 03:34:30 +00:00
4 changed files with 273 additions and 405 deletions
--- a/scripts/diff_analyzer.py
+++ b/scripts/diff_analyzer.py
@@ -1,216 +0,0 @@
 #!/usr/bin/env python3
 """
 Diff Analyzer — Parse unified diffs and categorize every change.
 Pipeline 6.1 for Compounding Intelligence.
 """
 import re
 from dataclasses import dataclass, field, asdict
 from enum import Enum
 from typing import List, Dict, Any, Optional
 class ChangeCategory(Enum):
    ADDED = "added"
    DELETED = "deleted"
    MODIFIED = "modified"
    MOVED = "moved"
    CONTEXT = "context"
@dataclass
 class Hunk:
    """A single diff hunk with header, line ranges, and category."""
    header: str
    old_start: int
    old_count: int
    new_start: int
    new_count: int
    lines: List[str] = field(default_factory=list)
    category: ChangeCategory = ChangeCategory.CONTEXT
    def to_dict(self) -> Dict[str, Any]:
        d = asdict(self)
        d["category"] = self.category.value
        return d
@dataclass
 class FileChange:
    """A single file's changes."""
    path: str
    old_path: Optional[str] = None  # For renames
    hunks: List[Hunk] = field(default_factory=list)
    added_lines: int = 0
    deleted_lines: int = 0
    is_new: bool = False
    is_deleted: bool = False
    is_renamed: bool = False
    is_binary: bool = False
    def to_dict(self) -> Dict[str, Any]:
        return {
            "path": self.path,
            "old_path": self.old_path,
            "hunks": [h.to_dict() for h in self.hunks],
            "added_lines": self.added_lines,
            "deleted_lines": self.deleted_lines,
            "is_new": self.is_new,
            "is_deleted": self.is_deleted,
            "is_renamed": self.is_renamed,
            "is_binary": self.is_binary,
        }
@dataclass
 class ChangeSummary:
    """Aggregate stats + per-file breakdown."""
    files: List[FileChange] = field(default_factory=list)
    total_added: int = 0
    total_deleted: int = 0
    total_files_changed: int = 0
    total_hunks: int = 0
    new_files: int = 0
    deleted_files: int = 0
    renamed_files: int = 0
    binary_files: int = 0
    def to_dict(self) -> Dict[str, Any]:
        return {
            "total_files_changed": self.total_files_changed,
            "total_added": self.total_added,
            "total_deleted": self.total_deleted,
            "total_hunks": self.total_hunks,
            "new_files": self.new_files,
            "deleted_files": self.deleted_files,
            "renamed_files": self.renamed_files,
            "binary_files": self.binary_files,
            "files": [f.to_dict() for f in self.files],
        }
 class DiffAnalyzer:
    """Parses unified diff format and produces structured ChangeSummary."""
    HUNK_HEADER_RE = re.compile(r"^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@(.*)$")
    DIFF_FILE_RE = re.compile(r"^diff --git a/(.*) b/(.*)")
    RENAME_RE = re.compile(r"^rename from (.+)$")
    RENAME_TO_RE = re.compile(r"^rename to (.+)$")
    NEW_FILE_RE = re.compile(r"^new file mode")
    DELETED_FILE_RE = re.compile(r"^deleted file mode")
    BINARY_RE = re.compile(r"^Binary files .* differ")
    def analyze(self, diff_text: str) -> ChangeSummary:
        """Parse a unified diff and return a ChangeSummary."""
        summary = ChangeSummary()
        if not diff_text or not diff_text.strip():
            return summary
        # Split diff into per-file sections
        file_diffs = self._split_files(diff_text)
        for file_diff in file_diffs:
            fc = self._parse_file_diff(file_diff)
            summary.files.append(fc)
            summary.total_added += fc.added_lines
            summary.total_deleted += fc.deleted_lines
            summary.total_hunks += len(fc.hunks)
            if fc.is_new:
                summary.new_files += 1
            if fc.is_deleted:
                summary.deleted_files += 1
            if fc.is_renamed:
                summary.renamed_files += 1
            if fc.is_binary:
                summary.binary_files += 1
        summary.total_files_changed = len(summary.files)
        return summary
    def _split_files(self, diff_text: str) -> List[str]:
        """Split a multi-file diff into individual file diffs."""
        lines = diff_text.split("\n")
        chunks = []
        current = []
        for line in lines:
            if line.startswith("diff --git ") and current:
                chunks.append("\n".join(current))
                current = [line]
            else:
                current.append(line)
        if current:
            chunks.append("\n".join(current))
        return chunks
    def _parse_file_diff(self, diff_text: str) -> FileChange:
        """Parse a single file's diff section."""
        lines = diff_text.split("\n")
        fc = FileChange(path="")
        # Extract file paths
        for line in lines:
            m = self.DIFF_FILE_RE.match(line)
            if m:
                fc.path = m.group(2)
                break
        # Check for special states
        for line in lines:
            if self.NEW_FILE_RE.match(line):
                fc.is_new = True
            elif self.DELETED_FILE_RE.match(line):
                fc.is_deleted = True
            elif self.RENAME_RE.match(line):
                fc.old_path = m.group(1) if (m := self.RENAME_RE.match(line)) else None
                fc.is_renamed = True
            elif self.BINARY_RE.match(line):
                fc.is_binary = True
                return fc  # No hunks for binary
        # Rename TO
        for line in lines:
            m = self.RENAME_TO_RE.match(line)
            if m and fc.is_renamed:
                fc.path = m.group(1)
        # Parse hunks
        current_hunk = None
        for line in lines:
            m = self.HUNK_HEADER_RE.match(line)
            if m:
                if current_hunk:
                    self._classify_hunk(current_hunk, fc)
                    fc.hunks.append(current_hunk)
                current_hunk = Hunk(
                    header=m.group(5).strip(),
                    old_start=int(m.group(1)),
                    old_count=int(m.group(2) or 1),
                    new_start=int(m.group(3)),
                    new_count=int(m.group(4) or 1),
                )
            elif current_hunk and (line.startswith("+") or line.startswith("-") or line.startswith(" ")):
                current_hunk.lines.append(line)
        if current_hunk:
            self._classify_hunk(current_hunk, fc)
            fc.hunks.append(current_hunk)
        return fc
    def _classify_hunk(self, hunk: Hunk, fc: FileChange):
        """Classify a hunk and count lines."""
        added = sum(1 for l in hunk.lines if l.startswith("+"))
        deleted = sum(1 for l in hunk.lines if l.startswith("-"))
        fc.added_lines += added
        fc.deleted_lines += deleted
        if added > 0 and deleted == 0:
            hunk.category = ChangeCategory.ADDED
        elif deleted > 0 and added == 0:
            hunk.category = ChangeCategory.DELETED
        elif added > 0 and deleted > 0:
            hunk.category = ChangeCategory.MODIFIED
        else:
            hunk.category = ChangeCategory.CONTEXT
--- a/scripts/gitea_issue_parser.py
+++ b/scripts/gitea_issue_parser.py
@@ -0,0 +1,162 @@
 #!/usr/bin/env python3
 """
 Gitea Issue Body Parser
 Extracts structured data from Gitea issue markdown bodies:
 - Title
 - Context section
 - Acceptance criteria (checkboxes)
 - Labels
 - Epic/parent references
 Usage:
  python3 scripts/gitea_issue_parser.py <issue_body.txt
  python3 scripts/gitea_issue_parser.py --url https://forge.../api/v1/repos/.../issues/123
  echo "issue body" | python3 scripts/gitea_issue_parser.py --stdin
 Output: JSON with {title, context, criteria[], labels[], epic_ref}
 """
 import argparse
 import json
 import re
 import sys
 from typing import Optional
 def parse_issue_body(body: str, title: str = "", labels: list = None) -> dict:
    """Parse a Gitea issue body into structured JSON."""
    result = {
        "title": title,
        "context": "",
        "criteria": [],
        "labels": labels or [],
        "epic_ref": None,
        "sections": {},
    }
    if not body:
        return result
    # Extract epic/parent reference from title or body
    epic_pattern = r"#(\d+)"
    title_refs = re.findall(epic_pattern, title)
    body_refs = re.findall(epic_pattern, body[:200])  # Check early body refs
    # Look for "Closes #N" or "Part of #N" or "Epic: #N"
    close_match = re.search(r"(?:Closes?|Fixes?|Resolves?)\s+#(\d+)", body, re.IGNORECASE)
    part_match = re.search(r"(?:Part of|Epic|Parent|Blocks?)\s+#(\d+)", body, re.IGNORECASE)
    if close_match:
        result["epic_ref"] = f"#{close_match.group(1)}"
    elif part_match:
        result["epic_ref"] = f"#{part_match.group(1)}"
    elif title_refs:
        result["epic_ref"] = f"#{title_refs[0]}"
    elif body_refs:
        result["epic_ref"] = f"#{body_refs[0]}"
    # Split into sections by ## headers
    section_pattern = r"^##\s+(.+)$"
    lines = body.split("\n")
    current_section = None
    current_content = []
    for line in lines:
        header_match = re.match(section_pattern, line)
        if header_match:
            # Save previous section
            if current_section:
                result["sections"][current_section] = "\n".join(current_content).strip()
            current_section = header_match.group(1).strip().lower()
            current_content = []
        else:
            current_content.append(line)
    # Save last section
    if current_section:
        result["sections"][current_section] = "\n".join(current_content).strip()
    # Extract context
    for key in ["context", "background", "description", "problem"]:
        if key in result["sections"]:
            result["context"] = result["sections"][key]
            break
    # Extract acceptance criteria (checkboxes)
    criteria_section = None
    for key in ["acceptance criteria", "acceptance_criteria", "criteria", "requirements", "definition of done"]:
        if key in result["sections"]:
            criteria_section = result["sections"][key]
            break
    if criteria_section:
        checkbox_pattern = r"-\s*\[[ xX]?\]\s*(.+)"
        for match in re.finditer(checkbox_pattern, criteria_section):
            result["criteria"].append(match.group(1).strip())
        # Also try plain numbered/bulleted lists if no checkboxes found
        if not result["criteria"]:
            list_pattern = r"^\s*(?:\d+\.|-|\*)\s+(.+)"
            for match in re.finditer(list_pattern, criteria_section, re.MULTILINE):
                result["criteria"].append(match.group(1).strip())
    # If no sectioned criteria found, scan whole body for checkboxes
    if not result["criteria"]:
        for match in re.finditer(r"-\s*\[[ xX]?\]\s*(.+)", body):
            result["criteria"].append(match.group(1).strip())
    return result
 def parse_from_url(api_url: str, token: str = None) -> dict:
    """Parse an issue from a Gitea API URL."""
    import urllib.request
    headers = {}
    if token:
        headers["Authorization"] = f"token {token}"
    req = urllib.request.Request(api_url, headers=headers)
    resp = json.loads(urllib.request.urlopen(req, timeout=30).read())
    title = resp.get("title", "")
    body = resp.get("body", "")
    labels = [l["name"] for l in resp.get("labels", [])]
    return parse_issue_body(body, title, labels)
 def main():
    parser = argparse.ArgumentParser(description="Parse Gitea issue body into structured JSON")
    parser.add_argument("input", nargs="?", help="Issue body file (or - for stdin)")
    parser.add_argument("--url", help="Gitea API URL for the issue")
    parser.add_argument("--stdin", action="store_true", help="Read from stdin")
    parser.add_argument("--token", help="Gitea API token (or set GITEA_TOKEN env var)")
    parser.add_argument("--title", default="", help="Issue title (for epic ref extraction)")
    parser.add_argument("--labels", nargs="*", default=[], help="Issue labels")
    parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
    args = parser.parse_args()
    import os
    token = args.token or os.environ.get("GITEA_TOKEN")
    if args.url:
        result = parse_from_url(args.url, token)
    elif args.stdin or (args.input and args.input == "-"):
        body = sys.stdin.read()
        result = parse_issue_body(body, args.title, args.labels)
    elif args.input:
        with open(args.input) as f:
            body = f.read()
        result = parse_issue_body(body, args.title, args.labels)
    else:
        parser.print_help()
        sys.exit(1)
    indent = 2 if args.pretty else None
    print(json.dumps(result, indent=indent))
 if __name__ == "__main__":
    main()
--- a/scripts/test_diff_analyzer.py
+++ b/scripts/test_diff_analyzer.py
@@ -1,189 +0,0 @@
 #!/usr/bin/env python3
 """Tests for scripts/diff_analyzer.py — 10 tests."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__) or ".")
 import importlib.util
 spec = importlib.util.spec_from_file_location("da", os.path.join(os.path.dirname(__file__) or ".", "diff_analyzer.py"))
 mod = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(mod)
 DiffAnalyzer = mod.DiffAnalyzer
 ChangeCategory = mod.ChangeCategory
 SAMPLE_ADD = """diff --git a/new.py b/new.py
 new file mode 100644
 --- /dev/null
 +++ b/new.py
@@ -0,0 +1,3 @@
 +def hello():
 +    print("world")
 +    return True
 """
 SAMPLE_DELETE = """diff --git a/old.py b/old.py
 deleted file mode 100644
 --- a/old.py
 +++ /dev/null
@@ -1,2 +0,0 @@
 -def goodbye():
 -    pass
 """
 SAMPLE_MODIFY = """diff --git a/app.py b/app.py
 --- a/app.py
 +++ b/app.py
@@ -1,3 +1,4 @@
 def main():
 -    print("old")
 +    print("new")
 +    print("extra")
     return 0
 """
 SAMPLE_RENAME = """diff --git a/old_name.py b/new_name.py
 rename from old_name.py
 rename to new_name.py
 --- a/old_name.py
 +++ b/new_name.py
@@ -1,1 +1,1 @@
 -old content
 +new content
 """
 SAMPLE_MULTI = """diff --git a/a.py b/a.py
 --- a/a.py
 +++ b/a.py
@@ -1,1 +1,2 @@
 existing
 +added line
 diff --git b/b.py b/b.py
 new file mode 100644
 --- /dev/null
 +++ b/b.py
@@ -0,0 +1,1 @@
 +new file
 """
 SAMPLE_BINARY = """diff --git a/img.png b/img.png
 Binary files a/img.png and b/img.png differ
 """
 def test_empty():
    a = DiffAnalyzer()
    s = a.analyze("")
    assert s.total_files_changed == 0
    print("PASS: test_empty")
 def test_addition():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_ADD)
    assert s.total_files_changed == 1
    assert s.total_added == 3
    assert s.total_deleted == 0
    assert s.new_files == 1
    assert s.files[0].hunks[0].category == ChangeCategory.ADDED
    print("PASS: test_addition")
 def test_deletion():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_DELETE)
    assert s.total_deleted == 2
    assert s.deleted_files == 1
    assert s.files[0].hunks[0].category == ChangeCategory.DELETED
    print("PASS: test_deletion")
 def test_modification():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_MODIFY)
    assert s.total_added == 2
    assert s.total_deleted == 1
    assert s.files[0].hunks[0].category == ChangeCategory.MODIFIED
    print("PASS: test_modification")
 def test_rename():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_RENAME)
    assert s.renamed_files == 1
    assert s.files[0].old_path == "old_name.py"
    assert s.files[0].path == "new_name.py"
    assert s.files[0].is_renamed == True
    print("PASS: test_rename")
 def test_multiple_files():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_MULTI)
    assert s.total_files_changed == 2
    assert s.new_files == 1
    print("PASS: test_multiple_files")
 def test_binary():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_BINARY)
    assert s.binary_files == 1
    assert s.files[0].is_binary == True
    assert len(s.files[0].hunks) == 0
    print("PASS: test_binary")
 def test_to_dict():
    a = DiffAnalyzer()
    s = a.analyze(SAMPLE_MODIFY)
    d = s.to_dict()
    assert "total_files_changed" in d
    assert "files" in d
    assert isinstance(d["files"], list)
    print("PASS: test_to_dict")
 def test_context_only():
    diff = """diff --git a/f.py b/f.py
 --- a/f.py
 +++ b/f.py
@@ -1,3 +1,3 @@
 line1
 -old
 +new
 line3
 """
    a = DiffAnalyzer()
    s = a.analyze(diff)
    # Has both added and deleted = MODIFIED
    assert s.files[0].hunks[0].category == ChangeCategory.MODIFIED
    print("PASS: test_context_only")
 def test_multi_hunk():
    diff = """diff --git a/f.py b/f.py
 --- a/f.py
 +++ b/f.py
@@ -1,1 +1,2 @@
 existing
 +first addition
@@ -10,1 +11,2 @@
 more
 +second addition
 """
    a = DiffAnalyzer()
    s = a.analyze(diff)
    assert s.total_hunks == 2
    assert s.total_added == 2
    print("PASS: test_multi_hunk")
 def run_all():
    test_empty()
    test_addition()
    test_deletion()
    test_modification()
    test_rename()
    test_multiple_files()
    test_binary()
    test_to_dict()
    test_context_only()
    test_multi_hunk()
    print("\nAll 10 tests passed!")
 if __name__ == "__main__":
    run_all()
--- a/scripts/test_gitea_issue_parser.py
+++ b/scripts/test_gitea_issue_parser.py
@@ -0,0 +1,111 @@
 #!/usr/bin/env python3
 """Tests for gitea_issue_parser."""
 import json
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from gitea_issue_parser import parse_issue_body
 def test_basic_structure():
    body = """## Context
 This is the background.
 ## Acceptance Criteria
 - [ ] First criterion
 - [x] Second criterion (already done)
 - [ ] Third criterion
 ## Labels
 `pipeline`, `extraction`
 """
    result = parse_issue_body(body, "Test Issue", ["pipeline", "extraction"])
    assert result["title"] == "Test Issue"
    assert "background" in result["context"].lower()
    assert len(result["criteria"]) == 3
    assert "First criterion" in result["criteria"]
    assert result["labels"] == ["pipeline", "extraction"]
    print("PASS: test_basic_structure")
 def test_epic_ref():
    body = "Closes #645\n\nSome description."
    result = parse_issue_body(body, "feat: thing (#688)")
    assert result["epic_ref"] == "#645"
    print("PASS: test_epic_ref")
 def test_epic_ref_from_title():
    body = "Some description without close ref."
    result = parse_issue_body(body, "feat: scene descriptions (#645)")
    assert result["epic_ref"] == "#645"
    print("PASS: test_epic_ref_from_title")
 def test_no_checkboxes():
    body = """## Requirements
 1. First thing
 2. Second thing
 3. Third thing
 """
    result = parse_issue_body(body)
    assert len(result["criteria"]) == 3
    print("PASS: test_no_checkboxes")
 def test_empty_body():
    result = parse_issue_body("", "Empty Issue")
    assert result["title"] == "Empty Issue"
    assert result["criteria"] == []
    assert result["context"] == ""
    print("PASS: test_empty_body")
 def test_real_issue_format():
    body = """Closes #681
 ## Changes
 Add `#!/usr/bin/env python3` shebang to 6 Python scripts.
 ## Verification
 All 6 files confirmed missing shebangs before fix.
 ## Impact
 Scripts can now be executed directly.
 """
    result = parse_issue_body(body, "fix: add python3 shebangs (#685)")
    assert result["epic_ref"] == "#681"
    assert "shebang" in result["context"].lower()
    print("PASS: test_real_issue_format")
 def test_all_sections_captured():
    body = """## Context
 Background info.
 ## Acceptance Criteria
 - [ ] Do thing
 ## Labels
 `test`
 """
    result = parse_issue_body(body)
    assert "context" in result["sections"]
    assert "acceptance criteria" in result["sections"]
    print("PASS: test_all_sections_captured")
 if __name__ == "__main__":
    test_basic_structure()
    test_epic_ref()
    test_epic_ref_from_title()
    test_no_checkboxes()
    test_empty_body()
    test_real_issue_format()
    test_all_sections_captured()
    print("\nAll tests passed.")
Author	SHA1	Message	Date
Alexander Whitestone	2f38c52691	test: add tests for issue body parser (#90 )	2026-04-15 03:34:33 +00:00
Alexander Whitestone	b614220eff	feat: Gitea issue body parser for structured task extraction (#90 )	2026-04-15 03:34:30 +00:00