Compare commits

..

2 Commits

4 changed files with 273 additions and 405 deletions

View File

@@ -1,216 +0,0 @@
#!/usr/bin/env python3
"""
Diff Analyzer — Parse unified diffs and categorize every change.
Pipeline 6.1 for Compounding Intelligence.
"""
import re
from dataclasses import dataclass, field, asdict
from enum import Enum
from typing import List, Dict, Any, Optional
class ChangeCategory(Enum):
ADDED = "added"
DELETED = "deleted"
MODIFIED = "modified"
MOVED = "moved"
CONTEXT = "context"
@dataclass
class Hunk:
"""A single diff hunk with header, line ranges, and category."""
header: str
old_start: int
old_count: int
new_start: int
new_count: int
lines: List[str] = field(default_factory=list)
category: ChangeCategory = ChangeCategory.CONTEXT
def to_dict(self) -> Dict[str, Any]:
d = asdict(self)
d["category"] = self.category.value
return d
@dataclass
class FileChange:
"""A single file's changes."""
path: str
old_path: Optional[str] = None # For renames
hunks: List[Hunk] = field(default_factory=list)
added_lines: int = 0
deleted_lines: int = 0
is_new: bool = False
is_deleted: bool = False
is_renamed: bool = False
is_binary: bool = False
def to_dict(self) -> Dict[str, Any]:
return {
"path": self.path,
"old_path": self.old_path,
"hunks": [h.to_dict() for h in self.hunks],
"added_lines": self.added_lines,
"deleted_lines": self.deleted_lines,
"is_new": self.is_new,
"is_deleted": self.is_deleted,
"is_renamed": self.is_renamed,
"is_binary": self.is_binary,
}
@dataclass
class ChangeSummary:
"""Aggregate stats + per-file breakdown."""
files: List[FileChange] = field(default_factory=list)
total_added: int = 0
total_deleted: int = 0
total_files_changed: int = 0
total_hunks: int = 0
new_files: int = 0
deleted_files: int = 0
renamed_files: int = 0
binary_files: int = 0
def to_dict(self) -> Dict[str, Any]:
return {
"total_files_changed": self.total_files_changed,
"total_added": self.total_added,
"total_deleted": self.total_deleted,
"total_hunks": self.total_hunks,
"new_files": self.new_files,
"deleted_files": self.deleted_files,
"renamed_files": self.renamed_files,
"binary_files": self.binary_files,
"files": [f.to_dict() for f in self.files],
}
class DiffAnalyzer:
"""Parses unified diff format and produces structured ChangeSummary."""
HUNK_HEADER_RE = re.compile(r"^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@(.*)$")
DIFF_FILE_RE = re.compile(r"^diff --git a/(.*) b/(.*)")
RENAME_RE = re.compile(r"^rename from (.+)$")
RENAME_TO_RE = re.compile(r"^rename to (.+)$")
NEW_FILE_RE = re.compile(r"^new file mode")
DELETED_FILE_RE = re.compile(r"^deleted file mode")
BINARY_RE = re.compile(r"^Binary files .* differ")
def analyze(self, diff_text: str) -> ChangeSummary:
"""Parse a unified diff and return a ChangeSummary."""
summary = ChangeSummary()
if not diff_text or not diff_text.strip():
return summary
# Split diff into per-file sections
file_diffs = self._split_files(diff_text)
for file_diff in file_diffs:
fc = self._parse_file_diff(file_diff)
summary.files.append(fc)
summary.total_added += fc.added_lines
summary.total_deleted += fc.deleted_lines
summary.total_hunks += len(fc.hunks)
if fc.is_new:
summary.new_files += 1
if fc.is_deleted:
summary.deleted_files += 1
if fc.is_renamed:
summary.renamed_files += 1
if fc.is_binary:
summary.binary_files += 1
summary.total_files_changed = len(summary.files)
return summary
def _split_files(self, diff_text: str) -> List[str]:
"""Split a multi-file diff into individual file diffs."""
lines = diff_text.split("\n")
chunks = []
current = []
for line in lines:
if line.startswith("diff --git ") and current:
chunks.append("\n".join(current))
current = [line]
else:
current.append(line)
if current:
chunks.append("\n".join(current))
return chunks
def _parse_file_diff(self, diff_text: str) -> FileChange:
"""Parse a single file's diff section."""
lines = diff_text.split("\n")
fc = FileChange(path="")
# Extract file paths
for line in lines:
m = self.DIFF_FILE_RE.match(line)
if m:
fc.path = m.group(2)
break
# Check for special states
for line in lines:
if self.NEW_FILE_RE.match(line):
fc.is_new = True
elif self.DELETED_FILE_RE.match(line):
fc.is_deleted = True
elif self.RENAME_RE.match(line):
fc.old_path = m.group(1) if (m := self.RENAME_RE.match(line)) else None
fc.is_renamed = True
elif self.BINARY_RE.match(line):
fc.is_binary = True
return fc # No hunks for binary
# Rename TO
for line in lines:
m = self.RENAME_TO_RE.match(line)
if m and fc.is_renamed:
fc.path = m.group(1)
# Parse hunks
current_hunk = None
for line in lines:
m = self.HUNK_HEADER_RE.match(line)
if m:
if current_hunk:
self._classify_hunk(current_hunk, fc)
fc.hunks.append(current_hunk)
current_hunk = Hunk(
header=m.group(5).strip(),
old_start=int(m.group(1)),
old_count=int(m.group(2) or 1),
new_start=int(m.group(3)),
new_count=int(m.group(4) or 1),
)
elif current_hunk and (line.startswith("+") or line.startswith("-") or line.startswith(" ")):
current_hunk.lines.append(line)
if current_hunk:
self._classify_hunk(current_hunk, fc)
fc.hunks.append(current_hunk)
return fc
def _classify_hunk(self, hunk: Hunk, fc: FileChange):
"""Classify a hunk and count lines."""
added = sum(1 for l in hunk.lines if l.startswith("+"))
deleted = sum(1 for l in hunk.lines if l.startswith("-"))
fc.added_lines += added
fc.deleted_lines += deleted
if added > 0 and deleted == 0:
hunk.category = ChangeCategory.ADDED
elif deleted > 0 and added == 0:
hunk.category = ChangeCategory.DELETED
elif added > 0 and deleted > 0:
hunk.category = ChangeCategory.MODIFIED
else:
hunk.category = ChangeCategory.CONTEXT

View File

@@ -0,0 +1,162 @@
#!/usr/bin/env python3
"""
Gitea Issue Body Parser
Extracts structured data from Gitea issue markdown bodies:
- Title
- Context section
- Acceptance criteria (checkboxes)
- Labels
- Epic/parent references
Usage:
python3 scripts/gitea_issue_parser.py <issue_body.txt
python3 scripts/gitea_issue_parser.py --url https://forge.../api/v1/repos/.../issues/123
echo "issue body" | python3 scripts/gitea_issue_parser.py --stdin
Output: JSON with {title, context, criteria[], labels[], epic_ref}
"""
import argparse
import json
import re
import sys
from typing import Optional
def parse_issue_body(body: str, title: str = "", labels: list = None) -> dict:
"""Parse a Gitea issue body into structured JSON."""
result = {
"title": title,
"context": "",
"criteria": [],
"labels": labels or [],
"epic_ref": None,
"sections": {},
}
if not body:
return result
# Extract epic/parent reference from title or body
epic_pattern = r"#(\d+)"
title_refs = re.findall(epic_pattern, title)
body_refs = re.findall(epic_pattern, body[:200]) # Check early body refs
# Look for "Closes #N" or "Part of #N" or "Epic: #N"
close_match = re.search(r"(?:Closes?|Fixes?|Resolves?)\s+#(\d+)", body, re.IGNORECASE)
part_match = re.search(r"(?:Part of|Epic|Parent|Blocks?)\s+#(\d+)", body, re.IGNORECASE)
if close_match:
result["epic_ref"] = f"#{close_match.group(1)}"
elif part_match:
result["epic_ref"] = f"#{part_match.group(1)}"
elif title_refs:
result["epic_ref"] = f"#{title_refs[0]}"
elif body_refs:
result["epic_ref"] = f"#{body_refs[0]}"
# Split into sections by ## headers
section_pattern = r"^##\s+(.+)$"
lines = body.split("\n")
current_section = None
current_content = []
for line in lines:
header_match = re.match(section_pattern, line)
if header_match:
# Save previous section
if current_section:
result["sections"][current_section] = "\n".join(current_content).strip()
current_section = header_match.group(1).strip().lower()
current_content = []
else:
current_content.append(line)
# Save last section
if current_section:
result["sections"][current_section] = "\n".join(current_content).strip()
# Extract context
for key in ["context", "background", "description", "problem"]:
if key in result["sections"]:
result["context"] = result["sections"][key]
break
# Extract acceptance criteria (checkboxes)
criteria_section = None
for key in ["acceptance criteria", "acceptance_criteria", "criteria", "requirements", "definition of done"]:
if key in result["sections"]:
criteria_section = result["sections"][key]
break
if criteria_section:
checkbox_pattern = r"-\s*\[[ xX]?\]\s*(.+)"
for match in re.finditer(checkbox_pattern, criteria_section):
result["criteria"].append(match.group(1).strip())
# Also try plain numbered/bulleted lists if no checkboxes found
if not result["criteria"]:
list_pattern = r"^\s*(?:\d+\.|-|\*)\s+(.+)"
for match in re.finditer(list_pattern, criteria_section, re.MULTILINE):
result["criteria"].append(match.group(1).strip())
# If no sectioned criteria found, scan whole body for checkboxes
if not result["criteria"]:
for match in re.finditer(r"-\s*\[[ xX]?\]\s*(.+)", body):
result["criteria"].append(match.group(1).strip())
return result
def parse_from_url(api_url: str, token: str = None) -> dict:
"""Parse an issue from a Gitea API URL."""
import urllib.request
headers = {}
if token:
headers["Authorization"] = f"token {token}"
req = urllib.request.Request(api_url, headers=headers)
resp = json.loads(urllib.request.urlopen(req, timeout=30).read())
title = resp.get("title", "")
body = resp.get("body", "")
labels = [l["name"] for l in resp.get("labels", [])]
return parse_issue_body(body, title, labels)
def main():
parser = argparse.ArgumentParser(description="Parse Gitea issue body into structured JSON")
parser.add_argument("input", nargs="?", help="Issue body file (or - for stdin)")
parser.add_argument("--url", help="Gitea API URL for the issue")
parser.add_argument("--stdin", action="store_true", help="Read from stdin")
parser.add_argument("--token", help="Gitea API token (or set GITEA_TOKEN env var)")
parser.add_argument("--title", default="", help="Issue title (for epic ref extraction)")
parser.add_argument("--labels", nargs="*", default=[], help="Issue labels")
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
args = parser.parse_args()
import os
token = args.token or os.environ.get("GITEA_TOKEN")
if args.url:
result = parse_from_url(args.url, token)
elif args.stdin or (args.input and args.input == "-"):
body = sys.stdin.read()
result = parse_issue_body(body, args.title, args.labels)
elif args.input:
with open(args.input) as f:
body = f.read()
result = parse_issue_body(body, args.title, args.labels)
else:
parser.print_help()
sys.exit(1)
indent = 2 if args.pretty else None
print(json.dumps(result, indent=indent))
if __name__ == "__main__":
main()

View File

@@ -1,189 +0,0 @@
#!/usr/bin/env python3
"""Tests for scripts/diff_analyzer.py — 10 tests."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__) or ".")
import importlib.util
spec = importlib.util.spec_from_file_location("da", os.path.join(os.path.dirname(__file__) or ".", "diff_analyzer.py"))
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
DiffAnalyzer = mod.DiffAnalyzer
ChangeCategory = mod.ChangeCategory
SAMPLE_ADD = """diff --git a/new.py b/new.py
new file mode 100644
--- /dev/null
+++ b/new.py
@@ -0,0 +1,3 @@
+def hello():
+ print("world")
+ return True
"""
SAMPLE_DELETE = """diff --git a/old.py b/old.py
deleted file mode 100644
--- a/old.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def goodbye():
- pass
"""
SAMPLE_MODIFY = """diff --git a/app.py b/app.py
--- a/app.py
+++ b/app.py
@@ -1,3 +1,4 @@
def main():
- print("old")
+ print("new")
+ print("extra")
return 0
"""
SAMPLE_RENAME = """diff --git a/old_name.py b/new_name.py
rename from old_name.py
rename to new_name.py
--- a/old_name.py
+++ b/new_name.py
@@ -1,1 +1,1 @@
-old content
+new content
"""
SAMPLE_MULTI = """diff --git a/a.py b/a.py
--- a/a.py
+++ b/a.py
@@ -1,1 +1,2 @@
existing
+added line
diff --git b/b.py b/b.py
new file mode 100644
--- /dev/null
+++ b/b.py
@@ -0,0 +1,1 @@
+new file
"""
SAMPLE_BINARY = """diff --git a/img.png b/img.png
Binary files a/img.png and b/img.png differ
"""
def test_empty():
a = DiffAnalyzer()
s = a.analyze("")
assert s.total_files_changed == 0
print("PASS: test_empty")
def test_addition():
a = DiffAnalyzer()
s = a.analyze(SAMPLE_ADD)
assert s.total_files_changed == 1
assert s.total_added == 3
assert s.total_deleted == 0
assert s.new_files == 1
assert s.files[0].hunks[0].category == ChangeCategory.ADDED
print("PASS: test_addition")
def test_deletion():
a = DiffAnalyzer()
s = a.analyze(SAMPLE_DELETE)
assert s.total_deleted == 2
assert s.deleted_files == 1
assert s.files[0].hunks[0].category == ChangeCategory.DELETED
print("PASS: test_deletion")
def test_modification():
a = DiffAnalyzer()
s = a.analyze(SAMPLE_MODIFY)
assert s.total_added == 2
assert s.total_deleted == 1
assert s.files[0].hunks[0].category == ChangeCategory.MODIFIED
print("PASS: test_modification")
def test_rename():
a = DiffAnalyzer()
s = a.analyze(SAMPLE_RENAME)
assert s.renamed_files == 1
assert s.files[0].old_path == "old_name.py"
assert s.files[0].path == "new_name.py"
assert s.files[0].is_renamed == True
print("PASS: test_rename")
def test_multiple_files():
a = DiffAnalyzer()
s = a.analyze(SAMPLE_MULTI)
assert s.total_files_changed == 2
assert s.new_files == 1
print("PASS: test_multiple_files")
def test_binary():
a = DiffAnalyzer()
s = a.analyze(SAMPLE_BINARY)
assert s.binary_files == 1
assert s.files[0].is_binary == True
assert len(s.files[0].hunks) == 0
print("PASS: test_binary")
def test_to_dict():
a = DiffAnalyzer()
s = a.analyze(SAMPLE_MODIFY)
d = s.to_dict()
assert "total_files_changed" in d
assert "files" in d
assert isinstance(d["files"], list)
print("PASS: test_to_dict")
def test_context_only():
diff = """diff --git a/f.py b/f.py
--- a/f.py
+++ b/f.py
@@ -1,3 +1,3 @@
line1
-old
+new
line3
"""
a = DiffAnalyzer()
s = a.analyze(diff)
# Has both added and deleted = MODIFIED
assert s.files[0].hunks[0].category == ChangeCategory.MODIFIED
print("PASS: test_context_only")
def test_multi_hunk():
diff = """diff --git a/f.py b/f.py
--- a/f.py
+++ b/f.py
@@ -1,1 +1,2 @@
existing
+first addition
@@ -10,1 +11,2 @@
more
+second addition
"""
a = DiffAnalyzer()
s = a.analyze(diff)
assert s.total_hunks == 2
assert s.total_added == 2
print("PASS: test_multi_hunk")
def run_all():
test_empty()
test_addition()
test_deletion()
test_modification()
test_rename()
test_multiple_files()
test_binary()
test_to_dict()
test_context_only()
test_multi_hunk()
print("\nAll 10 tests passed!")
if __name__ == "__main__":
run_all()

View File

@@ -0,0 +1,111 @@
#!/usr/bin/env python3
"""Tests for gitea_issue_parser."""
import json
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from gitea_issue_parser import parse_issue_body
def test_basic_structure():
body = """## Context
This is the background.
## Acceptance Criteria
- [ ] First criterion
- [x] Second criterion (already done)
- [ ] Third criterion
## Labels
`pipeline`, `extraction`
"""
result = parse_issue_body(body, "Test Issue", ["pipeline", "extraction"])
assert result["title"] == "Test Issue"
assert "background" in result["context"].lower()
assert len(result["criteria"]) == 3
assert "First criterion" in result["criteria"]
assert result["labels"] == ["pipeline", "extraction"]
print("PASS: test_basic_structure")
def test_epic_ref():
body = "Closes #645\n\nSome description."
result = parse_issue_body(body, "feat: thing (#688)")
assert result["epic_ref"] == "#645"
print("PASS: test_epic_ref")
def test_epic_ref_from_title():
body = "Some description without close ref."
result = parse_issue_body(body, "feat: scene descriptions (#645)")
assert result["epic_ref"] == "#645"
print("PASS: test_epic_ref_from_title")
def test_no_checkboxes():
body = """## Requirements
1. First thing
2. Second thing
3. Third thing
"""
result = parse_issue_body(body)
assert len(result["criteria"]) == 3
print("PASS: test_no_checkboxes")
def test_empty_body():
result = parse_issue_body("", "Empty Issue")
assert result["title"] == "Empty Issue"
assert result["criteria"] == []
assert result["context"] == ""
print("PASS: test_empty_body")
def test_real_issue_format():
body = """Closes #681
## Changes
Add `#!/usr/bin/env python3` shebang to 6 Python scripts.
## Verification
All 6 files confirmed missing shebangs before fix.
## Impact
Scripts can now be executed directly.
"""
result = parse_issue_body(body, "fix: add python3 shebangs (#685)")
assert result["epic_ref"] == "#681"
assert "shebang" in result["context"].lower()
print("PASS: test_real_issue_format")
def test_all_sections_captured():
body = """## Context
Background info.
## Acceptance Criteria
- [ ] Do thing
## Labels
`test`
"""
result = parse_issue_body(body)
assert "context" in result["sections"]
assert "acceptance criteria" in result["sections"]
print("PASS: test_all_sections_captured")
if __name__ == "__main__":
test_basic_structure()
test_epic_ref()
test_epic_ref_from_title()
test_no_checkboxes()
test_empty_body()
test_real_issue_format()
test_all_sections_captured()
print("\nAll tests passed.")