Files
compounding-intelligence/scripts/diff_analyzer.py

240 lines
7.5 KiB
Python

"""
Diff Analyzer — Pipeline 6.1
Reads PR diffs and categorizes changes: new code, deleted code, modified code, moved code.
Produces a change summary with line counts per category.
Usage:
from diff_analyzer import DiffAnalyzer
analyzer = DiffAnalyzer()
summary = analyzer.analyze(diff_text)
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Optional
class ChangeCategory(Enum):
"""Categories of code changes in a diff hunk."""
ADDED = "added"
DELETED = "deleted"
MODIFIED = "modified"
MOVED = "moved"
CONTEXT = "context"
@dataclass
class Hunk:
"""A single diff hunk with metadata."""
header: str
old_start: int
old_count: int
new_start: int
new_count: int
lines: List[str] = field(default_factory=list)
category: ChangeCategory = ChangeCategory.CONTEXT
old_lines: int = 0
new_lines: int = 0
@dataclass
class FileChange:
"""Changes within a single file."""
path: str
old_path: Optional[str] = None # For renames
is_new: bool = False
is_deleted: bool = False
is_renamed: bool = False
hunks: List[Hunk] = field(default_factory=list)
added_lines: int = 0
deleted_lines: int = 0
context_lines: int = 0
@dataclass
class ChangeSummary:
"""Summary of all changes in a diff."""
files_changed: int = 0
files_added: int = 0
files_deleted: int = 0
files_renamed: int = 0
files_modified: int = 0
total_added: int = 0
total_deleted: int = 0
total_context: int = 0
hunks_added: int = 0
hunks_deleted: int = 0
hunks_modified: int = 0
hunks_moved: int = 0
file_changes: List[FileChange] = field(default_factory=list)
def to_dict(self) -> dict:
"""Serialize to dict for JSON output."""
return {
"files_changed": self.files_changed,
"files_added": self.files_added,
"files_deleted": self.files_deleted,
"files_renamed": self.files_renamed,
"files_modified": self.files_modified,
"total_added": self.total_added,
"total_deleted": self.total_deleted,
"total_context": self.total_context,
"hunks_added": self.hunks_added,
"hunks_deleted": self.hunks_deleted,
"hunks_modified": self.hunks_modified,
"hunks_moved": self.hunks_moved,
"files": [
{
"path": fc.path,
"old_path": fc.old_path,
"is_new": fc.is_new,
"is_deleted": fc.is_deleted,
"is_renamed": fc.is_renamed,
"added": fc.added_lines,
"deleted": fc.deleted_lines,
"context": fc.context_lines,
}
for fc in self.file_changes
],
}
# Regex for unified diff headers
_HUNK_RE = re.compile(
r"^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@(.*)$"
)
_FILE_HEADER_RE = re.compile(r"^diff --git a/(.*) b/(.*)$")
_RENAME_RE = re.compile(r"^rename from (.+)$|^rename to (.+)$")
class DiffAnalyzer:
"""Parses unified diffs and categorizes changes."""
def analyze(self, diff_text: str) -> ChangeSummary:
"""Analyze a unified diff string and return a ChangeSummary."""
summary = ChangeSummary()
lines = diff_text.splitlines(keepends=False)
current_file: Optional[FileChange] = None
current_hunk: Optional[Hunk] = None
old_path: Optional[str] = None
new_path: Optional[str] = None
for line in lines:
# File header
m = _FILE_HEADER_RE.match(line)
if m:
# Save previous file
if current_file:
self._classify_file(current_file)
summary.file_changes.append(current_file)
old_path = m.group(1)
new_path = m.group(2)
current_file = FileChange(path=new_path, old_path=old_path)
current_hunk = None
continue
if current_file is None:
continue
# Detect new/deleted file markers
if line.startswith("new file mode"):
current_file.is_new = True
continue
if line.startswith("deleted file mode"):
current_file.is_deleted = True
continue
# Detect renames
rm = _RENAME_RE.match(line)
if rm:
current_file.is_renamed = True
continue
# Hunk header
hm = _HUNK_RE.match(line)
if hm:
if current_hunk:
self._classify_hunk(current_hunk)
current_file.hunks.append(current_hunk)
current_hunk = Hunk(
header=line,
old_start=int(hm.group(1)),
old_count=int(hm.group(2) or 1),
new_start=int(hm.group(3)),
new_count=int(hm.group(4) or 1),
)
continue
if current_hunk is None:
continue
# Hunk content
current_hunk.lines.append(line)
if line.startswith("+"):
current_hunk.new_lines += 1
current_file.added_lines += 1
elif line.startswith("-"):
current_hunk.old_lines += 1
current_file.deleted_lines += 1
elif line.startswith(" "):
current_file.context_lines += 1
# Finalize last hunk and file
if current_hunk:
self._classify_hunk(current_hunk)
if current_file:
current_file.hunks.append(current_hunk)
if current_file:
self._classify_file(current_file)
summary.file_changes.append(current_file)
# Aggregate
summary.files_changed = len(summary.file_changes)
for fc in summary.file_changes:
summary.total_added += fc.added_lines
summary.total_deleted += fc.deleted_lines
summary.total_context += fc.context_lines
if fc.is_new:
summary.files_added += 1
elif fc.is_deleted:
summary.files_deleted += 1
elif fc.is_renamed:
summary.files_renamed += 1
else:
summary.files_modified += 1
for h in fc.hunks:
if h.category == ChangeCategory.ADDED:
summary.hunks_added += 1
elif h.category == ChangeCategory.DELETED:
summary.hunks_deleted += 1
elif h.category == ChangeCategory.MODIFIED:
summary.hunks_modified += 1
elif h.category == ChangeCategory.MOVED:
summary.hunks_moved += 1
return summary
def _classify_hunk(self, hunk: Hunk) -> None:
"""Classify a hunk based on its add/delete ratio."""
if hunk.new_lines > 0 and hunk.old_lines == 0:
hunk.category = ChangeCategory.ADDED
elif hunk.old_lines > 0 and hunk.new_lines == 0:
hunk.category = ChangeCategory.DELETED
elif hunk.new_lines > 0 and hunk.old_lines > 0:
hunk.category = ChangeCategory.MODIFIED
else:
hunk.category = ChangeCategory.CONTEXT
def _classify_file(self, fc: FileChange) -> None:
"""Final file classification (renames already detected via headers)."""
pass