diff --git a/scripts/genome_diff.py b/scripts/genome_diff.py new file mode 100755 index 0000000..3617d3f --- /dev/null +++ b/scripts/genome_diff.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +""" +Codebase Genome Diff — Detect structural changes between two versions. + +Compares two git refs (commits, branches, tags) and produces a human-readable +report of structural changes: + • Added/removed/renamed files + • Changed functions/classes (signature modifications) + • New dependencies (imports, requirements, etc.) + +Usage: + python3 scripts/genome_diff.py --ref1 --ref2 + python3 scripts/genome_diff.py --ref1 main --ref2 feature-branch + python3 scripts/genome_diff.py --ref1 v1.0 --ref2 v2.0 --output report.txt +""" + +import argparse +import json +import os +import re +import subprocess +import sys +from dataclasses import dataclass, field +from typing import List, Dict, Any, Optional + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, SCRIPT_DIR) +from diff_analyzer import DiffAnalyzer, ChangeCategory + + +@dataclass +class FunctionChange: + file: str + name: str + kind: str # 'function' or 'class' + change_type: str # 'added' or 'removed' (simplified) + old_line: Optional[int] = None + new_line: Optional[int] = None + + +@dataclass +class DependencyChange: + file: str + module: str + change_type: str # 'added' or 'removed' or 'modified' + line: int = 0 + + +@dataclass +class GenomeDiffReport: + ref1: str + ref2: str + file_changes: List[Dict[str, Any]] = field(default_factory=list) + function_changes: List[FunctionChange] = field(default_factory=list) + dependency_changes: List[DependencyChange] = field(default_factory=list) + total_files_changed: int = 0 + total_functions_changed: int = 0 + total_dependencies_changed: int = 0 + + def to_dict(self) -> Dict[str, Any]: + return { + "ref1": self.ref1, + "ref2": self.ref2, + "summary": { + "files": self.total_files_changed, + "functions": self.total_functions_changed, + "dependencies": self.total_dependencies_changed, + }, + "file_changes": self.file_changes, + "function_changes": [fc.__dict__ for fc in self.function_changes], + "dependency_changes": [dc.__dict__ for dc in self.dependency_changes], + } + + def human_report(self) -> str: + lines = [] + lines.append(f"Codebase Genome Diff: {self.ref1} → {self.ref2}") + lines.append("=" * 60) + lines.append(f" Files changed: {self.total_files_changed}") + lines.append(f" Functions changed: {self.total_functions_changed}") + lines.append(f" Dependencies changed: {self.total_dependencies_changed}") + lines.append("") + + for fc in self.file_changes: + kind = [] + if fc.get('is_new'): + kind.append("NEW") + if fc.get('is_deleted'): + kind.append("DELETED") + if fc.get('is_renamed'): + kind.append("RENAMED") + if fc.get('is_binary'): + kind.append("BINARY") + kind_str = f" [{', '.join(kind)}]" if kind else "" + lines.append(f" {fc['path']}{kind_str} (+{fc['added_lines']}/-{fc['deleted_lines']})") + lines.append("") + + for fc in self.function_changes: + op = {'added': '+', 'removed': '-', 'modified': '~'}.get(fc.change_type, '?') + lines.append(f" [{op}] {fc.file}: {fc.kind} '{fc.name}'") + lines.append("") + + for dc in self.dependency_changes: + op = '+' if dc.change_type == 'added' else '-' + lines.append(f" [{op}] {dc.file}: {dc.module}") + lines.append("") + + return "\n".join(lines) + + +def run_git_diff(ref1: str, ref2: str) -> str: + result = subprocess.run( + ['git', 'diff', '--unified=0', f'{ref1}...{ref2}'], + capture_output=True, text=True, cwd=SCRIPT_DIR + ) + if result.returncode not in (0, 1): + print(f"git diff failed: {result.stderr}", file=sys.stderr) + sys.exit(1) + return result.stdout + + +def extract_function_changes(diff_text: str) -> List[FunctionChange]: + changes: List[FunctionChange] = [] + pattern = re.compile(r'^([+\-])\s*(def|class)\s+(\w+)', re.MULTILINE) + hunk_header_re = re.compile(r'^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@') + current_old_line: Optional[int] = None + current_new_line: Optional[int] = None + + for line in diff_text.split('\n'): + hdr = hunk_header_re.match(line) + if hdr: + current_old_line = int(hdr.group(1)) + current_new_line = int(hdr.group(3)) + continue + m = pattern.match(line) + if m: + op = m.group(1) + kind = m.group(2) + name = m.group(3) + change_type = "added" if op == '+' else "removed" + line_num = current_new_line if change_type == "added" else current_old_line + changes.append(FunctionChange( + file="", + name=name, + kind=kind, + change_type=change_type, + new_line=line_num if change_type == "added" else None, + old_line=line_num if change_type == "removed" else None, + )) + # Advance line counters heuristically + if op == '-': + if current_old_line is not None: + current_old_line += 1 + elif op == '+': + if current_new_line is not None: + current_new_line += 1 + elif line.startswith(' '): + if current_old_line is not None: + current_old_line += 1 + if current_new_line is not None: + current_new_line += 1 + # lines starting with other prefixes (like \\ No newline) ignored + return changes + + +def extract_dependency_changes(diff_text: str, analyzer: DiffAnalyzer) -> List[DependencyChange]: + changes: List[DependencyChange] = [] + import_pattern = re.compile( + r'^([+\-])\s*(?:import\s+([\w\.]+)|from\s+([\w\.]+)\s+import)', + re.MULTILINE + ) + file_diffs = analyzer._split_files(diff_text) + for file_diff in file_diffs: + file_match = re.search(r'^diff --git a/.*? b/(.*?)$', file_diff, re.MULTILINE) + if not file_match: + continue + filepath = file_match.group(1) + + # Scan each line for import changes + for line in file_diff.split('\n'): + m = import_pattern.match(line) + if m: + change_type = "added" if m.group(1) == '+' else "removed" + module = m.group(2) or m.group(3) + changes.append(DependencyChange( + file=filepath, + module=module, + change_type=change_type, + line=0 + )) + + # Detect if this file is a dependency manifest + req_file_pattern = re.compile( + r'^[\+\-].*?(requirements(.*?)\.txt|pyproject\.toml|setup\.py|Pipfile)' + ) + if any(req_file_pattern.match(line) for line in file_diff.split('\n')): + if not any(c.file == filepath and c.module == "" for c in changes): + changes.append(DependencyChange( + file=filepath, + module="", + change_type="modified", + line=0 + )) + return changes + + +def correlate_function_changes_with_files(diff_text: str, functions: List[FunctionChange]) -> List[FunctionChange]: + result: List[FunctionChange] = [] + # Split diff into per-file sections + file_sections: List[tuple[str, str]] = [] + current_file: Optional[str] = None + current_lines: List[str] = [] + for line in diff_text.split('\n'): + if line.startswith('diff --git'): + if current_file is not None: + file_sections.append((current_file, '\n'.join(current_lines))) + m = re.match(r'^diff --git a/.*? b/(.*?)$', line) + current_file = m.group(1) if m else "unknown" + current_lines = [line] + else: + current_lines.append(line) + if current_file is not None: + file_sections.append((current_file, '\n'.join(current_lines))) + + pattern = re.compile(r'^([+\-])\s*(def|class)\s+(\w+)', re.MULTILINE) + for filepath, section in file_sections: + for m in pattern.finditer(section): + op = m.group(1) + kind = m.group(2) + name = m.group(3) + change_type = "added" if op == '+' else "removed" + result.append(FunctionChange( + file=filepath, + name=name, + kind=kind, + change_type=change_type + )) + return result + + +def main(): + parser = argparse.ArgumentParser(description="Codebase Genome Diff — structural changes between versions") + parser.add_argument("--ref1", required=True, help="First git ref (commit, branch, tag)") + parser.add_argument("--ref2", required=True, help="Second git ref") + parser.add_argument("--output", help="Write report to file") + parser.add_argument("--json", action="store_true", help="Output JSON instead of human report") + args = parser.parse_args() + + try: + diff_text = run_git_diff(args.ref1, args.ref2) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + if not diff_text.strip(): + print(f"No differences between {args.ref1} and {args.ref2}.") + sys.exit(0) + + analyzer = DiffAnalyzer() + summary = analyzer.analyze(diff_text) + + file_changes = [fc.to_dict() for fc in summary.files] + func_changes = extract_function_changes(diff_text) + func_changes = correlate_function_changes_with_files(diff_text, func_changes) + dep_changes = extract_dependency_changes(diff_text, analyzer) + + report = GenomeDiffReport( + ref1=args.ref1, + ref2=args.ref2, + file_changes=file_changes, + function_changes=func_changes, + dependency_changes=dep_changes, + total_files_changed=len(file_changes), + total_functions_changed=len(func_changes), + total_dependencies_changed=len(dep_changes), + ) + + output = json.dumps(report.to_dict(), indent=2) if args.json else report.human_report() + + if args.output: + with open(args.output, 'w') as f: + f.write(output + '\n') + print(f"Report written to {args.output}") + else: + print(output) + + +if __name__ == '__main__': + main()