#!/usr/bin/env python3 """ Performance Bottleneck Finder — Identify slow tests, builds, and CI steps. Analyzes: 1. Pytest output for slow tests 2. Build logs for slow steps 3. CI workflow durations 4. File system for large/slow artifacts Usage: python3 scripts/perf_bottleneck_finder.py --repo /path/to/repo python3 scripts/perf_bottleneck_finder.py --repo /path/to/repo --json python3 scripts/perf_bottleneck_finder.py --repo /path/to/repo --report metrics/perf_report.md Weekly cron: 0 9 * * 1 cd /path/to/compounding-intelligence && python3 scripts/perf_bottleneck_finder.py --repo /path/to/target --report metrics/perf_report.md """ import argparse import json import os import re import subprocess import sys from collections import defaultdict from dataclasses import dataclass, field, asdict from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Tuple # ── Configuration ────────────────────────────────────────────────── SLOW_TEST_THRESHOLD_S = 2.0 # Tests slower than this are flagged SLOW_BUILD_STEP_THRESHOLD_S = 10.0 TOP_N_BOTTLENECKS = 10 # Report top N bottlenecks PYTEST_DURATIONS_COUNT = 20 # Number of slow tests to collect LOG_EXTENSIONS = {".log", ".txt"} @dataclass class Bottleneck: """A single performance bottleneck.""" category: str # "test", "build", "ci", "artifact", "import" name: str # What's slow duration_s: float # How long it takes severity: str # "critical", "warning", "info" recommendation: str # How to fix file_path: Optional[str] = None line_number: Optional[int] = None @dataclass class PerfReport: """Full performance report.""" timestamp: str repo_path: str bottlenecks: List[Bottleneck] = field(default_factory=list) summary: Dict[str, Any] = field(default_factory=dict) test_stats: Dict[str, Any] = field(default_factory=dict) build_stats: Dict[str, Any] = field(default_factory=dict) ci_stats: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> dict: d = asdict(self) return d # ── Test Analysis ────────────────────────────────────────────────── def find_slow_tests_pytest(repo_path: str) -> List[Bottleneck]: """Run pytest --durations and parse slow tests.""" bottlenecks = [] # Try to run pytest with durations try: result = subprocess.run( ["python3", "-m", "pytest", "--co", "-q", "--durations=0"], cwd=repo_path, capture_output=True, text=True, timeout=30 ) # If tests exist, try to get durations from last run durations_file = os.path.join(repo_path, ".pytest_cache", "v", "cache", "durations") if os.path.exists(durations_file): with open(durations_file) as f: for line in f: parts = line.strip().split() if len(parts) >= 2: try: duration = float(parts[0]) test_name = " ".join(parts[1:]) if duration > SLOW_TEST_THRESHOLD_S: severity = "critical" if duration > 10 else "warning" bottlenecks.append(Bottleneck( category="test", name=test_name, duration_s=duration, severity=severity, recommendation=f"Test takes {duration:.1f}s. Consider mocking slow I/O, using fixtures, or marking with @pytest.mark.slow." )) except ValueError: continue except (subprocess.TimeoutExpired, FileNotFoundError): pass return bottlenecks def find_slow_tests_by_scan(repo_path: str) -> List[Bottleneck]: """Scan test files for patterns that indicate slow tests.""" bottlenecks = [] test_patterns = [ (r"time\.sleep\((\d+(?:\.\d+)?)\)", "Contains time.sleep() — consider using mock or async wait"), (r"subprocess\.run\(.*timeout=(\d+)", "Subprocess with timeout — may block test"), (r"requests\.(get|post|put|delete)\(", "Real HTTP call — mock with responses or httpretty"), (r"open\\([^)]*)[\x27\x22]w[\x27\x22]", "File I/O in test — use tmp_path fixture"), ] for root, dirs, files in os.walk(repo_path): # Skip hidden and cache dirs dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__', 'node_modules', '.git'))] for fname in files: if not (fname.startswith("test_") or fname.endswith("_test.py")): continue if not fname.endswith(".py"): continue fpath = os.path.join(root, fname) rel_path = os.path.relpath(fpath, repo_path) try: with open(fpath) as f: lines = f.readlines() except (PermissionError, UnicodeDecodeError): continue for i, line in enumerate(lines): for pattern, recommendation in test_patterns: match = re.search(pattern, line) if match: duration = 1.0 # Default estimate if "sleep" in pattern: try: duration = float(match.group(1)) except (ValueError, IndexError): duration = 1.0 elif "timeout" in pattern: try: duration = float(match.group(1)) except (ValueError, IndexError): duration = 10.0 else: duration = 2.0 # Estimated bottlenecks.append(Bottleneck( category="test", name=f"{rel_path}:{i+1}", duration_s=duration, severity="warning" if duration < 5 else "critical", recommendation=recommendation, file_path=rel_path, line_number=i + 1 )) return bottlenecks # ── Build Analysis ───────────────────────────────────────────────── def analyze_build_artifacts(repo_path: str) -> List[Bottleneck]: """Find large build artifacts that slow down builds.""" bottlenecks = [] large_dirs = { "node_modules": "Consider using npm ci --production or yarn --production", "__pycache__": "Consider .gitignore and cleaning before builds", ".tox": "Consider caching tox environments", ".pytest_cache": "Consider cleaning between CI runs", "dist": "Check if dist/ artifacts are being rebuilt unnecessarily", "build": "Check if build/ artifacts are being rebuilt unnecessarily", ".next": "Next.js cache — consider incremental builds", "venv": "Virtual env in repo — move outside or use Docker", } for dirname, recommendation in large_dirs.items(): dirpath = os.path.join(repo_path, dirname) if os.path.isdir(dirpath): total_size = 0 file_count = 0 for root, dirs, files in os.walk(dirpath): for f in files: try: fpath = os.path.join(root, f) total_size += os.path.getsize(fpath) file_count += 1 except OSError: pass if total_size > 10 * 1024 * 1024: # > 10MB size_mb = total_size / (1024 * 1024) bottlenecks.append(Bottleneck( category="build", name=f"{dirname}/ ({size_mb:.1f}MB, {file_count} files)", duration_s=size_mb * 0.5, # Rough estimate severity="critical" if size_mb > 100 else "warning", recommendation=recommendation )) return bottlenecks def analyze_makefile_targets(repo_path: str) -> List[Bottleneck]: """Analyze Makefile for potentially slow targets.""" bottlenecks = [] makefiles = [] for root, dirs, files in os.walk(repo_path): dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__'))] for f in files: if f in ("Makefile", "makefile", "GNUmakefile"): makefiles.append(os.path.join(root, f)) slow_patterns = [ (r"pip install", "pip install without --no-deps or constraints"), (r"npm install(?!.*--production)", "npm install without --production flag"), (r"docker build", "Docker build — consider multi-stage and layer caching"), (r"pytest(?!.*-x|--maxfail)", "pytest without early exit on failure"), (r"mypy|mypy --strict", "Type checking — consider incremental mode"), ] for mfile in makefiles: rel_path = os.path.relpath(mfile, repo_path) try: with open(mfile) as f: content = f.read() except (PermissionError, UnicodeDecodeError): continue for pattern, recommendation in slow_patterns: if re.search(pattern, content): bottlenecks.append(Bottleneck( category="build", name=f"{rel_path}: {pattern}", duration_s=5.0, severity="info", recommendation=recommendation, file_path=rel_path )) return bottlenecks # ── CI Analysis ──────────────────────────────────────────────────── def analyze_github_actions(repo_path: str) -> List[Bottleneck]: """Analyze GitHub Actions workflow files for inefficiencies.""" bottlenecks = [] workflow_dir = os.path.join(repo_path, ".github", "workflows") if not os.path.isdir(workflow_dir): return bottlenecks slow_patterns = [ (r"runs-on:\s*ubuntu-latest", 0, "Consider caching dependencies between runs"), (r"npm install", 2, "Use npm ci instead of npm install for reproducible builds"), (r"pip install(?!.*--cache-dir)", 2, "Add --cache-dir or use pip cache action"), (r"docker build(?!.*--cache-from)", 5, "Use Docker layer caching"), (r"python -m pytest(?!.*-n|--numprocesses)", 3, "Consider pytest-xdist for parallel test execution"), ] for fname in os.listdir(workflow_dir): if not fname.endswith(('.yml', '.yaml')): continue fpath = os.path.join(workflow_dir, fname) try: with open(fpath) as f: content = f.read() except (PermissionError, UnicodeDecodeError): continue for pattern, est_savings, recommendation in slow_patterns: if re.search(pattern, content): bottlenecks.append(Bottleneck( category="ci", name=f"{fname}: {pattern}", duration_s=est_savings, severity="info", recommendation=recommendation, file_path=f".github/workflows/{fname}" )) return bottlenecks def analyze_gitea_ci(repo_path: str) -> List[Bottleneck]: """Analyze Gitea/Drone CI config files.""" bottlenecks = [] ci_files = [".gitea/workflows", ".drone.yml", ".woodpecker.yml"] for ci_path in ci_files: full_path = os.path.join(repo_path, ci_path) if os.path.isfile(full_path): try: with open(full_path) as f: content = f.read() except (PermissionError, UnicodeDecodeError): continue if "pip install" in content and "--cache-dir" not in content: bottlenecks.append(Bottleneck( category="ci", name=f"{ci_path}: pip without cache", duration_s=5.0, severity="warning", recommendation="Add --cache-dir or mount pip cache volume", file_path=ci_path )) elif os.path.isdir(full_path): for fname in os.listdir(full_path): if not fname.endswith(('.yml', '.yaml')): continue fpath = os.path.join(full_path, fname) try: with open(fpath) as f: content = f.read() except (PermissionError, UnicodeDecodeError): continue if "pip install" in content and "--cache-dir" not in content: bottlenecks.append(Bottleneck( category="ci", name=f"{ci_path}/{fname}: pip without cache", duration_s=5.0, severity="warning", recommendation="Add --cache-dir or mount pip cache volume", file_path=f"{ci_path}/{fname}" )) return bottlenecks # ── Import Analysis ──────────────────────────────────────────────── def find_slow_imports(repo_path: str) -> List[Bottleneck]: """Find Python files with heavy import chains.""" bottlenecks = [] heavy_imports = { "pandas": 0.5, "numpy": 0.3, "torch": 2.0, "tensorflow": 3.0, "scipy": 0.5, "matplotlib": 0.8, "sklearn": 0.5, "transformers": 1.5, } for root, dirs, files in os.walk(repo_path): dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__', 'node_modules'))] for fname in files: if not fname.endswith(".py"): continue fpath = os.path.join(root, fname) rel_path = os.path.relpath(fpath, repo_path) try: with open(fpath) as f: lines = f.readlines() except (PermissionError, UnicodeDecodeError): continue for i, line in enumerate(lines): stripped = line.strip() if stripped.startswith("import ") or stripped.startswith("from "): for heavy, est_time in heavy_imports.items(): if heavy in stripped: bottlenecks.append(Bottleneck( category="import", name=f"{rel_path}:{i+1}: import {heavy}", duration_s=est_time, severity="info" if est_time < 1.0 else "warning", recommendation=f"Heavy import ({heavy} ~{est_time}s). Consider lazy import or conditional import.", file_path=rel_path, line_number=i + 1 )) return bottlenecks # ── Report Generation ────────────────────────────────────────────── def severity_sort_key(b: Bottleneck) -> Tuple[int, float]: """Sort by severity then duration.""" sev_order = {"critical": 0, "warning": 1, "info": 2} return (sev_order.get(b.severity, 3), -b.duration_s) def generate_report(repo_path: str) -> PerfReport: """Run all analyses and generate a performance report.""" report = PerfReport( timestamp=datetime.now(timezone.utc).isoformat(), repo_path=os.path.abspath(repo_path) ) # Collect all bottlenecks all_bottlenecks = [] print("Scanning for slow tests (pytest cache)...") all_bottlenecks.extend(find_slow_tests_pytest(repo_path)) print("Scanning for slow test patterns...") all_bottlenecks.extend(find_slow_tests_by_scan(repo_path)) print("Analyzing build artifacts...") all_bottlenecks.extend(analyze_build_artifacts(repo_path)) print("Analyzing Makefiles...") all_bottlenecks.extend(analyze_makefile_targets(repo_path)) print("Analyzing CI workflows...") all_bottlenecks.extend(analyze_github_actions(repo_path)) all_bottlenecks.extend(analyze_gitea_ci(repo_path)) print("Scanning for heavy imports...") all_bottlenecks.extend(find_slow_imports(repo_path)) # Sort by severity and duration all_bottlenecks.sort(key=severity_sort_key) report.bottlenecks = all_bottlenecks[:TOP_N_BOTTLENECKS * 2] # Keep more for stats # Compute summary by_category = defaultdict(list) for b in all_bottlenecks: by_category[b.category].append(b) report.summary = { "total_bottlenecks": len(all_bottlenecks), "critical": sum(1 for b in all_bottlenecks if b.severity == "critical"), "warning": sum(1 for b in all_bottlenecks if b.severity == "warning"), "info": sum(1 for b in all_bottlenecks if b.severity == "info"), "estimated_total_slowdown_s": sum(b.duration_s for b in all_bottlenecks), "by_category": {cat: len(items) for cat, items in by_category.items()}, } report.test_stats = { "slow_tests": len(by_category.get("test", [])), "total_estimated_s": sum(b.duration_s for b in by_category.get("test", [])), } report.build_stats = { "build_issues": len(by_category.get("build", [])), "total_estimated_s": sum(b.duration_s for b in by_category.get("build", [])), } report.ci_stats = { "ci_issues": len(by_category.get("ci", [])), "total_estimated_s": sum(b.duration_s for b in by_category.get("ci", [])), } return report def format_markdown(report: PerfReport) -> str: """Format report as markdown.""" lines = [] lines.append(f"# Performance Bottleneck Report") lines.append(f"") lines.append(f"Generated: {report.timestamp}") lines.append(f"Repository: {report.repo_path}") lines.append(f"") # Summary s = report.summary lines.append(f"## Summary") lines.append(f"") lines.append(f"- **Total bottlenecks:** {s['total_bottlenecks']}") lines.append(f"- **Critical:** {s['critical']} | **Warning:** {s['warning']} | **Info:** {s['info']}") lines.append(f"- **Estimated total slowdown:** {s['estimated_total_slowdown_s']:.1f}s") lines.append(f"- **By category:** {', '.join(f'{k}: {v}' for k, v in s['by_category'].items())}") lines.append(f"") # Top bottlenecks lines.append(f"## Top {min(TOP_N_BOTTLENECKS, len(report.bottlenecks))} Bottlenecks") lines.append(f"") for i, b in enumerate(report.bottlenecks[:TOP_N_BOTTLENECKS], 1): icon = {"critical": "🔴", "warning": "🟡", "info": "🔵"}.get(b.severity, "⚪") loc = f" ({b.file_path}:{b.line_number})" if b.file_path else "" lines.append(f"{i}. {icon} **{b.category}** — {b.name}{loc}") lines.append(f" - Duration: ~{b.duration_s:.1f}s | Severity: {b.severity}") lines.append(f" - Fix: {b.recommendation}") lines.append(f"") # Category breakdowns for cat in ["test", "build", "ci", "import"]: items = [b for b in report.bottlenecks if b.category == cat] if items: lines.append(f"## {cat.title()} Bottlenecks") lines.append(f"") for b in items: icon = {"critical": "🔴", "warning": "🟡", "info": "🔵"}.get(b.severity, "⚪") loc = f" ({b.file_path}:{b.line_number})" if b.file_path else "" lines.append(f"- {icon} {b.name}{loc} — ~{b.duration_s:.1f}s — {b.recommendation}") lines.append(f"") return "\n".join(lines) # ── Main ─────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Performance Bottleneck Finder") parser.add_argument("--repo", default=".", help="Path to repository to analyze") parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument("--report", help="Write markdown report to file") parser.add_argument("--threshold", type=float, default=SLOW_TEST_THRESHOLD_S, help="Slow test threshold in seconds") args = parser.parse_args() # Threshold override handled via module-level default # (scan_tests uses SLOW_TEST_THRESHOLD_S from module scope) if not os.path.isdir(args.repo): print(f"Error: {args.repo} is not a directory", file=sys.stderr) sys.exit(1) report = generate_report(args.repo) if args.json: print(json.dumps(report.to_dict(), indent=2)) else: md = format_markdown(report) if args.report: os.makedirs(os.path.dirname(args.report) or ".", exist_ok=True) with open(args.report, "w") as f: f.write(md) print(f"Report written to {args.report}") else: print(md) # Exit code: 1 if critical bottlenecks found if report.summary.get("critical", 0) > 0: sys.exit(1) if __name__ == "__main__": main()