From bd8e044fb841574df2f530588edffd8197ad1ee6 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Tue, 21 Apr 2026 11:19:07 +0000 Subject: [PATCH] fix(#211): remove corrupted file --- scripts/perf_bottleneck_finder.py | 551 ------------------------------ 1 file changed, 551 deletions(-) delete mode 100644 scripts/perf_bottleneck_finder.py diff --git a/scripts/perf_bottleneck_finder.py b/scripts/perf_bottleneck_finder.py deleted file mode 100644 index 6d43f0e..0000000 --- a/scripts/perf_bottleneck_finder.py +++ /dev/null @@ -1,551 +0,0 @@ -#!/usr/bin/env python3 -""" -Performance Bottleneck Finder — Identify slow tests, builds, and CI steps. - -Analyzes: - 1. Pytest output for slow tests - 2. Build logs for slow steps - 3. CI workflow durations - 4. File system for large/slow artifacts - -Usage: - python3 scripts/perf_bottleneck_finder.py --repo /path/to/repo - python3 scripts/perf_bottleneck_finder.py --repo /path/to/repo --json - python3 scripts/perf_bottleneck_finder.py --repo /path/to/repo --report metrics/perf_report.md - -Weekly cron: - 0 9 * * 1 cd /path/to/compounding-intelligence && python3 scripts/perf_bottleneck_finder.py --repo /path/to/target --report metrics/perf_report.md -""" - -import argparse -import json -import os -import re -import subprocess -import sys -from collections import defaultdict -from dataclasses import dataclass, field, asdict -from datetime import datetime, timezone -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - - -# ── Configuration ────────────────────────────────────────────────── - -SLOW_TEST_THRESHOLD_S = 2.0 # Tests slower than this are flagged -SLOW_BUILD_STEP_THRESHOLD_S = 10.0 -TOP_N_BOTTLENECKS = 10 # Report top N bottlenecks -PYTEST_DURATIONS_COUNT = 20 # Number of slow tests to collect -LOG_EXTENSIONS = {".log", ".txt"} - - -@dataclass -class Bottleneck: - """A single performance bottleneck.""" - category: str # "test", "build", "ci", "artifact", "import" - name: str # What's slow - duration_s: float # How long it takes - severity: str # "critical", "warning", "info" - recommendation: str # How to fix - file_path: Optional[str] = None - line_number: Optional[int] = None - - -@dataclass -class PerfReport: - """Full performance report.""" - timestamp: str - repo_path: str - bottlenecks: List[Bottleneck] = field(default_factory=list) - summary: Dict[str, Any] = field(default_factory=dict) - test_stats: Dict[str, Any] = field(default_factory=dict) - build_stats: Dict[str, Any] = field(default_factory=dict) - ci_stats: Dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> dict: - d = asdict(self) - return d - - -# ── Test Analysis ────────────────────────────────────────────────── - -def find_slow_tests_pytest(repo_path: str) -> List[Bottleneck]: - """Run pytest --durations and parse slow tests.""" - bottlenecks = [] - - # Try to run pytest with durations - try: - result = subprocess.run( - ["python3", "-m", "pytest", "--co", "-q", "--durations=0"], - cwd=repo_path, capture_output=True, text=True, timeout=30 - ) - # If tests exist, try to get durations from last run - durations_file = os.path.join(repo_path, ".pytest_cache", "v", "cache", "durations") - if os.path.exists(durations_file): - with open(durations_file) as f: - for line in f: - parts = line.strip().split() - if len(parts) >= 2: - try: - duration = float(parts[0]) - test_name = " ".join(parts[1:]) - if duration > SLOW_TEST_THRESHOLD_S: - severity = "critical" if duration > 10 else "warning" - bottlenecks.append(Bottleneck( - category="test", - name=test_name, - duration_s=duration, - severity=severity, - recommendation=f"Test takes {duration:.1f}s. Consider mocking slow I/O, using fixtures, or marking with @pytest.mark.slow." - )) - except ValueError: - continue - except (subprocess.TimeoutExpired, FileNotFoundError): - pass - - return bottlenecks - - -def find_slow_tests_by_scan(repo_path: str) -> List[Bottleneck]: - """Scan test files for patterns that indicate slow tests.""" - bottlenecks = [] - test_patterns = [ - (r"time\.sleep\((\d+(?:\.\d+)?)\)", "Contains time.sleep() — consider using mock or async wait"), - (r"subprocess\.run\(.*timeout=(\d+)", "Subprocess with timeout — may block test"), - (r"requests\.(get|post|put|delete)\(", "Real HTTP call — mock with responses or httpretty"), - ("open\\([^)]*)[\047\042]w[\047\042]", "File I/O in test — use tmp_path fixture"), - ] - - for root, dirs, files in os.walk(repo_path): - # Skip hidden and cache dirs - dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__', 'node_modules', '.git'))] - - for fname in files: - if not (fname.startswith("test_") or fname.endswith("_test.py")): - continue - if not fname.endswith(".py"): - continue - - fpath = os.path.join(root, fname) - rel_path = os.path.relpath(fpath, repo_path) - - try: - with open(fpath) as f: - lines = f.readlines() - except (PermissionError, UnicodeDecodeError): - continue - - for i, line in enumerate(lines): - for pattern, recommendation in test_patterns: - match = re.search(pattern, line) - if match: - duration = 1.0 # Default estimate - if "sleep" in pattern: - try: - duration = float(match.group(1)) - except (ValueError, IndexError): - duration = 1.0 - elif "timeout" in pattern: - try: - duration = float(match.group(1)) - except (ValueError, IndexError): - duration = 10.0 - else: - duration = 2.0 # Estimated - - bottlenecks.append(Bottleneck( - category="test", - name=f"{rel_path}:{i+1}", - duration_s=duration, - severity="warning" if duration < 5 else "critical", - recommendation=recommendation, - file_path=rel_path, - line_number=i + 1 - )) - - return bottlenecks - - -# ── Build Analysis ───────────────────────────────────────────────── - -def analyze_build_artifacts(repo_path: str) -> List[Bottleneck]: - """Find large build artifacts that slow down builds.""" - bottlenecks = [] - large_dirs = { - "node_modules": "Consider using npm ci --production or yarn --production", - "__pycache__": "Consider .gitignore and cleaning before builds", - ".tox": "Consider caching tox environments", - ".pytest_cache": "Consider cleaning between CI runs", - "dist": "Check if dist/ artifacts are being rebuilt unnecessarily", - "build": "Check if build/ artifacts are being rebuilt unnecessarily", - ".next": "Next.js cache — consider incremental builds", - "venv": "Virtual env in repo — move outside or use Docker", - } - - for dirname, recommendation in large_dirs.items(): - dirpath = os.path.join(repo_path, dirname) - if os.path.isdir(dirpath): - total_size = 0 - file_count = 0 - for root, dirs, files in os.walk(dirpath): - for f in files: - try: - fpath = os.path.join(root, f) - total_size += os.path.getsize(fpath) - file_count += 1 - except OSError: - pass - - if total_size > 10 * 1024 * 1024: # > 10MB - size_mb = total_size / (1024 * 1024) - bottlenecks.append(Bottleneck( - category="build", - name=f"{dirname}/ ({size_mb:.1f}MB, {file_count} files)", - duration_s=size_mb * 0.5, # Rough estimate - severity="critical" if size_mb > 100 else "warning", - recommendation=recommendation - )) - - return bottlenecks - - -def analyze_makefile_targets(repo_path: str) -> List[Bottleneck]: - """Analyze Makefile for potentially slow targets.""" - bottlenecks = [] - makefiles = [] - - for root, dirs, files in os.walk(repo_path): - dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__'))] - for f in files: - if f in ("Makefile", "makefile", "GNUmakefile"): - makefiles.append(os.path.join(root, f)) - - slow_patterns = [ - (r"pip install", "pip install without --no-deps or constraints"), - (r"npm install(?!.*--production)", "npm install without --production flag"), - (r"docker build", "Docker build — consider multi-stage and layer caching"), - (r"pytest(?!.*-x|--maxfail)", "pytest without early exit on failure"), - (r"mypy|mypy --strict", "Type checking — consider incremental mode"), - ] - - for mfile in makefiles: - rel_path = os.path.relpath(mfile, repo_path) - try: - with open(mfile) as f: - content = f.read() - except (PermissionError, UnicodeDecodeError): - continue - - for pattern, recommendation in slow_patterns: - if re.search(pattern, content): - bottlenecks.append(Bottleneck( - category="build", - name=f"{rel_path}: {pattern}", - duration_s=5.0, - severity="info", - recommendation=recommendation, - file_path=rel_path - )) - - return bottlenecks - - -# ── CI Analysis ──────────────────────────────────────────────────── - -def analyze_github_actions(repo_path: str) -> List[Bottleneck]: - """Analyze GitHub Actions workflow files for inefficiencies.""" - bottlenecks = [] - workflow_dir = os.path.join(repo_path, ".github", "workflows") - - if not os.path.isdir(workflow_dir): - return bottlenecks - - slow_patterns = [ - (r"runs-on:\s*ubuntu-latest", 0, "Consider caching dependencies between runs"), - (r"npm install", 2, "Use npm ci instead of npm install for reproducible builds"), - (r"pip install(?!.*--cache-dir)", 2, "Add --cache-dir or use pip cache action"), - (r"docker build(?!.*--cache-from)", 5, "Use Docker layer caching"), - (r"python -m pytest(?!.*-n|--numprocesses)", 3, "Consider pytest-xdist for parallel test execution"), - ] - - for fname in os.listdir(workflow_dir): - if not fname.endswith(('.yml', '.yaml')): - continue - - fpath = os.path.join(workflow_dir, fname) - try: - with open(fpath) as f: - content = f.read() - except (PermissionError, UnicodeDecodeError): - continue - - for pattern, est_savings, recommendation in slow_patterns: - if re.search(pattern, content): - bottlenecks.append(Bottleneck( - category="ci", - name=f"{fname}: {pattern}", - duration_s=est_savings, - severity="info", - recommendation=recommendation, - file_path=f".github/workflows/{fname}" - )) - - return bottlenecks - - -def analyze_gitea_ci(repo_path: str) -> List[Bottleneck]: - """Analyze Gitea/Drone CI config files.""" - bottlenecks = [] - ci_files = [".gitea/workflows", ".drone.yml", ".woodpecker.yml"] - - for ci_path in ci_files: - full_path = os.path.join(repo_path, ci_path) - if os.path.isfile(full_path): - try: - with open(full_path) as f: - content = f.read() - except (PermissionError, UnicodeDecodeError): - continue - - if "pip install" in content and "--cache-dir" not in content: - bottlenecks.append(Bottleneck( - category="ci", - name=f"{ci_path}: pip without cache", - duration_s=5.0, - severity="warning", - recommendation="Add --cache-dir or mount pip cache volume", - file_path=ci_path - )) - - elif os.path.isdir(full_path): - for fname in os.listdir(full_path): - if not fname.endswith(('.yml', '.yaml')): - continue - fpath = os.path.join(full_path, fname) - try: - with open(fpath) as f: - content = f.read() - except (PermissionError, UnicodeDecodeError): - continue - - if "pip install" in content and "--cache-dir" not in content: - bottlenecks.append(Bottleneck( - category="ci", - name=f"{ci_path}/{fname}: pip without cache", - duration_s=5.0, - severity="warning", - recommendation="Add --cache-dir or mount pip cache volume", - file_path=f"{ci_path}/{fname}" - )) - - return bottlenecks - - -# ── Import Analysis ──────────────────────────────────────────────── - -def find_slow_imports(repo_path: str) -> List[Bottleneck]: - """Find Python files with heavy import chains.""" - bottlenecks = [] - heavy_imports = { - "pandas": 0.5, - "numpy": 0.3, - "torch": 2.0, - "tensorflow": 3.0, - "scipy": 0.5, - "matplotlib": 0.8, - "sklearn": 0.5, - "transformers": 1.5, - } - - for root, dirs, files in os.walk(repo_path): - dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__', 'node_modules'))] - for fname in files: - if not fname.endswith(".py"): - continue - - fpath = os.path.join(root, fname) - rel_path = os.path.relpath(fpath, repo_path) - - try: - with open(fpath) as f: - lines = f.readlines() - except (PermissionError, UnicodeDecodeError): - continue - - for i, line in enumerate(lines): - stripped = line.strip() - if stripped.startswith("import ") or stripped.startswith("from "): - for heavy, est_time in heavy_imports.items(): - if heavy in stripped: - bottlenecks.append(Bottleneck( - category="import", - name=f"{rel_path}:{i+1}: import {heavy}", - duration_s=est_time, - severity="info" if est_time < 1.0 else "warning", - recommendation=f"Heavy import ({heavy} ~{est_time}s). Consider lazy import or conditional import.", - file_path=rel_path, - line_number=i + 1 - )) - - return bottlenecks - - -# ── Report Generation ────────────────────────────────────────────── - -def severity_sort_key(b: Bottleneck) -> Tuple[int, float]: - """Sort by severity then duration.""" - sev_order = {"critical": 0, "warning": 1, "info": 2} - return (sev_order.get(b.severity, 3), -b.duration_s) - - -def generate_report(repo_path: str) -> PerfReport: - """Run all analyses and generate a performance report.""" - report = PerfReport( - timestamp=datetime.now(timezone.utc).isoformat(), - repo_path=os.path.abspath(repo_path) - ) - - # Collect all bottlenecks - all_bottlenecks = [] - - print("Scanning for slow tests (pytest cache)...") - all_bottlenecks.extend(find_slow_tests_pytest(repo_path)) - - print("Scanning for slow test patterns...") - all_bottlenecks.extend(find_slow_tests_by_scan(repo_path)) - - print("Analyzing build artifacts...") - all_bottlenecks.extend(analyze_build_artifacts(repo_path)) - - print("Analyzing Makefiles...") - all_bottlenecks.extend(analyze_makefile_targets(repo_path)) - - print("Analyzing CI workflows...") - all_bottlenecks.extend(analyze_github_actions(repo_path)) - all_bottlenecks.extend(analyze_gitea_ci(repo_path)) - - print("Scanning for heavy imports...") - all_bottlenecks.extend(find_slow_imports(repo_path)) - - # Sort by severity and duration - all_bottlenecks.sort(key=severity_sort_key) - report.bottlenecks = all_bottlenecks[:TOP_N_BOTTLENECKS * 2] # Keep more for stats - - # Compute summary - by_category = defaultdict(list) - for b in all_bottlenecks: - by_category[b.category].append(b) - - report.summary = { - "total_bottlenecks": len(all_bottlenecks), - "critical": sum(1 for b in all_bottlenecks if b.severity == "critical"), - "warning": sum(1 for b in all_bottlenecks if b.severity == "warning"), - "info": sum(1 for b in all_bottlenecks if b.severity == "info"), - "estimated_total_slowdown_s": sum(b.duration_s for b in all_bottlenecks), - "by_category": {cat: len(items) for cat, items in by_category.items()}, - } - - report.test_stats = { - "slow_tests": len(by_category.get("test", [])), - "total_estimated_s": sum(b.duration_s for b in by_category.get("test", [])), - } - - report.build_stats = { - "build_issues": len(by_category.get("build", [])), - "total_estimated_s": sum(b.duration_s for b in by_category.get("build", [])), - } - - report.ci_stats = { - "ci_issues": len(by_category.get("ci", [])), - "total_estimated_s": sum(b.duration_s for b in by_category.get("ci", [])), - } - - return report - - -def format_markdown(report: PerfReport) -> str: - """Format report as markdown.""" - lines = [] - lines.append(f"# Performance Bottleneck Report") - lines.append(f"") - lines.append(f"Generated: {report.timestamp}") - lines.append(f"Repository: {report.repo_path}") - lines.append(f"") - - # Summary - s = report.summary - lines.append(f"## Summary") - lines.append(f"") - lines.append(f"- **Total bottlenecks:** {s['total_bottlenecks']}") - lines.append(f"- **Critical:** {s['critical']} | **Warning:** {s['warning']} | **Info:** {s['info']}") - lines.append(f"- **Estimated total slowdown:** {s['estimated_total_slowdown_s']:.1f}s") - lines.append(f"- **By category:** {', '.join(f'{k}: {v}' for k, v in s['by_category'].items())}") - lines.append(f"") - - # Top bottlenecks - lines.append(f"## Top {min(TOP_N_BOTTLENECKS, len(report.bottlenecks))} Bottlenecks") - lines.append(f"") - - for i, b in enumerate(report.bottlenecks[:TOP_N_BOTTLENECKS], 1): - icon = {"critical": "🔴", "warning": "🟡", "info": "🔵"}.get(b.severity, "⚪") - loc = f" ({b.file_path}:{b.line_number})" if b.file_path else "" - lines.append(f"{i}. {icon} **{b.category}** — {b.name}{loc}") - lines.append(f" - Duration: ~{b.duration_s:.1f}s | Severity: {b.severity}") - lines.append(f" - Fix: {b.recommendation}") - lines.append(f"") - - # Category breakdowns - for cat in ["test", "build", "ci", "import"]: - items = [b for b in report.bottlenecks if b.category == cat] - if items: - lines.append(f"## {cat.title()} Bottlenecks") - lines.append(f"") - for b in items: - icon = {"critical": "🔴", "warning": "🟡", "info": "🔵"}.get(b.severity, "⚪") - loc = f" ({b.file_path}:{b.line_number})" if b.file_path else "" - lines.append(f"- {icon} {b.name}{loc} — ~{b.duration_s:.1f}s — {b.recommendation}") - lines.append(f"") - - return " -".join(lines) - - -# ── Main ─────────────────────────────────────────────────────────── - -def main(): - parser = argparse.ArgumentParser(description="Performance Bottleneck Finder") - parser.add_argument("--repo", default=".", help="Path to repository to analyze") - parser.add_argument("--json", action="store_true", help="Output as JSON") - parser.add_argument("--report", help="Write markdown report to file") - parser.add_argument("--threshold", type=float, default=SLOW_TEST_THRESHOLD_S, - help="Slow test threshold in seconds") - args = parser.parse_args() - - global SLOW_TEST_THRESHOLD_S - SLOW_TEST_THRESHOLD_S = args.threshold - - if not os.path.isdir(args.repo): - print(f"Error: {args.repo} is not a directory", file=sys.stderr) - sys.exit(1) - - report = generate_report(args.repo) - - if args.json: - print(json.dumps(report.to_dict(), indent=2)) - else: - md = format_markdown(report) - if args.report: - os.makedirs(os.path.dirname(args.report) or ".", exist_ok=True) - with open(args.report, "w") as f: - f.write(md) - print(f"Report written to {args.report}") - else: - print(md) - - # Exit code: 1 if critical bottlenecks found - if report.summary.get("critical", 0) > 0: - sys.exit(1) - - -if __name__ == "__main__": - main()