diff --git a/scripts/perf_bottleneck_finder.py b/scripts/perf_bottleneck_finder.py index a9490d0..86e6fea 100644 --- a/scripts/perf_bottleneck_finder.py +++ b/scripts/perf_bottleneck_finder.py @@ -30,26 +30,30 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple -SLOW_TEST_THRESHOLD_S = 2.0 +# ── Configuration ────────────────────────────────────────────────── + +SLOW_TEST_THRESHOLD_S = 2.0 # Tests slower than this are flagged SLOW_BUILD_STEP_THRESHOLD_S = 10.0 -TOP_N_BOTTLENECKS = 10 -PYTEST_DURATIONS_COUNT = 20 +TOP_N_BOTTLENECKS = 10 # Report top N bottlenecks +PYTEST_DURATIONS_COUNT = 20 # Number of slow tests to collect LOG_EXTENSIONS = {".log", ".txt"} @dataclass class Bottleneck: - category: str - name: str - duration_s: float - severity: str - recommendation: str + """A single performance bottleneck.""" + category: str # "test", "build", "ci", "artifact", "import" + name: str # What's slow + duration_s: float # How long it takes + severity: str # "critical", "warning", "info" + recommendation: str # How to fix file_path: Optional[str] = None line_number: Optional[int] = None @dataclass class PerfReport: + """Full performance report.""" timestamp: str repo_path: str bottlenecks: List[Bottleneck] = field(default_factory=list) @@ -59,16 +63,23 @@ class PerfReport: ci_stats: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> dict: - return asdict(self) + d = asdict(self) + return d +# ── Test Analysis ────────────────────────────────────────────────── + def find_slow_tests_pytest(repo_path: str) -> List[Bottleneck]: + """Run pytest --durations and parse slow tests.""" bottlenecks = [] + + # Try to run pytest with durations try: - subprocess.run( + result = subprocess.run( ["python3", "-m", "pytest", "--co", "-q", "--durations=0"], cwd=repo_path, capture_output=True, text=True, timeout=30 ) + # If tests exist, try to get durations from last run durations_file = os.path.join(repo_path, ".pytest_cache", "v", "cache", "durations") if os.path.exists(durations_file): with open(durations_file) as f: @@ -81,45 +92,54 @@ def find_slow_tests_pytest(repo_path: str) -> List[Bottleneck]: if duration > SLOW_TEST_THRESHOLD_S: severity = "critical" if duration > 10 else "warning" bottlenecks.append(Bottleneck( - category="test", name=test_name, duration_s=duration, + category="test", + name=test_name, + duration_s=duration, severity=severity, - recommendation=f"Test takes {duration:.1f}s. Consider mocking slow I/O." + recommendation=f"Test takes {duration:.1f}s. Consider mocking slow I/O, using fixtures, or marking with @pytest.mark.slow." )) except ValueError: continue except (subprocess.TimeoutExpired, FileNotFoundError): pass + return bottlenecks def find_slow_tests_by_scan(repo_path: str) -> List[Bottleneck]: + """Scan test files for patterns that indicate slow tests.""" bottlenecks = [] test_patterns = [ - (r"time\.sleep\((\d+(?:\.\d+)?)\)", "Contains time.sleep() — consider using mock"), + (r"time\.sleep\((\d+(?:\.\d+)?)\)", "Contains time.sleep() — consider using mock or async wait"), (r"subprocess\.run\(.*timeout=(\d+)", "Subprocess with timeout — may block test"), - (r"requests\.(get|post|put|delete)\(", "Real HTTP call — mock with responses"), - (r"open\(.*[\'\"']w[\'\"']", "File I/O in test — use tmp_path fixture"), + (r"requests\.(get|post|put|delete)\(", "Real HTTP call — mock with responses or httpretty"), + ("open\\([^)]*['"]w['"]", "File I/O in test — use tmp_path fixture"), ] for root, dirs, files in os.walk(repo_path): + # Skip hidden and cache dirs dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__', 'node_modules', '.git'))] + for fname in files: if not (fname.startswith("test_") or fname.endswith("_test.py")): continue if not fname.endswith(".py"): continue + fpath = os.path.join(root, fname) rel_path = os.path.relpath(fpath, repo_path) + try: with open(fpath) as f: lines = f.readlines() except (PermissionError, UnicodeDecodeError): continue + for i, line in enumerate(lines): for pattern, recommendation in test_patterns: match = re.search(pattern, line) if match: - duration = 1.0 + duration = 1.0 # Default estimate if "sleep" in pattern: try: duration = float(match.group(1)) @@ -131,151 +151,227 @@ def find_slow_tests_by_scan(repo_path: str) -> List[Bottleneck]: except (ValueError, IndexError): duration = 10.0 else: - duration = 2.0 + duration = 2.0 # Estimated + bottlenecks.append(Bottleneck( - category="test", name=f"{rel_path}:{i+1}", duration_s=duration, + category="test", + name=f"{rel_path}:{i+1}", + duration_s=duration, severity="warning" if duration < 5 else "critical", - recommendation=recommendation, file_path=rel_path, line_number=i + 1 + recommendation=recommendation, + file_path=rel_path, + line_number=i + 1 )) + return bottlenecks +# ── Build Analysis ───────────────────────────────────────────────── + def analyze_build_artifacts(repo_path: str) -> List[Bottleneck]: + """Find large build artifacts that slow down builds.""" bottlenecks = [] large_dirs = { - "node_modules": "Consider npm ci --production", - "__pycache__": "Consider .gitignore and cleaning", + "node_modules": "Consider using npm ci --production or yarn --production", + "__pycache__": "Consider .gitignore and cleaning before builds", ".tox": "Consider caching tox environments", ".pytest_cache": "Consider cleaning between CI runs", - "dist": "Check if dist/ rebuilt unnecessarily", - "build": "Check if build/ rebuilt unnecessarily", - ".next": "Consider incremental builds", - "venv": "Move outside repo or use Docker", + "dist": "Check if dist/ artifacts are being rebuilt unnecessarily", + "build": "Check if build/ artifacts are being rebuilt unnecessarily", + ".next": "Next.js cache — consider incremental builds", + "venv": "Virtual env in repo — move outside or use Docker", } + for dirname, recommendation in large_dirs.items(): dirpath = os.path.join(repo_path, dirname) if os.path.isdir(dirpath): total_size = 0 file_count = 0 - for root, _, files in os.walk(dirpath): + for root, dirs, files in os.walk(dirpath): for f in files: try: - total_size += os.path.getsize(os.path.join(root, f)) + fpath = os.path.join(root, f) + total_size += os.path.getsize(fpath) file_count += 1 except OSError: pass - if total_size > 10 * 1024 * 1024: + + if total_size > 10 * 1024 * 1024: # > 10MB size_mb = total_size / (1024 * 1024) bottlenecks.append(Bottleneck( category="build", name=f"{dirname}/ ({size_mb:.1f}MB, {file_count} files)", - duration_s=size_mb * 0.5, + duration_s=size_mb * 0.5, # Rough estimate severity="critical" if size_mb > 100 else "warning", recommendation=recommendation )) + return bottlenecks def analyze_makefile_targets(repo_path: str) -> List[Bottleneck]: + """Analyze Makefile for potentially slow targets.""" bottlenecks = [] - slow_patterns = [ - (r"pip install", "pip install without --no-deps"), - (r"npm install(?!.*--production)", "npm install without --production"), - (r"docker build", "Docker build — consider layer caching"), - (r"pytest(?!.*-x|--maxfail)", "pytest without early exit"), - ] + makefiles = [] + for root, dirs, files in os.walk(repo_path): dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__'))] for f in files: if f in ("Makefile", "makefile", "GNUmakefile"): - fpath = os.path.join(root, f) - rel_path = os.path.relpath(fpath, repo_path) - try: - with open(fpath) as fh: - content = fh.read() - except (PermissionError, UnicodeDecodeError): - continue - for pattern, recommendation in slow_patterns: - if re.search(pattern, content): - bottlenecks.append(Bottleneck( - category="build", name=f"{rel_path}: {pattern}", - duration_s=5.0, severity="info", - recommendation=recommendation, file_path=rel_path - )) + makefiles.append(os.path.join(root, f)) + + slow_patterns = [ + (r"pip install", "pip install without --no-deps or constraints"), + (r"npm install(?!.*--production)", "npm install without --production flag"), + (r"docker build", "Docker build — consider multi-stage and layer caching"), + (r"pytest(?!.*-x|--maxfail)", "pytest without early exit on failure"), + (r"mypy|mypy --strict", "Type checking — consider incremental mode"), + ] + + for mfile in makefiles: + rel_path = os.path.relpath(mfile, repo_path) + try: + with open(mfile) as f: + content = f.read() + except (PermissionError, UnicodeDecodeError): + continue + + for pattern, recommendation in slow_patterns: + if re.search(pattern, content): + bottlenecks.append(Bottleneck( + category="build", + name=f"{rel_path}: {pattern}", + duration_s=5.0, + severity="info", + recommendation=recommendation, + file_path=rel_path + )) + return bottlenecks +# ── CI Analysis ──────────────────────────────────────────────────── + def analyze_github_actions(repo_path: str) -> List[Bottleneck]: + """Analyze GitHub Actions workflow files for inefficiencies.""" bottlenecks = [] workflow_dir = os.path.join(repo_path, ".github", "workflows") + if not os.path.isdir(workflow_dir): return bottlenecks + slow_patterns = [ - (r"runs-on:\s*ubuntu-latest", 0, "Consider caching dependencies"), - (r"npm install", 2, "Use npm ci instead"), - (r"pip install(?!.*--cache-dir)", 2, "Add --cache-dir"), + (r"runs-on:\s*ubuntu-latest", 0, "Consider caching dependencies between runs"), + (r"npm install", 2, "Use npm ci instead of npm install for reproducible builds"), + (r"pip install(?!.*--cache-dir)", 2, "Add --cache-dir or use pip cache action"), (r"docker build(?!.*--cache-from)", 5, "Use Docker layer caching"), + (r"python -m pytest(?!.*-n|--numprocesses)", 3, "Consider pytest-xdist for parallel test execution"), ] + for fname in os.listdir(workflow_dir): if not fname.endswith(('.yml', '.yaml')): continue + fpath = os.path.join(workflow_dir, fname) try: with open(fpath) as f: content = f.read() except (PermissionError, UnicodeDecodeError): continue + for pattern, est_savings, recommendation in slow_patterns: if re.search(pattern, content): bottlenecks.append(Bottleneck( - category="ci", name=f"{fname}: {pattern}", - duration_s=est_savings, severity="info", - recommendation=recommendation, file_path=f".github/workflows/{fname}" + category="ci", + name=f"{fname}: {pattern}", + duration_s=est_savings, + severity="info", + recommendation=recommendation, + file_path=f".github/workflows/{fname}" )) + return bottlenecks def analyze_gitea_ci(repo_path: str) -> List[Bottleneck]: + """Analyze Gitea/Drone CI config files.""" bottlenecks = [] - ci_dir = os.path.join(repo_path, ".gitea", "workflows") - if os.path.isdir(ci_dir): - for fname in os.listdir(ci_dir): - if not fname.endswith(('.yml', '.yaml')): - continue - fpath = os.path.join(ci_dir, fname) + ci_files = [".gitea/workflows", ".drone.yml", ".woodpecker.yml"] + + for ci_path in ci_files: + full_path = os.path.join(repo_path, ci_path) + if os.path.isfile(full_path): try: - with open(fpath) as f: + with open(full_path) as f: content = f.read() except (PermissionError, UnicodeDecodeError): continue + if "pip install" in content and "--cache-dir" not in content: bottlenecks.append(Bottleneck( - category="ci", name=f".gitea/workflows/{fname}: pip without cache", - duration_s=5.0, severity="warning", + category="ci", + name=f"{ci_path}: pip without cache", + duration_s=5.0, + severity="warning", recommendation="Add --cache-dir or mount pip cache volume", - file_path=f".gitea/workflows/{fname}" + file_path=ci_path )) + + elif os.path.isdir(full_path): + for fname in os.listdir(full_path): + if not fname.endswith(('.yml', '.yaml')): + continue + fpath = os.path.join(full_path, fname) + try: + with open(fpath) as f: + content = f.read() + except (PermissionError, UnicodeDecodeError): + continue + + if "pip install" in content and "--cache-dir" not in content: + bottlenecks.append(Bottleneck( + category="ci", + name=f"{ci_path}/{fname}: pip without cache", + duration_s=5.0, + severity="warning", + recommendation="Add --cache-dir or mount pip cache volume", + file_path=f"{ci_path}/{fname}" + )) + return bottlenecks +# ── Import Analysis ──────────────────────────────────────────────── + def find_slow_imports(repo_path: str) -> List[Bottleneck]: + """Find Python files with heavy import chains.""" bottlenecks = [] heavy_imports = { - "pandas": 0.5, "numpy": 0.3, "torch": 2.0, "tensorflow": 3.0, - "scipy": 0.5, "matplotlib": 0.8, "sklearn": 0.5, "transformers": 1.5, + "pandas": 0.5, + "numpy": 0.3, + "torch": 2.0, + "tensorflow": 3.0, + "scipy": 0.5, + "matplotlib": 0.8, + "sklearn": 0.5, + "transformers": 1.5, } + for root, dirs, files in os.walk(repo_path): dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__', 'node_modules'))] for fname in files: if not fname.endswith(".py"): continue + fpath = os.path.join(root, fname) rel_path = os.path.relpath(fpath, repo_path) + try: with open(fpath) as f: lines = f.readlines() except (PermissionError, UnicodeDecodeError): continue + for i, line in enumerate(lines): stripped = line.strip() if stripped.startswith("import ") or stripped.startswith("from "): @@ -286,40 +382,56 @@ def find_slow_imports(repo_path: str) -> List[Bottleneck]: name=f"{rel_path}:{i+1}: import {heavy}", duration_s=est_time, severity="info" if est_time < 1.0 else "warning", - recommendation=f"Heavy import ({heavy} ~{est_time}s). Consider lazy import.", - file_path=rel_path, line_number=i + 1 + recommendation=f"Heavy import ({heavy} ~{est_time}s). Consider lazy import or conditional import.", + file_path=rel_path, + line_number=i + 1 )) + return bottlenecks +# ── Report Generation ────────────────────────────────────────────── + def severity_sort_key(b: Bottleneck) -> Tuple[int, float]: + """Sort by severity then duration.""" sev_order = {"critical": 0, "warning": 1, "info": 2} return (sev_order.get(b.severity, 3), -b.duration_s) def generate_report(repo_path: str) -> PerfReport: + """Run all analyses and generate a performance report.""" report = PerfReport( timestamp=datetime.now(timezone.utc).isoformat(), repo_path=os.path.abspath(repo_path) ) + + # Collect all bottlenecks all_bottlenecks = [] + print("Scanning for slow tests (pytest cache)...") all_bottlenecks.extend(find_slow_tests_pytest(repo_path)) + print("Scanning for slow test patterns...") all_bottlenecks.extend(find_slow_tests_by_scan(repo_path)) + print("Analyzing build artifacts...") all_bottlenecks.extend(analyze_build_artifacts(repo_path)) + print("Analyzing Makefiles...") all_bottlenecks.extend(analyze_makefile_targets(repo_path)) + print("Analyzing CI workflows...") all_bottlenecks.extend(analyze_github_actions(repo_path)) all_bottlenecks.extend(analyze_gitea_ci(repo_path)) + print("Scanning for heavy imports...") all_bottlenecks.extend(find_slow_imports(repo_path)) + # Sort by severity and duration all_bottlenecks.sort(key=severity_sort_key) - report.bottlenecks = all_bottlenecks[:TOP_N_BOTTLENECKS * 2] + report.bottlenecks = all_bottlenecks[:TOP_N_BOTTLENECKS * 2] # Keep more for stats + # Compute summary by_category = defaultdict(list) for b in all_bottlenecks: by_category[b.category].append(b) @@ -332,53 +444,81 @@ def generate_report(repo_path: str) -> PerfReport: "estimated_total_slowdown_s": sum(b.duration_s for b in all_bottlenecks), "by_category": {cat: len(items) for cat, items in by_category.items()}, } + report.test_stats = { "slow_tests": len(by_category.get("test", [])), "total_estimated_s": sum(b.duration_s for b in by_category.get("test", [])), } + report.build_stats = { "build_issues": len(by_category.get("build", [])), "total_estimated_s": sum(b.duration_s for b in by_category.get("build", [])), } + report.ci_stats = { "ci_issues": len(by_category.get("ci", [])), "total_estimated_s": sum(b.duration_s for b in by_category.get("ci", [])), } + return report def format_markdown(report: PerfReport) -> str: + """Format report as markdown.""" lines = [] - lines.append("# Performance Bottleneck Report -") + lines.append(f"# Performance Bottleneck Report") + lines.append(f"") lines.append(f"Generated: {report.timestamp}") - lines.append(f"Repository: {report.repo_path} -") + lines.append(f"Repository: {report.repo_path}") + lines.append(f"") + + # Summary s = report.summary - lines.append("## Summary -") + lines.append(f"## Summary") + lines.append(f"") lines.append(f"- **Total bottlenecks:** {s['total_bottlenecks']}") lines.append(f"- **Critical:** {s['critical']} | **Warning:** {s['warning']} | **Info:** {s['info']}") - lines.append(f"- **Estimated total slowdown:** {s['estimated_total_slowdown_s']:.1f}s -") - lines.append(f"## Top {min(TOP_N_BOTTLENECKS, len(report.bottlenecks))} Bottlenecks -") + lines.append(f"- **Estimated total slowdown:** {s['estimated_total_slowdown_s']:.1f}s") + lines.append(f"- **By category:** {', '.join(f'{k}: {v}' for k, v in s['by_category'].items())}") + lines.append(f"") + + # Top bottlenecks + lines.append(f"## Top {min(TOP_N_BOTTLENECKS, len(report.bottlenecks))} Bottlenecks") + lines.append(f"") + for i, b in enumerate(report.bottlenecks[:TOP_N_BOTTLENECKS], 1): - icon = {"critical": "CRIT", "warning": "WARN", "info": "INFO"}.get(b.severity, "?") + icon = {"critical": "🔴", "warning": "🟡", "info": "🔵"}.get(b.severity, "⚪") loc = f" ({b.file_path}:{b.line_number})" if b.file_path else "" - lines.append(f"{i}. [{icon}] **{b.category}** -- {b.name}{loc}") - lines.append(f" Duration: ~{b.duration_s:.1f}s | {b.recommendation} -") + lines.append(f"{i}. {icon} **{b.category}** — {b.name}{loc}") + lines.append(f" - Duration: ~{b.duration_s:.1f}s | Severity: {b.severity}") + lines.append(f" - Fix: {b.recommendation}") + lines.append(f"") + + # Category breakdowns + for cat in ["test", "build", "ci", "import"]: + items = [b for b in report.bottlenecks if b.category == cat] + if items: + lines.append(f"## {cat.title()} Bottlenecks") + lines.append(f"") + for b in items: + icon = {"critical": "🔴", "warning": "🟡", "info": "🔵"}.get(b.severity, "⚪") + loc = f" ({b.file_path}:{b.line_number})" if b.file_path else "" + lines.append(f"- {icon} {b.name}{loc} — ~{b.duration_s:.1f}s — {b.recommendation}") + lines.append(f"") + return " ".join(lines) +# ── Main ─────────────────────────────────────────────────────────── + def main(): parser = argparse.ArgumentParser(description="Performance Bottleneck Finder") parser.add_argument("--repo", default=".", help="Path to repository to analyze") parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument("--report", help="Write markdown report to file") - parser.add_argument("--threshold", type=float, default=SLOW_TEST_THRESHOLD_S) + parser.add_argument("--threshold", type=float, default=SLOW_TEST_THRESHOLD_S, + help="Slow test threshold in seconds") args = parser.parse_args() global SLOW_TEST_THRESHOLD_S @@ -402,6 +542,7 @@ def main(): else: print(md) + # Exit code: 1 if critical bottlenecks found if report.summary.get("critical", 0) > 0: sys.exit(1)