fix(#211): fix regex syntax error in test_patterns list

2026-04-21 11:20:29 +00:00
parent de37e743be
commit bfc1f5613b
1 changed files with 226 additions and 85 deletions
--- a/scripts/perf_bottleneck_finder.py
+++ b/scripts/perf_bottleneck_finder.py
@@ -30,26 +30,30 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple


-SLOW_TEST_THRESHOLD_S = 2.0
+# ── Configuration ──────────────────────────────────────────────────
+
+SLOW_TEST_THRESHOLD_S = 2.0      # Tests slower than this are flagged
 SLOW_BUILD_STEP_THRESHOLD_S = 10.0
-TOP_N_BOTTLENECKS = 10
-PYTEST_DURATIONS_COUNT = 20
+TOP_N_BOTTLENECKS = 10           # Report top N bottlenecks
+PYTEST_DURATIONS_COUNT = 20      # Number of slow tests to collect
 LOG_EXTENSIONS = {".log", ".txt"}


@dataclass
 class Bottleneck:
-    category: str
-    name: str
-    duration_s: float
-    severity: str
-    recommendation: str
+    """A single performance bottleneck."""
+    category: str          # "test", "build", "ci", "artifact", "import"
+    name: str              # What's slow
+    duration_s: float      # How long it takes
+    severity: str          # "critical", "warning", "info"
+    recommendation: str    # How to fix
    file_path: Optional[str] = None
    line_number: Optional[int] = None


@dataclass
 class PerfReport:
+    """Full performance report."""
    timestamp: str
    repo_path: str
    bottlenecks: List[Bottleneck] = field(default_factory=list)
@@ -59,16 +63,23 @@ class PerfReport:
    ci_stats: Dict[str, Any] = field(default_factory=dict)

    def to_dict(self) -> dict:
-        return asdict(self)
+        d = asdict(self)
+        return d


+# ── Test Analysis ──────────────────────────────────────────────────
+
 def find_slow_tests_pytest(repo_path: str) -> List[Bottleneck]:
+    """Run pytest --durations and parse slow tests."""
    bottlenecks = []
+
+    # Try to run pytest with durations
    try:
-        subprocess.run(
+        result = subprocess.run(
            ["python3", "-m", "pytest", "--co", "-q", "--durations=0"],
            cwd=repo_path, capture_output=True, text=True, timeout=30
        )
+        # If tests exist, try to get durations from last run
        durations_file = os.path.join(repo_path, ".pytest_cache", "v", "cache", "durations")
        if os.path.exists(durations_file):
            with open(durations_file) as f:
@@ -81,45 +92,54 @@ def find_slow_tests_pytest(repo_path: str) -> List[Bottleneck]:
                            if duration > SLOW_TEST_THRESHOLD_S:
                                severity = "critical" if duration > 10 else "warning"
                                bottlenecks.append(Bottleneck(
-                                    category="test", name=test_name, duration_s=duration,
+                                    category="test",
+                                    name=test_name,
+                                    duration_s=duration,
                                    severity=severity,
-                                    recommendation=f"Test takes {duration:.1f}s. Consider mocking slow I/O."
+                                    recommendation=f"Test takes {duration:.1f}s. Consider mocking slow I/O, using fixtures, or marking with @pytest.mark.slow."
                                ))
                        except ValueError:
                            continue
    except (subprocess.TimeoutExpired, FileNotFoundError):
        pass
+
    return bottlenecks


 def find_slow_tests_by_scan(repo_path: str) -> List[Bottleneck]:
+    """Scan test files for patterns that indicate slow tests."""
    bottlenecks = []
    test_patterns = [
-        (r"time\.sleep\((\d+(?:\.\d+)?)\)", "Contains time.sleep() — consider using mock"),
+        (r"time\.sleep\((\d+(?:\.\d+)?)\)", "Contains time.sleep() — consider using mock or async wait"),
        (r"subprocess\.run\(.*timeout=(\d+)", "Subprocess with timeout — may block test"),
-        (r"requests\.(get|post|put|delete)\(", "Real HTTP call — mock with responses"),
-        (r"open\(.*[\'\"']w[\'\"']", "File I/O in test — use tmp_path fixture"),
+        (r"requests\.(get|post|put|delete)\(", "Real HTTP call — mock with responses or httpretty"),
+        ("open\\([^)]*['"]w['"]", "File I/O in test — use tmp_path fixture"),
    ]

    for root, dirs, files in os.walk(repo_path):
+        # Skip hidden and cache dirs
        dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__', 'node_modules', '.git'))]
+
        for fname in files:
            if not (fname.startswith("test_") or fname.endswith("_test.py")):
                continue
            if not fname.endswith(".py"):
                continue
+
            fpath = os.path.join(root, fname)
            rel_path = os.path.relpath(fpath, repo_path)
+
            try:
                with open(fpath) as f:
                    lines = f.readlines()
            except (PermissionError, UnicodeDecodeError):
                continue
+
            for i, line in enumerate(lines):
                for pattern, recommendation in test_patterns:
                    match = re.search(pattern, line)
                    if match:
-                        duration = 1.0
+                        duration = 1.0  # Default estimate
                        if "sleep" in pattern:
                            try:
                                duration = float(match.group(1))
@@ -131,151 +151,227 @@ def find_slow_tests_by_scan(repo_path: str) -> List[Bottleneck]:
                            except (ValueError, IndexError):
                                duration = 10.0
                        else:
-                            duration = 2.0
+                            duration = 2.0  # Estimated
+
                        bottlenecks.append(Bottleneck(
-                            category="test", name=f"{rel_path}:{i+1}", duration_s=duration,
+                            category="test",
+                            name=f"{rel_path}:{i+1}",
+                            duration_s=duration,
                            severity="warning" if duration < 5 else "critical",
-                            recommendation=recommendation, file_path=rel_path, line_number=i + 1
+                            recommendation=recommendation,
+                            file_path=rel_path,
+                            line_number=i + 1
                        ))
+
    return bottlenecks


+# ── Build Analysis ─────────────────────────────────────────────────
+
 def analyze_build_artifacts(repo_path: str) -> List[Bottleneck]:
+    """Find large build artifacts that slow down builds."""
    bottlenecks = []
    large_dirs = {
-        "node_modules": "Consider npm ci --production",
-        "__pycache__": "Consider .gitignore and cleaning",
+        "node_modules": "Consider using npm ci --production or yarn --production",
+        "__pycache__": "Consider .gitignore and cleaning before builds",
        ".tox": "Consider caching tox environments",
        ".pytest_cache": "Consider cleaning between CI runs",
-        "dist": "Check if dist/ rebuilt unnecessarily",
-        "build": "Check if build/ rebuilt unnecessarily",
-        ".next": "Consider incremental builds",
-        "venv": "Move outside repo or use Docker",
+        "dist": "Check if dist/ artifacts are being rebuilt unnecessarily",
+        "build": "Check if build/ artifacts are being rebuilt unnecessarily",
+        ".next": "Next.js cache — consider incremental builds",
+        "venv": "Virtual env in repo — move outside or use Docker",
    }
+
    for dirname, recommendation in large_dirs.items():
        dirpath = os.path.join(repo_path, dirname)
        if os.path.isdir(dirpath):
            total_size = 0
            file_count = 0
-            for root, _, files in os.walk(dirpath):
+            for root, dirs, files in os.walk(dirpath):
                for f in files:
                    try:
-                        total_size += os.path.getsize(os.path.join(root, f))
+                        fpath = os.path.join(root, f)
+                        total_size += os.path.getsize(fpath)
                        file_count += 1
                    except OSError:
                        pass
-            if total_size > 10 * 1024 * 1024:
+
+            if total_size > 10 * 1024 * 1024:  # > 10MB
                size_mb = total_size / (1024 * 1024)
                bottlenecks.append(Bottleneck(
                    category="build",
                    name=f"{dirname}/ ({size_mb:.1f}MB, {file_count} files)",
-                    duration_s=size_mb * 0.5,
+                    duration_s=size_mb * 0.5,  # Rough estimate
                    severity="critical" if size_mb > 100 else "warning",
                    recommendation=recommendation
                ))
+
    return bottlenecks


 def analyze_makefile_targets(repo_path: str) -> List[Bottleneck]:
+    """Analyze Makefile for potentially slow targets."""
    bottlenecks = []
-    slow_patterns = [
-        (r"pip install", "pip install without --no-deps"),
-        (r"npm install(?!.*--production)", "npm install without --production"),
-        (r"docker build", "Docker build — consider layer caching"),
-        (r"pytest(?!.*-x|--maxfail)", "pytest without early exit"),
-    ]
+    makefiles = []
+
    for root, dirs, files in os.walk(repo_path):
        dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__'))]
        for f in files:
            if f in ("Makefile", "makefile", "GNUmakefile"):
-                fpath = os.path.join(root, f)
-                rel_path = os.path.relpath(fpath, repo_path)
-                try:
-                    with open(fpath) as fh:
-                        content = fh.read()
-                except (PermissionError, UnicodeDecodeError):
-                    continue
-                for pattern, recommendation in slow_patterns:
-                    if re.search(pattern, content):
-                        bottlenecks.append(Bottleneck(
-                            category="build", name=f"{rel_path}: {pattern}",
-                            duration_s=5.0, severity="info",
-                            recommendation=recommendation, file_path=rel_path
-                        ))
+                makefiles.append(os.path.join(root, f))
+
+    slow_patterns = [
+        (r"pip install", "pip install without --no-deps or constraints"),
+        (r"npm install(?!.*--production)", "npm install without --production flag"),
+        (r"docker build", "Docker build — consider multi-stage and layer caching"),
+        (r"pytest(?!.*-x|--maxfail)", "pytest without early exit on failure"),
+        (r"mypy|mypy --strict", "Type checking — consider incremental mode"),
+    ]
+
+    for mfile in makefiles:
+        rel_path = os.path.relpath(mfile, repo_path)
+        try:
+            with open(mfile) as f:
+                content = f.read()
+        except (PermissionError, UnicodeDecodeError):
+            continue
+
+        for pattern, recommendation in slow_patterns:
+            if re.search(pattern, content):
+                bottlenecks.append(Bottleneck(
+                    category="build",
+                    name=f"{rel_path}: {pattern}",
+                    duration_s=5.0,
+                    severity="info",
+                    recommendation=recommendation,
+                    file_path=rel_path
+                ))
+
    return bottlenecks


+# ── CI Analysis ────────────────────────────────────────────────────
+
 def analyze_github_actions(repo_path: str) -> List[Bottleneck]:
+    """Analyze GitHub Actions workflow files for inefficiencies."""
    bottlenecks = []
    workflow_dir = os.path.join(repo_path, ".github", "workflows")
+
    if not os.path.isdir(workflow_dir):
        return bottlenecks
+
    slow_patterns = [
-        (r"runs-on:\s*ubuntu-latest", 0, "Consider caching dependencies"),
-        (r"npm install", 2, "Use npm ci instead"),
-        (r"pip install(?!.*--cache-dir)", 2, "Add --cache-dir"),
+        (r"runs-on:\s*ubuntu-latest", 0, "Consider caching dependencies between runs"),
+        (r"npm install", 2, "Use npm ci instead of npm install for reproducible builds"),
+        (r"pip install(?!.*--cache-dir)", 2, "Add --cache-dir or use pip cache action"),
        (r"docker build(?!.*--cache-from)", 5, "Use Docker layer caching"),
+        (r"python -m pytest(?!.*-n|--numprocesses)", 3, "Consider pytest-xdist for parallel test execution"),
    ]
+
    for fname in os.listdir(workflow_dir):
        if not fname.endswith(('.yml', '.yaml')):
            continue
+
        fpath = os.path.join(workflow_dir, fname)
        try:
            with open(fpath) as f:
                content = f.read()
        except (PermissionError, UnicodeDecodeError):
            continue
+
        for pattern, est_savings, recommendation in slow_patterns:
            if re.search(pattern, content):
                bottlenecks.append(Bottleneck(
-                    category="ci", name=f"{fname}: {pattern}",
-                    duration_s=est_savings, severity="info",
-                    recommendation=recommendation, file_path=f".github/workflows/{fname}"
+                    category="ci",
+                    name=f"{fname}: {pattern}",
+                    duration_s=est_savings,
+                    severity="info",
+                    recommendation=recommendation,
+                    file_path=f".github/workflows/{fname}"
                ))
+
    return bottlenecks


 def analyze_gitea_ci(repo_path: str) -> List[Bottleneck]:
+    """Analyze Gitea/Drone CI config files."""
    bottlenecks = []
-    ci_dir = os.path.join(repo_path, ".gitea", "workflows")
-    if os.path.isdir(ci_dir):
-        for fname in os.listdir(ci_dir):
-            if not fname.endswith(('.yml', '.yaml')):
-                continue
-            fpath = os.path.join(ci_dir, fname)
+    ci_files = [".gitea/workflows", ".drone.yml", ".woodpecker.yml"]
+
+    for ci_path in ci_files:
+        full_path = os.path.join(repo_path, ci_path)
+        if os.path.isfile(full_path):
            try:
-                with open(fpath) as f:
+                with open(full_path) as f:
                    content = f.read()
            except (PermissionError, UnicodeDecodeError):
                continue
+
            if "pip install" in content and "--cache-dir" not in content:
                bottlenecks.append(Bottleneck(
-                    category="ci", name=f".gitea/workflows/{fname}: pip without cache",
-                    duration_s=5.0, severity="warning",
+                    category="ci",
+                    name=f"{ci_path}: pip without cache",
+                    duration_s=5.0,
+                    severity="warning",
                    recommendation="Add --cache-dir or mount pip cache volume",
-                    file_path=f".gitea/workflows/{fname}"
+                    file_path=ci_path
                ))
+
+        elif os.path.isdir(full_path):
+            for fname in os.listdir(full_path):
+                if not fname.endswith(('.yml', '.yaml')):
+                    continue
+                fpath = os.path.join(full_path, fname)
+                try:
+                    with open(fpath) as f:
+                        content = f.read()
+                except (PermissionError, UnicodeDecodeError):
+                    continue
+
+                if "pip install" in content and "--cache-dir" not in content:
+                    bottlenecks.append(Bottleneck(
+                        category="ci",
+                        name=f"{ci_path}/{fname}: pip without cache",
+                        duration_s=5.0,
+                        severity="warning",
+                        recommendation="Add --cache-dir or mount pip cache volume",
+                        file_path=f"{ci_path}/{fname}"
+                    ))
+
    return bottlenecks


+# ── Import Analysis ────────────────────────────────────────────────
+
 def find_slow_imports(repo_path: str) -> List[Bottleneck]:
+    """Find Python files with heavy import chains."""
    bottlenecks = []
    heavy_imports = {
-        "pandas": 0.5, "numpy": 0.3, "torch": 2.0, "tensorflow": 3.0,
-        "scipy": 0.5, "matplotlib": 0.8, "sklearn": 0.5, "transformers": 1.5,
+        "pandas": 0.5,
+        "numpy": 0.3,
+        "torch": 2.0,
+        "tensorflow": 3.0,
+        "scipy": 0.5,
+        "matplotlib": 0.8,
+        "sklearn": 0.5,
+        "transformers": 1.5,
    }
+
    for root, dirs, files in os.walk(repo_path):
        dirs[:] = [d for d in dirs if not d.startswith(('.', '__pycache__', 'node_modules'))]
        for fname in files:
            if not fname.endswith(".py"):
                continue
+
            fpath = os.path.join(root, fname)
            rel_path = os.path.relpath(fpath, repo_path)
+
            try:
                with open(fpath) as f:
                    lines = f.readlines()
            except (PermissionError, UnicodeDecodeError):
                continue
+
            for i, line in enumerate(lines):
                stripped = line.strip()
                if stripped.startswith("import ") or stripped.startswith("from "):
@@ -286,40 +382,56 @@ def find_slow_imports(repo_path: str) -> List[Bottleneck]:
                                name=f"{rel_path}:{i+1}: import {heavy}",
                                duration_s=est_time,
                                severity="info" if est_time < 1.0 else "warning",
-                                recommendation=f"Heavy import ({heavy} ~{est_time}s). Consider lazy import.",
-                                file_path=rel_path, line_number=i + 1
+                                recommendation=f"Heavy import ({heavy} ~{est_time}s). Consider lazy import or conditional import.",
+                                file_path=rel_path,
+                                line_number=i + 1
                            ))
+
    return bottlenecks


+# ── Report Generation ──────────────────────────────────────────────
+
 def severity_sort_key(b: Bottleneck) -> Tuple[int, float]:
+    """Sort by severity then duration."""
    sev_order = {"critical": 0, "warning": 1, "info": 2}
    return (sev_order.get(b.severity, 3), -b.duration_s)


 def generate_report(repo_path: str) -> PerfReport:
+    """Run all analyses and generate a performance report."""
    report = PerfReport(
        timestamp=datetime.now(timezone.utc).isoformat(),
        repo_path=os.path.abspath(repo_path)
    )
+
+    # Collect all bottlenecks
    all_bottlenecks = []
+
    print("Scanning for slow tests (pytest cache)...")
    all_bottlenecks.extend(find_slow_tests_pytest(repo_path))
+
    print("Scanning for slow test patterns...")
    all_bottlenecks.extend(find_slow_tests_by_scan(repo_path))
+
    print("Analyzing build artifacts...")
    all_bottlenecks.extend(analyze_build_artifacts(repo_path))
+
    print("Analyzing Makefiles...")
    all_bottlenecks.extend(analyze_makefile_targets(repo_path))
+
    print("Analyzing CI workflows...")
    all_bottlenecks.extend(analyze_github_actions(repo_path))
    all_bottlenecks.extend(analyze_gitea_ci(repo_path))
+
    print("Scanning for heavy imports...")
    all_bottlenecks.extend(find_slow_imports(repo_path))

+    # Sort by severity and duration
    all_bottlenecks.sort(key=severity_sort_key)
-    report.bottlenecks = all_bottlenecks[:TOP_N_BOTTLENECKS * 2]
+    report.bottlenecks = all_bottlenecks[:TOP_N_BOTTLENECKS * 2]  # Keep more for stats

+    # Compute summary
    by_category = defaultdict(list)
    for b in all_bottlenecks:
        by_category[b.category].append(b)
@@ -332,53 +444,81 @@ def generate_report(repo_path: str) -> PerfReport:
        "estimated_total_slowdown_s": sum(b.duration_s for b in all_bottlenecks),
        "by_category": {cat: len(items) for cat, items in by_category.items()},
    }
+
    report.test_stats = {
        "slow_tests": len(by_category.get("test", [])),
        "total_estimated_s": sum(b.duration_s for b in by_category.get("test", [])),
    }
+
    report.build_stats = {
        "build_issues": len(by_category.get("build", [])),
        "total_estimated_s": sum(b.duration_s for b in by_category.get("build", [])),
    }
+
    report.ci_stats = {
        "ci_issues": len(by_category.get("ci", [])),
        "total_estimated_s": sum(b.duration_s for b in by_category.get("ci", [])),
    }
+
    return report


 def format_markdown(report: PerfReport) -> str:
+    """Format report as markdown."""
    lines = []
-    lines.append("# Performance Bottleneck Report
-")
+    lines.append(f"# Performance Bottleneck Report")
+    lines.append(f"")
    lines.append(f"Generated: {report.timestamp}")
-    lines.append(f"Repository: {report.repo_path}
-")
+    lines.append(f"Repository: {report.repo_path}")
+    lines.append(f"")
+
+    # Summary
    s = report.summary
-    lines.append("## Summary
-")
+    lines.append(f"## Summary")
+    lines.append(f"")
    lines.append(f"- **Total bottlenecks:** {s['total_bottlenecks']}")
    lines.append(f"- **Critical:** {s['critical']} | **Warning:** {s['warning']} | **Info:** {s['info']}")
-    lines.append(f"- **Estimated total slowdown:** {s['estimated_total_slowdown_s']:.1f}s
-")
-    lines.append(f"## Top {min(TOP_N_BOTTLENECKS, len(report.bottlenecks))} Bottlenecks
-")
+    lines.append(f"- **Estimated total slowdown:** {s['estimated_total_slowdown_s']:.1f}s")
+    lines.append(f"- **By category:** {', '.join(f'{k}: {v}' for k, v in s['by_category'].items())}")
+    lines.append(f"")
+
+    # Top bottlenecks
+    lines.append(f"## Top {min(TOP_N_BOTTLENECKS, len(report.bottlenecks))} Bottlenecks")
+    lines.append(f"")
+
    for i, b in enumerate(report.bottlenecks[:TOP_N_BOTTLENECKS], 1):
-        icon = {"critical": "CRIT", "warning": "WARN", "info": "INFO"}.get(b.severity, "?")
+        icon = {"critical": "🔴", "warning": "🟡", "info": "🔵"}.get(b.severity, "⚪")
        loc = f" ({b.file_path}:{b.line_number})" if b.file_path else ""
-        lines.append(f"{i}. [{icon}] **{b.category}** -- {b.name}{loc}")
-        lines.append(f"   Duration: ~{b.duration_s:.1f}s | {b.recommendation}
-")
+        lines.append(f"{i}. {icon} **{b.category}** — {b.name}{loc}")
+        lines.append(f"   - Duration: ~{b.duration_s:.1f}s | Severity: {b.severity}")
+        lines.append(f"   - Fix: {b.recommendation}")
+        lines.append(f"")
+
+    # Category breakdowns
+    for cat in ["test", "build", "ci", "import"]:
+        items = [b for b in report.bottlenecks if b.category == cat]
+        if items:
+            lines.append(f"## {cat.title()} Bottlenecks")
+            lines.append(f"")
+            for b in items:
+                icon = {"critical": "🔴", "warning": "🟡", "info": "🔵"}.get(b.severity, "⚪")
+                loc = f" ({b.file_path}:{b.line_number})" if b.file_path else ""
+                lines.append(f"- {icon} {b.name}{loc} — ~{b.duration_s:.1f}s — {b.recommendation}")
+            lines.append(f"")
+
    return "
 ".join(lines)


+# ── Main ───────────────────────────────────────────────────────────
+
 def main():
    parser = argparse.ArgumentParser(description="Performance Bottleneck Finder")
    parser.add_argument("--repo", default=".", help="Path to repository to analyze")
    parser.add_argument("--json", action="store_true", help="Output as JSON")
    parser.add_argument("--report", help="Write markdown report to file")
-    parser.add_argument("--threshold", type=float, default=SLOW_TEST_THRESHOLD_S)
+    parser.add_argument("--threshold", type=float, default=SLOW_TEST_THRESHOLD_S,
+                        help="Slow test threshold in seconds")
    args = parser.parse_args()

    global SLOW_TEST_THRESHOLD_S
@@ -402,6 +542,7 @@ def main():
        else:
            print(md)

+    # Exit code: 1 if critical bottlenecks found
    if report.summary.get("critical", 0) > 0:
        sys.exit(1)