[claude] Poka-yoke: make test skips/flakes impossible to ignore (#1094) (#1104)

2026-04-07 14:38:49 +00:00
parent d0d655b42a
commit caa7823cdd
6 changed files with 559 additions and 0 deletions
--- a/scripts/flake_detector.py
+++ b/scripts/flake_detector.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+"""Flake detector for the Nexus test suite.
+
+Reads pytest JSON reports (produced by pytest-json-report) and maintains a
+rolling history file at .test-history.json.  After each update it prints a
+report of any test whose pass rate has dropped below the 95 % consistency
+threshold and exits non-zero if any flaky tests are found.
+
+Usage
+-----
+Install pytest-json-report once::
+
+    pip install pytest-json-report
+
+Then run tests with JSON output::
+
+    pytest --json-report --json-report-file=.test-report.json
+
+Then call this script::
+
+    python scripts/flake_detector.py            # uses .test-report.json + .test-history.json
+    python scripts/flake_detector.py --report path/to/report.json
+    python scripts/flake_detector.py --history path/to/history.json
+    python scripts/flake_detector.py --threshold 0.90   # lower threshold for local dev
+
+The script is also safe to call with no report file — it will just print the
+current history statistics without updating anything.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import TypedDict
+
+
+# ---------------------------------------------------------------------------
+# Types
+# ---------------------------------------------------------------------------
+
+class TestRecord(TypedDict):
+    """Per-test rolling history."""
+    runs: int
+    passes: int
+    failures: int
+    skips: int
+    last_outcome: str  # "passed" | "failed" | "skipped" | "error"
+
+
+class HistoryFile(TypedDict):
+    total_runs: int
+    tests: dict[str, TestRecord]
+
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+DEFAULT_REPORT = Path(".test-report.json")
+DEFAULT_HISTORY = Path(".test-history.json")
+DEFAULT_THRESHOLD = 0.95  # 95 % consistency required
+
+
+# ---------------------------------------------------------------------------
+# Core helpers
+# ---------------------------------------------------------------------------
+
+def load_history(history_path: Path) -> HistoryFile:
+    if history_path.exists():
+        with history_path.open() as fh:
+            return json.load(fh)
+    return {"total_runs": 0, "tests": {}}
+
+
+def save_history(history: HistoryFile, history_path: Path) -> None:
+    with history_path.open("w") as fh:
+        json.dump(history, fh, indent=2, sort_keys=True)
+    print(f"[flake-detector] History saved → {history_path}", file=sys.stderr)
+
+
+def ingest_report(report_path: Path, history: HistoryFile) -> int:
+    """Merge a pytest JSON report into *history*.  Returns the number of tests updated."""
+    with report_path.open() as fh:
+        report = json.load(fh)
+
+    history["total_runs"] = history.get("total_runs", 0) + 1
+    tests_section = report.get("tests", [])
+
+    for test in tests_section:
+        node_id: str = test.get("nodeid", "unknown")
+        outcome: str = test.get("outcome", "unknown")
+
+        rec: TestRecord = history["tests"].setdefault(
+            node_id,
+            {"runs": 0, "passes": 0, "failures": 0, "skips": 0, "last_outcome": ""},
+        )
+        rec["runs"] += 1
+        rec["last_outcome"] = outcome
+        if outcome == "passed":
+            rec["passes"] += 1
+        elif outcome in ("failed", "error"):
+            rec["failures"] += 1
+        elif outcome == "skipped":
+            rec["skips"] += 1
+
+    return len(tests_section)
+
+
+def consistency(rec: TestRecord) -> float:
+    """Return fraction of runs that produced a deterministic (pass or fail) outcome.
+
+    A test that always passes → 1.0 (stable green).
+    A test that always fails  → 0.0 (stable red — broken, not flaky).
+    A test that passes 9 out of 10 times → 0.9 (flaky).
+
+    We define *consistency* as the rate at which the test's outcome matches
+    its dominant outcome (pass or fail).  A test with fewer than
+    MIN_RUNS runs is not judged.
+    """
+    runs = rec["runs"]
+    if runs == 0:
+        return 1.0
+    passes = rec["passes"]
+    failures = rec["failures"]
+    dominant = max(passes, failures)
+    return dominant / runs
+
+
+MIN_RUNS = 5  # need at least this many runs before flagging
+
+
+def find_flaky_tests(
+    history: HistoryFile,
+    threshold: float = DEFAULT_THRESHOLD,
+) -> list[tuple[str, TestRecord, float]]:
+    """Return (node_id, record, consistency_rate) for all tests below threshold."""
+    flaky: list[tuple[str, TestRecord, float]] = []
+    for node_id, rec in history["tests"].items():
+        if rec["runs"] < MIN_RUNS:
+            continue
+        rate = consistency(rec)
+        if rate < threshold:
+            flaky.append((node_id, rec, rate))
+    flaky.sort(key=lambda x: x[2])  # worst first
+    return flaky
+
+
+# ---------------------------------------------------------------------------
+# Reporting
+# ---------------------------------------------------------------------------
+
+def print_report(
+    flaky: list[tuple[str, TestRecord, float]],
+    history: HistoryFile,
+    threshold: float,
+) -> None:
+    total_tests = len(history["tests"])
+    total_runs = history.get("total_runs", 0)
+
+    print(f"\n{'=' * 70}")
+    print("  FLAKE DETECTOR REPORT")
+    print(f"{'=' * 70}")
+    print(f"  Total suite runs tracked : {total_runs}")
+    print(f"  Total distinct tests     : {total_tests}")
+    print(f"  Consistency threshold    : {threshold:.0%}")
+    print(f"  Min runs before judging  : {MIN_RUNS}")
+    print(f"{'=' * 70}")
+
+    if not flaky:
+        print("  ✓ No flaky tests detected — all tests above consistency threshold.")
+        print(f"{'=' * 70}\n")
+        return
+
+    print(f"  ✗ {len(flaky)} FLAKY TEST(S) DETECTED:\n")
+    for node_id, rec, rate in flaky:
+        print(f"  [{rate:.0%}] {node_id}")
+        print(
+            f"       runs={rec['runs']}  passes={rec['passes']}  "
+            f"failures={rec['failures']}  skips={rec['skips']}  "
+            f"last={rec['last_outcome']}"
+        )
+        print()
+
+    print("  ACTION REQUIRED:")
+    print("  1. Move each flaky test to tests/quarantine/")
+    print("  2. File a tracking issue with [FLAKY] in the title")
+    print("  3. Add @pytest.mark.quarantine(reason='#NNN') to the test")
+    print("  See docs/QUARANTINE_PROCESS.md for full instructions.")
+    print(f"{'=' * 70}\n")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Detect flaky tests by analysing pytest JSON report history."
+    )
+    parser.add_argument(
+        "--report",
+        type=Path,
+        default=DEFAULT_REPORT,
+        help=f"Path to pytest JSON report file (default: {DEFAULT_REPORT})",
+    )
+    parser.add_argument(
+        "--history",
+        type=Path,
+        default=DEFAULT_HISTORY,
+        help=f"Path to rolling history JSON file (default: {DEFAULT_HISTORY})",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=DEFAULT_THRESHOLD,
+        help=f"Consistency threshold 0–1 (default: {DEFAULT_THRESHOLD})",
+    )
+    parser.add_argument(
+        "--no-update",
+        action="store_true",
+        default=False,
+        help="Print current statistics without ingesting a new report",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(argv)
+    history = load_history(args.history)
+
+    if not args.no_update:
+        if not args.report.exists():
+            print(
+                f"[flake-detector] No report file at {args.report} — "
+                "run pytest with --json-report first.",
+                file=sys.stderr,
+            )
+            # Not a fatal error; just print current state.
+        else:
+            n = ingest_report(args.report, history)
+            print(
+                f"[flake-detector] Ingested {n} test results from {args.report}",
+                file=sys.stderr,
+            )
+            save_history(history, args.history)
+
+    flaky = find_flaky_tests(history, threshold=args.threshold)
+    print_report(flaky, history, threshold=args.threshold)
+
+    return 1 if flaky else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())