#!/usr/bin/env python3 """Flake detector for the Nexus test suite. Reads pytest JSON reports (produced by pytest-json-report) and maintains a rolling history file at .test-history.json. After each update it prints a report of any test whose pass rate has dropped below the 95 % consistency threshold and exits non-zero if any flaky tests are found. Usage ----- Install pytest-json-report once:: pip install pytest-json-report Then run tests with JSON output:: pytest --json-report --json-report-file=.test-report.json Then call this script:: python scripts/flake_detector.py # uses .test-report.json + .test-history.json python scripts/flake_detector.py --report path/to/report.json python scripts/flake_detector.py --history path/to/history.json python scripts/flake_detector.py --threshold 0.90 # lower threshold for local dev The script is also safe to call with no report file — it will just print the current history statistics without updating anything. """ from __future__ import annotations import argparse import json import sys from pathlib import Path from typing import TypedDict # --------------------------------------------------------------------------- # Types # --------------------------------------------------------------------------- class TestRecord(TypedDict): """Per-test rolling history.""" runs: int passes: int failures: int skips: int last_outcome: str # "passed" | "failed" | "skipped" | "error" class HistoryFile(TypedDict): total_runs: int tests: dict[str, TestRecord] # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- DEFAULT_REPORT = Path(".test-report.json") DEFAULT_HISTORY = Path(".test-history.json") DEFAULT_THRESHOLD = 0.95 # 95 % consistency required # --------------------------------------------------------------------------- # Core helpers # --------------------------------------------------------------------------- def load_history(history_path: Path) -> HistoryFile: if history_path.exists(): with history_path.open() as fh: return json.load(fh) return {"total_runs": 0, "tests": {}} def save_history(history: HistoryFile, history_path: Path) -> None: with history_path.open("w") as fh: json.dump(history, fh, indent=2, sort_keys=True) print(f"[flake-detector] History saved → {history_path}", file=sys.stderr) def ingest_report(report_path: Path, history: HistoryFile) -> int: """Merge a pytest JSON report into *history*. Returns the number of tests updated.""" with report_path.open() as fh: report = json.load(fh) history["total_runs"] = history.get("total_runs", 0) + 1 tests_section = report.get("tests", []) for test in tests_section: node_id: str = test.get("nodeid", "unknown") outcome: str = test.get("outcome", "unknown") rec: TestRecord = history["tests"].setdefault( node_id, {"runs": 0, "passes": 0, "failures": 0, "skips": 0, "last_outcome": ""}, ) rec["runs"] += 1 rec["last_outcome"] = outcome if outcome == "passed": rec["passes"] += 1 elif outcome in ("failed", "error"): rec["failures"] += 1 elif outcome == "skipped": rec["skips"] += 1 return len(tests_section) def consistency(rec: TestRecord) -> float: """Return fraction of runs that produced a deterministic (pass or fail) outcome. A test that always passes → 1.0 (stable green). A test that always fails → 0.0 (stable red — broken, not flaky). A test that passes 9 out of 10 times → 0.9 (flaky). We define *consistency* as the rate at which the test's outcome matches its dominant outcome (pass or fail). A test with fewer than MIN_RUNS runs is not judged. """ runs = rec["runs"] if runs == 0: return 1.0 passes = rec["passes"] failures = rec["failures"] dominant = max(passes, failures) return dominant / runs MIN_RUNS = 5 # need at least this many runs before flagging def find_flaky_tests( history: HistoryFile, threshold: float = DEFAULT_THRESHOLD, ) -> list[tuple[str, TestRecord, float]]: """Return (node_id, record, consistency_rate) for all tests below threshold.""" flaky: list[tuple[str, TestRecord, float]] = [] for node_id, rec in history["tests"].items(): if rec["runs"] < MIN_RUNS: continue rate = consistency(rec) if rate < threshold: flaky.append((node_id, rec, rate)) flaky.sort(key=lambda x: x[2]) # worst first return flaky # --------------------------------------------------------------------------- # Reporting # --------------------------------------------------------------------------- def print_report( flaky: list[tuple[str, TestRecord, float]], history: HistoryFile, threshold: float, ) -> None: total_tests = len(history["tests"]) total_runs = history.get("total_runs", 0) print(f"\n{'=' * 70}") print(" FLAKE DETECTOR REPORT") print(f"{'=' * 70}") print(f" Total suite runs tracked : {total_runs}") print(f" Total distinct tests : {total_tests}") print(f" Consistency threshold : {threshold:.0%}") print(f" Min runs before judging : {MIN_RUNS}") print(f"{'=' * 70}") if not flaky: print(" ✓ No flaky tests detected — all tests above consistency threshold.") print(f"{'=' * 70}\n") return print(f" ✗ {len(flaky)} FLAKY TEST(S) DETECTED:\n") for node_id, rec, rate in flaky: print(f" [{rate:.0%}] {node_id}") print( f" runs={rec['runs']} passes={rec['passes']} " f"failures={rec['failures']} skips={rec['skips']} " f"last={rec['last_outcome']}" ) print() print(" ACTION REQUIRED:") print(" 1. Move each flaky test to tests/quarantine/") print(" 2. File a tracking issue with [FLAKY] in the title") print(" 3. Add @pytest.mark.quarantine(reason='#NNN') to the test") print(" See docs/QUARANTINE_PROCESS.md for full instructions.") print(f"{'=' * 70}\n") # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser( description="Detect flaky tests by analysing pytest JSON report history." ) parser.add_argument( "--report", type=Path, default=DEFAULT_REPORT, help=f"Path to pytest JSON report file (default: {DEFAULT_REPORT})", ) parser.add_argument( "--history", type=Path, default=DEFAULT_HISTORY, help=f"Path to rolling history JSON file (default: {DEFAULT_HISTORY})", ) parser.add_argument( "--threshold", type=float, default=DEFAULT_THRESHOLD, help=f"Consistency threshold 0–1 (default: {DEFAULT_THRESHOLD})", ) parser.add_argument( "--no-update", action="store_true", default=False, help="Print current statistics without ingesting a new report", ) return parser.parse_args(argv) def main(argv: list[str] | None = None) -> int: args = parse_args(argv) history = load_history(args.history) if not args.no_update: if not args.report.exists(): print( f"[flake-detector] No report file at {args.report} — " "run pytest with --json-report first.", file=sys.stderr, ) # Not a fatal error; just print current state. else: n = ingest_report(args.report, history) print( f"[flake-detector] Ingested {n} test results from {args.report}", file=sys.stderr, ) save_history(history, args.history) flaky = find_flaky_tests(history, threshold=args.threshold) print_report(flaky, history, threshold=args.threshold) return 1 if flaky else 0 if __name__ == "__main__": sys.exit(main())