Files
the-nexus/scripts/flake_detector.py
2026-04-07 14:38:49 +00:00

257 lines
8.2 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Flake detector for the Nexus test suite.
Reads pytest JSON reports (produced by pytest-json-report) and maintains a
rolling history file at .test-history.json. After each update it prints a
report of any test whose pass rate has dropped below the 95 % consistency
threshold and exits non-zero if any flaky tests are found.
Usage
-----
Install pytest-json-report once::
pip install pytest-json-report
Then run tests with JSON output::
pytest --json-report --json-report-file=.test-report.json
Then call this script::
python scripts/flake_detector.py # uses .test-report.json + .test-history.json
python scripts/flake_detector.py --report path/to/report.json
python scripts/flake_detector.py --history path/to/history.json
python scripts/flake_detector.py --threshold 0.90 # lower threshold for local dev
The script is also safe to call with no report file — it will just print the
current history statistics without updating anything.
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import TypedDict
# ---------------------------------------------------------------------------
# Types
# ---------------------------------------------------------------------------
class TestRecord(TypedDict):
"""Per-test rolling history."""
runs: int
passes: int
failures: int
skips: int
last_outcome: str # "passed" | "failed" | "skipped" | "error"
class HistoryFile(TypedDict):
total_runs: int
tests: dict[str, TestRecord]
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
DEFAULT_REPORT = Path(".test-report.json")
DEFAULT_HISTORY = Path(".test-history.json")
DEFAULT_THRESHOLD = 0.95 # 95 % consistency required
# ---------------------------------------------------------------------------
# Core helpers
# ---------------------------------------------------------------------------
def load_history(history_path: Path) -> HistoryFile:
if history_path.exists():
with history_path.open() as fh:
return json.load(fh)
return {"total_runs": 0, "tests": {}}
def save_history(history: HistoryFile, history_path: Path) -> None:
with history_path.open("w") as fh:
json.dump(history, fh, indent=2, sort_keys=True)
print(f"[flake-detector] History saved → {history_path}", file=sys.stderr)
def ingest_report(report_path: Path, history: HistoryFile) -> int:
"""Merge a pytest JSON report into *history*. Returns the number of tests updated."""
with report_path.open() as fh:
report = json.load(fh)
history["total_runs"] = history.get("total_runs", 0) + 1
tests_section = report.get("tests", [])
for test in tests_section:
node_id: str = test.get("nodeid", "unknown")
outcome: str = test.get("outcome", "unknown")
rec: TestRecord = history["tests"].setdefault(
node_id,
{"runs": 0, "passes": 0, "failures": 0, "skips": 0, "last_outcome": ""},
)
rec["runs"] += 1
rec["last_outcome"] = outcome
if outcome == "passed":
rec["passes"] += 1
elif outcome in ("failed", "error"):
rec["failures"] += 1
elif outcome == "skipped":
rec["skips"] += 1
return len(tests_section)
def consistency(rec: TestRecord) -> float:
"""Return fraction of runs that produced a deterministic (pass or fail) outcome.
A test that always passes → 1.0 (stable green).
A test that always fails → 0.0 (stable red — broken, not flaky).
A test that passes 9 out of 10 times → 0.9 (flaky).
We define *consistency* as the rate at which the test's outcome matches
its dominant outcome (pass or fail). A test with fewer than
MIN_RUNS runs is not judged.
"""
runs = rec["runs"]
if runs == 0:
return 1.0
passes = rec["passes"]
failures = rec["failures"]
dominant = max(passes, failures)
return dominant / runs
MIN_RUNS = 5 # need at least this many runs before flagging
def find_flaky_tests(
history: HistoryFile,
threshold: float = DEFAULT_THRESHOLD,
) -> list[tuple[str, TestRecord, float]]:
"""Return (node_id, record, consistency_rate) for all tests below threshold."""
flaky: list[tuple[str, TestRecord, float]] = []
for node_id, rec in history["tests"].items():
if rec["runs"] < MIN_RUNS:
continue
rate = consistency(rec)
if rate < threshold:
flaky.append((node_id, rec, rate))
flaky.sort(key=lambda x: x[2]) # worst first
return flaky
# ---------------------------------------------------------------------------
# Reporting
# ---------------------------------------------------------------------------
def print_report(
flaky: list[tuple[str, TestRecord, float]],
history: HistoryFile,
threshold: float,
) -> None:
total_tests = len(history["tests"])
total_runs = history.get("total_runs", 0)
print(f"\n{'=' * 70}")
print(" FLAKE DETECTOR REPORT")
print(f"{'=' * 70}")
print(f" Total suite runs tracked : {total_runs}")
print(f" Total distinct tests : {total_tests}")
print(f" Consistency threshold : {threshold:.0%}")
print(f" Min runs before judging : {MIN_RUNS}")
print(f"{'=' * 70}")
if not flaky:
print(" ✓ No flaky tests detected — all tests above consistency threshold.")
print(f"{'=' * 70}\n")
return
print(f"{len(flaky)} FLAKY TEST(S) DETECTED:\n")
for node_id, rec, rate in flaky:
print(f" [{rate:.0%}] {node_id}")
print(
f" runs={rec['runs']} passes={rec['passes']} "
f"failures={rec['failures']} skips={rec['skips']} "
f"last={rec['last_outcome']}"
)
print()
print(" ACTION REQUIRED:")
print(" 1. Move each flaky test to tests/quarantine/")
print(" 2. File a tracking issue with [FLAKY] in the title")
print(" 3. Add @pytest.mark.quarantine(reason='#NNN') to the test")
print(" See docs/QUARANTINE_PROCESS.md for full instructions.")
print(f"{'=' * 70}\n")
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Detect flaky tests by analysing pytest JSON report history."
)
parser.add_argument(
"--report",
type=Path,
default=DEFAULT_REPORT,
help=f"Path to pytest JSON report file (default: {DEFAULT_REPORT})",
)
parser.add_argument(
"--history",
type=Path,
default=DEFAULT_HISTORY,
help=f"Path to rolling history JSON file (default: {DEFAULT_HISTORY})",
)
parser.add_argument(
"--threshold",
type=float,
default=DEFAULT_THRESHOLD,
help=f"Consistency threshold 01 (default: {DEFAULT_THRESHOLD})",
)
parser.add_argument(
"--no-update",
action="store_true",
default=False,
help="Print current statistics without ingesting a new report",
)
return parser.parse_args(argv)
def main(argv: list[str] | None = None) -> int:
args = parse_args(argv)
history = load_history(args.history)
if not args.no_update:
if not args.report.exists():
print(
f"[flake-detector] No report file at {args.report}"
"run pytest with --json-report first.",
file=sys.stderr,
)
# Not a fatal error; just print current state.
else:
n = ingest_report(args.report, history)
print(
f"[flake-detector] Ingested {n} test results from {args.report}",
file=sys.stderr,
)
save_history(history, args.history)
flaky = find_flaky_tests(history, threshold=args.threshold)
print_report(flaky, history, threshold=args.threshold)
return 1 if flaky else 0
if __name__ == "__main__":
sys.exit(main())