257 lines
8.2 KiB
Python
257 lines
8.2 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""Flake detector for the Nexus test suite.
|
|||
|
|
|
|||
|
|
Reads pytest JSON reports (produced by pytest-json-report) and maintains a
|
|||
|
|
rolling history file at .test-history.json. After each update it prints a
|
|||
|
|
report of any test whose pass rate has dropped below the 95 % consistency
|
|||
|
|
threshold and exits non-zero if any flaky tests are found.
|
|||
|
|
|
|||
|
|
Usage
|
|||
|
|
-----
|
|||
|
|
Install pytest-json-report once::
|
|||
|
|
|
|||
|
|
pip install pytest-json-report
|
|||
|
|
|
|||
|
|
Then run tests with JSON output::
|
|||
|
|
|
|||
|
|
pytest --json-report --json-report-file=.test-report.json
|
|||
|
|
|
|||
|
|
Then call this script::
|
|||
|
|
|
|||
|
|
python scripts/flake_detector.py # uses .test-report.json + .test-history.json
|
|||
|
|
python scripts/flake_detector.py --report path/to/report.json
|
|||
|
|
python scripts/flake_detector.py --history path/to/history.json
|
|||
|
|
python scripts/flake_detector.py --threshold 0.90 # lower threshold for local dev
|
|||
|
|
|
|||
|
|
The script is also safe to call with no report file — it will just print the
|
|||
|
|
current history statistics without updating anything.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import json
|
|||
|
|
import sys
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import TypedDict
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
# Types
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
class TestRecord(TypedDict):
|
|||
|
|
"""Per-test rolling history."""
|
|||
|
|
runs: int
|
|||
|
|
passes: int
|
|||
|
|
failures: int
|
|||
|
|
skips: int
|
|||
|
|
last_outcome: str # "passed" | "failed" | "skipped" | "error"
|
|||
|
|
|
|||
|
|
|
|||
|
|
class HistoryFile(TypedDict):
|
|||
|
|
total_runs: int
|
|||
|
|
tests: dict[str, TestRecord]
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
# Constants
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
DEFAULT_REPORT = Path(".test-report.json")
|
|||
|
|
DEFAULT_HISTORY = Path(".test-history.json")
|
|||
|
|
DEFAULT_THRESHOLD = 0.95 # 95 % consistency required
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
# Core helpers
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
def load_history(history_path: Path) -> HistoryFile:
|
|||
|
|
if history_path.exists():
|
|||
|
|
with history_path.open() as fh:
|
|||
|
|
return json.load(fh)
|
|||
|
|
return {"total_runs": 0, "tests": {}}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def save_history(history: HistoryFile, history_path: Path) -> None:
|
|||
|
|
with history_path.open("w") as fh:
|
|||
|
|
json.dump(history, fh, indent=2, sort_keys=True)
|
|||
|
|
print(f"[flake-detector] History saved → {history_path}", file=sys.stderr)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def ingest_report(report_path: Path, history: HistoryFile) -> int:
|
|||
|
|
"""Merge a pytest JSON report into *history*. Returns the number of tests updated."""
|
|||
|
|
with report_path.open() as fh:
|
|||
|
|
report = json.load(fh)
|
|||
|
|
|
|||
|
|
history["total_runs"] = history.get("total_runs", 0) + 1
|
|||
|
|
tests_section = report.get("tests", [])
|
|||
|
|
|
|||
|
|
for test in tests_section:
|
|||
|
|
node_id: str = test.get("nodeid", "unknown")
|
|||
|
|
outcome: str = test.get("outcome", "unknown")
|
|||
|
|
|
|||
|
|
rec: TestRecord = history["tests"].setdefault(
|
|||
|
|
node_id,
|
|||
|
|
{"runs": 0, "passes": 0, "failures": 0, "skips": 0, "last_outcome": ""},
|
|||
|
|
)
|
|||
|
|
rec["runs"] += 1
|
|||
|
|
rec["last_outcome"] = outcome
|
|||
|
|
if outcome == "passed":
|
|||
|
|
rec["passes"] += 1
|
|||
|
|
elif outcome in ("failed", "error"):
|
|||
|
|
rec["failures"] += 1
|
|||
|
|
elif outcome == "skipped":
|
|||
|
|
rec["skips"] += 1
|
|||
|
|
|
|||
|
|
return len(tests_section)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def consistency(rec: TestRecord) -> float:
|
|||
|
|
"""Return fraction of runs that produced a deterministic (pass or fail) outcome.
|
|||
|
|
|
|||
|
|
A test that always passes → 1.0 (stable green).
|
|||
|
|
A test that always fails → 0.0 (stable red — broken, not flaky).
|
|||
|
|
A test that passes 9 out of 10 times → 0.9 (flaky).
|
|||
|
|
|
|||
|
|
We define *consistency* as the rate at which the test's outcome matches
|
|||
|
|
its dominant outcome (pass or fail). A test with fewer than
|
|||
|
|
MIN_RUNS runs is not judged.
|
|||
|
|
"""
|
|||
|
|
runs = rec["runs"]
|
|||
|
|
if runs == 0:
|
|||
|
|
return 1.0
|
|||
|
|
passes = rec["passes"]
|
|||
|
|
failures = rec["failures"]
|
|||
|
|
dominant = max(passes, failures)
|
|||
|
|
return dominant / runs
|
|||
|
|
|
|||
|
|
|
|||
|
|
MIN_RUNS = 5 # need at least this many runs before flagging
|
|||
|
|
|
|||
|
|
|
|||
|
|
def find_flaky_tests(
|
|||
|
|
history: HistoryFile,
|
|||
|
|
threshold: float = DEFAULT_THRESHOLD,
|
|||
|
|
) -> list[tuple[str, TestRecord, float]]:
|
|||
|
|
"""Return (node_id, record, consistency_rate) for all tests below threshold."""
|
|||
|
|
flaky: list[tuple[str, TestRecord, float]] = []
|
|||
|
|
for node_id, rec in history["tests"].items():
|
|||
|
|
if rec["runs"] < MIN_RUNS:
|
|||
|
|
continue
|
|||
|
|
rate = consistency(rec)
|
|||
|
|
if rate < threshold:
|
|||
|
|
flaky.append((node_id, rec, rate))
|
|||
|
|
flaky.sort(key=lambda x: x[2]) # worst first
|
|||
|
|
return flaky
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
# Reporting
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
def print_report(
|
|||
|
|
flaky: list[tuple[str, TestRecord, float]],
|
|||
|
|
history: HistoryFile,
|
|||
|
|
threshold: float,
|
|||
|
|
) -> None:
|
|||
|
|
total_tests = len(history["tests"])
|
|||
|
|
total_runs = history.get("total_runs", 0)
|
|||
|
|
|
|||
|
|
print(f"\n{'=' * 70}")
|
|||
|
|
print(" FLAKE DETECTOR REPORT")
|
|||
|
|
print(f"{'=' * 70}")
|
|||
|
|
print(f" Total suite runs tracked : {total_runs}")
|
|||
|
|
print(f" Total distinct tests : {total_tests}")
|
|||
|
|
print(f" Consistency threshold : {threshold:.0%}")
|
|||
|
|
print(f" Min runs before judging : {MIN_RUNS}")
|
|||
|
|
print(f"{'=' * 70}")
|
|||
|
|
|
|||
|
|
if not flaky:
|
|||
|
|
print(" ✓ No flaky tests detected — all tests above consistency threshold.")
|
|||
|
|
print(f"{'=' * 70}\n")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print(f" ✗ {len(flaky)} FLAKY TEST(S) DETECTED:\n")
|
|||
|
|
for node_id, rec, rate in flaky:
|
|||
|
|
print(f" [{rate:.0%}] {node_id}")
|
|||
|
|
print(
|
|||
|
|
f" runs={rec['runs']} passes={rec['passes']} "
|
|||
|
|
f"failures={rec['failures']} skips={rec['skips']} "
|
|||
|
|
f"last={rec['last_outcome']}"
|
|||
|
|
)
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
print(" ACTION REQUIRED:")
|
|||
|
|
print(" 1. Move each flaky test to tests/quarantine/")
|
|||
|
|
print(" 2. File a tracking issue with [FLAKY] in the title")
|
|||
|
|
print(" 3. Add @pytest.mark.quarantine(reason='#NNN') to the test")
|
|||
|
|
print(" See docs/QUARANTINE_PROCESS.md for full instructions.")
|
|||
|
|
print(f"{'=' * 70}\n")
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
# CLI
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|||
|
|
parser = argparse.ArgumentParser(
|
|||
|
|
description="Detect flaky tests by analysing pytest JSON report history."
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--report",
|
|||
|
|
type=Path,
|
|||
|
|
default=DEFAULT_REPORT,
|
|||
|
|
help=f"Path to pytest JSON report file (default: {DEFAULT_REPORT})",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--history",
|
|||
|
|
type=Path,
|
|||
|
|
default=DEFAULT_HISTORY,
|
|||
|
|
help=f"Path to rolling history JSON file (default: {DEFAULT_HISTORY})",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--threshold",
|
|||
|
|
type=float,
|
|||
|
|
default=DEFAULT_THRESHOLD,
|
|||
|
|
help=f"Consistency threshold 0–1 (default: {DEFAULT_THRESHOLD})",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--no-update",
|
|||
|
|
action="store_true",
|
|||
|
|
default=False,
|
|||
|
|
help="Print current statistics without ingesting a new report",
|
|||
|
|
)
|
|||
|
|
return parser.parse_args(argv)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main(argv: list[str] | None = None) -> int:
|
|||
|
|
args = parse_args(argv)
|
|||
|
|
history = load_history(args.history)
|
|||
|
|
|
|||
|
|
if not args.no_update:
|
|||
|
|
if not args.report.exists():
|
|||
|
|
print(
|
|||
|
|
f"[flake-detector] No report file at {args.report} — "
|
|||
|
|
"run pytest with --json-report first.",
|
|||
|
|
file=sys.stderr,
|
|||
|
|
)
|
|||
|
|
# Not a fatal error; just print current state.
|
|||
|
|
else:
|
|||
|
|
n = ingest_report(args.report, history)
|
|||
|
|
print(
|
|||
|
|
f"[flake-detector] Ingested {n} test results from {args.report}",
|
|||
|
|
file=sys.stderr,
|
|||
|
|
)
|
|||
|
|
save_history(history, args.history)
|
|||
|
|
|
|||
|
|
flaky = find_flaky_tests(history, threshold=args.threshold)
|
|||
|
|
print_report(flaky, history, threshold=args.threshold)
|
|||
|
|
|
|||
|
|
return 1 if flaky else 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
sys.exit(main())
|