scripts/flake_detector.py

#!/usr/bin/env python3
"""Flake detector for the Nexus test suite.

Reads pytest JSON reports (produced by pytest-json-report) and maintains a
rolling history file at .test-history.json.  After each update it prints a
report of any test whose pass rate has dropped below the 95 % consistency
threshold and exits non-zero if any flaky tests are found.

Usage
-----
Install pytest-json-report once::

    pip install pytest-json-report

Then run tests with JSON output::

    pytest --json-report --json-report-file=.test-report.json

Then call this script::

    python scripts/flake_detector.py            # uses .test-report.json + .test-history.json
    python scripts/flake_detector.py --report path/to/report.json
    python scripts/flake_detector.py --history path/to/history.json
    python scripts/flake_detector.py --threshold 0.90   # lower threshold for local dev

The script is also safe to call with no report file — it will just print the
current history statistics without updating anything.
"""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import TypedDict


# ---------------------------------------------------------------------------
# Types
# ---------------------------------------------------------------------------

class TestRecord(TypedDict):
    """Per-test rolling history."""
    runs: int
    passes: int
    failures: int
    skips: int
    last_outcome: str  # "passed" | "failed" | "skipped" | "error"


class HistoryFile(TypedDict):
    total_runs: int
    tests: dict[str, TestRecord]


# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

DEFAULT_REPORT = Path(".test-report.json")
DEFAULT_HISTORY = Path(".test-history.json")
DEFAULT_THRESHOLD = 0.95  # 95 % consistency required


# ---------------------------------------------------------------------------
# Core helpers
# ---------------------------------------------------------------------------

def load_history(history_path: Path) -> HistoryFile:
    if history_path.exists():
        with history_path.open() as fh:
            return json.load(fh)
    return {"total_runs": 0, "tests": {}}


def save_history(history: HistoryFile, history_path: Path) -> None:
    with history_path.open("w") as fh:
        json.dump(history, fh, indent=2, sort_keys=True)
    print(f"[flake-detector] History saved → {history_path}", file=sys.stderr)


def ingest_report(report_path: Path, history: HistoryFile) -> int:
    """Merge a pytest JSON report into *history*.  Returns the number of tests updated."""
    with report_path.open() as fh:
        report = json.load(fh)

    history["total_runs"] = history.get("total_runs", 0) + 1
    tests_section = report.get("tests", [])

    for test in tests_section:
        node_id: str = test.get("nodeid", "unknown")
        outcome: str = test.get("outcome", "unknown")

        rec: TestRecord = history["tests"].setdefault(
            node_id,
            {"runs": 0, "passes": 0, "failures": 0, "skips": 0, "last_outcome": ""},
        )
        rec["runs"] += 1
        rec["last_outcome"] = outcome
        if outcome == "passed":
            rec["passes"] += 1
        elif outcome in ("failed", "error"):
            rec["failures"] += 1
        elif outcome == "skipped":
            rec["skips"] += 1

    return len(tests_section)


def consistency(rec: TestRecord) -> float:
    """Return fraction of runs that produced a deterministic (pass or fail) outcome.

    A test that always passes → 1.0 (stable green).
    A test that always fails  → 0.0 (stable red — broken, not flaky).
    A test that passes 9 out of 10 times → 0.9 (flaky).

    We define *consistency* as the rate at which the test's outcome matches
    its dominant outcome (pass or fail).  A test with fewer than
    MIN_RUNS runs is not judged.
    """
    runs = rec["runs"]
    if runs == 0:
        return 1.0
    passes = rec["passes"]
    failures = rec["failures"]
    dominant = max(passes, failures)
    return dominant / runs


MIN_RUNS = 5  # need at least this many runs before flagging


def find_flaky_tests(
    history: HistoryFile,
    threshold: float = DEFAULT_THRESHOLD,
) -> list[tuple[str, TestRecord, float]]:
    """Return (node_id, record, consistency_rate) for all tests below threshold."""
    flaky: list[tuple[str, TestRecord, float]] = []
    for node_id, rec in history["tests"].items():
        if rec["runs"] < MIN_RUNS:
            continue
        rate = consistency(rec)
        if rate < threshold:
            flaky.append((node_id, rec, rate))
    flaky.sort(key=lambda x: x[2])  # worst first
    return flaky


# ---------------------------------------------------------------------------
# Reporting
# ---------------------------------------------------------------------------

def print_report(
    flaky: list[tuple[str, TestRecord, float]],
    history: HistoryFile,
    threshold: float,
) -> None:
    total_tests = len(history["tests"])
    total_runs = history.get("total_runs", 0)

    print(f"\n{'=' * 70}")
    print("  FLAKE DETECTOR REPORT")
    print(f"{'=' * 70}")
    print(f"  Total suite runs tracked : {total_runs}")
    print(f"  Total distinct tests     : {total_tests}")
    print(f"  Consistency threshold    : {threshold:.0%}")
    print(f"  Min runs before judging  : {MIN_RUNS}")
    print(f"{'=' * 70}")

    if not flaky:
        print("  ✓ No flaky tests detected — all tests above consistency threshold.")
        print(f"{'=' * 70}\n")
        return

    print(f"  ✗ {len(flaky)} FLAKY TEST(S) DETECTED:\n")
    for node_id, rec, rate in flaky:
        print(f"  [{rate:.0%}] {node_id}")
        print(
            f"       runs={rec['runs']}  passes={rec['passes']}  "
            f"failures={rec['failures']}  skips={rec['skips']}  "
            f"last={rec['last_outcome']}"
        )
        print()

    print("  ACTION REQUIRED:")
    print("  1. Move each flaky test to tests/quarantine/")
    print("  2. File a tracking issue with [FLAKY] in the title")
    print("  3. Add @pytest.mark.quarantine(reason='#NNN') to the test")
    print("  See docs/QUARANTINE_PROCESS.md for full instructions.")
    print(f"{'=' * 70}\n")


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Detect flaky tests by analysing pytest JSON report history."
    )
    parser.add_argument(
        "--report",
        type=Path,
        default=DEFAULT_REPORT,
        help=f"Path to pytest JSON report file (default: {DEFAULT_REPORT})",
    )
    parser.add_argument(
        "--history",
        type=Path,
        default=DEFAULT_HISTORY,
        help=f"Path to rolling history JSON file (default: {DEFAULT_HISTORY})",
    )
    parser.add_argument(
        "--threshold",
        type=float,
        default=DEFAULT_THRESHOLD,
        help=f"Consistency threshold 0–1 (default: {DEFAULT_THRESHOLD})",
    )
    parser.add_argument(
        "--no-update",
        action="store_true",
        default=False,
        help="Print current statistics without ingesting a new report",
    )
    return parser.parse_args(argv)


def main(argv: list[str] | None = None) -> int:
    args = parse_args(argv)
    history = load_history(args.history)

    if not args.no_update:
        if not args.report.exists():
            print(
                f"[flake-detector] No report file at {args.report} — "
                "run pytest with --json-report first.",
                file=sys.stderr,
            )
            # Not a fatal error; just print current state.
        else:
            n = ingest_report(args.report, history)
            print(
                f"[flake-detector] Ingested {n} test results from {args.report}",
                file=sys.stderr,
            )
            save_history(history, args.history)

    flaky = find_flaky_tests(history, threshold=args.threshold)
    print_report(flaky, history, threshold=args.threshold)

    return 1 if flaky else 0


if __name__ == "__main__":
    sys.exit(main())