"""Comprehensive tests for spark.eidos module. Covers: - _get_conn (schema creation, WAL, busy timeout) - predict_task_outcome (baseline, with history, edge cases) - evaluate_prediction (correct, wrong, missing, double-eval) - _compute_accuracy (all components, edge cases) - get_predictions (filters: task_id, evaluated_only, limit) - get_accuracy_stats (empty, after evaluations) """ import pytest from spark.eidos import ( Prediction, _compute_accuracy, evaluate_prediction, get_accuracy_stats, get_predictions, predict_task_outcome, ) # ── Prediction dataclass ────────────────────────────────────────────────── class TestPredictionDataclass: def test_defaults(self): p = Prediction( id="1", task_id="t1", prediction_type="outcome", predicted_value="{}", actual_value=None, accuracy=None, created_at="2026-01-01", evaluated_at=None, ) assert p.actual_value is None assert p.accuracy is None # ── predict_task_outcome ────────────────────────────────────────────────── class TestPredictTaskOutcome: def test_baseline_no_history(self): result = predict_task_outcome("t-base", "Do stuff", ["a1", "a2"]) assert result["likely_winner"] == "a1" assert result["success_probability"] == 0.7 assert result["estimated_bid_range"] == [20, 80] assert "baseline" in result["reasoning"] assert "prediction_id" in result def test_empty_candidates(self): result = predict_task_outcome("t-empty", "Nothing", []) assert result["likely_winner"] is None def test_history_selects_best_agent(self): history = { "a1": {"success_rate": 0.3, "avg_winning_bid": 40}, "a2": {"success_rate": 0.95, "avg_winning_bid": 50}, } result = predict_task_outcome("t-hist", "Task", ["a1", "a2"], agent_history=history) assert result["likely_winner"] == "a2" assert result["success_probability"] > 0.7 def test_history_agent_not_in_candidates_ignored(self): history = { "a-outside": {"success_rate": 0.99, "avg_winning_bid": 10}, } result = predict_task_outcome("t-out", "Task", ["a1"], agent_history=history) # a-outside not in candidates, so falls back to baseline assert result["likely_winner"] == "a1" def test_history_adjusts_bid_range(self): history = { "a1": {"success_rate": 0.5, "avg_winning_bid": 100}, "a2": {"success_rate": 0.8, "avg_winning_bid": 200}, } result = predict_task_outcome("t-bid", "Task", ["a1", "a2"], agent_history=history) low, high = result["estimated_bid_range"] assert low == max(1, int(100 * 0.8)) assert high == int(200 * 1.2) def test_history_with_zero_avg_bid_skipped(self): history = { "a1": {"success_rate": 0.8, "avg_winning_bid": 0}, } result = predict_task_outcome("t-zero-bid", "Task", ["a1"], agent_history=history) # Zero avg_winning_bid should be skipped, keep default range assert result["estimated_bid_range"] == [20, 80] def test_prediction_stored_in_db(self): result = predict_task_outcome("t-db", "Store me", ["a1"]) preds = get_predictions(task_id="t-db") assert len(preds) == 1 assert preds[0].id == result["prediction_id"] assert preds[0].prediction_type == "outcome" def test_success_probability_clamped(self): history = { "a1": {"success_rate": 1.5, "avg_winning_bid": 50}, } result = predict_task_outcome("t-clamp", "Task", ["a1"], agent_history=history) assert result["success_probability"] <= 1.0 # ── evaluate_prediction ─────────────────────────────────────────────────── class TestEvaluatePrediction: def test_correct_prediction(self): predict_task_outcome("t-eval-ok", "Task", ["a1"]) result = evaluate_prediction("t-eval-ok", "a1", task_succeeded=True, winning_bid=30) assert result is not None assert 0.0 <= result["accuracy"] <= 1.0 assert result["actual"]["winner"] == "a1" assert result["actual"]["succeeded"] is True def test_wrong_prediction(self): predict_task_outcome("t-eval-wrong", "Task", ["a1"]) result = evaluate_prediction("t-eval-wrong", "a2", task_succeeded=False) assert result is not None assert result["accuracy"] < 1.0 def test_no_prediction_returns_none(self): result = evaluate_prediction("nonexistent", "a1", task_succeeded=True) assert result is None def test_double_evaluation_returns_none(self): predict_task_outcome("t-double", "Task", ["a1"]) evaluate_prediction("t-double", "a1", task_succeeded=True) result = evaluate_prediction("t-double", "a1", task_succeeded=True) assert result is None def test_evaluation_updates_db(self): predict_task_outcome("t-upd", "Task", ["a1"]) evaluate_prediction("t-upd", "a1", task_succeeded=True, winning_bid=50) preds = get_predictions(task_id="t-upd", evaluated_only=True) assert len(preds) == 1 assert preds[0].accuracy is not None assert preds[0].actual_value is not None assert preds[0].evaluated_at is not None def test_winning_bid_none(self): predict_task_outcome("t-nobid", "Task", ["a1"]) result = evaluate_prediction("t-nobid", "a1", task_succeeded=True) assert result is not None assert result["actual"]["winning_bid"] is None # ── _compute_accuracy ───────────────────────────────────────────────────── class TestComputeAccuracy: def test_perfect_match(self): predicted = { "likely_winner": "a1", "success_probability": 1.0, "estimated_bid_range": [20, 40], } actual = {"winner": "a1", "succeeded": True, "winning_bid": 30} assert _compute_accuracy(predicted, actual) == pytest.approx(1.0, abs=0.01) def test_all_wrong(self): predicted = { "likely_winner": "a1", "success_probability": 1.0, "estimated_bid_range": [10, 20], } actual = {"winner": "a2", "succeeded": False, "winning_bid": 100} assert _compute_accuracy(predicted, actual) < 0.3 def test_no_winner_in_predicted(self): predicted = {"success_probability": 0.5, "estimated_bid_range": [20, 40]} actual = {"winner": "a1", "succeeded": True, "winning_bid": 30} acc = _compute_accuracy(predicted, actual) # Winner component skipped, success + bid counted assert 0.0 <= acc <= 1.0 def test_no_winner_in_actual(self): predicted = {"likely_winner": "a1", "success_probability": 0.5} actual = {"succeeded": True} acc = _compute_accuracy(predicted, actual) assert 0.0 <= acc <= 1.0 def test_bid_outside_range_partial_credit(self): predicted = { "likely_winner": "a1", "success_probability": 1.0, "estimated_bid_range": [20, 40], } # Bid just outside range actual = {"winner": "a1", "succeeded": True, "winning_bid": 45} acc = _compute_accuracy(predicted, actual) assert 0.5 < acc < 1.0 def test_bid_far_outside_range(self): predicted = { "likely_winner": "a1", "success_probability": 1.0, "estimated_bid_range": [20, 40], } actual = {"winner": "a1", "succeeded": True, "winning_bid": 500} acc = _compute_accuracy(predicted, actual) assert acc < 1.0 def test_no_actual_bid(self): predicted = { "likely_winner": "a1", "success_probability": 0.7, "estimated_bid_range": [20, 40], } actual = {"winner": "a1", "succeeded": True, "winning_bid": None} acc = _compute_accuracy(predicted, actual) # Bid component skipped — only winner + success assert 0.0 <= acc <= 1.0 def test_failed_prediction_low_probability(self): predicted = {"success_probability": 0.1} actual = {"succeeded": False} acc = _compute_accuracy(predicted, actual) # Predicted low success and task failed → high accuracy assert acc > 0.8 # ── get_predictions ─────────────────────────────────────────────────────── class TestGetPredictions: def test_empty_db(self): assert get_predictions() == [] def test_filter_by_task_id(self): predict_task_outcome("t-filter1", "A", ["a1"]) predict_task_outcome("t-filter2", "B", ["a2"]) preds = get_predictions(task_id="t-filter1") assert len(preds) == 1 assert preds[0].task_id == "t-filter1" def test_evaluated_only(self): predict_task_outcome("t-eo1", "A", ["a1"]) predict_task_outcome("t-eo2", "B", ["a1"]) evaluate_prediction("t-eo1", "a1", task_succeeded=True) preds = get_predictions(evaluated_only=True) assert len(preds) == 1 assert preds[0].task_id == "t-eo1" def test_limit(self): for i in range(10): predict_task_outcome(f"t-lim{i}", "X", ["a1"]) preds = get_predictions(limit=3) assert len(preds) == 3 def test_combined_filters(self): predict_task_outcome("t-combo", "A", ["a1"]) evaluate_prediction("t-combo", "a1", task_succeeded=True) predict_task_outcome("t-combo2", "B", ["a1"]) preds = get_predictions(task_id="t-combo", evaluated_only=True) assert len(preds) == 1 def test_order_by_created_desc(self): for i in range(3): predict_task_outcome(f"t-ord{i}", f"Task {i}", ["a1"]) preds = get_predictions() # Most recent first assert preds[0].task_id == "t-ord2" # ── get_accuracy_stats ──────────────────────────────────────────────────── class TestGetAccuracyStats: def test_empty(self): stats = get_accuracy_stats() assert stats["total_predictions"] == 0 assert stats["evaluated"] == 0 assert stats["pending"] == 0 assert stats["avg_accuracy"] == 0.0 assert stats["min_accuracy"] == 0.0 assert stats["max_accuracy"] == 0.0 def test_with_unevaluated(self): predict_task_outcome("t-uneval", "X", ["a1"]) stats = get_accuracy_stats() assert stats["total_predictions"] == 1 assert stats["evaluated"] == 0 assert stats["pending"] == 1 def test_with_evaluations(self): for i in range(3): predict_task_outcome(f"t-stats{i}", "X", ["a1"]) evaluate_prediction(f"t-stats{i}", "a1", task_succeeded=True, winning_bid=30) stats = get_accuracy_stats() assert stats["total_predictions"] == 3 assert stats["evaluated"] == 3 assert stats["pending"] == 0 assert stats["avg_accuracy"] > 0.0 assert stats["min_accuracy"] <= stats["avg_accuracy"] <= stats["max_accuracy"]