This repository has been archived on 2026-03-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Timmy-time-dashboard/tests/spark/test_eidos.py

300 lines
12 KiB
Python

"""Comprehensive tests for spark.eidos module.
Covers:
- _get_conn (schema creation, WAL, busy timeout)
- predict_task_outcome (baseline, with history, edge cases)
- evaluate_prediction (correct, wrong, missing, double-eval)
- _compute_accuracy (all components, edge cases)
- get_predictions (filters: task_id, evaluated_only, limit)
- get_accuracy_stats (empty, after evaluations)
"""
import pytest
from spark.eidos import (
Prediction,
_compute_accuracy,
evaluate_prediction,
get_accuracy_stats,
get_predictions,
predict_task_outcome,
)
# ── Prediction dataclass ──────────────────────────────────────────────────
class TestPredictionDataclass:
def test_defaults(self):
p = Prediction(
id="1",
task_id="t1",
prediction_type="outcome",
predicted_value="{}",
actual_value=None,
accuracy=None,
created_at="2026-01-01",
evaluated_at=None,
)
assert p.actual_value is None
assert p.accuracy is None
# ── predict_task_outcome ──────────────────────────────────────────────────
class TestPredictTaskOutcome:
def test_baseline_no_history(self):
result = predict_task_outcome("t-base", "Do stuff", ["a1", "a2"])
assert result["likely_winner"] == "a1"
assert result["success_probability"] == 0.7
assert result["estimated_bid_range"] == [20, 80]
assert "baseline" in result["reasoning"]
assert "prediction_id" in result
def test_empty_candidates(self):
result = predict_task_outcome("t-empty", "Nothing", [])
assert result["likely_winner"] is None
def test_history_selects_best_agent(self):
history = {
"a1": {"success_rate": 0.3, "avg_winning_bid": 40},
"a2": {"success_rate": 0.95, "avg_winning_bid": 50},
}
result = predict_task_outcome("t-hist", "Task", ["a1", "a2"], agent_history=history)
assert result["likely_winner"] == "a2"
assert result["success_probability"] > 0.7
def test_history_agent_not_in_candidates_ignored(self):
history = {
"a-outside": {"success_rate": 0.99, "avg_winning_bid": 10},
}
result = predict_task_outcome("t-out", "Task", ["a1"], agent_history=history)
# a-outside not in candidates, so falls back to baseline
assert result["likely_winner"] == "a1"
def test_history_adjusts_bid_range(self):
history = {
"a1": {"success_rate": 0.5, "avg_winning_bid": 100},
"a2": {"success_rate": 0.8, "avg_winning_bid": 200},
}
result = predict_task_outcome("t-bid", "Task", ["a1", "a2"], agent_history=history)
low, high = result["estimated_bid_range"]
assert low == max(1, int(100 * 0.8))
assert high == int(200 * 1.2)
def test_history_with_zero_avg_bid_skipped(self):
history = {
"a1": {"success_rate": 0.8, "avg_winning_bid": 0},
}
result = predict_task_outcome("t-zero-bid", "Task", ["a1"], agent_history=history)
# Zero avg_winning_bid should be skipped, keep default range
assert result["estimated_bid_range"] == [20, 80]
def test_prediction_stored_in_db(self):
result = predict_task_outcome("t-db", "Store me", ["a1"])
preds = get_predictions(task_id="t-db")
assert len(preds) == 1
assert preds[0].id == result["prediction_id"]
assert preds[0].prediction_type == "outcome"
def test_success_probability_clamped(self):
history = {
"a1": {"success_rate": 1.5, "avg_winning_bid": 50},
}
result = predict_task_outcome("t-clamp", "Task", ["a1"], agent_history=history)
assert result["success_probability"] <= 1.0
# ── evaluate_prediction ───────────────────────────────────────────────────
class TestEvaluatePrediction:
def test_correct_prediction(self):
predict_task_outcome("t-eval-ok", "Task", ["a1"])
result = evaluate_prediction("t-eval-ok", "a1", task_succeeded=True, winning_bid=30)
assert result is not None
assert 0.0 <= result["accuracy"] <= 1.0
assert result["actual"]["winner"] == "a1"
assert result["actual"]["succeeded"] is True
def test_wrong_prediction(self):
predict_task_outcome("t-eval-wrong", "Task", ["a1"])
result = evaluate_prediction("t-eval-wrong", "a2", task_succeeded=False)
assert result is not None
assert result["accuracy"] < 1.0
def test_no_prediction_returns_none(self):
result = evaluate_prediction("nonexistent", "a1", task_succeeded=True)
assert result is None
def test_double_evaluation_returns_none(self):
predict_task_outcome("t-double", "Task", ["a1"])
evaluate_prediction("t-double", "a1", task_succeeded=True)
result = evaluate_prediction("t-double", "a1", task_succeeded=True)
assert result is None
def test_evaluation_updates_db(self):
predict_task_outcome("t-upd", "Task", ["a1"])
evaluate_prediction("t-upd", "a1", task_succeeded=True, winning_bid=50)
preds = get_predictions(task_id="t-upd", evaluated_only=True)
assert len(preds) == 1
assert preds[0].accuracy is not None
assert preds[0].actual_value is not None
assert preds[0].evaluated_at is not None
def test_winning_bid_none(self):
predict_task_outcome("t-nobid", "Task", ["a1"])
result = evaluate_prediction("t-nobid", "a1", task_succeeded=True)
assert result is not None
assert result["actual"]["winning_bid"] is None
# ── _compute_accuracy ─────────────────────────────────────────────────────
class TestComputeAccuracy:
def test_perfect_match(self):
predicted = {
"likely_winner": "a1",
"success_probability": 1.0,
"estimated_bid_range": [20, 40],
}
actual = {"winner": "a1", "succeeded": True, "winning_bid": 30}
assert _compute_accuracy(predicted, actual) == pytest.approx(1.0, abs=0.01)
def test_all_wrong(self):
predicted = {
"likely_winner": "a1",
"success_probability": 1.0,
"estimated_bid_range": [10, 20],
}
actual = {"winner": "a2", "succeeded": False, "winning_bid": 100}
assert _compute_accuracy(predicted, actual) < 0.3
def test_no_winner_in_predicted(self):
predicted = {"success_probability": 0.5, "estimated_bid_range": [20, 40]}
actual = {"winner": "a1", "succeeded": True, "winning_bid": 30}
acc = _compute_accuracy(predicted, actual)
# Winner component skipped, success + bid counted
assert 0.0 <= acc <= 1.0
def test_no_winner_in_actual(self):
predicted = {"likely_winner": "a1", "success_probability": 0.5}
actual = {"succeeded": True}
acc = _compute_accuracy(predicted, actual)
assert 0.0 <= acc <= 1.0
def test_bid_outside_range_partial_credit(self):
predicted = {
"likely_winner": "a1",
"success_probability": 1.0,
"estimated_bid_range": [20, 40],
}
# Bid just outside range
actual = {"winner": "a1", "succeeded": True, "winning_bid": 45}
acc = _compute_accuracy(predicted, actual)
assert 0.5 < acc < 1.0
def test_bid_far_outside_range(self):
predicted = {
"likely_winner": "a1",
"success_probability": 1.0,
"estimated_bid_range": [20, 40],
}
actual = {"winner": "a1", "succeeded": True, "winning_bid": 500}
acc = _compute_accuracy(predicted, actual)
assert acc < 1.0
def test_no_actual_bid(self):
predicted = {
"likely_winner": "a1",
"success_probability": 0.7,
"estimated_bid_range": [20, 40],
}
actual = {"winner": "a1", "succeeded": True, "winning_bid": None}
acc = _compute_accuracy(predicted, actual)
# Bid component skipped — only winner + success
assert 0.0 <= acc <= 1.0
def test_failed_prediction_low_probability(self):
predicted = {"success_probability": 0.1}
actual = {"succeeded": False}
acc = _compute_accuracy(predicted, actual)
# Predicted low success and task failed → high accuracy
assert acc > 0.8
# ── get_predictions ───────────────────────────────────────────────────────
class TestGetPredictions:
def test_empty_db(self):
assert get_predictions() == []
def test_filter_by_task_id(self):
predict_task_outcome("t-filter1", "A", ["a1"])
predict_task_outcome("t-filter2", "B", ["a2"])
preds = get_predictions(task_id="t-filter1")
assert len(preds) == 1
assert preds[0].task_id == "t-filter1"
def test_evaluated_only(self):
predict_task_outcome("t-eo1", "A", ["a1"])
predict_task_outcome("t-eo2", "B", ["a1"])
evaluate_prediction("t-eo1", "a1", task_succeeded=True)
preds = get_predictions(evaluated_only=True)
assert len(preds) == 1
assert preds[0].task_id == "t-eo1"
def test_limit(self):
for i in range(10):
predict_task_outcome(f"t-lim{i}", "X", ["a1"])
preds = get_predictions(limit=3)
assert len(preds) == 3
def test_combined_filters(self):
predict_task_outcome("t-combo", "A", ["a1"])
evaluate_prediction("t-combo", "a1", task_succeeded=True)
predict_task_outcome("t-combo2", "B", ["a1"])
preds = get_predictions(task_id="t-combo", evaluated_only=True)
assert len(preds) == 1
def test_order_by_created_desc(self):
for i in range(3):
predict_task_outcome(f"t-ord{i}", f"Task {i}", ["a1"])
preds = get_predictions()
# Most recent first
assert preds[0].task_id == "t-ord2"
# ── get_accuracy_stats ────────────────────────────────────────────────────
class TestGetAccuracyStats:
def test_empty(self):
stats = get_accuracy_stats()
assert stats["total_predictions"] == 0
assert stats["evaluated"] == 0
assert stats["pending"] == 0
assert stats["avg_accuracy"] == 0.0
assert stats["min_accuracy"] == 0.0
assert stats["max_accuracy"] == 0.0
def test_with_unevaluated(self):
predict_task_outcome("t-uneval", "X", ["a1"])
stats = get_accuracy_stats()
assert stats["total_predictions"] == 1
assert stats["evaluated"] == 0
assert stats["pending"] == 1
def test_with_evaluations(self):
for i in range(3):
predict_task_outcome(f"t-stats{i}", "X", ["a1"])
evaluate_prediction(f"t-stats{i}", "a1", task_succeeded=True, winning_bid=30)
stats = get_accuracy_stats()
assert stats["total_predictions"] == 3
assert stats["evaluated"] == 3
assert stats["pending"] == 0
assert stats["avg_accuracy"] > 0.0
assert stats["min_accuracy"] <= stats["avg_accuracy"] <= stats["max_accuracy"]