"""Tests for tasks.py — the orchestration brain. tasks.py is 2,117 lines with zero test coverage. This suite covers the pure utility functions that every pipeline depends on: JSON parsing, data normalization, file I/O primitives, and prompt formatting. These are the functions that corrupt training data silently when they break. If a normalization function drops a field or misparses JSON from an LLM, the entire training pipeline produces garbage. No one notices until the next autolora run produces a worse model. Coverage priority is based on blast radius — a bug in extract_first_json_object() affects every @huey.task that processes LLM output, which is all of them. """ from __future__ import annotations import json import sys import tempfile from pathlib import Path import pytest # Import tasks.py without triggering Huey/GiteaClient side effects. # We mock the imports that have side effects to isolate the pure functions. from unittest.mock import MagicMock # Stub out modules with side effects before importing tasks sys.modules.setdefault("orchestration", MagicMock(huey=MagicMock())) sys.modules.setdefault("huey", MagicMock()) sys.modules.setdefault("gitea_client", MagicMock()) sys.modules.setdefault("metrics_helpers", MagicMock( build_local_metric_record=MagicMock(return_value={}) )) # Now we can import the functions we want to test REPO_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(REPO_ROOT)) import importlib tasks = importlib.import_module("tasks") # Pull out the functions under test extract_first_json_object = tasks.extract_first_json_object parse_json_output = tasks.parse_json_output normalize_candidate_entry = tasks.normalize_candidate_entry normalize_training_examples = tasks.normalize_training_examples normalize_rubric_scores = tasks.normalize_rubric_scores archive_batch_id = tasks.archive_batch_id archive_profile_summary = tasks.archive_profile_summary format_tweets_for_prompt = tasks.format_tweets_for_prompt read_json = tasks.read_json write_json = tasks.write_json load_jsonl = tasks.load_jsonl write_jsonl = tasks.write_jsonl append_jsonl = tasks.append_jsonl write_text = tasks.write_text count_jsonl_rows = tasks.count_jsonl_rows newest_file = tasks.newest_file latest_path = tasks.latest_path archive_default_checkpoint = tasks.archive_default_checkpoint # ═══════════════════════════════════════════════════════════════════════ # JSON EXTRACTION — the single most critical function in the pipeline # ═══════════════════════════════════════════════════════════════════════ class TestExtractFirstJsonObject: """extract_first_json_object() parses JSON from noisy LLM output. Every @huey.task that processes model output depends on this. If this breaks, the entire training pipeline produces garbage. """ def test_clean_json(self): """Parses valid JSON directly.""" result = extract_first_json_object('{"key": "value"}') assert result == {"key": "value"} def test_json_with_markdown_fences(self): """Strips ```json fences that models love to add.""" text = '```json\n{"hello": "world"}\n```' result = extract_first_json_object(text) assert result == {"hello": "world"} def test_json_after_prose(self): """Finds JSON buried after the model's explanation.""" text = "Here is the analysis:\n\nI found that {'key': 'value'}\n\n{\"real\": true}" result = extract_first_json_object(text) assert result == {"real": True} def test_nested_json(self): """Handles nested objects correctly.""" text = '{"outer": {"inner": [1, 2, 3]}}' result = extract_first_json_object(text) assert result == {"outer": {"inner": [1, 2, 3]}} def test_raises_on_no_json(self): """Raises ValueError when no JSON object is found.""" with pytest.raises(ValueError, match="No JSON object found"): extract_first_json_object("No JSON here at all") def test_raises_on_json_array(self): """Raises ValueError for JSON arrays (only objects accepted).""" with pytest.raises(ValueError, match="No JSON object found"): extract_first_json_object("[1, 2, 3]") def test_skips_malformed_and_finds_valid(self): """Skips broken JSON fragments to find the real one.""" text = '{broken {"valid": true}' result = extract_first_json_object(text) assert result == {"valid": True} def test_handles_whitespace_heavy_output(self): """Handles output with excessive whitespace.""" text = ' \n\n {"spaced": "out"} \n\n ' result = extract_first_json_object(text) assert result == {"spaced": "out"} def test_empty_string_raises(self): """Empty input raises ValueError.""" with pytest.raises(ValueError): extract_first_json_object("") def test_unicode_content(self): """Handles Unicode characters in JSON values.""" text = '{"emoji": "🔥", "jp": "日本語"}' result = extract_first_json_object(text) assert result["emoji"] == "🔥" class TestParseJsonOutput: """parse_json_output() tries stdout then stderr for JSON.""" def test_finds_json_in_stdout(self): result = parse_json_output(stdout='{"from": "stdout"}') assert result == {"from": "stdout"} def test_falls_back_to_stderr(self): result = parse_json_output(stdout="no json", stderr='{"from": "stderr"}') assert result == {"from": "stderr"} def test_empty_returns_empty_dict(self): result = parse_json_output(stdout="", stderr="") assert result == {} def test_none_inputs_handled(self): result = parse_json_output(stdout=None, stderr=None) assert result == {} # ═══════════════════════════════════════════════════════════════════════ # DATA NORMALIZATION — training data quality depends on this # ═══════════════════════════════════════════════════════════════════════ class TestNormalizeCandidateEntry: """normalize_candidate_entry() cleans LLM-generated knowledge candidates. A bug here silently corrupts the knowledge graph. Fields are coerced to correct types, clamped to valid ranges, and deduplicated. """ def test_valid_candidate(self): """Normalizes a well-formed candidate.""" candidate = { "category": "trait", "claim": "Alexander likes coffee", "evidence_tweet_ids": ["123", "456"], "evidence_quotes": ["I love coffee"], "confidence": 0.8, "status": "provisional", } result = normalize_candidate_entry(candidate, "batch_001", 1) assert result["id"] == "batch_001-candidate-01" assert result["category"] == "trait" assert result["claim"] == "Alexander likes coffee" assert result["confidence"] == 0.8 assert result["status"] == "provisional" def test_empty_claim_returns_none(self): """Rejects candidates with empty claims.""" result = normalize_candidate_entry({"claim": ""}, "b001", 0) assert result is None def test_missing_claim_returns_none(self): """Rejects candidates with no claim field.""" result = normalize_candidate_entry({"category": "trait"}, "b001", 0) assert result is None def test_confidence_clamped_high(self): """Confidence above 1.0 is clamped to 1.0.""" result = normalize_candidate_entry( {"claim": "test", "confidence": 5.0}, "b001", 1 ) assert result["confidence"] == 1.0 def test_confidence_clamped_low(self): """Confidence below 0.0 is clamped to 0.0.""" result = normalize_candidate_entry( {"claim": "test", "confidence": -0.5}, "b001", 1 ) assert result["confidence"] == 0.0 def test_invalid_confidence_defaults(self): """Non-numeric confidence defaults to 0.5.""" result = normalize_candidate_entry( {"claim": "test", "confidence": "high"}, "b001", 1 ) assert result["confidence"] == 0.5 def test_invalid_status_defaults_to_provisional(self): """Unknown status values default to 'provisional'.""" result = normalize_candidate_entry( {"claim": "test", "status": "banana"}, "b001", 1 ) assert result["status"] == "provisional" def test_duplicate_evidence_ids_deduped(self): """Duplicate tweet IDs are removed.""" result = normalize_candidate_entry( {"claim": "test", "evidence_tweet_ids": ["1", "1", "2", "2"]}, "b001", 1, ) assert result["evidence_tweet_ids"] == ["1", "2"] def test_duplicate_quotes_deduped(self): """Duplicate evidence quotes are removed.""" result = normalize_candidate_entry( {"claim": "test", "evidence_quotes": ["same", "same", "new"]}, "b001", 1, ) assert result["evidence_quotes"] == ["same", "new"] def test_evidence_truncated_to_5(self): """Evidence lists are capped at 5 items.""" result = normalize_candidate_entry( {"claim": "test", "evidence_quotes": [f"q{i}" for i in range(10)]}, "b001", 1, ) assert len(result["evidence_quotes"]) == 5 def test_none_category_defaults(self): """None category defaults to 'recurring-theme'.""" result = normalize_candidate_entry( {"claim": "test", "category": None}, "b001", 1 ) assert result["category"] == "recurring-theme" def test_valid_statuses_accepted(self): """All three valid statuses are preserved.""" for status in ("provisional", "durable", "retracted"): result = normalize_candidate_entry( {"claim": "test", "status": status}, "b001", 1 ) assert result["status"] == status class TestNormalizeTrainingExamples: """normalize_training_examples() cleans LLM-generated training pairs. This feeds directly into autolora. Bad data here means bad training. """ def test_valid_examples_normalized(self): """Well-formed examples pass through with added metadata.""" examples = [ {"prompt": "Q1", "response": "A1", "task_type": "analysis"}, {"prompt": "Q2", "response": "A2"}, ] result = normalize_training_examples( examples, "b001", ["t1"], "fallback_p", "fallback_r" ) assert len(result) == 2 assert result[0]["example_id"] == "b001-example-01" assert result[0]["prompt"] == "Q1" assert result[1]["task_type"] == "analysis" # defaults def test_empty_examples_get_fallback(self): """When no valid examples exist, fallback is used.""" result = normalize_training_examples( [], "b001", ["t1"], "fallback prompt", "fallback response" ) assert len(result) == 1 assert result[0]["prompt"] == "fallback prompt" assert result[0]["response"] == "fallback response" def test_examples_with_empty_prompt_skipped(self): """Examples without prompts are filtered out.""" examples = [ {"prompt": "", "response": "A1"}, {"prompt": "Q2", "response": "A2"}, ] result = normalize_training_examples( examples, "b001", ["t1"], "fp", "fr" ) assert len(result) == 1 assert result[0]["prompt"] == "Q2" def test_examples_with_empty_response_skipped(self): """Examples without responses are filtered out.""" examples = [ {"prompt": "Q1", "response": ""}, ] result = normalize_training_examples( examples, "b001", ["t1"], "fp", "fr" ) # Falls to fallback assert len(result) == 1 assert result[0]["prompt"] == "fp" def test_alternative_field_names_accepted(self): """Accepts 'instruction'/'answer' as field name alternatives.""" examples = [ {"instruction": "Q1", "answer": "A1"}, ] result = normalize_training_examples( examples, "b001", ["t1"], "fp", "fr" ) assert len(result) == 1 assert result[0]["prompt"] == "Q1" assert result[0]["response"] == "A1" class TestNormalizeRubricScores: """normalize_rubric_scores() cleans eval rubric output.""" def test_valid_scores(self): scores = {"grounding": 8, "specificity": 7, "source_distinction": 9, "actionability": 6} result = normalize_rubric_scores(scores) assert result == {"grounding": 8.0, "specificity": 7.0, "source_distinction": 9.0, "actionability": 6.0} def test_missing_keys_default_to_zero(self): result = normalize_rubric_scores({}) assert result == {"grounding": 0.0, "specificity": 0.0, "source_distinction": 0.0, "actionability": 0.0} def test_non_numeric_defaults_to_zero(self): result = normalize_rubric_scores({"grounding": "excellent"}) assert result["grounding"] == 0.0 # ═══════════════════════════════════════════════════════════════════════ # FILE I/O PRIMITIVES — the foundation everything reads/writes through # ═══════════════════════════════════════════════════════════════════════ class TestReadJson: def test_reads_valid_file(self, tmp_path): f = tmp_path / "test.json" f.write_text('{"key": "val"}') assert read_json(f, {}) == {"key": "val"} def test_missing_file_returns_default(self, tmp_path): assert read_json(tmp_path / "nope.json", {"default": True}) == {"default": True} def test_corrupt_file_returns_default(self, tmp_path): f = tmp_path / "bad.json" f.write_text("{corrupt json!!!}") assert read_json(f, {"safe": True}) == {"safe": True} def test_default_is_deep_copied(self, tmp_path): """Default is deep-copied, not shared between calls.""" default = {"nested": {"key": "val"}} result1 = read_json(tmp_path / "a.json", default) result2 = read_json(tmp_path / "b.json", default) result1["nested"]["key"] = "mutated" assert result2["nested"]["key"] == "val" class TestWriteJson: def test_creates_file_with_indent(self, tmp_path): f = tmp_path / "out.json" write_json(f, {"key": "val"}) content = f.read_text() assert '"key": "val"' in content assert content.endswith("\n") def test_creates_parent_dirs(self, tmp_path): f = tmp_path / "deep" / "nested" / "out.json" write_json(f, {"ok": True}) assert f.exists() def test_sorted_keys(self, tmp_path): f = tmp_path / "sorted.json" write_json(f, {"z": 1, "a": 2}) content = f.read_text() assert content.index('"a"') < content.index('"z"') class TestJsonlIO: def test_load_jsonl_valid(self, tmp_path): f = tmp_path / "data.jsonl" f.write_text('{"a":1}\n{"b":2}\n') rows = load_jsonl(f) assert len(rows) == 2 assert rows[0] == {"a": 1} def test_load_jsonl_missing_file(self, tmp_path): assert load_jsonl(tmp_path / "nope.jsonl") == [] def test_load_jsonl_skips_blank_lines(self, tmp_path): f = tmp_path / "data.jsonl" f.write_text('{"a":1}\n\n\n{"b":2}\n') rows = load_jsonl(f) assert len(rows) == 2 def test_write_jsonl(self, tmp_path): f = tmp_path / "out.jsonl" write_jsonl(f, [{"a": 1}, {"b": 2}]) lines = f.read_text().strip().split("\n") assert len(lines) == 2 assert json.loads(lines[0]) == {"a": 1} def test_append_jsonl(self, tmp_path): f = tmp_path / "append.jsonl" f.write_text('{"existing":true}\n') append_jsonl(f, [{"new": True}]) rows = load_jsonl(f) assert len(rows) == 2 def test_append_jsonl_empty_list_noop(self, tmp_path): """Appending empty list doesn't create file.""" f = tmp_path / "nope.jsonl" append_jsonl(f, []) assert not f.exists() def test_count_jsonl_rows(self, tmp_path): f = tmp_path / "count.jsonl" f.write_text('{"a":1}\n{"b":2}\n{"c":3}\n') assert count_jsonl_rows(f) == 3 def test_count_jsonl_missing_file(self, tmp_path): assert count_jsonl_rows(tmp_path / "nope.jsonl") == 0 def test_count_jsonl_skips_blank_lines(self, tmp_path): f = tmp_path / "sparse.jsonl" f.write_text('{"a":1}\n\n{"b":2}\n\n') assert count_jsonl_rows(f) == 2 class TestWriteText: def test_writes_with_trailing_newline(self, tmp_path): f = tmp_path / "text.md" write_text(f, "hello") assert f.read_text() == "hello\n" def test_strips_trailing_whitespace(self, tmp_path): f = tmp_path / "text.md" write_text(f, "hello \n\n\n") assert f.read_text() == "hello\n" def test_empty_content_writes_empty_file(self, tmp_path): f = tmp_path / "text.md" write_text(f, " ") assert f.read_text() == "" # ═══════════════════════════════════════════════════════════════════════ # PATH UTILITIES # ═══════════════════════════════════════════════════════════════════════ class TestPathUtilities: def test_newest_file(self, tmp_path): (tmp_path / "a.txt").write_text("a") (tmp_path / "b.txt").write_text("b") (tmp_path / "c.txt").write_text("c") result = newest_file(tmp_path, "*.txt") assert result.name == "c.txt" # sorted, last = newest def test_newest_file_empty_dir(self, tmp_path): assert newest_file(tmp_path, "*.txt") is None def test_latest_path(self, tmp_path): (tmp_path / "batch_001.json").write_text("{}") (tmp_path / "batch_002.json").write_text("{}") result = latest_path(tmp_path, "batch_*.json") assert result.name == "batch_002.json" def test_latest_path_no_matches(self, tmp_path): assert latest_path(tmp_path, "*.nope") is None # ═══════════════════════════════════════════════════════════════════════ # FORMATTING & HELPERS # ═══════════════════════════════════════════════════════════════════════ class TestFormatting: def test_archive_batch_id(self): assert archive_batch_id(1) == "batch_001" assert archive_batch_id(42) == "batch_042" assert archive_batch_id(100) == "batch_100" def test_archive_profile_summary(self): profile = { "claims": [ {"status": "durable", "claim": "a"}, {"status": "durable", "claim": "b"}, {"status": "provisional", "claim": "c"}, {"status": "retracted", "claim": "d"}, ] } summary = archive_profile_summary(profile) assert len(summary["durable_claims"]) == 2 assert len(summary["provisional_claims"]) == 1 def test_archive_profile_summary_truncates(self): """Summaries are capped at 12 durable and 8 provisional.""" profile = { "claims": [{"status": "durable", "claim": f"d{i}"} for i in range(20)] + [{"status": "provisional", "claim": f"p{i}"} for i in range(15)] } summary = archive_profile_summary(profile) assert len(summary["durable_claims"]) <= 12 assert len(summary["provisional_claims"]) <= 8 def test_archive_profile_summary_empty(self): assert archive_profile_summary({}) == { "durable_claims": [], "provisional_claims": [], } def test_format_tweets_for_prompt(self): rows = [ {"tweet_id": "123", "created_at": "2024-01-01", "full_text": "Hello world"}, {"tweet_id": "456", "created_at": "2024-01-02", "full_text": "Goodbye world"}, ] result = format_tweets_for_prompt(rows) assert "tweet_id=123" in result assert "Hello world" in result assert "2." in result # 1-indexed def test_archive_default_checkpoint(self): """Default checkpoint has all required fields.""" cp = archive_default_checkpoint() assert cp["phase"] == "discovery" assert cp["next_offset"] == 0 assert cp["batch_size"] == 50 assert cp["batches_completed"] == 0