From b28071bb7127b76f0786ada0bece114b9115af4f Mon Sep 17 00:00:00 2001
From: Alexander Whitestone <alexander@alexanderwhitestone.com>
Date: Mon, 20 Apr 2026 23:15:48 +0000
Subject: [PATCH 1/2] fix(#687): Training data quality filter

- Score pairs on specificity, length ratio, code correctness
- Composite weighted score (0.5 spec + 0.2 length + 0.3 code)
- Configurable threshold filtering
- Report mode with score distribution
- Supports prompt/response, input/output, question/answer formats
- CLI: python3 quality_filter.py input.jsonl -o output.jsonl --report
---
 scripts/quality_filter.py | 276 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 276 insertions(+)
 create mode 100644 scripts/quality_filter.py

diff --git a/scripts/quality_filter.py b/scripts/quality_filter.py
new file mode 100644
index 00000000..0647fb17
--- /dev/null
+++ b/scripts/quality_filter.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python3
+"""
+Training Data Quality Filter — Score and remove low-quality training pairs.
+
+Scores each pair on:
+  1. Specificity: How concrete vs generic is the content?
+  2. Length ratio: Balanced input/output lengths?
+  3. Code correctness: If code is present, does it parse?
+
+Usage:
+    python3 quality_filter.py input.jsonl -o output.jsonl
+    python3 quality_filter.py input.jsonl --report
+    python3 quality_filter.py input.jsonl --threshold 0.4
+
+Accepts JSONL where each line has:
+  {"prompt": "...", "response": "..."} or {"input": "...", "output": "..."}
+"""
+
+import argparse
+import json
+import re
+import sys
+import ast
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# SCORING
+# ---------------------------------------------------------------------------
+
+GENERIC_PHRASES = [
+    "i don't know", "it depends", "there are many ways",
+    "that's a good question", "let me think about", "in general",
+    "as an ai", "i cannot", "i'm sorry but", "unfortunately",
+    "that being said", "it's worth noting", "in conclusion",
+    "to summarize", "overall", "basically", "essentially",
+]
+
+SPECIFIC_MARKERS = [
+    r"(?:bash|python|javascript|go|rust)\n",  # Language-tagged code blocks
+    r"```[a-z]+\n",                            # Fenced code blocks
+    r"https?://\S+",                           # URLs
+    r"(?:file|path|dir|repo|branch|commit)\b", # Concrete references
+    r"\d+\.\d+\.\d+",                          # Version numbers
+    r"(?:error|exception|traceback|stderr)",    # Error messages
+    r"(?:curl|git|apt|brew|pip|npm)\s",         # CLI commands
+    r"(?:GET|POST|PUT|DELETE|PATCH)\s",         # HTTP methods
+    r"(?:Issue|PR|commit|merge|branch)\s*#",    # Gitea/GitHub refs
+]
+
+
+def score_specificity(text: str) -> float:
+    """Score 0-1 for how specific/concrete the text is."""
+    text_lower = text.lower()
+    score = 0.5  # baseline
+
+    # Penalize generic phrases
+    generic_count = sum(1 for p in GENERIC_PHRASES if p in text_lower)
+    score -= generic_count * 0.05
+
+    # Reward specific markers
+    specific_count = sum(1 for p in SPECIFIC_MARKERS if re.search(p, text, re.IGNORECASE))
+    score += specific_count * 0.08
+
+    # Reward longer, detailed responses
+    word_count = len(text.split())
+    if word_count > 100:
+        score += 0.1
+    elif word_count > 50:
+        score += 0.05
+    elif word_count < 10:
+        score -= 0.15
+
+    return max(0.0, min(1.0, score))
+
+
+def score_length_ratio(prompt: str, response: str) -> float:
+    """Score 0-1 for balanced input/output lengths."""
+    p_len = len(prompt.split())
+    r_len = len(response.split())
+
+    if p_len == 0 or r_len == 0:
+        return 0.0
+
+    ratio = r_len / p_len
+
+    # Ideal: response is 1-10x the prompt length
+    if 1.0 <= ratio <= 10.0:
+        return 1.0
+    elif 0.5 <= ratio <= 20.0:
+        return 0.7
+    elif 0.2 <= ratio <= 50.0:
+        return 0.4
+    else:
+        return 0.1
+
+
+def score_code_correctness(text: str) -> float:
+    """Score 0-1 for code blocks that parse correctly."""
+    code_blocks = re.findall(r"```(?:\w*\n)?(.*?)```", text, re.DOTALL)
+
+    if not code_blocks:
+        return 1.0  # No code = no code errors
+
+    total = len(code_blocks)
+    valid = 0
+
+    for block in code_blocks:
+        block = block.strip()
+        if not block:
+            continue
+
+        # Try Python parse
+        try:
+            ast.parse(block)
+            valid += 1
+            continue
+        except SyntaxError:
+            pass
+
+        # Try JSON parse
+        try:
+            json.loads(block)
+            valid += 1
+            continue
+        except (json.JSONDecodeError, ValueError):
+            pass
+
+        # Shell scripts: check for balanced braces/parens
+        open_count = block.count("{") + block.count("(") + block.count("[")
+        close_count = block.count("}") + block.count(")") + block.count("]")
+        if abs(open_count - close_count) <= 1:
+            valid += 1
+
+    return valid / total if total > 0 else 1.0
+
+
+def score_pair(pair: dict) -> dict:
+    """Score a single training pair. Returns scores dict and composite."""
+    prompt = str(pair.get("prompt") or pair.get("input") or pair.get("question") or "")
+    response = str(pair.get("response") or pair.get("output") or pair.get("answer") or pair.get("completion") or "")
+
+    if not prompt or not response:
+        return {"specificity": 0.0, "length_ratio": 0.0, "code_correctness": 0.0, "composite": 0.0}
+
+    spec = score_specificity(response)
+    length = score_length_ratio(prompt, response)
+    code = score_code_correctness(response)
+
+    composite = (spec * 0.5) + (length * 0.2) + (code * 0.3)
+
+    return {
+        "specificity": round(spec, 3),
+        "length_ratio": round(length, 3),
+        "code_correctness": round(code, 3),
+        "composite": round(composite, 3),
+    }
+
+
+# ---------------------------------------------------------------------------
+# FILTER
+# ---------------------------------------------------------------------------
+
+def filter_pairs(input_path: str, output_path: str = None, threshold: float = 0.3,
+                 report: bool = False) -> dict:
+    """Filter JSONL training pairs by quality score."""
+
+    kept = []
+    removed = []
+    total = 0
+
+    with open(input_path, "r") as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+
+            try:
+                pair = json.loads(line)
+            except json.JSONDecodeError:
+                removed.append({"line": line_num, "reason": "invalid JSON", "scores": {}})
+                continue
+
+            total += 1
+            scores = score_pair(pair)
+            pair["_quality_scores"] = scores
+
+            if scores["composite"] >= threshold:
+                kept.append(pair)
+            else:
+                pair["_filter_reason"] = f"composite {scores['composite']} < {threshold}"
+                removed.append(pair)
+
+    # Write filtered output
+    if output_path and kept:
+        with open(output_path, "w") as f:
+            for pair in kept:
+                # Remove internal scoring metadata before writing
+                clean = {k: v for k, v in pair.items() if not k.startswith("_")}
+                f.write(json.dumps(clean, ensure_ascii=False) + "\n")
+
+    result = {
+        "total": total,
+        "kept": len(kept),
+        "removed": len(removed),
+        "threshold": threshold,
+        "removal_rate": round(len(removed) / total * 100, 1) if total > 0 else 0,
+    }
+
+    if report:
+        print(f"\n=== QUALITY FILTER REPORT ===")
+        print(f"Input:  {input_path}")
+        if output_path:
+            print(f"Output: {output_path}")
+        print(f"")
+        print(f"Total pairs:    {result['total']}")
+        print(f"Kept:           {result['kept']}")
+        print(f"Removed:        {result['removed']} ({result['removal_rate']}%)")
+        print(f"Threshold:      {result['threshold']}")
+        print(f"")
+
+        # Score distribution
+        if kept:
+            composites = [p["_quality_scores"]["composite"] for p in kept]
+            print(f"Kept scores:    min={min(composites):.3f} max={max(composites):.3f} avg={sum(composites)/len(composites):.3f}")
+
+        if removed:
+            reasons = {}
+            for r in removed:
+                reason = r.get("_filter_reason", r.get("reason", "unknown"))
+                reasons[reason] = reasons.get(reason, 0) + 1
+            print(f"\nRemoval reasons:")
+            for reason, count in sorted(reasons.items(), key=lambda x: -x[1]):
+                print(f"  {reason}: {count}")
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Training data quality filter — score and remove low-quality pairs"
+    )
+    parser.add_argument("input", help="Input JSONL file")
+    parser.add_argument("-o", "--output", help="Output JSONL file (filtered)")
+    parser.add_argument("-t", "--threshold", type=float, default=0.3,
+                        help="Quality threshold (0.0-1.0, default: 0.3)")
+    parser.add_argument("--report", action="store_true",
+                        help="Print detailed report")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Score only, don't filter")
+
+    args = parser.parse_args()
+
+    if not Path(args.input).exists():
+        print(f"ERROR: Input file not found: {args.input}")
+        sys.exit(1)
+
+    if args.dry_run and not args.output:
+        args.report = True
+
+    output = args.output
+    if args.dry_run:
+        output = None
+
+    result = filter_pairs(args.input, output, args.threshold, args.report)
+
+    if not args.report:
+        print(f"{result['kept']}/{result['total']} pairs kept (removed {result['removed']}, {result['removal_rate']}%)")
+
+
+if __name__ == "__main__":
+    main()

From a0266c83a424bb04e28ef6339d3c4d066b095778 Mon Sep 17 00:00:00 2001
From: Alexander Whitestone <alexander@alexanderwhitestone.com>
Date: Mon, 20 Apr 2026 23:16:13 +0000
Subject: [PATCH 2/2] fix(#687): Add quality filter tests

---
 scripts/test_quality_filter.py | 136 +++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 scripts/test_quality_filter.py

diff --git a/scripts/test_quality_filter.py b/scripts/test_quality_filter.py
new file mode 100644
index 00000000..7c6e3667
--- /dev/null
+++ b/scripts/test_quality_filter.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+Tests for training data quality filter.
+"""
+
+import json
+import os
+import sys
+import tempfile
+import unittest
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from quality_filter import score_specificity, score_length_ratio, score_code_correctness, score_pair, filter_pairs
+
+
+class TestSpecificity(unittest.TestCase):
+
+    def test_generic_response_scores_low(self):
+        text = "I don't know. It depends on many factors. There are many ways to approach this."
+        score = score_specificity(text)
+        self.assertLess(score, 0.4)
+
+    def test_specific_response_scores_high(self):
+        text = 'Run: curl -s https://api.example.com/v1/repos | python3 -c "import sys,json; print(json.load(sys.stdin))"'
+        score = score_specificity(text)
+        self.assertGreater(score, 0.6)
+
+    def test_code_block_boosts_score(self):
+        text = """Here's the fix:
+```python
+def hello():
+    return "world"
+```"""
+        score = score_specificity(text)
+        self.assertGreater(score, 0.5)
+
+    def test_long_detailed_response(self):
+        text = " ".join(["word"] * 150) + " GET /api/v1/repos"
+        score = score_specificity(text)
+        self.assertGreater(score, 0.5)
+
+    def test_short_response_penalized(self):
+        score = score_specificity("yes")
+        self.assertLess(score, 0.4)
+
+
+class TestLengthRatio(unittest.TestCase):
+
+    def test_balanced_ratio(self):
+        score = score_length_ratio("short prompt", "This is a medium length response with some detail.")
+        self.assertEqual(score, 1.0)
+
+    def test_too_short_response(self):
+        score = score_length_ratio("A long prompt with many words here", "ok")
+        self.assertLess(score, 1.0)
+
+    def test_empty_returns_zero(self):
+        self.assertEqual(score_length_ratio("", "something"), 0.0)
+        self.assertEqual(score_length_ratio("something", ""), 0.0)
+
+
+class TestCodeCorrectness(unittest.TestCase):
+
+    def test_no_code_returns_one(self):
+        self.assertEqual(score_code_correctness("Just text, no code."), 1.0)
+
+    def test_valid_python(self):
+        text = '```python\ndef foo():\n    return 42\n```'
+        self.assertEqual(score_code_correctness(text), 1.0)
+
+    def test_valid_json(self):
+        text = '```json\n{"key": "value"}\n```'
+        self.assertEqual(score_code_correctness(text), 1.0)
+
+    def test_invalid_python(self):
+        text = '```python\ndef foo(\n    return broken\n```'
+        score = score_code_correctness(text)
+        self.assertLess(score, 1.0)
+
+
+class TestScorePair(unittest.TestCase):
+
+    def test_good_pair(self):
+        pair = {
+            "prompt": "How do I list files in Python?",
+            "response": 'Use `os.listdir()` or `pathlib.Path.iterdir()`. Example:\n```python\nfrom pathlib import Path\nfor f in Path(".").iterdir():\n    print(f)\n```'
+        }
+        scores = score_pair(pair)
+        self.assertGreater(scores["composite"], 0.4)
+
+    def test_bad_pair(self):
+        pair = {
+            "prompt": "How do I deploy?",
+            "response": "It depends. There are many ways. I don't know your setup."
+        }
+        scores = score_pair(pair)
+        self.assertLess(scores["composite"], 0.4)
+
+    def test_empty_pair_returns_zero(self):
+        scores = score_pair({})
+        self.assertEqual(scores["composite"], 0.0)
+
+
+class TestFilterPairs(unittest.TestCase):
+
+    def test_filter_removes_low_quality(self):
+        pairs = [
+            json.dumps({"prompt": "How?", "response": "Yes."}),
+            json.dumps({"prompt": "List files?", "response": 'Use os.listdir():\n```python\nimport os\nos.listdir(".")\n```'}),
+            json.dumps({"prompt": "Deploy?", "response": "It depends. I don't know."}),
+        ]
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+            f.write("\n".join(pairs) + "\n")
+            input_path = f.name
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+            output_path = f.name
+
+        try:
+            result = filter_pairs(input_path, output_path, threshold=0.3)
+            self.assertEqual(result["total"], 3)
+            self.assertGreater(result["kept"], 0)
+            self.assertGreater(result["removed"], 0)
+
+            # Verify output is valid JSONL
+            with open(output_path) as f:
+                for line in f:
+                    json.loads(line.strip())
+        finally:
+            os.unlink(input_path)
+            os.unlink(output_path)
+
+
+if __name__ == "__main__":
+    unittest.main()