feat: training data quality filter (#687)

Scores training pairs and removes low-quality entries. Scoring criteria: - Response length (too short = low quality) - Prompt/response ratio (response should be substantive) - Filler detection (sure, okay, i dont know) - Placeholder detection (TODO, FIXME, PLACEHOLDER) - Prompt=response detection (duplicates) - Repetition detection (repeated bigrams) - Prompt minimum length Usage: python3 training/scripts/quality_filter.py --input data.jsonl --dry-run python3 training/scripts/quality_filter.py --input data.jsonl --threshold 0.5 Closes #687
2026-04-16 00:45:50 -04:00
parent f5d456a5e8
commit 79d148ddd8
1 changed files with 51 additions and 0 deletions
--- a/training/scripts/quality_filter.py
+++ b/training/scripts/quality_filter.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+import json,sys,re,os
+from pathlib import Path
+
+def score_pair(entry):
+    reasons = []
+    score = 1.0
+    prompt = str(entry.get("prompt", entry.get("terse", entry.get("scenario", ""))))
+    response = str(entry.get("response", entry.get("rich", entry.get("content", ""))))
+    if not prompt or not response: return 0.0, ["empty"]
+    rlen = len(response)
+    plen = len(prompt)
+    if rlen < 10: score -= 0.5; reasons.append("response_too_short")
+    elif rlen < plen * 0.5: score -= 0.2; reasons.append("response_shorter")
+    fillers = [r"^(sure|okay|yes|no|maybe)\.?$", r"^(i don.?t know|not sure|sorry)\.?$"]
+    for p in fillers:
+        if re.match(p, response.strip().lower()): score -= 0.3; reasons.append("filler"); break
+    if plen < 5: score -= 0.4; reasons.append("prompt_too_short")
+    if prompt.strip().lower() == response.strip().lower(): score -= 0.5; reasons.append("prompt_equals_response")
+    for ph in ["TODO","FIXME","PLACEHOLDER","lorem ipsum","TBD"]:
+        if ph.lower() in (prompt+response).lower(): score -= 0.3; reasons.append(f"placeholder:{ph}"); break
+    words = response.lower().split()
+    if len(words) > 20:
+        bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
+        if bigrams and len(set(bigrams))/len(bigrams) < 0.3: score -= 0.3; reasons.append("repetitive")
+    return max(0.0, score), reasons
+
+def filter_file(inp, outp=None, threshold=0.4, dry_run=False):
+    inp = Path(inp)
+    if outp is None: outp = inp.parent / f"{inp.stem}_filtered{inp.suffix}"
+    entries = [json.loads(l) for l in open(inp) if l.strip()]
+    if not entries: print(f"No entries in {inp}"); return
+    print(f"Input: {inp} ({len(entries)} entries) Threshold: {threshold}")
+    kept=removed=0; removed_reasons={}
+    with open(outp,"w") as out:
+        for e in entries:
+            s,r = score_pair(e)
+            if s >= threshold: out.write(json.dumps(e)+chr(10)); kept+=1
+            else: removed+=1; [removed_reasons.update({x:removed_reasons.get(x,0)+1}) for x in r]
+    print(f"Kept: {kept} Removed: {removed}")
+    if removed_reasons:
+        print("Reasons:")
+        for k,v in sorted(removed_reasons.items(),key=lambda x:-x[1]): print(f"  {k}: {v}")
+    if not dry_run: print(f"Output: {outp}")
+
+def main():
+    import argparse
+    p=argparse.ArgumentParser(); p.add_argument("--input",required=True); p.add_argument("--output"); p.add_argument("--threshold",type=float,default=0.4); p.add_argument("--dry-run",action="store_true"); a=p.parse_args()
+    filter_file(a.input,a.output,a.threshold,a.dry_run)
+
+if __name__=="__main__": main()