feat: training data quality filter (#687)
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 31s
Smoke Test / smoke (pull_request) Failing after 21s
Validate Config / YAML Lint (pull_request) Failing after 14s
Validate Config / JSON Validate (pull_request) Successful in 15s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m12s
PR Checklist / pr-checklist (pull_request) Failing after 5m45s
Validate Config / Shell Script Lint (pull_request) Failing after 46s
Validate Config / Cron Syntax Check (pull_request) Successful in 10s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 9s
Validate Training Data / validate (pull_request) Successful in 15s
Validate Config / Playbook Schema Validation (pull_request) Successful in 19s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled

Scores training pairs and removes low-quality entries.

Scoring criteria:
- Response length (too short = low quality)
- Prompt/response ratio (response should be substantive)
- Filler detection (sure, okay, i dont know)
- Placeholder detection (TODO, FIXME, PLACEHOLDER)
- Prompt=response detection (duplicates)
- Repetition detection (repeated bigrams)
- Prompt minimum length

Usage:
  python3 training/scripts/quality_filter.py --input data.jsonl --dry-run
  python3 training/scripts/quality_filter.py --input data.jsonl --threshold 0.5

Closes #687
This commit is contained in:
Alexander Whitestone
2026-04-16 00:45:50 -04:00
parent f5d456a5e8
commit 79d148ddd8

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python3
import json,sys,re,os
from pathlib import Path
def score_pair(entry):
reasons = []
score = 1.0
prompt = str(entry.get("prompt", entry.get("terse", entry.get("scenario", ""))))
response = str(entry.get("response", entry.get("rich", entry.get("content", ""))))
if not prompt or not response: return 0.0, ["empty"]
rlen = len(response)
plen = len(prompt)
if rlen < 10: score -= 0.5; reasons.append("response_too_short")
elif rlen < plen * 0.5: score -= 0.2; reasons.append("response_shorter")
fillers = [r"^(sure|okay|yes|no|maybe)\.?$", r"^(i don.?t know|not sure|sorry)\.?$"]
for p in fillers:
if re.match(p, response.strip().lower()): score -= 0.3; reasons.append("filler"); break
if plen < 5: score -= 0.4; reasons.append("prompt_too_short")
if prompt.strip().lower() == response.strip().lower(): score -= 0.5; reasons.append("prompt_equals_response")
for ph in ["TODO","FIXME","PLACEHOLDER","lorem ipsum","TBD"]:
if ph.lower() in (prompt+response).lower(): score -= 0.3; reasons.append(f"placeholder:{ph}"); break
words = response.lower().split()
if len(words) > 20:
bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
if bigrams and len(set(bigrams))/len(bigrams) < 0.3: score -= 0.3; reasons.append("repetitive")
return max(0.0, score), reasons
def filter_file(inp, outp=None, threshold=0.4, dry_run=False):
inp = Path(inp)
if outp is None: outp = inp.parent / f"{inp.stem}_filtered{inp.suffix}"
entries = [json.loads(l) for l in open(inp) if l.strip()]
if not entries: print(f"No entries in {inp}"); return
print(f"Input: {inp} ({len(entries)} entries) Threshold: {threshold}")
kept=removed=0; removed_reasons={}
with open(outp,"w") as out:
for e in entries:
s,r = score_pair(e)
if s >= threshold: out.write(json.dumps(e)+chr(10)); kept+=1
else: removed+=1; [removed_reasons.update({x:removed_reasons.get(x,0)+1}) for x in r]
print(f"Kept: {kept} Removed: {removed}")
if removed_reasons:
print("Reasons:")
for k,v in sorted(removed_reasons.items(),key=lambda x:-x[1]): print(f" {k}: {v}")
if not dry_run: print(f"Output: {outp}")
def main():
import argparse
p=argparse.ArgumentParser(); p.add_argument("--input",required=True); p.add_argument("--output"); p.add_argument("--threshold",type=float,default=0.4); p.add_argument("--dry-run",action="store_true"); a=p.parse_args()
filter_file(a.input,a.output,a.threshold,a.dry_run)
if __name__=="__main__": main()