From a653434dbbfd7352a3d58d09da3f79c687abdb11 Mon Sep 17 00:00:00 2001 From: Merge Bot Date: Thu, 16 Apr 2026 04:58:20 +0000 Subject: [PATCH] Merge PR #786: training/scripts/quality_filter.py (added) --- training/scripts/quality_filter.py | 51 ++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 training/scripts/quality_filter.py diff --git a/training/scripts/quality_filter.py b/training/scripts/quality_filter.py new file mode 100644 index 00000000..1fcc436b --- /dev/null +++ b/training/scripts/quality_filter.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +import json,sys,re,os +from pathlib import Path + +def score_pair(entry): + reasons = [] + score = 1.0 + prompt = str(entry.get("prompt", entry.get("terse", entry.get("scenario", "")))) + response = str(entry.get("response", entry.get("rich", entry.get("content", "")))) + if not prompt or not response: return 0.0, ["empty"] + rlen = len(response) + plen = len(prompt) + if rlen < 10: score -= 0.5; reasons.append("response_too_short") + elif rlen < plen * 0.5: score -= 0.2; reasons.append("response_shorter") + fillers = [r"^(sure|okay|yes|no|maybe)\.?$", r"^(i don.?t know|not sure|sorry)\.?$"] + for p in fillers: + if re.match(p, response.strip().lower()): score -= 0.3; reasons.append("filler"); break + if plen < 5: score -= 0.4; reasons.append("prompt_too_short") + if prompt.strip().lower() == response.strip().lower(): score -= 0.5; reasons.append("prompt_equals_response") + for ph in ["TODO","FIXME","PLACEHOLDER","lorem ipsum","TBD"]: + if ph.lower() in (prompt+response).lower(): score -= 0.3; reasons.append(f"placeholder:{ph}"); break + words = response.lower().split() + if len(words) > 20: + bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)] + if bigrams and len(set(bigrams))/len(bigrams) < 0.3: score -= 0.3; reasons.append("repetitive") + return max(0.0, score), reasons + +def filter_file(inp, outp=None, threshold=0.4, dry_run=False): + inp = Path(inp) + if outp is None: outp = inp.parent / f"{inp.stem}_filtered{inp.suffix}" + entries = [json.loads(l) for l in open(inp) if l.strip()] + if not entries: print(f"No entries in {inp}"); return + print(f"Input: {inp} ({len(entries)} entries) Threshold: {threshold}") + kept=removed=0; removed_reasons={} + with open(outp,"w") as out: + for e in entries: + s,r = score_pair(e) + if s >= threshold: out.write(json.dumps(e)+chr(10)); kept+=1 + else: removed+=1; [removed_reasons.update({x:removed_reasons.get(x,0)+1}) for x in r] + print(f"Kept: {kept} Removed: {removed}") + if removed_reasons: + print("Reasons:") + for k,v in sorted(removed_reasons.items(),key=lambda x:-x[1]): print(f" {k}: {v}") + if not dry_run: print(f"Output: {outp}") + +def main(): + import argparse + p=argparse.ArgumentParser(); p.add_argument("--input",required=True); p.add_argument("--output"); p.add_argument("--threshold",type=float,default=0.4); p.add_argument("--dry-run",action="store_true"); a=p.parse_args() + filter_file(a.input,a.output,a.threshold,a.dry_run) + +if __name__=="__main__": main()