feat: training data quality filter (#687)
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 31s
Smoke Test / smoke (pull_request) Failing after 21s
Validate Config / YAML Lint (pull_request) Failing after 14s
Validate Config / JSON Validate (pull_request) Successful in 15s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m12s
PR Checklist / pr-checklist (pull_request) Failing after 5m45s
Validate Config / Shell Script Lint (pull_request) Failing after 46s
Validate Config / Cron Syntax Check (pull_request) Successful in 10s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 9s
Validate Training Data / validate (pull_request) Successful in 15s
Validate Config / Playbook Schema Validation (pull_request) Successful in 19s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 31s
Smoke Test / smoke (pull_request) Failing after 21s
Validate Config / YAML Lint (pull_request) Failing after 14s
Validate Config / JSON Validate (pull_request) Successful in 15s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m12s
PR Checklist / pr-checklist (pull_request) Failing after 5m45s
Validate Config / Shell Script Lint (pull_request) Failing after 46s
Validate Config / Cron Syntax Check (pull_request) Successful in 10s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 9s
Validate Training Data / validate (pull_request) Successful in 15s
Validate Config / Playbook Schema Validation (pull_request) Successful in 19s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
Scores training pairs and removes low-quality entries. Scoring criteria: - Response length (too short = low quality) - Prompt/response ratio (response should be substantive) - Filler detection (sure, okay, i dont know) - Placeholder detection (TODO, FIXME, PLACEHOLDER) - Prompt=response detection (duplicates) - Repetition detection (repeated bigrams) - Prompt minimum length Usage: python3 training/scripts/quality_filter.py --input data.jsonl --dry-run python3 training/scripts/quality_filter.py --input data.jsonl --threshold 0.5 Closes #687
This commit is contained in:
51
training/scripts/quality_filter.py
Executable file
51
training/scripts/quality_filter.py
Executable file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env python3
|
||||
import json,sys,re,os
|
||||
from pathlib import Path
|
||||
|
||||
def score_pair(entry):
|
||||
reasons = []
|
||||
score = 1.0
|
||||
prompt = str(entry.get("prompt", entry.get("terse", entry.get("scenario", ""))))
|
||||
response = str(entry.get("response", entry.get("rich", entry.get("content", ""))))
|
||||
if not prompt or not response: return 0.0, ["empty"]
|
||||
rlen = len(response)
|
||||
plen = len(prompt)
|
||||
if rlen < 10: score -= 0.5; reasons.append("response_too_short")
|
||||
elif rlen < plen * 0.5: score -= 0.2; reasons.append("response_shorter")
|
||||
fillers = [r"^(sure|okay|yes|no|maybe)\.?$", r"^(i don.?t know|not sure|sorry)\.?$"]
|
||||
for p in fillers:
|
||||
if re.match(p, response.strip().lower()): score -= 0.3; reasons.append("filler"); break
|
||||
if plen < 5: score -= 0.4; reasons.append("prompt_too_short")
|
||||
if prompt.strip().lower() == response.strip().lower(): score -= 0.5; reasons.append("prompt_equals_response")
|
||||
for ph in ["TODO","FIXME","PLACEHOLDER","lorem ipsum","TBD"]:
|
||||
if ph.lower() in (prompt+response).lower(): score -= 0.3; reasons.append(f"placeholder:{ph}"); break
|
||||
words = response.lower().split()
|
||||
if len(words) > 20:
|
||||
bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
|
||||
if bigrams and len(set(bigrams))/len(bigrams) < 0.3: score -= 0.3; reasons.append("repetitive")
|
||||
return max(0.0, score), reasons
|
||||
|
||||
def filter_file(inp, outp=None, threshold=0.4, dry_run=False):
|
||||
inp = Path(inp)
|
||||
if outp is None: outp = inp.parent / f"{inp.stem}_filtered{inp.suffix}"
|
||||
entries = [json.loads(l) for l in open(inp) if l.strip()]
|
||||
if not entries: print(f"No entries in {inp}"); return
|
||||
print(f"Input: {inp} ({len(entries)} entries) Threshold: {threshold}")
|
||||
kept=removed=0; removed_reasons={}
|
||||
with open(outp,"w") as out:
|
||||
for e in entries:
|
||||
s,r = score_pair(e)
|
||||
if s >= threshold: out.write(json.dumps(e)+chr(10)); kept+=1
|
||||
else: removed+=1; [removed_reasons.update({x:removed_reasons.get(x,0)+1}) for x in r]
|
||||
print(f"Kept: {kept} Removed: {removed}")
|
||||
if removed_reasons:
|
||||
print("Reasons:")
|
||||
for k,v in sorted(removed_reasons.items(),key=lambda x:-x[1]): print(f" {k}: {v}")
|
||||
if not dry_run: print(f"Output: {outp}")
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
p=argparse.ArgumentParser(); p.add_argument("--input",required=True); p.add_argument("--output"); p.add_argument("--threshold",type=float,default=0.4); p.add_argument("--dry-run",action="store_true"); a=p.parse_args()
|
||||
filter_file(a.input,a.output,a.threshold,a.dry_run)
|
||||
|
||||
if __name__=="__main__": main()
|
||||
Reference in New Issue
Block a user