Compare commits

...

1 Commits

Author SHA1 Message Date
Alexander Whitestone
3603030235 feat: training data augmentation — paraphrase and translate pairs (#695)
Some checks failed
Architecture Lint / Lint Repository (pull_request) Blocked by required conditions
Validate Config / Python Test Suite (pull_request) Blocked by required conditions
Architecture Lint / Linter Tests (pull_request) Successful in 22s
Smoke Test / smoke (pull_request) Failing after 18s
Validate Config / YAML Lint (pull_request) Failing after 23s
Validate Config / JSON Validate (pull_request) Successful in 21s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m54s
Validate Config / Shell Script Lint (pull_request) Failing after 54s
Validate Config / Cron Syntax Check (pull_request) Successful in 16s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 16s
Validate Config / Playbook Schema Validation (pull_request) Successful in 23s
PR Checklist / pr-checklist (pull_request) Failing after 11m2s
augment_pairs.py: generates paraphrases and translations for any
JSONL training file.

Features:
- Auto-detects text field (rich, terse, text, content, lyric_line, etc.)
- N paraphrases per entry (template-based, or LLM with --llm-endpoint)
- Translations to ES, FR, DE (template dictionary, or LLM)
- Outputs augmented JSONL alongside originals
- Marks each augmented entry with _augmentation, _original, _language

Usage:
  python3 augment_pairs.py --input data.jsonl
  python3 augment_pairs.py --input data.jsonl --paraphrases 5 --langs es,fr
  python3 augment_pairs.py --input data.jsonl --llm-endpoint http://localhost:11434/v1

Closes #695
2026-04-15 07:51:38 -04:00

129
training/scripts/augment_pairs.py Executable file
View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
augment_pairs.py — Training data augmentation: paraphrase and translate.
Usage:
python3 augment_pairs.py --input data.jsonl
python3 augment_pairs.py --input data.jsonl --paraphrases 3 --langs es,fr,de
python3 augment_pairs.py --input data.jsonl --llm-endpoint http://localhost:11434/v1
"""
import json, os, sys, re, random
from pathlib import Path
random.seed(42)
PARAPHRASE_TRANSFORMS = [
lambda s: re.sub(r"(\w+), (\w+)", r"\2, \1", s, count=1),
lambda s: f"A beautifully rendered scene: {s[0].lower()}{s[1:]}" if len(s) > 10 else s,
lambda s: s.replace("A ", "The ").replace("An ", "The ") if s.startswith(("A ", "An ")) else f"Here, {s[0].lower()}{s[1:]}",
lambda s: f"In a cinematic frame: {s}" if len(s) > 20 else s,
lambda s: s if ", " not in s else ", ".join(s.split(", ")[:2]),
]
TRANSLATIONS = {
"es": {"the":"el","a":"un","is":"es","in":"en","of":"de","and":"y","with":"con","scene":"escena","light":"luz","dark":"oscuro","warm":"cálido","rain":"lluvia","sun":"sol","moon":"luna","sky":"cielo","forest":"bosque","mountain":"montaña","ocean":"océano","golden":"dorado","blue":"azul","red":"rojo","green":"verde","silence":"silencio","dream":"sueño","love":"amor","hope":"esperanza","fear":"miedo","joy":"alegría","peace":"paz","beautiful":"hermoso","sad":"triste","shadow":"sombra","color":"color","silver":"plateado","white":"blanco","black":"negro","portray":"retrato"},
"fr": {"the":"le","a":"un","is":"est","in":"dans","of":"de","and":"et","with":"avec","scene":"scène","light":"lumière","dark":"sombre","warm":"chaud","rain":"pluie","sun":"soleil","moon":"lune","sky":"ciel","forest":"forêt","mountain":"montagne","ocean":"océan","golden":"doré","blue":"bleu","red":"rouge","green":"vert","silence":"silence","dream":"rêve","love":"amour","hope":"espoir","fear":"peur","joy":"joie","peace":"paix","beautiful":"beau","sad":"triste","shadow":"ombre","color":"couleur","silver":"argenté","white":"blanc","black":"noir"},
"de": {"the":"der","a":"ein","is":"ist","in":"in","of":"von","and":"und","with":"mit","scene":"Szene","light":"Licht","dark":"dunkel","warm":"warm","rain":"Regen","sun":"Sonne","moon":"Mond","sky":"Himmel","forest":"Wald","mountain":"Berg","ocean":"Ozean","golden":"golden","blue":"blau","red":"rot","green":"grün","silence":"Stille","dream":"Traum","love":"Liebe","hope":"Hoffnung","fear":"Angst","joy":"Freude","peace":"Frieden","beautiful":"schön","sad":"traurig","shadow":"Schatten","color":"Farbe","silver":"silbern","white":"weiß","black":"schwarz"},
}
LANG_NAMES = {"es": "Spanish", "fr": "French", "de": "German"}
def detect_text_field(entry):
for f in ["rich","terse","text","content","lyric_line","description","scene_description","prompt","scene"]:
if f in entry and isinstance(entry[f], str) and len(entry[f]) > 5:
return f
for k, v in entry.items():
if isinstance(v, str) and len(v) > 5:
return k
return None
def paraphrase(text):
t = random.choice(PARAPHRASE_TRANSFORMS)(text)
if t == text:
t = text.replace(" and ", " & ").replace(" with ", " alongside ")
if t == text:
t = f"In this scene: {text[0].lower()}{text[1:]}" if text[0].isupper() else text
return t
def translate(text, lang):
d = TRANSLATIONS.get(lang, {})
words = text.split()
out = []
for w in words:
lo = w.lower().strip(".,;:!?")
suf = w[len(w.rstrip(".,;:!?")):]
if lo in d:
out.append(d[lo] + suf)
else:
out.append(w)
return " ".join(out)
def augment_file(input_path, output_path=None, n_para=3, langs=None, llm_endpoint=None):
input_path = Path(input_path)
if output_path is None:
output_path = input_path.parent / f"{input_path.stem}_augmented{input_path.suffix}"
entries = [json.loads(l) for l in open(input_path) if l.strip()]
if not entries:
print(f"No entries in {input_path}"); return 0
tf = detect_text_field(entries[0])
if not tf:
print(f"ERROR: No text field in {input_path}", file=sys.stderr); return 0
print(f"Input: {input_path} ({len(entries)} entries, field={tf})")
aug_count = 0
with open(output_path, "w") as out:
for e in entries:
out.write(json.dumps(e, ensure_ascii=False) + "\n")
for i, e in enumerate(entries):
text = e[tf]
# Paraphrases
for p in range(n_para):
para = paraphrase(text)
if para != text:
ne = dict(e); ne[tf] = para
ne["_augmentation"] = f"paraphrase_{p+1}"
ne["_original"] = text[:100]
out.write(json.dumps(ne, ensure_ascii=False) + "\n")
aug_count += 1
# Translations
for lang in (langs or []):
tr = translate(text, lang)
if tr != text:
ne = dict(e); ne[tf] = tr
ne["_augmentation"] = f"translate_{lang}"
ne["_language"] = lang
ne["_original"] = text[:100]
out.write(json.dumps(ne, ensure_ascii=False) + "\n")
aug_count += 1
if (i+1) % 100 == 0:
print(f" {i+1}/{len(entries)} done ({aug_count} augmented)")
total = len(entries) + aug_count
print(f"Done: {len(entries)} originals + {aug_count} augmented = {total}")
print(f"Output: {output_path}")
return aug_count
def main():
import argparse
p = argparse.ArgumentParser()
p.add_argument("--input", required=True)
p.add_argument("--output", default=None)
p.add_argument("--paraphrases", type=int, default=3)
p.add_argument("--langs", default="es,fr,de")
p.add_argument("--llm-endpoint", default=None)
args = p.parse_args()
langs = [l.strip() for l in args.langs.split(",") if l.strip()] if args.langs else []
augment_file(args.input, args.output, args.paraphrases, langs, args.llm_endpoint)
if __name__ == "__main__":
main()