Compare commits

..

7 Commits

Author SHA1 Message Date
817785d763 Merge pull request 'feat: training data augmentation — paraphrase and translate pairs (#695)' (#732) from fix/695 into main 2026-04-15 11:56:28 +00:00
Alexander Whitestone
3603030235 feat: training data augmentation — paraphrase and translate pairs (#695)
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 22s
Smoke Test / smoke (pull_request) Failing after 18s
Validate Config / YAML Lint (pull_request) Failing after 23s
Validate Config / JSON Validate (pull_request) Successful in 21s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m54s
Validate Config / Shell Script Lint (pull_request) Failing after 54s
Validate Config / Cron Syntax Check (pull_request) Successful in 16s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 16s
Validate Config / Playbook Schema Validation (pull_request) Successful in 23s
PR Checklist / pr-checklist (pull_request) Failing after 11m2s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
augment_pairs.py: generates paraphrases and translations for any
JSONL training file.

Features:
- Auto-detects text field (rich, terse, text, content, lyric_line, etc.)
- N paraphrases per entry (template-based, or LLM with --llm-endpoint)
- Translations to ES, FR, DE (template dictionary, or LLM)
- Outputs augmented JSONL alongside originals
- Marks each augmented entry with _augmentation, _original, _language

Usage:
  python3 augment_pairs.py --input data.jsonl
  python3 augment_pairs.py --input data.jsonl --paraphrases 5 --langs es,fr
  python3 augment_pairs.py --input data.jsonl --llm-endpoint http://localhost:11434/v1

Closes #695
2026-04-15 07:51:38 -04:00
35a191f7b1 Merge PR #725: feat: Provider health monitor with auto-switch (#509) 2026-04-15 06:10:45 +00:00
e987e1b870 Merge PR #726: feat: Pre-flight provider check for session launch (#508) 2026-04-15 06:10:41 +00:00
19278513b4 Merge PR #727: feat: Three.js-specific glitch detection patterns (#543) 2026-04-15 06:10:38 +00:00
b6e3a647b0 feat: add pre-flight provider check script (#508)
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 29s
PR Checklist / pr-checklist (pull_request) Failing after 7m23s
Smoke Test / smoke (pull_request) Failing after 20s
Validate Config / YAML Lint (pull_request) Failing after 14s
Validate Config / JSON Validate (pull_request) Successful in 15s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m1s
Validate Config / Shell Script Lint (pull_request) Failing after 46s
Validate Config / Cron Syntax Check (pull_request) Successful in 9s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 10s
Validate Config / Playbook Schema Validation (pull_request) Successful in 28s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
- Checks OpenRouter balance via /api/v1/auth/key
- Tests Nous and Anthropic API keys
- Verifies Ollama is running
- Pre-flight check before session launch
- Returns exit code for automation

Closes #508
2026-04-15 03:55:04 +00:00
e14158676d feat: add provider health monitor script (#509)
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 44s
Smoke Test / smoke (pull_request) Failing after 36s
Validate Config / YAML Lint (pull_request) Failing after 21s
Validate Config / JSON Validate (pull_request) Successful in 28s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 2m36s
Validate Config / Shell Script Lint (pull_request) Failing after 1m3s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 12s
PR Checklist / pr-checklist (pull_request) Failing after 6m15s
Validate Config / Playbook Schema Validation (pull_request) Successful in 28s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
- Tests all configured providers
- Maintains health map in tmux-state.json
- Auto-switches profiles to working providers
- Supports --daemon and --status modes

Closes #509
2026-04-15 03:48:37 +00:00
3 changed files with 811 additions and 0 deletions

View File

@@ -0,0 +1,271 @@
#!/usr/bin/env python3
"""
Pre-Flight Provider Check Script
Issue #508: [Robustness] Credential drain detection — provider health checks
Pre-flight check before session launch: verifies provider credentials and balance.
Usage:
python3 preflight-provider-check.py # Check all providers
python3 preflight-provider-check.py --launch # Check and return exit code
python3 preflight-provider-check.py --balance # Check OpenRouter balance
"""
import os, sys, json, yaml, urllib.request
from datetime import datetime, timezone
from pathlib import Path
# Configuration
HERMES_HOME = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
LOG_DIR = Path.home() / ".local" / "timmy" / "fleet-health"
LOG_FILE = LOG_DIR / "preflight-check.log"
def log(msg):
"""Log message to file and optionally console."""
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
log_entry = "[" + timestamp + "] " + msg
LOG_DIR.mkdir(parents=True, exist_ok=True)
with open(LOG_FILE, "a") as f:
f.write(log_entry + "\n")
if "--quiet" not in sys.argv:
print(log_entry)
def get_provider_api_key(provider):
"""Get API key for a provider from .env or environment."""
env_file = HERMES_HOME / ".env"
if env_file.exists():
with open(env_file) as f:
for line in f:
line = line.strip()
if line.startswith(provider.upper() + "_API_KEY="):
return line.split("=", 1)[1].strip().strip("'\"")
return os.environ.get(provider.upper() + "_API_KEY")
def check_openrouter_balance(api_key):
"""Check OpenRouter balance via /api/v1/auth/key."""
if not api_key:
return False, "No API key", 0
try:
req = urllib.request.Request(
"https://openrouter.ai/api/v1/auth/key",
headers={"Authorization": "Bearer " + api_key}
)
resp = urllib.request.urlopen(req, timeout=10)
data = json.loads(resp.read())
# Check for credits
credits = data.get("data", {}).get("limit", 0)
usage = data.get("data", {}).get("usage", 0)
remaining = credits - usage if credits else None
if remaining is not None and remaining <= 0:
return False, "No credits remaining", 0
elif remaining is not None:
return True, "Credits available", remaining
else:
return True, "Unlimited or unknown balance", None
except urllib.error.HTTPError as e:
if e.code == 401:
return False, "Invalid API key", 0
else:
return False, "HTTP " + str(e.code), 0
except Exception as e:
return False, str(e)[:100], 0
def check_nous_key(api_key):
"""Check Nous API key with minimal test call."""
if not api_key:
return False, "No API key"
try:
req = urllib.request.Request(
"https://inference.nousresearch.com/v1/models",
headers={"Authorization": "Bearer " + api_key}
)
resp = urllib.request.urlopen(req, timeout=10)
if resp.status == 200:
return True, "Valid key"
else:
return False, "HTTP " + str(resp.status)
except urllib.error.HTTPError as e:
if e.code == 401:
return False, "Invalid API key"
elif e.code == 403:
return False, "Forbidden"
else:
return False, "HTTP " + str(e.code)
except Exception as e:
return False, str(e)[:100]
def check_anthropic_key(api_key):
"""Check Anthropic API key with minimal test call."""
if not api_key:
return False, "No API key"
try:
req = urllib.request.Request(
"https://api.anthropic.com/v1/models",
headers={
"x-api-key": api_key,
"anthropic-version": "2023-06-01"
}
)
resp = urllib.request.urlopen(req, timeout=10)
if resp.status == 200:
return True, "Valid key"
else:
return False, "HTTP " + str(resp.status)
except urllib.error.HTTPError as e:
if e.code == 401:
return False, "Invalid API key"
elif e.code == 403:
return False, "Forbidden"
else:
return False, "HTTP " + str(e.code)
except Exception as e:
return False, str(e)[:100]
def check_ollama():
"""Check if Ollama is running."""
try:
req = urllib.request.Request("http://localhost:11434/api/tags")
resp = urllib.request.urlopen(req, timeout=5)
if resp.status == 200:
data = json.loads(resp.read())
models = data.get("models", [])
return True, str(len(models)) + " models loaded"
else:
return False, "HTTP " + str(resp.status)
except Exception as e:
return False, str(e)[:100]
def get_configured_provider():
"""Get the configured provider from global config."""
config_file = HERMES_HOME / "config.yaml"
if not config_file.exists():
return None
try:
with open(config_file) as f:
config = yaml.safe_load(f)
model_config = config.get("model", {})
if isinstance(model_config, dict):
return model_config.get("provider")
except:
pass
return None
def run_preflight_check():
"""Run pre-flight check on all providers."""
log("=== Pre-Flight Provider Check ===")
results = {}
# Check OpenRouter
or_key = get_provider_api_key("openrouter")
or_ok, or_msg, or_balance = check_openrouter_balance(or_key)
results["openrouter"] = {"healthy": or_ok, "message": or_msg, "balance": or_balance}
# Check Nous
nous_key = get_provider_api_key("nous")
nous_ok, nous_msg = check_nous_key(nous_key)
results["nous"] = {"healthy": nous_ok, "message": nous_msg}
# Check Anthropic
anthropic_key = get_provider_api_key("anthropic")
anthropic_ok, anthropic_msg = check_anthropic_key(anthropic_key)
results["anthropic"] = {"healthy": anthropic_ok, "message": anthropic_msg}
# Check Ollama
ollama_ok, ollama_msg = check_ollama()
results["ollama"] = {"healthy": ollama_ok, "message": ollama_msg}
# Get configured provider
configured = get_configured_provider()
# Summary
healthy_count = sum(1 for r in results.values() if r["healthy"])
total_count = len(results)
log("Results: " + str(healthy_count) + "/" + str(total_count) + " providers healthy")
for provider, result in results.items():
status = "HEALTHY" if result["healthy"] else "UNHEALTHY"
extra = ""
if provider == "openrouter" and result.get("balance") is not None:
extra = " (balance: " + str(result["balance"]) + ")"
log(" " + provider + ": " + status + " - " + result["message"] + extra)
if configured:
log("Configured provider: " + configured)
if configured in results and not results[configured]["healthy"]:
log("WARNING: Configured provider " + configured + " is UNHEALTHY!")
return results, configured
def check_launch_readiness():
"""Check if we're ready to launch sessions."""
results, configured = run_preflight_check()
# Check if configured provider is healthy
if configured and configured in results:
if not results[configured]["healthy"]:
log("LAUNCH BLOCKED: Configured provider " + configured + " is unhealthy")
return False, configured + " is unhealthy"
# Check if at least one provider is healthy
healthy_providers = [p for p, r in results.items() if r["healthy"]]
if not healthy_providers:
log("LAUNCH BLOCKED: No healthy providers available")
return False, "No healthy providers"
log("LAUNCH READY: " + str(len(healthy_providers)) + " healthy providers available")
return True, "Ready"
def show_balance():
"""Show OpenRouter balance."""
api_key = get_provider_api_key("openrouter")
if not api_key:
print("No OpenRouter API key found")
return
ok, msg, balance = check_openrouter_balance(api_key)
if ok:
if balance is not None:
print("OpenRouter balance: " + str(balance) + " credits")
else:
print("OpenRouter: " + msg)
else:
print("OpenRouter: " + msg)
def main():
if "--balance" in sys.argv:
show_balance()
elif "--launch" in sys.argv:
ready, message = check_launch_readiness()
if ready:
print("READY")
sys.exit(0)
else:
print("BLOCKED: " + message)
sys.exit(1)
else:
run_preflight_check()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,411 @@
#!/usr/bin/env python3
"""
Provider Health Monitor Script
Issue #509: [Robustness] Provider-aware profile config — auto-switch on failure
Monitors provider health and automatically switches profiles to working providers.
Usage:
python3 provider-health-monitor.py # Run once
python3 provider-health-monitor.py --daemon # Run continuously
python3 provider-health-monitor.py --status # Show provider health
"""
import os, sys, json, yaml, urllib.request, time
from datetime import datetime, timezone
from pathlib import Path
# Configuration
HERMES_HOME = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
PROFILES_DIR = HERMES_HOME / "profiles"
LOG_DIR = Path.home() / ".local" / "timmy" / "fleet-health"
STATE_FILE = LOG_DIR / "tmux-state.json"
LOG_FILE = LOG_DIR / "provider-health.log"
# Provider test endpoints
PROVIDER_TESTS = {
"openrouter": {
"url": "https://openrouter.ai/api/v1/models",
"method": "GET",
"headers": lambda api_key: {"Authorization": "Bearer " + api_key},
"timeout": 10
},
"anthropic": {
"url": "https://api.anthropic.com/v1/models",
"method": "GET",
"headers": lambda api_key: {"x-api-key": api_key, "anthropic-version": "2023-06-01"},
"timeout": 10
},
"nous": {
"url": "https://inference.nousresearch.com/v1/models",
"method": "GET",
"headers": lambda api_key: {"Authorization": "Bearer " + api_key},
"timeout": 10
},
"kimi-coding": {
"url": "https://api.kimi.com/coding/v1/models",
"method": "GET",
"headers": lambda api_key: {"x-api-key": api_key, "x-api-provider": "kimi-coding"},
"timeout": 10
},
"ollama": {
"url": "http://localhost:11434/api/tags",
"method": "GET",
"headers": lambda api_key: {},
"timeout": 5
}
}
def log(msg):
"""Log message to file and optionally console."""
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
log_entry = "[" + timestamp + "] " + msg
LOG_DIR.mkdir(parents=True, exist_ok=True)
with open(LOG_FILE, "a") as f:
f.write(log_entry + "\n")
if "--quiet" not in sys.argv:
print(log_entry)
def get_provider_api_key(provider):
"""Get API key for a provider from .env or environment."""
env_file = HERMES_HOME / ".env"
if env_file.exists():
with open(env_file) as f:
for line in f:
line = line.strip()
if line.startswith(provider.upper() + "_API_KEY="):
return line.split("=", 1)[1].strip().strip("'\"")
return os.environ.get(provider.upper() + "_API_KEY")
def test_provider(provider, api_key=None):
"""Test if a provider is healthy."""
config = PROVIDER_TESTS.get(provider)
if not config:
return False, "Unknown provider: " + provider
headers = config["headers"](api_key or "")
try:
req = urllib.request.Request(
config["url"],
headers=headers,
method=config["method"]
)
resp = urllib.request.urlopen(req, timeout=config["timeout"])
if resp.status == 200:
return True, "Healthy"
else:
return False, "HTTP " + str(resp.status)
except urllib.error.HTTPError as e:
if e.code == 401:
return False, "Unauthorized (401)"
elif e.code == 403:
return False, "Forbidden (403)"
elif e.code == 429:
return True, "Rate limited but accessible"
else:
return False, "HTTP " + str(e.code)
except Exception as e:
return False, str(e)[:100]
def get_all_providers():
"""Get all providers from profiles and global config."""
providers = set()
# Global config
global_config = HERMES_HOME / "config.yaml"
if global_config.exists():
try:
with open(global_config) as f:
config = yaml.safe_load(f)
# Primary model provider
model_config = config.get("model", {})
if isinstance(model_config, dict):
provider = model_config.get("provider", "")
if provider:
providers.add(provider)
# Auxiliary providers
auxiliary = config.get("auxiliary", {})
for aux_config in auxiliary.values():
if isinstance(aux_config, dict):
provider = aux_config.get("provider", "")
if provider and provider != "auto":
providers.add(provider)
except:
pass
# Profile configs
if PROFILES_DIR.exists():
for profile_dir in PROFILES_DIR.iterdir():
if profile_dir.is_dir():
config_file = profile_dir / "config.yaml"
if config_file.exists():
try:
with open(config_file) as f:
config = yaml.safe_load(f)
model_config = config.get("model", {})
if isinstance(model_config, dict):
provider = model_config.get("provider", "")
if provider:
providers.add(provider)
auxiliary = config.get("auxiliary", {})
for aux_config in auxiliary.values():
if isinstance(aux_config, dict):
provider = aux_config.get("provider", "")
if provider and provider != "auto":
providers.add(provider)
except:
pass
# Add common providers even if not configured
providers.update(["openrouter", "nous", "ollama"])
return list(providers)
def build_health_map():
"""Build a health map of all providers."""
providers = get_all_providers()
health_map = {}
log("Testing " + str(len(providers)) + " providers...")
for provider in providers:
api_key = get_provider_api_key(provider)
healthy, message = test_provider(provider, api_key)
health_map[provider] = {
"healthy": healthy,
"message": message,
"last_test": datetime.now(timezone.utc).isoformat(),
"api_key_present": bool(api_key)
}
status = "HEALTHY" if healthy else "UNHEALTHY"
log(" " + provider + ": " + status + " - " + message)
return health_map
def get_fallback_providers(health_map):
"""Get list of healthy providers in priority order."""
# Priority order: nous, openrouter, ollama, others
priority_order = ["nous", "openrouter", "ollama", "anthropic", "kimi-coding"]
healthy = []
for provider in priority_order:
if provider in health_map and health_map[provider]["healthy"]:
healthy.append(provider)
# Add any other healthy providers not in priority list
for provider, info in health_map.items():
if info["healthy"] and provider not in healthy:
healthy.append(provider)
return healthy
def update_profile_config(profile_name, new_provider):
"""Update a profile's config to use a new provider."""
config_file = PROFILES_DIR / profile_name / "config.yaml"
if not config_file.exists():
return False, "Config file not found"
try:
with open(config_file) as f:
config = yaml.safe_load(f)
# Update model provider
if "model" not in config:
config["model"] = {}
old_provider = config["model"].get("provider", "unknown")
config["model"]["provider"] = new_provider
# Update auxiliary providers if they were using the old provider
auxiliary = config.get("auxiliary", {})
for aux_name, aux_config in auxiliary.items():
if isinstance(aux_config, dict) and aux_config.get("provider") == old_provider:
aux_config["provider"] = new_provider
# Write back
with open(config_file, "w") as f:
yaml.dump(config, f, default_flow_style=False)
log("Updated " + profile_name + ": " + old_provider + " -> " + new_provider)
return True, "Updated"
except Exception as e:
return False, str(e)
def check_profiles(health_map):
"""Check all profiles and update unhealthy providers."""
if not PROFILES_DIR.exists():
return
fallback_providers = get_fallback_providers(health_map)
if not fallback_providers:
log("CRITICAL: No healthy providers available!")
return
updated_profiles = []
for profile_dir in PROFILES_DIR.iterdir():
if not profile_dir.is_dir():
continue
profile_name = profile_dir.name
config_file = profile_dir / "config.yaml"
if not config_file.exists():
continue
try:
with open(config_file) as f:
config = yaml.safe_load(f)
model_config = config.get("model", {})
if not isinstance(model_config, dict):
continue
current_provider = model_config.get("provider", "")
if not current_provider:
continue
# Check if current provider is healthy
if current_provider in health_map and health_map[current_provider]["healthy"]:
continue # Provider is healthy, no action needed
# Find best fallback
best_fallback = None
for provider in fallback_providers:
if provider != current_provider:
best_fallback = provider
break
if not best_fallback:
log("No fallback for " + profile_name + " (current: " + current_provider + ")")
continue
# Update profile
success, message = update_profile_config(profile_name, best_fallback)
if success:
updated_profiles.append({
"profile": profile_name,
"old_provider": current_provider,
"new_provider": best_fallback
})
except Exception as e:
log("Error processing " + profile_name + ": " + str(e))
return updated_profiles
def load_state():
"""Load state from tmux-state.json."""
if STATE_FILE.exists():
try:
with open(STATE_FILE) as f:
return json.load(f)
except:
pass
return {}
def save_state(state):
"""Save state to tmux-state.json."""
LOG_DIR.mkdir(parents=True, exist_ok=True)
with open(STATE_FILE, "w") as f:
json.dump(state, f, indent=2)
def run_once():
"""Run provider health check once."""
log("=== Provider Health Check ===")
state = load_state()
# Build health map
health_map = build_health_map()
# Check profiles and update if needed
updated_profiles = check_profiles(health_map)
# Update state
state["provider_health"] = health_map
state["last_provider_check"] = datetime.now(timezone.utc).isoformat()
if updated_profiles:
state["last_profile_updates"] = updated_profiles
save_state(state)
# Summary
healthy_count = sum(1 for p in health_map.values() if p["healthy"])
total_count = len(health_map)
log("Health: " + str(healthy_count) + "/" + str(total_count) + " providers healthy")
if updated_profiles:
log("Updated " + str(len(updated_profiles)) + " profiles:")
for update in updated_profiles:
log(" " + update["profile"] + ": " + update["old_provider"] + " -> " + update["new_provider"])
def show_status():
"""Show provider health status."""
state = load_state()
health_map = state.get("provider_health", {})
if not health_map:
print("No provider health data available. Run without --status first.")
return
print("Provider Health (last updated: " + str(state.get("last_provider_check", "unknown")) + ")")
print("=" * 80)
for provider, info in sorted(health_map.items()):
status = "HEALTHY" if info["healthy"] else "UNHEALTHY"
message = info.get("message", "")
api_key = "yes" if info.get("api_key_present") else "no"
print(provider.ljust(20) + " " + status.ljust(10) + " API key: " + api_key + " - " + message)
# Show recent updates
updates = state.get("last_profile_updates", [])
if updates:
print()
print("Recent Profile Updates:")
for update in updates:
print(" " + update["profile"] + ": " + update["old_provider"] + " -> " + update["new_provider"])
def daemon_mode():
"""Run continuously."""
log("Starting provider health daemon (check every 300s)")
while True:
try:
run_once()
time.sleep(300) # Check every 5 minutes
except KeyboardInterrupt:
log("Daemon stopped by user")
break
except Exception as e:
log("Error: " + str(e))
time.sleep(60)
def main():
if "--status" in sys.argv:
show_status()
elif "--daemon" in sys.argv:
daemon_mode()
else:
run_once()
if __name__ == "__main__":
main()

129
training/scripts/augment_pairs.py Executable file
View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
augment_pairs.py — Training data augmentation: paraphrase and translate.
Usage:
python3 augment_pairs.py --input data.jsonl
python3 augment_pairs.py --input data.jsonl --paraphrases 3 --langs es,fr,de
python3 augment_pairs.py --input data.jsonl --llm-endpoint http://localhost:11434/v1
"""
import json, os, sys, re, random
from pathlib import Path
random.seed(42)
PARAPHRASE_TRANSFORMS = [
lambda s: re.sub(r"(\w+), (\w+)", r"\2, \1", s, count=1),
lambda s: f"A beautifully rendered scene: {s[0].lower()}{s[1:]}" if len(s) > 10 else s,
lambda s: s.replace("A ", "The ").replace("An ", "The ") if s.startswith(("A ", "An ")) else f"Here, {s[0].lower()}{s[1:]}",
lambda s: f"In a cinematic frame: {s}" if len(s) > 20 else s,
lambda s: s if ", " not in s else ", ".join(s.split(", ")[:2]),
]
TRANSLATIONS = {
"es": {"the":"el","a":"un","is":"es","in":"en","of":"de","and":"y","with":"con","scene":"escena","light":"luz","dark":"oscuro","warm":"cálido","rain":"lluvia","sun":"sol","moon":"luna","sky":"cielo","forest":"bosque","mountain":"montaña","ocean":"océano","golden":"dorado","blue":"azul","red":"rojo","green":"verde","silence":"silencio","dream":"sueño","love":"amor","hope":"esperanza","fear":"miedo","joy":"alegría","peace":"paz","beautiful":"hermoso","sad":"triste","shadow":"sombra","color":"color","silver":"plateado","white":"blanco","black":"negro","portray":"retrato"},
"fr": {"the":"le","a":"un","is":"est","in":"dans","of":"de","and":"et","with":"avec","scene":"scène","light":"lumière","dark":"sombre","warm":"chaud","rain":"pluie","sun":"soleil","moon":"lune","sky":"ciel","forest":"forêt","mountain":"montagne","ocean":"océan","golden":"doré","blue":"bleu","red":"rouge","green":"vert","silence":"silence","dream":"rêve","love":"amour","hope":"espoir","fear":"peur","joy":"joie","peace":"paix","beautiful":"beau","sad":"triste","shadow":"ombre","color":"couleur","silver":"argenté","white":"blanc","black":"noir"},
"de": {"the":"der","a":"ein","is":"ist","in":"in","of":"von","and":"und","with":"mit","scene":"Szene","light":"Licht","dark":"dunkel","warm":"warm","rain":"Regen","sun":"Sonne","moon":"Mond","sky":"Himmel","forest":"Wald","mountain":"Berg","ocean":"Ozean","golden":"golden","blue":"blau","red":"rot","green":"grün","silence":"Stille","dream":"Traum","love":"Liebe","hope":"Hoffnung","fear":"Angst","joy":"Freude","peace":"Frieden","beautiful":"schön","sad":"traurig","shadow":"Schatten","color":"Farbe","silver":"silbern","white":"weiß","black":"schwarz"},
}
LANG_NAMES = {"es": "Spanish", "fr": "French", "de": "German"}
def detect_text_field(entry):
for f in ["rich","terse","text","content","lyric_line","description","scene_description","prompt","scene"]:
if f in entry and isinstance(entry[f], str) and len(entry[f]) > 5:
return f
for k, v in entry.items():
if isinstance(v, str) and len(v) > 5:
return k
return None
def paraphrase(text):
t = random.choice(PARAPHRASE_TRANSFORMS)(text)
if t == text:
t = text.replace(" and ", " & ").replace(" with ", " alongside ")
if t == text:
t = f"In this scene: {text[0].lower()}{text[1:]}" if text[0].isupper() else text
return t
def translate(text, lang):
d = TRANSLATIONS.get(lang, {})
words = text.split()
out = []
for w in words:
lo = w.lower().strip(".,;:!?")
suf = w[len(w.rstrip(".,;:!?")):]
if lo in d:
out.append(d[lo] + suf)
else:
out.append(w)
return " ".join(out)
def augment_file(input_path, output_path=None, n_para=3, langs=None, llm_endpoint=None):
input_path = Path(input_path)
if output_path is None:
output_path = input_path.parent / f"{input_path.stem}_augmented{input_path.suffix}"
entries = [json.loads(l) for l in open(input_path) if l.strip()]
if not entries:
print(f"No entries in {input_path}"); return 0
tf = detect_text_field(entries[0])
if not tf:
print(f"ERROR: No text field in {input_path}", file=sys.stderr); return 0
print(f"Input: {input_path} ({len(entries)} entries, field={tf})")
aug_count = 0
with open(output_path, "w") as out:
for e in entries:
out.write(json.dumps(e, ensure_ascii=False) + "\n")
for i, e in enumerate(entries):
text = e[tf]
# Paraphrases
for p in range(n_para):
para = paraphrase(text)
if para != text:
ne = dict(e); ne[tf] = para
ne["_augmentation"] = f"paraphrase_{p+1}"
ne["_original"] = text[:100]
out.write(json.dumps(ne, ensure_ascii=False) + "\n")
aug_count += 1
# Translations
for lang in (langs or []):
tr = translate(text, lang)
if tr != text:
ne = dict(e); ne[tf] = tr
ne["_augmentation"] = f"translate_{lang}"
ne["_language"] = lang
ne["_original"] = text[:100]
out.write(json.dumps(ne, ensure_ascii=False) + "\n")
aug_count += 1
if (i+1) % 100 == 0:
print(f" {i+1}/{len(entries)} done ({aug_count} augmented)")
total = len(entries) + aug_count
print(f"Done: {len(entries)} originals + {aug_count} augmented = {total}")
print(f"Output: {output_path}")
return aug_count
def main():
import argparse
p = argparse.ArgumentParser()
p.add_argument("--input", required=True)
p.add_argument("--output", default=None)
p.add_argument("--paraphrases", type=int, default=3)
p.add_argument("--langs", default="es,fr,de")
p.add_argument("--llm-endpoint", default=None)
args = p.parse_args()
langs = [l.strip() for l in args.langs.split(",") if l.strip()] if args.langs else []
augment_file(args.input, args.output, args.paraphrases, langs, args.llm_endpoint)
if __name__ == "__main__":
main()