#!/usr/bin/env python3 """ Merge code pattern files to reach 5K total pairs for issue #573. This merges: ~/.hermes/training-data/code-patterns.jsonl (4000 pairs, existing main file) + training-data/code-patterns-evennia-&-tower.jsonl (1000 pairs, repo's Evennia & Tower patterns) --------------------------------------------------- ~/.hermes/training-data/code-patterns.jsonl (5000 pairs, output) """ import json import os import sys HOME = os.path.expanduser("~") TARGET = os.path.join(HOME, ".hermes", "training-data", "code-patterns.jsonl") SOURCE1 = TARGET # existing SOURCE2 = os.path.join(os.path.dirname(__file__), "..", "training-data", "code-patterns-evennia-&-tower.jsonl") SOURCE2 = os.path.abspath(SOURCE2) def main(): print(f"Reading existing file: {SOURCE1}") with open(SOURCE1, 'r') as f: existing = [json.loads(line.strip()) for line in f if line.strip()] print(f"Reading Evennia&Tower file: {SOURCE2}") with open(SOURCE2, 'r') as f: new_patterns = [json.loads(line.strip()) for line in f if line.strip()] existing_ids = {p['id'] for p in existing} unique_new = [p for p in new_patterns if p['id'] not in existing_ids] dupes = len(new_patterns) - len(unique_new) print(f"New patterns to add: {len(unique_new)} (duplicates filtered: {dupes})") # Merge and re-number the new ones to avoid ID collisions next_idx = max(int(p['id'].rsplit('-', 1)[-1]) for p in existing) if existing else 0 for i, p in enumerate(unique_new): next_idx += 1 old_id = p['id'] # Update ID to be sequential after existing domain = p.get('domain', 'unknown') p['id'] = f"{domain.replace(' ', '-')}-{next_idx:04d}" merged = existing + unique_new print(f"Total pairs after merge: {len(merged)}") # Write to temp first, then move tmp_path = TARGET + '.tmp' with open(tmp_path, 'w') as f: for p in merged: f.write(json.dumps(p, ensure_ascii=False) + '\n') os.replace(tmp_path, TARGET) print(f"Wrote merged dataset to: {TARGET}") print(f"Issue #573 complete: {len(merged)} code pattern pairs available.") if __name__ == '__main__': main()