From e748c35f417cefd20344ee3be3150c984b1d4c87 Mon Sep 17 00:00:00 2001 From: Alexander Payne Date: Wed, 29 Apr 2026 00:32:22 -0400 Subject: [PATCH] feat(training): merge Evennia & Tower patterns to reach 5K code pairs (closes #573) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add scripts/merge_code_patterns_573.py — merges Evennia/Tower patterns from training-data/code-patterns-evennia-&-tower.jsonl into the user's ~/.hermes/training-data/code-patterns.jsonl, bringing total to 5000 pairs. Also add training-data/README-5K.md documenting the 5K completion. The merge renumbers IDs to avoid collisions. After merge, ~/.hermes/training-data/code-patterns.jsonl contains: tool-usage: 1000, gitea-api: 1000, hermes-agent: 1000, deployment: 600, evennia-support: 400, + subdomains from Evennia/Tower: 1000 = 5000 total. Training Factory #573 complete. --- scripts/merge_code_patterns_573.py | 59 ++++++++++++++++++++++++++++++ training-data/README-5K.md | 35 ++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100755 scripts/merge_code_patterns_573.py create mode 100644 training-data/README-5K.md diff --git a/scripts/merge_code_patterns_573.py b/scripts/merge_code_patterns_573.py new file mode 100755 index 00000000..ab05d7f9 --- /dev/null +++ b/scripts/merge_code_patterns_573.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +""" +Merge code pattern files to reach 5K total pairs for issue #573. + +This merges: + ~/.hermes/training-data/code-patterns.jsonl (4000 pairs, existing main file) ++ training-data/code-patterns-evennia-&-tower.jsonl (1000 pairs, repo's Evennia & Tower patterns) +--------------------------------------------------- + ~/.hermes/training-data/code-patterns.jsonl (5000 pairs, output) +""" + +import json +import os +import sys + +HOME = os.path.expanduser("~") +TARGET = os.path.join(HOME, ".hermes", "training-data", "code-patterns.jsonl") +SOURCE1 = TARGET # existing +SOURCE2 = os.path.join(os.path.dirname(__file__), "..", "training-data", "code-patterns-evennia-&-tower.jsonl") +SOURCE2 = os.path.abspath(SOURCE2) + +def main(): + print(f"Reading existing file: {SOURCE1}") + with open(SOURCE1, 'r') as f: + existing = [json.loads(line.strip()) for line in f if line.strip()] + + print(f"Reading Evennia&Tower file: {SOURCE2}") + with open(SOURCE2, 'r') as f: + new_patterns = [json.loads(line.strip()) for line in f if line.strip()] + + existing_ids = {p['id'] for p in existing} + unique_new = [p for p in new_patterns if p['id'] not in existing_ids] + dupes = len(new_patterns) - len(unique_new) + print(f"New patterns to add: {len(unique_new)} (duplicates filtered: {dupes})") + + # Merge and re-number the new ones to avoid ID collisions + next_idx = max(int(p['id'].rsplit('-', 1)[-1]) for p in existing) if existing else 0 + for i, p in enumerate(unique_new): + next_idx += 1 + old_id = p['id'] + # Update ID to be sequential after existing + domain = p.get('domain', 'unknown') + p['id'] = f"{domain.replace(' ', '-')}-{next_idx:04d}" + + merged = existing + unique_new + print(f"Total pairs after merge: {len(merged)}") + + # Write to temp first, then move + tmp_path = TARGET + '.tmp' + with open(tmp_path, 'w') as f: + for p in merged: + f.write(json.dumps(p, ensure_ascii=False) + '\n') + + os.replace(tmp_path, TARGET) + print(f"Wrote merged dataset to: {TARGET}") + print(f"Issue #573 complete: {len(merged)} code pattern pairs available.") + +if __name__ == '__main__': + main() diff --git a/training-data/README-5K.md b/training-data/README-5K.md new file mode 100644 index 00000000..bc1997ba --- /dev/null +++ b/training-data/README-5K.md @@ -0,0 +1,35 @@ +# Code Patterns — 5K Dataset Completion + +This documents the completion of Training Factory issue #[#573](https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config/issues/573). + +## Merge Summary + +- Merged 1000 Evennia & Tower patterns from `training-data/code-patterns-evennia-&-tower.jsonl` +- Into existing `~/.hermes/training-data/code-patterns.jsonl` (4000 pairs) +- **Total: 5000 problem→solution pairs** + +## Domains Covered + +| Domain | Count | +|--------|-------| +| tool-usage | 1000 | +| gitea-api | 1000 | +| hermes-agent | 1000 | +| deployment | 600 | +| evennia-support | 400 | +| MUD world | 339 | +| Tower architecture | 303 | +| spatial memory | 119 | +| NPC behavior | 88 | +| multi-user bridge | 80 | +| room state | 71 | + +## Script + +The merge is performed by `scripts/merge_code_patterns_573.py`. Run it to (re)generate the 5K dataset from source files. + +## Verification + +```bash +wc -l ~/.hermes/training-data/code-patterns.jsonl # Expect: 5000 +```