Compare commits
6 Commits
burn/659-1
...
fix/687-tr
| Author | SHA1 | Date | |
|---|---|---|---|
| d8921630a5 | |||
| d120526244 | |||
| 8596ff761b | |||
| 7553fd4f3e | |||
| 71082fe06f | |||
| 6d678e938e |
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Full Nostr agent-to-agent communication demo - FINAL WORKING
|
||||
"""
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Soul Eval Gate — The Conscience of the Training Pipeline
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
from hermes_tools import browser_navigate, browser_vision
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
from hermes_tools import browser_navigate, browser_vision
|
||||
|
||||
|
||||
@@ -1,176 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""PR Triage Automation -- Categorize, deduplicate, report (#659)."""
|
||||
import argparse, json, os, re, sys, subprocess
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError
|
||||
|
||||
|
||||
def _token():
|
||||
t = os.environ.get("GITEA_TOKEN", "")
|
||||
if not t:
|
||||
p = os.path.expanduser("~/.config/gitea/token")
|
||||
if os.path.exists(p):
|
||||
t = open(p).read().strip()
|
||||
return t
|
||||
|
||||
|
||||
def _api(url, token, method="GET", data=None):
|
||||
h = {"Authorization": "token " + token, "Accept": "application/json"}
|
||||
body = json.dumps(data).encode() if data else None
|
||||
if data:
|
||||
h["Content-Type"] = "application/json"
|
||||
req = Request(url, data=body, headers=h, method=method)
|
||||
try:
|
||||
return json.loads(urlopen(req, timeout=30).read())
|
||||
except HTTPError:
|
||||
return None
|
||||
|
||||
|
||||
def fetch_prs(base, token, owner, repo):
|
||||
prs, page = [], 1
|
||||
while True:
|
||||
b = _api(base + "/api/v1/repos/" + owner + "/" + repo + "/pulls?state=open&limit=50&page=" + str(page), token)
|
||||
if not b:
|
||||
break
|
||||
prs.extend(b)
|
||||
if len(b) < 50:
|
||||
break
|
||||
page += 1
|
||||
return prs
|
||||
|
||||
|
||||
def fetch_issues(base, token, owner, repo):
|
||||
iss, page = {}, 1
|
||||
while True:
|
||||
b = _api(base + "/api/v1/repos/" + owner + "/" + repo + "/issues?state=open&limit=50&page=" + str(page), token)
|
||||
if not b:
|
||||
break
|
||||
for i in b:
|
||||
if "pull_request" not in i:
|
||||
iss[i["number"]] = i
|
||||
if len(b) < 50:
|
||||
break
|
||||
page += 1
|
||||
return iss
|
||||
|
||||
|
||||
def categorize(pr):
|
||||
c = (pr.get("title", "") + " " + pr.get("body", "") + " " + " ".join(l.get("name", "") for l in pr.get("labels", []))).lower()
|
||||
for kw, cat in [("training data", "training-data"), ("dpo", "training-data"), ("grpo", "training-data"),
|
||||
("fix:", "bug-fix"), ("bug", "bug-fix"), ("hotfix", "bug-fix"),
|
||||
("feat:", "feature"), ("feature", "feature"),
|
||||
("refactor", "maintenance"), ("cleanup", "maintenance"),
|
||||
("doc", "documentation"), ("test", "testing"), ("infra", "infrastructure")]:
|
||||
if kw in c:
|
||||
return cat
|
||||
return "other"
|
||||
|
||||
|
||||
def refs(pr):
|
||||
return [int(m) for m in re.findall(r"#(\d+)", pr.get("title", "") + " " + pr.get("body", ""))]
|
||||
|
||||
|
||||
def find_duplicates(prs):
|
||||
by = defaultdict(list)
|
||||
for p in prs:
|
||||
for r in refs(p):
|
||||
by[r].append(p)
|
||||
return [g for g in by.values() if len(g) > 1]
|
||||
|
||||
|
||||
def health(pr, issues):
|
||||
r = refs(pr)
|
||||
created = datetime.fromisoformat(pr["created_at"].replace("Z", "+00:00"))
|
||||
updated = datetime.fromisoformat(pr["updated_at"].replace("Z", "+00:00"))
|
||||
now = datetime.now(created.tzinfo)
|
||||
return {
|
||||
"pr": pr["number"], "title": pr["title"], "head": pr["head"]["ref"],
|
||||
"category": categorize(pr), "refs": r,
|
||||
"open": [x for x in r if x in issues], "closed": [x for x in r if x not in issues],
|
||||
"age": (now - created).days, "stale": (now - updated).days,
|
||||
"mergeable": pr.get("mergeable"), "author": pr.get("user", {}).get("login", ""),
|
||||
}
|
||||
|
||||
|
||||
def report(repo, checks, dups):
|
||||
lines = ["# PR Triage -- " + repo,
|
||||
"Generated: " + datetime.now().strftime("%Y-%m-%d %H:%M"),
|
||||
"Open PRs: " + str(len(checks)), "", "## Summary", ""]
|
||||
cats = Counter(h["category"] for h in checks)
|
||||
lines.append("| Category | Count |")
|
||||
lines.append("|----------|-------|")
|
||||
for c, n in cats.most_common():
|
||||
lines.append("| " + c + " | " + str(n) + " |")
|
||||
stale = [h for h in checks if h["stale"] > 7]
|
||||
lines.extend(["", "Stale (>7d): " + str(len(stale)),
|
||||
"Duplicate groups: " + str(len(dups)), ""])
|
||||
if dups:
|
||||
lines.append("## Duplicates")
|
||||
for g in dups:
|
||||
rs = set()
|
||||
for p in g:
|
||||
rs.update(refs(p))
|
||||
lines.append("Issues " + ", ".join("#" + str(r) for r in sorted(rs)) + ":")
|
||||
for p in g:
|
||||
lines.append(" - #" + str(p["number"]) + ": " + p["title"])
|
||||
lines.append("")
|
||||
if stale:
|
||||
lines.append("## Stale (>7d)")
|
||||
for h in sorted(stale, key=lambda x: x["stale"], reverse=True):
|
||||
lines.append("- #" + str(h["pr"]) + ": " + h["title"] + " -- " + str(h["stale"]) + "d")
|
||||
lines.append("")
|
||||
lines.append("## All PRs")
|
||||
lines.append("| # | Title | Category | Age | Stale | Merge |")
|
||||
lines.append("|---|-------|----------|-----|-------|-------|")
|
||||
for h in sorted(checks, key=lambda x: x["pr"]):
|
||||
m = "Y" if h["mergeable"] else ("N" if h["mergeable"] is False else "?")
|
||||
s = str(h["stale"]) + "d" if h["stale"] > 7 else "-"
|
||||
lines.append("| " + str(h["pr"]) + " | " + h["title"][:50] + " | " + h["category"] +
|
||||
" | " + str(h["age"]) + "d | " + s + " | " + m + " |")
|
||||
return chr(10).join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser(description="PR Triage Automation")
|
||||
p.add_argument("--base-url", default="https://forge.alexanderwhitestone.com")
|
||||
p.add_argument("--owner", default="Timmy_Foundation")
|
||||
p.add_argument("--repo", default="")
|
||||
p.add_argument("--json", action="store_true", dest="js")
|
||||
p.add_argument("--output", default="")
|
||||
a = p.parse_args()
|
||||
token = _token()
|
||||
if not token:
|
||||
print("No token"); sys.exit(1)
|
||||
repo = a.repo
|
||||
if not repo:
|
||||
try:
|
||||
remote = subprocess.check_output(["git", "remote", "get-url", "origin"], text=True).strip()
|
||||
m = re.search(r"[/:](\w[\w-]*)/(\w[\w-]*?)(?:\.git)?$", remote)
|
||||
if m:
|
||||
a.owner, repo = m.group(1), m.group(2)
|
||||
except Exception:
|
||||
pass
|
||||
if not repo:
|
||||
print("No repo specified"); sys.exit(1)
|
||||
print("Triaging " + a.owner + "/" + repo + "...", file=sys.stderr)
|
||||
prs = fetch_prs(a.base_url, token, a.owner, repo)
|
||||
issues = fetch_issues(a.base_url, token, a.owner, repo)
|
||||
checks = [health(pr, issues) for pr in prs]
|
||||
dups = find_duplicates(prs)
|
||||
if a.js:
|
||||
print(json.dumps({"repo": repo, "prs": checks,
|
||||
"duplicates": [[{"number": p["number"], "title": p["title"]} for p in g] for g in dups]},
|
||||
indent=2))
|
||||
else:
|
||||
r = report(repo, checks, dups)
|
||||
print(r)
|
||||
if a.output:
|
||||
with open(a.output, "w") as f:
|
||||
f.write(r)
|
||||
print("\n" + str(len(checks)) + " PRs, " + str(len(dups)) + " duplicate groups", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
286
scripts/training_data_quality_filter.py
Normal file
286
scripts/training_data_quality_filter.py
Normal file
@@ -0,0 +1,286 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Training Data Quality Filter
|
||||
|
||||
Scores and removes low-quality training pairs from JSONL datasets.
|
||||
Supports two formats:
|
||||
- ShareGPT session format: {"conversations": [...], ...}
|
||||
- Scene/pair format: {"terse": "...", "rich": "..."} or {"lyric_line": "...", "scene": {...}}
|
||||
|
||||
Scoring dimensions:
|
||||
- Specificity: penalizes vague/generic content
|
||||
- Length ratio: penalizes extreme input/output imbalances
|
||||
- Code correctness: validates code blocks have matching fences
|
||||
|
||||
Usage:
|
||||
python3 scripts/training_data_quality_filter.py input.jsonl [--threshold 0.4] [--output filtered.jsonl]
|
||||
python3 scripts/training_data_quality_filter.py --dir training-data/ [--threshold 0.4]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def score_specificity(text: str) -> float:
|
||||
"""Score 0-1 based on how specific vs generic the text is."""
|
||||
if not text or len(text.strip()) < 10:
|
||||
return 0.0
|
||||
|
||||
score = 0.5 # baseline
|
||||
|
||||
# Penalize very generic starters
|
||||
generic_starters = [
|
||||
"sure,", "of course", "i can help", "here is", "here are",
|
||||
"certainly", "absolutely", "let me help", "great question",
|
||||
"that\'s a great", "interesting question",
|
||||
]
|
||||
lower = text.lower().strip()
|
||||
for starter in generic_starters:
|
||||
if lower.startswith(starter):
|
||||
score -= 0.15
|
||||
break
|
||||
|
||||
# Reward specific content indicators
|
||||
if re.search(r"`[^`]+`", text): # inline code
|
||||
score += 0.1
|
||||
if re.search(r"```[\s\S]*?```", text): # code blocks
|
||||
score += 0.15
|
||||
if re.search(r"\d+\.\s", text): # numbered lists
|
||||
score += 0.05
|
||||
if len(text.split()) > 50: # substantial length
|
||||
score += 0.1
|
||||
if re.search(r"https?://", text): # URLs/references
|
||||
score += 0.05
|
||||
|
||||
# Penalize extremely short outputs
|
||||
if len(text.split()) < 5:
|
||||
score -= 0.2
|
||||
|
||||
# Penalize repetition (same sentence repeated)
|
||||
sentences = re.split(r"[.!?]+", text)
|
||||
sentences = [s.strip().lower() for s in sentences if s.strip()]
|
||||
if sentences:
|
||||
unique_ratio = len(set(sentences)) / len(sentences)
|
||||
if unique_ratio < 0.7:
|
||||
score -= 0.15
|
||||
|
||||
return max(0.0, min(1.0, score))
|
||||
|
||||
|
||||
def score_length_ratio(input_text: str, output_text: str) -> float:
|
||||
"""Score 0-1 based on input/output length balance."""
|
||||
in_len = len(input_text.split())
|
||||
out_len = len(output_text.split())
|
||||
|
||||
if in_len == 0 or out_len == 0:
|
||||
return 0.0
|
||||
|
||||
ratio = out_len / in_len
|
||||
|
||||
# Ideal ratio: 0.5-5x (output can be shorter or longer, but not extreme)
|
||||
if 0.5 <= ratio <= 5.0:
|
||||
return 1.0
|
||||
elif 0.2 <= ratio <= 10.0:
|
||||
return 0.6
|
||||
elif 0.1 <= ratio <= 20.0:
|
||||
return 0.3
|
||||
else:
|
||||
return 0.1
|
||||
|
||||
|
||||
def score_code_correctness(text: str) -> float:
|
||||
"""Score 0-1 based on code block correctness."""
|
||||
code_blocks = re.findall(r"```[\s\S]*?```", text)
|
||||
|
||||
if not code_blocks:
|
||||
return 1.0 # no code = no code errors
|
||||
|
||||
for block in code_blocks:
|
||||
# Check balanced fences
|
||||
fence_count = block.count("```")
|
||||
if fence_count % 2 != 0:
|
||||
return 0.2
|
||||
|
||||
# Check for common errors
|
||||
content = block.split("\n", 1)[-1] if "\n" in block else ""
|
||||
if "SyntaxError" in content or "Traceback" in content:
|
||||
return 0.3
|
||||
if content.strip().endswith("...") and len(content.strip()) < 30:
|
||||
return 0.4 # truncated code
|
||||
|
||||
return 1.0
|
||||
|
||||
|
||||
def score_pair(input_text: str, output_text: str) -> dict:
|
||||
"""Score a training pair on all dimensions."""
|
||||
spec = score_specificity(output_text)
|
||||
length = score_length_ratio(input_text, output_text)
|
||||
code = score_code_correctness(output_text)
|
||||
|
||||
# Weighted composite
|
||||
composite = (spec * 0.4) + (length * 0.3) + (code * 0.3)
|
||||
|
||||
return {
|
||||
"specificity": round(spec, 3),
|
||||
"length_ratio": round(length, 3),
|
||||
"code_correctness": round(code, 3),
|
||||
"composite": round(composite, 3),
|
||||
}
|
||||
|
||||
|
||||
def extract_pairs(obj: dict) -> list:
|
||||
"""Extract (input, output) pairs from a JSONL object."""
|
||||
pairs = []
|
||||
|
||||
# ShareGPT session format
|
||||
if "conversations" in obj:
|
||||
convs = obj["conversations"]
|
||||
for i, msg in enumerate(convs):
|
||||
if msg.get("from") in ("gpt", "assistant"):
|
||||
# Find preceding human message
|
||||
input_text = ""
|
||||
for j in range(i - 1, -1, -1):
|
||||
if convs[j].get("from") == "human":
|
||||
input_text = convs[j].get("value", "")
|
||||
break
|
||||
output_text = msg.get("value", "")
|
||||
if input_text and output_text:
|
||||
pairs.append((input_text, output_text))
|
||||
|
||||
# Scene/pair format (terse/rich)
|
||||
elif "terse" in obj and "rich" in obj:
|
||||
pairs.append((obj["terse"], obj["rich"]))
|
||||
|
||||
# Scene description format
|
||||
elif "lyric_line" in obj and "scene" in obj:
|
||||
scene_text = json.dumps(obj["scene"]) if isinstance(obj["scene"], dict) else str(obj["scene"])
|
||||
pairs.append((obj["lyric_line"], scene_text))
|
||||
|
||||
# Generic prompt/response
|
||||
elif "prompt" in obj and "response" in obj:
|
||||
pairs.append((obj["prompt"], obj["response"]))
|
||||
|
||||
# Generic input/output
|
||||
elif "input" in obj and "output" in obj:
|
||||
pairs.append((obj["input"], obj["output"]))
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def filter_jsonl(input_path: str, threshold: float = 0.4, output_path: str = None) -> dict:
|
||||
"""Filter a JSONL file, removing low-quality pairs."""
|
||||
path = Path(input_path)
|
||||
if not path.exists():
|
||||
return {"error": f"File not found: {input_path}"}
|
||||
|
||||
lines = path.read_text().strip().split("\n")
|
||||
total = 0
|
||||
kept = 0
|
||||
removed = 0
|
||||
scores_list = []
|
||||
kept_lines = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
removed += 1
|
||||
continue
|
||||
|
||||
pairs = extract_pairs(obj)
|
||||
total += 1
|
||||
|
||||
if not pairs:
|
||||
# No extractable pairs — keep as-is (might be metadata)
|
||||
kept += 1
|
||||
kept_lines.append(line)
|
||||
continue
|
||||
|
||||
# Score all pairs in this object
|
||||
pair_scores = [score_pair(inp, out) for inp, out in pairs]
|
||||
avg_composite = sum(s["composite"] for s in pair_scores) / len(pair_scores)
|
||||
|
||||
scores_list.append(avg_composite)
|
||||
|
||||
if avg_composite >= threshold:
|
||||
kept += 1
|
||||
kept_lines.append(line)
|
||||
else:
|
||||
removed += 1
|
||||
|
||||
# Write output
|
||||
if output_path:
|
||||
Path(output_path).write_text("\n".join(kept_lines) + "\n")
|
||||
|
||||
return {
|
||||
"file": input_path,
|
||||
"total": total,
|
||||
"kept": kept,
|
||||
"removed": removed,
|
||||
"removal_rate": f"{removed}/{total}" if total > 0 else "0/0",
|
||||
"avg_score": round(sum(scores_list) / len(scores_list), 3) if scores_list else None,
|
||||
"min_score": round(min(scores_list), 3) if scores_list else None,
|
||||
"max_score": round(max(scores_list), 3) if scores_list else None,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Filter low-quality training data pairs")
|
||||
parser.add_argument("input", nargs="?", help="Input JSONL file")
|
||||
parser.add_argument("--threshold", type=float, default=0.4, help="Minimum quality score (0-1)")
|
||||
parser.add_argument("--output", "-o", help="Output file (default: input_filtered.jsonl)")
|
||||
parser.add_argument("--dir", help="Process all .jsonl files in directory")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Score only, don\'t write output")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dir:
|
||||
dirpath = Path(args.dir)
|
||||
jsonl_files = sorted(dirpath.rglob("*.jsonl"))
|
||||
if not jsonl_files:
|
||||
print(f"No .jsonl files found in {args.dir}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Processing {len(jsonl_files)} files (threshold={args.threshold})\n")
|
||||
print(f"{'File':<50} {'Total':>6} {'Kept':>6} {'Removed':>8} {'Avg':>6}")
|
||||
print("-" * 82)
|
||||
|
||||
grand_total = grand_kept = grand_removed = 0
|
||||
for fpath in jsonl_files:
|
||||
out = str(fpath).replace(".jsonl", "_filtered.jsonl") if not args.dry_run else None
|
||||
result = filter_jsonl(str(fpath), args.threshold, out)
|
||||
if "error" in result:
|
||||
print(f"{str(fpath):<50} ERROR: {result['error']}")
|
||||
continue
|
||||
print(f"{fpath.name:<50} {result['total']:>6} {result['kept']:>6} {result['removed']:>8} {result['avg_score']:>6.3f}")
|
||||
grand_total += result["total"]
|
||||
grand_kept += result["kept"]
|
||||
grand_removed += result["removed"]
|
||||
|
||||
print("-" * 82)
|
||||
print(f"{'TOTAL':<50} {grand_total:>6} {grand_kept:>6} {grand_removed:>8}")
|
||||
|
||||
elif args.input:
|
||||
out = args.output or args.input.replace(".jsonl", "_filtered.jsonl")
|
||||
if args.dry_run:
|
||||
out = None
|
||||
result = filter_jsonl(args.input, args.threshold, out)
|
||||
if "error" in result:
|
||||
print(f"Error: {result['error']}")
|
||||
sys.exit(1)
|
||||
print(json.dumps(result, indent=2))
|
||||
if out:
|
||||
print(f"\nFiltered output written to: {out}")
|
||||
else:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
from hermes_tools import browser_navigate, browser_vision
|
||||
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
"""Tests for PR triage automation (#659)."""
|
||||
import pytest
|
||||
|
||||
class TestCategorize:
|
||||
def _pr(self, title="", body=""):
|
||||
return {"title": title, "body": body, "labels": []}
|
||||
|
||||
def test_training(self):
|
||||
from scripts.pr_triage import categorize
|
||||
assert categorize(self._pr("Add DPO pairs")) == "training-data"
|
||||
|
||||
def test_bug(self):
|
||||
from scripts.pr_triage import categorize
|
||||
assert categorize(self._pr("fix: crash")) == "bug-fix"
|
||||
|
||||
def test_feature(self):
|
||||
from scripts.pr_triage import categorize
|
||||
assert categorize(self._pr("feat: dark mode")) == "feature"
|
||||
|
||||
def test_other(self):
|
||||
from scripts.pr_triage import categorize
|
||||
assert categorize(self._pr("random")) == "other"
|
||||
|
||||
class TestRefs:
|
||||
def test_simple(self):
|
||||
from scripts.pr_triage import refs
|
||||
assert 123 in refs({"title": "Fix #123", "body": ""})
|
||||
|
||||
def test_multiple(self):
|
||||
from scripts.pr_triage import refs
|
||||
r = refs({"title": "", "body": "Closes #100, Refs #200"})
|
||||
assert 100 in r and 200 in r
|
||||
|
||||
class TestDuplicates:
|
||||
def test_found(self):
|
||||
from scripts.pr_triage import find_duplicates
|
||||
prs = [{"title": "", "body": "Fix #1", "number": 1, "head": {"ref": "a"}, "created_at": "2026-01-01T00:00:00Z", "updated_at": "2026-01-01T00:00:00Z", "user": {}},
|
||||
{"title": "", "body": "Refs #1", "number": 2, "head": {"ref": "b"}, "created_at": "2026-01-01T00:00:00Z", "updated_at": "2026-01-01T00:00:00Z", "user": {}}]
|
||||
assert len(find_duplicates(prs)) == 1
|
||||
|
||||
def test_none(self):
|
||||
from scripts.pr_triage import find_duplicates
|
||||
prs = [{"title": "", "body": "Fix #1", "number": 1, "head": {"ref": "a"}, "created_at": "2026-01-01T00:00:00Z", "updated_at": "2026-01-01T00:00:00Z", "user": {}},
|
||||
{"title": "", "body": "Fix #2", "number": 2, "head": {"ref": "b"}, "created_at": "2026-01-01T00:00:00Z", "updated_at": "2026-01-01T00:00:00Z", "user": {}}]
|
||||
assert find_duplicates(prs) == []
|
||||
Reference in New Issue
Block a user