Compare commits

..

1 Commits

Author SHA1 Message Date
55797c8a3e feat: add sampler.py — session value scorer (#17) 2026-04-15 03:02:12 +00:00
2 changed files with 353 additions and 249 deletions

View File

@@ -1,249 +0,0 @@
#!/usr/bin/env python3
"""
Cross-Repo Dependency Graph Builder
Scans repos for import/require/reference patterns and builds a directed
dependency graph. Detects circular dependencies. Outputs DOT and Mermaid.
Usage:
python3 scripts/dependency_graph.py /path/to/repos/
python3 scripts/dependency_graph.py --repos repo1,repo2,repo3 --format mermaid
python3 scripts/dependency_graph.py --repos-dir /path/to/ --format dot --output deps.dot
Patterns detected:
- Python: import X, from X import Y
- JavaScript: require("X"), import ... from "X"
- Go: import "X"
- Ansible: include_role, import_role
- Docker/Compose: image: X, depends_on
- Config references: repo-name in YAML/TOML/JSON
"""
import argparse
import json
import os
import re
import sys
from collections import defaultdict
from pathlib import Path
# Known repo names for matching
KNOWN_REPOS = [
"hermes-agent", "timmy-config", "timmy-home", "the-nexus", "the-door",
"the-beacon", "fleet-ops", "burn-fleet", "timmy-dispatch", "turboquant",
"compounding-intelligence", "the-playground", "second-son-of-timmy",
"ai-safety-review", "the-echo-pattern", "timmy-academy", "wolf",
"the-testament",
]
def normalize_repo_name(name: str) -> str:
"""Normalize a repo name for comparison."""
return name.lower().replace("_", "-").replace(".git", "").strip()
def scan_file_for_deps(filepath: str, content: str, own_repo: str) -> set:
"""Scan a file's content for references to other repos."""
deps = set()
own_norm = normalize_repo_name(own_repo)
for repo in KNOWN_REPOS:
repo_norm = normalize_repo_name(repo)
if repo_norm == own_norm:
continue
# Direct name references
patterns = [
repo, # exact name
repo.replace("-", "_"), # underscore variant
repo.replace("-", ""), # no separator
f"/{repo}/", # path reference
f'"{repo}"', # quoted
f"'{repo}'", # single quoted
f"Timmy_Foundation/{repo}", # full Gitea path
f"Timmy_Foundation.{repo}", # Python module path
]
for pattern in patterns:
if pattern in content:
deps.add(repo)
break
return deps
def scan_repo(repo_path: str, repo_name: str = None) -> dict:
"""Scan a repo directory for dependencies."""
path = Path(repo_path)
if not path.is_dir():
return {"error": f"Not a directory: {repo_path}"}
if not repo_name:
repo_name = path.name
deps = set()
files_scanned = 0
exts = {".py", ".js", ".ts", ".go", ".yaml", ".yml", ".toml", ".json",
".md", ".sh", ".bash", ".Dockerfile", ".tf", ".hcl"}
for fpath in path.rglob("*"):
if not fpath.is_file():
continue
if fpath.suffix not in exts:
continue
# Skip common non-source dirs
parts = fpath.parts
if any(p in (".git", "node_modules", "__pycache__", ".venv", "venv",
"vendor", "dist", "build", ".tox") for p in parts):
continue
try:
content = fpath.read_text(errors="ignore")
except:
continue
file_deps = scan_file_for_deps(str(fpath), content, repo_name)
deps.update(file_deps)
files_scanned += 1
return {
"repo": repo_name,
"dependencies": sorted(deps),
"files_scanned": files_scanned,
}
def detect_cycles(graph: dict) -> list:
"""Detect circular dependencies using DFS."""
cycles = []
visited = set()
rec_stack = set()
def dfs(node, path):
visited.add(node)
rec_stack.add(node)
for neighbor in graph.get(node, {}).get("dependencies", []):
if neighbor not in visited:
result = dfs(neighbor, path + [neighbor])
if result:
return result
elif neighbor in rec_stack:
cycle_start = path.index(neighbor)
return path[cycle_start:] + [neighbor]
rec_stack.remove(node)
return None
for node in graph:
if node not in visited:
cycle = dfs(node, [node])
if cycle:
cycles.append(cycle)
return cycles
def to_dot(graph: dict) -> str:
"""Generate DOT format output."""
lines = ["digraph dependencies {"]
lines.append(" rankdir=LR;")
lines.append(" node [shape=box, style=filled, fillcolor="#1a1a2e", fontcolor="#e6edf3"];")
lines.append(" edge [color="#4a4a6a"];")
lines.append("")
for repo, data in sorted(graph.items()):
dep_count = len(data.get("dependencies", []))
fill = "#2d1b69" if dep_count > 2 else "#16213e"
lines.append(f' "{repo}" [fillcolor="{fill}"];')
for dep in data.get("dependencies", []):
lines.append(f' "{repo}" -> "{dep}";')
lines.append("}")
return "\n".join(lines)
def to_mermaid(graph: dict) -> str:
"""Generate Mermaid format output."""
lines = ["graph LR"]
for repo, data in sorted(graph.items()):
for dep in data.get("dependencies", []):
lines.append(f" {repo.replace('-','_')} --> {dep.replace('-','_')}")
# Add node labels
lines.append("")
for repo in sorted(graph.keys()):
lines.append(f" {repo.replace('-','_')}[{repo}]")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Build cross-repo dependency graph")
parser.add_argument("repos_dir", nargs="?", help="Directory containing repos")
parser.add_argument("--repos", help="Comma-separated list of repo paths")
parser.add_argument("--format", choices=["dot", "mermaid", "json"], default="json")
parser.add_argument("--output", "-o", help="Output file (default: stdout)")
parser.add_argument("--cycles-only", action="store_true", help="Only report cycles")
args = parser.parse_args()
results = {}
repo_paths = []
if args.repos:
repo_paths = [p.strip() for p in args.repos.split(",")]
elif args.repos_dir:
base = Path(args.repos_dir)
repo_paths = [str(p) for p in base.iterdir() if p.is_dir() and not p.name.startswith(".")]
else:
parser.print_help()
sys.exit(1)
for rpath in repo_paths:
name = Path(rpath).name
print(f"Scanning {name}...", file=sys.stderr)
result = scan_repo(rpath, name)
if "error" not in result:
results[name] = result
# Detect cycles
cycles = detect_cycles(results)
if args.cycles_only:
if cycles:
print("CIRCULAR DEPENDENCIES DETECTED:")
for cycle in cycles:
print(f" {' -> '.join(cycle)}")
sys.exit(1)
else:
print("No circular dependencies found.")
sys.exit(0)
# Output
output = {}
if args.format == "dot":
output = to_dot(results)
elif args.format == "mermaid":
output = to_mermaid(results)
else:
output = json.dumps({
"repos": results,
"cycles": cycles,
"summary": {
"total_repos": len(results),
"total_deps": sum(len(r["dependencies"]) for r in results.values()),
"cycles_found": len(cycles),
}
}, indent=2)
if args.output:
Path(args.output).write_text(output)
print(f"Written to {args.output}", file=sys.stderr)
else:
print(output)
if __name__ == "__main__":
main()

353
scripts/sampler.py Normal file
View File

@@ -0,0 +1,353 @@
#!/usr/bin/env python3
"""
sampler.py — Score and rank sessions by harvest value.
With 20k+ sessions on disk, we can't harvest all at once. This script
scores each session by how likely it is to contain valuable knowledge,
so the harvester processes the best ones first.
Scoring strategy:
- Recency: last 7d=3pts, last 30d=2pts, older=1pt
- Length: >50 messages=3pts, >20=2pts, <20=1pt
- Repo uniqueness: first session for a repo=5pts, otherwise=1pt
- Outcome: failure=3pts (most to learn), success=2pts, unknown=1pt
- Tool calls: >10 tool invocations=2pts (complex sessions)
Usage:
python3 sampler.py --count 100 # Top 100 sessions
python3 sampler.py --repo the-nexus --count 20 # Top 20 for a repo
python3 sampler.py --since 2026-04-01 # All sessions since date
python3 sampler.py --count 50 --min-score 8 # Only high-value sessions
python3 sampler.py --count 100 --output sample.json # Save to file
"""
import argparse
import json
import os
import sys
import time
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Optional
# --- Fast session scanning (no full parse) ---
def scan_session_fast(path: str) -> dict:
"""Extract scoring metadata from a session without parsing the full JSONL.
Reads only: first line, last ~20 lines, and line count. This processes
20k sessions in seconds instead of minutes.
"""
meta = {
'path': path,
'message_count': 0,
'has_tool_calls': False,
'tool_call_count': 0,
'first_timestamp': '',
'last_timestamp': '',
'is_failure': False,
'repos_mentioned': set(),
'first_role': '',
'last_content_preview': '',
}
try:
file_size = os.path.getsize(path)
if file_size == 0:
return meta
with open(path, 'r', encoding='utf-8', errors='replace') as f:
# Read first line for timestamp + role
first_line = f.readline().strip()
if first_line:
try:
first_msg = json.loads(first_line)
meta['first_timestamp'] = first_msg.get('timestamp', '')
meta['first_role'] = first_msg.get('role', '')
except json.JSONDecodeError:
pass
# Fast line count + collect tail lines
# For the tail, seek to near end of file
tail_lines = []
line_count = 1 # already read first
if file_size > 8192:
# Seek to last 8KB for tail sampling
f.seek(max(0, file_size - 8192))
f.readline() # skip partial line
for line in f:
line = line.strip()
if line:
tail_lines.append(line)
line_count += 1
# We lost the exact count for big files — estimate from file size
# Average JSONL line is ~500 bytes
if line_count < 100:
line_count = max(line_count, file_size // 500)
else:
# Small file — read all
for line in f:
line = line.strip()
if line:
tail_lines.append(line)
line_count += 1
meta['message_count'] = line_count
# Parse tail lines for outcome, tool calls, repos
for line in tail_lines[-30:]: # last 30 non-empty lines
try:
msg = json.loads(line)
# Track last timestamp
ts = msg.get('timestamp', '')
if ts:
meta['last_timestamp'] = ts
# Count tool calls
if msg.get('tool_calls'):
meta['has_tool_calls'] = True
meta['tool_call_count'] += len(msg['tool_calls'])
# Detect failure signals in content
content = ''
if isinstance(msg.get('content'), str):
content = msg['content'].lower()
elif isinstance(msg.get('content'), list):
for part in msg['content']:
if isinstance(part, dict) and part.get('type') == 'text':
content += part.get('text', '').lower()
if content:
meta['last_content_preview'] = content[:200]
failure_signals = ['error', 'failed', 'cannot', 'unable',
'exception', 'traceback', 'rejected', 'denied']
if any(sig in content for sig in failure_signals):
meta['is_failure'] = True
# Extract repo references from tool call arguments
if msg.get('tool_calls'):
for tc in msg['tool_calls']:
args = tc.get('function', {}).get('arguments', '')
if isinstance(args, str):
# Look for repo patterns
for pattern in ['Timmy_Foundation/', 'Rockachopa/', 'compounding-intelligence', 'the-nexus', 'timmy-home', 'hermes-agent', 'the-beacon', 'the-door']:
if pattern in args:
repo = pattern.rstrip('/')
meta['repos_mentioned'].add(repo)
except json.JSONDecodeError:
continue
except (IOError, OSError):
pass
meta['repos_mentioned'] = list(meta['repos_mentioned'])
return meta
# --- Filename timestamp parsing ---
def parse_session_timestamp(filename: str) -> Optional[datetime]:
"""Parse timestamp from session filename.
Common formats:
session_20260413_123456_hash.jsonl
20260413_123456_hash.jsonl
"""
stem = Path(filename).stem
parts = stem.split('_')
# Try session_YYYYMMDD_HHMMSS format
for i, part in enumerate(parts):
if len(part) == 8 and part.isdigit():
date_part = part
time_part = parts[i + 1] if i + 1 < len(parts) and len(parts[i + 1]) == 6 else '000000'
try:
return datetime.strptime(f"{date_part}_{time_part}", '%Y%m%d_%H%M%S').replace(tzinfo=timezone.utc)
except ValueError:
continue
# Fallback: use file modification time
return None
# --- Scoring ---
def score_session(meta: dict, now: datetime, seen_repos: set) -> tuple[int, dict]:
"""Score a session for harvest value. Returns (score, breakdown)."""
score = 0
breakdown = {}
# 1. Recency
ts = parse_session_timestamp(os.path.basename(meta['path']))
if ts is None:
# Fallback to mtime
try:
ts = datetime.fromtimestamp(os.path.getmtime(meta['path']), tz=timezone.utc)
except OSError:
ts = now - timedelta(days=365)
age_days = (now - ts).days
if age_days <= 7:
recency = 3
elif age_days <= 30:
recency = 2
else:
recency = 1
score += recency
breakdown['recency'] = recency
# 2. Length
count = meta['message_count']
if count > 50:
length = 3
elif count > 20:
length = 2
else:
length = 1
score += length
breakdown['length'] = length
# 3. Repo uniqueness (first session mentioning a repo gets bonus)
repo_score = 0
for repo in meta.get('repos_mentioned', []):
if repo not in seen_repos:
seen_repos.add(repo)
repo_score = max(repo_score, 5)
else:
repo_score = max(repo_score, 1)
score += repo_score
breakdown['repo_unique'] = repo_score
# 4. Outcome
if meta.get('is_failure'):
outcome = 3
elif meta.get('last_content_preview', '').strip():
outcome = 2 # has some content = likely completed
else:
outcome = 1
score += outcome
breakdown['outcome'] = outcome
# 5. Tool calls
if meta.get('tool_call_count', 0) > 10:
tool = 2
else:
tool = 0
score += tool
breakdown['tool_calls'] = tool
return score, breakdown
# --- Main ---
def main():
parser = argparse.ArgumentParser(description="Score and rank sessions for harvesting")
parser.add_argument('--sessions-dir', default=os.path.expanduser('~/.hermes/sessions'),
help='Directory containing session files')
parser.add_argument('--count', type=int, default=100, help='Number of top sessions to return')
parser.add_argument('--repo', default='', help='Filter to sessions mentioning this repo')
parser.add_argument('--since', default='', help='Only score sessions after this date (YYYY-MM-DD)')
parser.add_argument('--min-score', type=int, default=0, help='Minimum score threshold')
parser.add_argument('--output', default='', help='Output file (JSON). Default: stdout')
parser.add_argument('--format', choices=['json', 'paths', 'table'], default='table',
help='Output format: json (full), paths (one per line), table (human)')
parser.add_argument('--top-percent', type=float, default=0, help='Return top N%% instead of --count')
args = parser.parse_args()
sessions_dir = Path(args.sessions_dir)
if not sessions_dir.is_dir():
print(f"ERROR: Sessions directory not found: {sessions_dir}", file=sys.stderr)
sys.exit(1)
# Find all JSONL files
print(f"Scanning {sessions_dir}...", file=sys.stderr)
t0 = time.time()
session_files = list(sessions_dir.glob('*.jsonl'))
total = len(session_files)
print(f"Found {total} session files", file=sys.stderr)
# Parse since date
since_dt = None
if args.since:
since_dt = datetime.strptime(args.since, '%Y-%m-%d').replace(tzinfo=timezone.utc)
# Score all sessions
now = datetime.now(timezone.utc)
seen_repos = set() # Track repos for uniqueness scoring
scored = []
for i, sf in enumerate(session_files):
# Date filter (fast path: check filename first)
if since_dt:
ts = parse_session_timestamp(sf.name)
if ts and ts < since_dt:
continue
meta = scan_session_fast(str(sf))
# Repo filter
if args.repo:
repos = meta.get('repos_mentioned', [])
if args.repo.lower() not in [r.lower() for r in repos]:
# Also check filename
if args.repo.lower() not in sf.name.lower():
continue
score, breakdown = score_session(meta, now, seen_repos)
if score >= args.min_score:
scored.append({
'path': str(sf),
'filename': sf.name,
'score': score,
'breakdown': breakdown,
'message_count': meta['message_count'],
'repos': meta['repos_mentioned'],
'is_failure': meta['is_failure'],
})
if (i + 1) % 5000 == 0:
elapsed = time.time() - t0
print(f" Scanned {i + 1}/{total} ({elapsed:.1f}s)", file=sys.stderr)
elapsed = time.time() - t0
print(f"Scored {len(scored)} sessions in {elapsed:.1f}s", file=sys.stderr)
# Sort by score descending
scored.sort(key=lambda x: x['score'], reverse=True)
# Apply count or percent
if args.top_percent > 0:
count = max(1, int(len(scored) * args.top_percent / 100))
else:
count = args.count
scored = scored[:count]
# Output
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
json.dump(scored, f, indent=2)
print(f"Wrote {len(scored)} sessions to {args.output}", file=sys.stderr)
elif args.format == 'json':
json.dump(scored, sys.stdout, indent=2)
elif args.format == 'paths':
for s in scored:
print(s['path'])
else: # table
print(f"{'SCORE':>5} {'MSGS':>5} {'REPOS':<25} {'FILE'}")
print(f"{'-'*5} {'-'*5} {'-'*25} {'-'*40}")
for s in scored:
repos = ', '.join(s['repos'][:2]) if s['repos'] else '-'
fail = ' FAIL' if s['is_failure'] else ''
print(f"{s['score']:>5} {s['message_count']:>5} {repos:<25} {s['filename'][:40]}{fail}")
if __name__ == '__main__':
main()