Compare commits

..

2 Commits

Author SHA1 Message Date
Step35
86eb1c9a50 feat: training data pipeline — knowledge entries → JSONL training pairs
Some checks failed
Test / pytest (pull_request) Failing after 7s
Add scripts/knowledge_to_training_pairs.py which reads quality-gated
knowledge entries from knowledge/index.json and emits terse→rich
training pairs in JSONL format.

Features:
- Derives terse queries from facts via category-aware heuristics
- Configurable quality filters: min-confidence, model-filter, date range
- Output includes domain, source_confidence, source_model
- Smoke tests added in tests/test_knowledge_to_training_pairs.py

Deliverables for #199:
1. Pipeline script: scripts/knowledge_to_training_pairs.py
2. End-to-end: knowledge/index.json → training_pairs.jsonl (or custom JSONL)
3. Config: min-confidence, model-filter, after/before date filters
4. Test: 9 smoke tests covering conversion, filtering, and end-to-end run

Closes #199
2026-04-26 13:03:06 -04:00
Rockachopa
4b5a675355 feat: add PR complexity scorer — estimate review effort\n\nImplements issue #135: a script that analyzes open PRs and computes\na complexity score (1-10) based on files changed, lines added/removed,\ndependency changes, and test coverage delta. Also estimates review time.\n\nThe scorer can be run with --dry-run to preview or --apply to post\nscore comments directly on PRs.\n\nOutput: metrics/pr_complexity.json with full analysis.\n\nCloses #135
Some checks failed
Test / pytest (push) Failing after 10s
2026-04-26 09:34:57 -04:00
5 changed files with 950 additions and 477 deletions

View File

@@ -0,0 +1,255 @@
#!/usr/bin/env python3
"""
knowledge_to_training_pairs.py — Convert quality-gated knowledge entries into training pairs.
Reads knowledge/index.json (or a custom JSONL of entries), applies quality filters,
and emits terse→rich training pairs in JSONL format for model fine-tuning.
Usage:
python3 scripts/knowledge_to_training_pairs.py \
--input knowledge/index.json \
--output training_pairs.jsonl \
--min-confidence 0.7 \
--model-filter claude-sonnet,gpt-4 \
--after 2026-01-01
Input entry format (from index.json facts):
{
"id": "hermes-agent:pitfall:001",
"fact": "deploy-crons.py leaves jobs in mixed model format",
"category": "pitfall",
"domain": "hermes-agent",
"confidence": 0.95,
...
}
Output training pair format:
{
"terse": "How do I handle deploy-crons.py mixed model format?",
"rich": "deploy-crons.py leaves jobs in mixed model format.",
"domain": "hermes-agent",
"source_confidence": 0.95,
"source_model": "unknown"
}
"""
import argparse
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
def fact_to_terse(fact: str, category: str, domain: str) -> str:
"""
Derive a short user query from a knowledge fact.
Strategy:
- Pitfalls → "How do I avoid/handle/fix <fact excerpt>?"
- Patterns → "What's the recommended way to <pattern core>?"
- Tool quirks → "How does <tool> behave in <context>?"
- Facts → "What should I know about <fact excerpt>?"
- Questions → "What is the answer to: <fact>?"
"""
fact_lower = fact.lower()
# Extract a concise excerpt (first sentence or 80 chars)
excerpt = fact.split('. ')[0] if '. ' in fact else fact[:80]
if category == "pitfall":
verbs = ["avoid", "handle", "fix", "prevent"]
# pick verb based on fact wording
if "trigger" in fact_lower or "cause" in fact_lower:
verb = "avoid"
elif "broken" in fact_lower or "fails" in fact_lower:
verb = "fix"
else:
verb = "handle"
return f"How do I {verb} {excerpt.rstrip('.')}?"
elif category == "pattern":
return f"What's the recommended way to {excerpt.rstrip('.')}?"
elif category == "tool-quirk":
# Try to extract tool name
tool = fact.split()[0] if fact.split() else domain
return f"How does {tool} behave in this context?"
elif category == "question":
return f"What is the answer to: {excerpt}?"
else: # fact or unknown
return f"What should I know about {excerpt.rstrip('.')}?"
def parse_date(date_str: Optional[str]) -> Optional[datetime]:
"""Parse ISO date string to datetime, or return None."""
if not date_str:
return None
try:
return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
def load_knowledge_index(path: str) -> list[dict]:
"""Load knowledge facts from index.json (or plain JSONL of entries)."""
p = Path(path)
if not p.exists():
print(f"ERROR: Knowledge input not found: {path}", file=sys.stderr)
sys.exit(1)
with open(p) as f:
data = json.load(f)
# index.json format: {"facts": [...], ...}
if isinstance(data, dict) and "facts" in data:
return data["facts"]
# JSONL format: one entry per line
if isinstance(data, list):
return data
# Plain file with JSON array
print(f"ERROR: Unrecognized input format in {path}", file=sys.stderr)
sys.exit(1)
def filter_entries(entries: list[dict],
min_confidence: float = 0.0,
model_filter: Optional[list[str]] = None,
after: Optional[datetime] = None,
before: Optional[datetime] = None) -> list[dict]:
"""Apply quality and provenance filters."""
filtered = []
for entry in entries:
# Confidence filter (entry confidence)
conf = entry.get("confidence", 0.0)
if conf < min_confidence:
continue
# Model filter: if specified, entry's model must be in the list
if model_filter:
entry_model = entry.get("model", entry.get("provenance", {}).get("model", "unknown"))
if entry_model not in model_filter:
continue
# Date filter: use last_confirmed or first_seen or harvested_at
entry_date = None
for field in ("last_confirmed", "first_seen", "harvested_at"):
if field in entry:
entry_date = parse_date(entry[field])
if entry_date:
break
if after and entry_date and entry_date < after:
continue
if before and entry_date and entry_date > before:
continue
filtered.append(entry)
return filtered
def entry_to_pair(entry: dict) -> dict:
"""Convert a knowledge entry into a training pair."""
fact = entry.get("fact", "").strip()
if not fact:
return None
category = entry.get("category", "fact")
domain = entry.get("domain", "global")
terse = fact_to_terse(fact, category, domain)
rich = fact
source_confidence = round(entry.get("confidence", 0.0), 4)
source_model = entry.get("model", entry.get("provenance", {}).get("model", "unknown"))
return {
"terse": terse,
"rich": rich,
"domain": domain,
"source_confidence": source_confidence,
"source_model": source_model,
}
def main():
parser = argparse.ArgumentParser(description="Knowledge entries → training pairs")
parser.add_argument("--input", "-i", default="knowledge/index.json",
help="Input knowledge index or JSONL (default: knowledge/index.json)")
parser.add_argument("--output", "-o", default="training_pairs.jsonl",
help="Output JSONL file")
parser.add_argument("--min-confidence", type=float, default=0.5,
help="Minimum entry confidence to include (0.0-1.0, default: 0.5)")
parser.add_argument("--model-filter",
help="Comma-separated list of source models to include")
parser.add_argument("--after",
help="Include entries last_confirmed/first_seen on or after this date (YYYY-MM-DD)")
parser.add_argument("--before",
help="Include entries last_confirmed/first_seen on or before this date (YYYY-MM-DD)")
parser.add_argument("--dry-run", action="store_true",
help="Print sample pairs and stats without writing")
args = parser.parse_args()
# Load
entries = load_knowledge_index(args.input)
print(f"Loaded {len(entries)} entries from {args.input}", file=sys.stderr)
# Parse filters
model_list = args.model_filter.split(",") if args.model_filter else None
after_dt = parse_date(args.after) if args.after else None
before_dt = parse_date(args.before) if args.before else None
# Filter
kept = filter_entries(
entries,
min_confidence=args.min_confidence,
model_filter=model_list,
after=after_dt,
before=before_dt,
)
print(f"After filtering: {len(kept)} / {len(entries)} entries", file=sys.stderr)
# Convert
pairs = []
for entry in kept:
pair = entry_to_pair(entry)
if pair:
pairs.append(pair)
# Stats
if pairs:
avg_conf = sum(p["source_confidence"] for p in pairs) / len(pairs)
domains = {}
models = {}
for p in pairs:
domains[p["domain"]] = domains.get(p["domain"], 0) + 1
models[p["source_model"]] = models.get(p["source_model"], 0) + 1
else:
avg_conf = 0.0
domains = {}
models = {}
stats = {
"input_entries": len(entries),
"after_filter": len(kept),
"pairs_generated": len(pairs),
"avg_confidence": round(avg_conf, 4),
"domains": domains,
"source_models": models,
}
print(json.dumps(stats, indent=2), file=sys.stderr)
if args.dry_run:
print("\nSample pairs:", file=sys.stderr)
for p in pairs[:3]:
print(json.dumps(p, ensure_ascii=False), file=sys.stderr)
return
# Write JSONL
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
for pair in pairs:
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
print(f"\nWrote {len(pairs)} training pairs to {out_path}", file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,351 @@
#!/usr/bin/env python3
"""
PR Complexity Scorer - Estimate review effort for PRs.
"""
import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
import urllib.request
import urllib.error
GITEA_BASE = "https://forge.alexanderwhitestone.com/api/v1"
DEPENDENCY_FILES = {
"requirements.txt", "pyproject.toml", "setup.py", "setup.cfg",
"Pipfile", "poetry.lock", "package.json", "yarn.lock", "Gemfile",
"go.mod", "Cargo.toml", "pom.xml", "build.gradle"
}
TEST_PATTERNS = [
r"tests?/.*\.py$", r".*_test\.py$", r"test_.*\.py$",
r"spec/.*\.rb$", r".*_spec\.rb$",
r"__tests__/", r".*\.test\.(js|ts|jsx|tsx)$"
]
WEIGHT_FILES = 0.25
WEIGHT_LINES = 0.25
WEIGHT_DEPS = 0.30
WEIGHT_TEST_COV = 0.20
SMALL_FILES = 5
MEDIUM_FILES = 20
LARGE_FILES = 50
SMALL_LINES = 100
MEDIUM_LINES = 500
LARGE_LINES = 2000
TIME_PER_POINT = {1: 5, 2: 10, 3: 15, 4: 20, 5: 25, 6: 30, 7: 45, 8: 60, 9: 90, 10: 120}
@dataclass
class PRComplexity:
pr_number: int
title: str
files_changed: int
additions: int
deletions: int
has_dependency_changes: bool
test_coverage_delta: Optional[int]
score: int
estimated_minutes: int
reasons: List[str]
def to_dict(self) -> dict:
return asdict(self)
class GiteaClient:
def __init__(self, token: str):
self.token = token
self.base_url = GITEA_BASE.rstrip("/")
def _request(self, path: str, params: Dict = None) -> Any:
url = f"{self.base_url}{path}"
if params:
qs = "&".join(f"{k}={v}" for k, v in params.items() if v is not None)
url += f"?{qs}"
req = urllib.request.Request(url)
req.add_header("Authorization", f"token {self.token}")
req.add_header("Content-Type", "application/json")
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
print(f"API error {e.code}: {e.read().decode()[:200]}", file=sys.stderr)
return None
except urllib.error.URLError as e:
print(f"Network error: {e}", file=sys.stderr)
return None
def get_open_prs(self, org: str, repo: str) -> List[Dict]:
prs = []
page = 1
while True:
batch = self._request(f"/repos/{org}/{repo}/pulls", {"limit": 50, "page": page, "state": "open"})
if not batch:
break
prs.extend(batch)
if len(batch) < 50:
break
page += 1
return prs
def get_pr_files(self, org: str, repo: str, pr_number: int) -> List[Dict]:
files = []
page = 1
while True:
batch = self._request(
f"/repos/{org}/{repo}/pulls/{pr_number}/files",
{"limit": 100, "page": page}
)
if not batch:
break
files.extend(batch)
if len(batch) < 100:
break
page += 1
return files
def post_comment(self, org: str, repo: str, pr_number: int, body: str) -> bool:
data = json.dumps({"body": body}).encode("utf-8")
req = urllib.request.Request(
f"{self.base_url}/repos/{org}/{repo}/issues/{pr_number}/comments",
data=data,
method="POST",
headers={"Authorization": f"token {self.token}", "Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.status in (200, 201)
except urllib.error.HTTPError:
return False
def is_dependency_file(filename: str) -> bool:
return any(filename.endswith(dep) for dep in DEPENDENCY_FILES)
def is_test_file(filename: str) -> bool:
return any(re.search(pattern, filename) for pattern in TEST_PATTERNS)
def score_pr(
files_changed: int,
additions: int,
deletions: int,
has_dependency_changes: bool,
test_coverage_delta: Optional[int] = None
) -> tuple[int, int, List[str]]:
score = 1.0
reasons = []
# Files changed
if files_changed <= SMALL_FILES:
fscore = 1.0
reasons.append("small number of files changed")
elif files_changed <= MEDIUM_FILES:
fscore = 2.0
reasons.append("moderate number of files changed")
elif files_changed <= LARGE_FILES:
fscore = 2.5
reasons.append("large number of files changed")
else:
fscore = 3.0
reasons.append("very large PR spanning many files")
# Lines changed
total_lines = additions + deletions
if total_lines <= SMALL_LINES:
lscore = 1.0
reasons.append("small change size")
elif total_lines <= MEDIUM_LINES:
lscore = 2.0
reasons.append("moderate change size")
elif total_lines <= LARGE_LINES:
lscore = 3.0
reasons.append("large change size")
else:
lscore = 4.0
reasons.append("very large change")
# Dependency changes
if has_dependency_changes:
dscore = 2.5
reasons.append("dependency changes (architectural impact)")
else:
dscore = 0.0
# Test coverage delta
tscore = 0.0
if test_coverage_delta is not None:
if test_coverage_delta > 0:
reasons.append(f"test additions (+{test_coverage_delta} test files)")
tscore = -min(2.0, test_coverage_delta / 2.0)
elif test_coverage_delta < 0:
reasons.append(f"test removals ({abs(test_coverage_delta)} test files)")
tscore = min(2.0, abs(test_coverage_delta) * 0.5)
else:
reasons.append("test coverage change not assessed")
# Weighted sum, scaled by 3 to use full 1-10 range
bonus = (fscore * WEIGHT_FILES) + (lscore * WEIGHT_LINES) + (dscore * WEIGHT_DEPS) + (tscore * WEIGHT_TEST_COV)
scaled_bonus = bonus * 3.0
score = 1.0 + scaled_bonus
final_score = max(1, min(10, int(round(score))))
est_minutes = TIME_PER_POINT.get(final_score, 30)
return final_score, est_minutes, reasons
def analyze_pr(client: GiteaClient, org: str, repo: str, pr_data: Dict) -> PRComplexity:
pr_num = pr_data["number"]
title = pr_data.get("title", "")
files = client.get_pr_files(org, repo, pr_num)
additions = sum(f.get("additions", 0) for f in files)
deletions = sum(f.get("deletions", 0) for f in files)
filenames = [f.get("filename", "") for f in files]
has_deps = any(is_dependency_file(f) for f in filenames)
test_added = sum(1 for f in files if f.get("status") == "added" and is_test_file(f.get("filename", "")))
test_removed = sum(1 for f in files if f.get("status") == "removed" and is_test_file(f.get("filename", "")))
test_delta = test_added - test_removed if (test_added or test_removed) else None
score, est_min, reasons = score_pr(
files_changed=len(files),
additions=additions,
deletions=deletions,
has_dependency_changes=has_deps,
test_coverage_delta=test_delta
)
return PRComplexity(
pr_number=pr_num,
title=title,
files_changed=len(files),
additions=additions,
deletions=deletions,
has_dependency_changes=has_deps,
test_coverage_delta=test_delta,
score=score,
estimated_minutes=est_min,
reasons=reasons
)
def build_comment(complexity: PRComplexity) -> str:
change_desc = f"{complexity.files_changed} files, +{complexity.additions}/-{complexity.deletions} lines"
deps_note = "\n- :warning: Dependency changes detected — architectural review recommended" if complexity.has_dependency_changes else ""
test_note = ""
if complexity.test_coverage_delta is not None:
if complexity.test_coverage_delta > 0:
test_note = f"\n- :+1: {complexity.test_coverage_delta} test file(s) added"
elif complexity.test_coverage_delta < 0:
test_note = f"\n- :warning: {abs(complexity.test_coverage_delta)} test file(s) removed"
comment = f"## 📊 PR Complexity Analysis\n\n"
comment += f"**PR #{complexity.pr_number}: {complexity.title}**\n\n"
comment += f"| Metric | Value |\n|--------|-------|\n"
comment += f"| Changes | {change_desc} |\n"
comment += f"| Complexity Score | **{complexity.score}/10** |\n"
comment += f"| Estimated Review Time | ~{complexity.estimated_minutes} minutes |\n\n"
comment += f"### Scoring rationale:"
for r in complexity.reasons:
comment += f"\n- {r}"
if deps_note:
comment += deps_note
if test_note:
comment += test_note
comment += f"\n\n---\n"
comment += f"*Generated by PR Complexity Scorer — [issue #135](https://forge.alexanderwhitestone.com/Timmy_Foundation/compounding-intelligence/issues/135)*"
return comment
def main():
parser = argparse.ArgumentParser(description="PR Complexity Scorer")
parser.add_argument("--org", default="Timmy_Foundation")
parser.add_argument("--repo", default="compounding-intelligence")
parser.add_argument("--token", default=os.environ.get("GITEA_TOKEN") or os.path.expanduser("~/.config/gitea/token"))
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--apply", action="store_true")
parser.add_argument("--output", default="metrics/pr_complexity.json")
args = parser.parse_args()
token_path = args.token
if os.path.exists(token_path):
with open(token_path) as f:
token = f.read().strip()
else:
token = args.token
if not token:
print("ERROR: No Gitea token provided", file=sys.stderr)
sys.exit(1)
client = GiteaClient(token)
print(f"Fetching open PRs for {args.org}/{args.repo}...")
prs = client.get_open_prs(args.org, args.repo)
if not prs:
print("No open PRs found.")
sys.exit(0)
print(f"Found {len(prs)} open PR(s). Analyzing...")
results = []
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
for pr in prs:
pr_num = pr["number"]
title = pr.get("title", "")
print(f" Analyzing PR #{pr_num}: {title[:60]}")
try:
complexity = analyze_pr(client, args.org, args.repo, pr)
results.append(complexity.to_dict())
comment = build_comment(complexity)
if args.dry_run:
print(f" → Score: {complexity.score}/10, Est: {complexity.estimated_minutes}min [DRY-RUN]")
elif args.apply:
success = client.post_comment(args.org, args.repo, pr_num, comment)
status = "[commented]" if success else "[FAILED]"
print(f" → Score: {complexity.score}/10, Est: {complexity.estimated_minutes}min {status}")
else:
print(f" → Score: {complexity.score}/10, Est: {complexity.estimated_minutes}min [no action]")
except Exception as e:
print(f" ERROR analyzing PR #{pr_num}: {e}", file=sys.stderr)
with open(args.output, "w") as f:
json.dump({
"org": args.org,
"repo": args.repo,
"timestamp": datetime.now(timezone.utc).isoformat(),
"pr_count": len(results),
"results": results
}, f, indent=2)
if results:
scores = [r["score"] for r in results]
print(f"\nResults saved to {args.output}")
print(f"Summary: {len(results)} PRs, scores range {min(scores):.0f}-{max(scores):.0f}")
else:
print("\nNo results to save.")
if __name__ == "__main__":
main()

View File

@@ -1,477 +0,0 @@
#!/usr/bin/env python3
"""
Progress Tracker — Pipeline 10.8
Track improvement metrics over time. Are we getting better?
Metrics tracked:
1. Test coverage — % of Python functions with associated tests (test:source file ratio + line coverage if available)
2. Doc coverage — % of Python callables with docstrings (AST-based)
3. Issue close rate — closed / (opened + closed) per week (Gitea API)
4. Dep freshness — % of requirements pinned vs outdated (pip list --outdated)
Output:
- metrics/snapshots/YYYY-MM-DD.json — one snapshot per run
- metrics/TRENDS.md — cumulative markdown table
- stdout summary
Usage:
python3 scripts/progress_tracker.py
python3 scripts/progress_tracker.py --json
python3 scripts/progress_tracker.py --output metrics/TRENDS.md
Weekly cron:
0 9 * * 1 cd /path/to/compounding-intelligence && python3 scripts/progress_tracker.py
"""
import argparse
import json
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
# ── Configuration ──────────────────────────────────────────────────────────
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
METRICS_DIR = REPO_ROOT / "metrics"
SNAPSHOTS_DIR = METRICS_DIR / "snapshots"
TOKEN_PATH = Path.home() / ".config" / "gitea" / "token"
GITEA_API_BASE = "https://forge.alexanderwhitestone.com/api/v1"
ORG = "Timmy_Foundation"
# Ensure paths exist
SNAPSHOTS_DIR.mkdir(parents=True, exist_ok=True)
# ── Helpers ─────────────────────────────────────────────────────────────────
def run_cmd(cmd: List[str], cwd: Path = REPO_ROOT) -> str:
"""Run a shell command and return stdout (stderr merged)."""
result = subprocess.run(
cmd, capture_output=True, text=True, cwd=cwd, timeout=30
)
if result.returncode != 0:
return ""
return result.stdout.strip()
def slugify_date(dt: datetime) -> str:
return dt.strftime("%Y-%m-%d")
def snapshot_path(dt: datetime) -> Path:
return SNAPSHOTS_DIR / f"{slugify_date(dt)}.json"
def load_snapshots() -> List[Dict[str, Any]]:
"""Load all existing snapshots sorted by date."""
snapshots = []
for f in sorted(SNAPSHOTS_DIR.glob("*.json")):
try:
with open(f) as fp:
snapshots.append(json.load(fp))
except Exception:
continue
return snapshots
# ── Metric 1: Test Coverage ─────────────────────────────────────────────────
def collect_test_coverage() -> Dict[str, Any]:
"""
Compute test coverage metrics.
Counts test_*.py and *_test.py files vs non-test .py source files.
Also attempts to read .coverage if present.
"""
all_py = list(REPO_ROOT.rglob("*.py"))
source_files = []
test_files = []
for p in all_py:
try:
rel_parts = p.relative_to(REPO_ROOT).parts
except ValueError:
continue
# Skip hidden/cache/temp dirs (check only relative parts)
if any(part.startswith('.') or part.startswith('__') for part in rel_parts):
continue
if any(part in ('node_modules', 'venv', '.venv', 'env', '.pytest_cache') for part in rel_parts):
continue
if p.name.startswith("test_") or p.name.endswith("_test.py"):
test_files.append(p)
else:
source_files.append(p)
# Try to get line coverage from .coverage
coverage_percent = None
coverage_tool = None
coverage_file = REPO_ROOT / ".coverage"
if coverage_file.exists():
try:
import coverage # type: ignore
# Use coverage API if available
cov = coverage.Coverage(data_file=str(coverage_file))
cov.load()
total = cov.report()
coverage_percent = total if isinstance(total, float) else None
coverage_tool = "coverage"
except Exception:
# Fallback: parse `coverage report` output
out = run_cmd(["coverage", "report", "--skip-empty"])
if out:
for line in out.splitlines():
if "TOTAL" in line:
parts = line.split()
if len(parts) >= 2:
try:
coverage_percent = float(parts[-1].rstrip('%'))
coverage_tool = "coverage"
break
except ValueError:
pass
return {
"test_files": len(test_files),
"source_files": len(source_files),
"test_to_source_ratio": round(len(test_files) / len(source_files), 4) if source_files else 0.0,
"coverage_tool": coverage_tool,
"coverage_percent": coverage_percent,
}
# ── Metric 2: Doc Coverage ──────────────────────────────────────────────────
def collect_doc_coverage() -> Dict[str, Any]:
"""
Check AST of Python files for docstrings.
Returns: callables_total, callables_with_doc, doc_coverage_percent
"""
import ast
all_py = list(REPO_ROOT.rglob("*.py"))
source_files = []
test_files = []
for p in all_py:
try:
rel_parts = p.relative_to(REPO_ROOT).parts
except ValueError:
continue
if any(part.startswith('.') or part.startswith('__') for part in rel_parts):
continue
if any(part in ('node_modules', 'venv', '.venv', 'env', '.pytest_cache') for part in rel_parts):
continue
if p.name.startswith("test_") or p.name.endswith("_test.py"):
test_files.append(p)
else:
source_files.append(p)
total_callables = 0
with_doc = 0
for p in source_files + test_files:
try:
with open(p) as f:
tree = ast.parse(f.read(), filename=str(p))
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
total_callables += 1
doc = ast.get_docstring(node)
if doc and doc.strip():
with_doc += 1
except Exception:
continue
return {
"callables_total": total_callables,
"callables_with_doc": with_doc,
"doc_coverage_percent": round((with_doc / total_callables * 100) if total_callables else 0.0, 2),
}
# ── Metric 3: Issue Close Rate ──────────────────────────────────────────────
def collect_issue_metrics() -> Dict[str, Any]:
"""
Use Gitea API to get issue open/close stats for the last 7 days.
Returns counts and close rate.
"""
token = ""
if TOKEN_PATH.exists():
token = TOKEN_PATH.read_text().strip()
if not token:
return {
"opened_last_7d": None,
"closed_last_7d": None,
"close_rate": None,
"total_open": None,
"note": "Gitea token not available"
}
try:
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
except ImportError:
return {"error": "urllib not available"}
now = datetime.now(timezone.utc)
week_ago = now - timedelta(days=7)
since = week_ago.strftime("%Y-%m-%d")
headers = {"Authorization": f"token {token}"}
base_url = f"{GITEA_API_BASE}/repos/{ORG}/compounding-intelligence/issues"
try:
# Get issues from last 7 days
url = f"{base_url}?state=all&since={since}&per_page=100"
req = Request(url, headers=headers)
with urlopen(req, timeout=15) as resp:
issues = json.loads(resp.read())
opened = 0
closed = 0
for issue in issues:
created = datetime.fromisoformat(issue["created_at"].replace("Z", "+00:00"))
if created >= week_ago:
opened += 1
if issue.get("state") == "closed":
closed_at_str = issue.get("closed_at")
if closed_at_str:
closed_at = datetime.fromisoformat(closed_at_str.replace("Z", "+00:00"))
if closed_at >= week_ago:
closed += 1
# Total open issues
req2 = Request(f"{base_url}?state=open&per_page=1", headers=headers)
with urlopen(req2, timeout=15) as resp:
total_open = int(resp.headers.get("X-Total-Count", "0"))
total = opened + closed
close_rate = closed / total if total > 0 else 0.0
return {
"opened_last_7d": opened,
"closed_last_7d": closed,
"close_rate": round(close_rate, 4),
"total_open": total_open,
}
except Exception as e:
return {
"opened_last_7d": None,
"closed_last_7d": None,
"close_rate": None,
"total_open": None,
"error": str(e)[:100],
"note": "Gitea API unavailable"
}
# ── Metric 4: Dependency Freshness ─────────────────────────────────────────
def collect_dep_freshness() -> Dict[str, Any]:
"""
Check requirements.txt for outdated dependencies using pip list --outdated.
Returns freshness percentage and outdated list.
"""
req_file = REPO_ROOT / "requirements.txt"
if not req_file.exists():
return {
"total_deps": 0,
"outdated_deps": 0,
"freshness_percent": 100.0,
"outdated_list": [],
"note": "requirements.txt not found"
}
# Parse requirements (very simple: take name before comparison op)
reqs = []
with open(req_file) as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
m = re.match(r"^([a-zA-Z0-9_.-]+)", line)
if m:
reqs.append(m.group(1))
if not reqs:
return {"total_deps": 0, "outdated_deps": 0, "freshness_percent": 100.0, "outdated_list": []}
# Query pip for outdated packages (may fail if pip not available)
outdated_names = set()
try:
out = run_cmd(["pip", "list", "--outdated", "--format=json"])
if out:
data = json.loads(out)
outdated_names = {item["name"].lower() for item in data}
except Exception:
pass
outdated = [p for p in reqs if p.lower() in outdated_names]
total = len(reqs)
outdated_count = len(outdated)
freshness = round(((total - outdated_count) / total * 100) if total else 100.0, 1)
return {
"total_deps": total,
"outdated_deps": outdated_count,
"freshness_percent": freshness,
"outdated_list": outdated,
}
# ── Snapshot & Trends ───────────────────────────────────────────────────────
def take_snapshot() -> Dict[str, Any]:
"""Collect all metrics and return a snapshot dict."""
now = datetime.now(timezone.utc)
test_cov = collect_test_coverage()
doc_cov = collect_doc_coverage()
issues = collect_issue_metrics()
deps = collect_dep_freshness()
return {
"timestamp": now.isoformat(),
"date": slugify_date(now),
"metrics": {
"test_coverage": test_cov,
"doc_coverage": doc_cov,
"issues": issues,
"dependencies": deps,
}
}
def save_snapshot(snapshot: Dict[str, Any]) -> Path:
path = snapshot_path(datetime.fromisoformat(snapshot["timestamp"]))
with open(path, "w") as f:
json.dump(snapshot, f, indent=2)
return path
def generate_trends(snapshots: List[Dict[str, Any]], output_path: Optional[Path] = None) -> str:
"""Generate markdown trends table; optionally write to file."""
if not snapshots:
msg = "# Progress Tracker — Trends\n\nNo snapshots yet. Run `progress_tracker.py` to create the first snapshot."
if output_path:
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(msg)
return msg
lines = [
"# Progress Tracker — Trends",
f"\nLast updated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}",
f"\nSnapshots: {len(snapshots)}\n",
"| Date | Test Files → Source | Doc Coverage | Issues Closed/Opened (7d) | Dep Freshness |",
"|------|---------------------|--------------|---------------------------|---------------|",
]
for snap in reversed(snapshots): # chronological
date = snap["date"]
m = snap["metrics"]
tc = m["test_coverage"]
test_str = f"{tc['test_files']}/{tc['source_files']} ({tc['test_to_source_ratio']:.2f})"
doc_str = f"{m['doc_coverage']['doc_coverage_percent']:.1f}%"
issues_str = f"{m['issues'].get('closed_last_7d','-')}/{m['issues'].get('opened_last_7d','-')}"
dep_str = f"{m['dependencies'].get('freshness_percent','?')}%"
lines.append(f"| {date} | {test_str} | {doc_str} | {issues_str} | {dep_str} |")
# Current snapshot summary
cur = snapshots[-1]
cm = cur["metrics"]
lines.append(f"\n## Current Snapshot ({cur['date']})\n")
tc = cm["test_coverage"]
cov_line = f"- Test coverage: {tc['coverage_percent']:.1f}% (via {tc['coverage_tool']})\n" if tc["coverage_percent"] else "- Test coverage: (pytest-cov not configured)\n"
lines.append(cov_line)
lines.append(f"- Doc coverage: {cm['doc_coverage']['doc_coverage_percent']:.1f}%")
im = cm["issues"]
if im.get("close_rate") is not None:
lines.append(f"- Issue close rate (7d): {im['close_rate']*100:.1f}% ({im['closed_last_7d']} closed, {im['opened_last_7d']} opened)")
else:
lines.append(f"- Issue metrics: {im.get('note','unavailable')}")
dd = cm["dependencies"]
lines.append(f"- Dep freshness: {dd.get('freshness_percent','?')}% outdated ({dd.get('outdated_deps',0)}/{dd.get('total_deps',0)} deps)")
if dd.get('outdated_list'):
lines.append(f" Outdated: {', '.join(dd['outdated_list'][:5])}")
content = "\n".join(lines) + "\n"
if output_path:
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(content)
return content
# ── Main ─────────────────────────────────────────────────────────────────────
def main() -> int:
parser = argparse.ArgumentParser(description="Progress Tracker — 10.8")
parser.add_argument("--json", action="store_true", help="Emit snapshot as JSON only")
parser.add_argument("--output", type=Path, default=METRICS_DIR / "TRENDS.md",
help="Write trends markdown to this file")
args = parser.parse_args()
snapshot = take_snapshot()
all_snapshots = load_snapshots()
path_written = save_snapshot(snapshot)
if args.json:
print(json.dumps(snapshot, indent=2))
return 0
trends = generate_trends(all_snapshots + [snapshot], output_path=args.output)
# Print current snapshot summary
print(f"Snapshot saved: {path_written}\n")
print(f"Progress Tracker — {snapshot['date']}")
print("=" * 50)
m = snapshot["metrics"]
tc = m["test_coverage"]
print(f"Test files: {tc['test_files']} | Source files: {tc['source_files']} | Ratio: {tc['test_to_source_ratio']:.3f}")
if tc["coverage_percent"] is not None:
print(f"Line coverage: {tc['coverage_percent']:.1f}% (via {tc['coverage_tool']})")
else:
print("Line coverage: (not available — run `pytest --cov`)")
print()
dc = m["doc_coverage"]
print(f"Callables with docstrings: {dc['callables_with_doc']}/{dc['callables_total']} ({dc['doc_coverage_percent']:.1f}%)")
print()
im = m["issues"]
if im.get("close_rate") is not None:
print(f"Issues (7d): {im['closed_last_7d']} closed / {im['opened_last_7d']} opened → close rate: {im['close_rate']*100:.1f}%")
print(f"Total open: {im['total_open']}")
else:
print(f"Issues: {im.get('note','unavailable')}")
print()
dd = m["dependencies"]
print(f"Dependencies: {dd.get('total_deps',0)} total, {dd.get('outdated_deps',0)} outdated")
if dd.get('outdated_list'):
shown = dd['outdated_list'][:5]
print(f"Outdated: {', '.join(shown)}" + ("..." if len(dd['outdated_list']) > 5 else ""))
print(f"\nTrends written to: {args.output}")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,170 @@
#!/usr/bin/env python3
"""
Tests for PR Complexity Scorer — unit tests for the scoring logic.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from pr_complexity_scorer import (
score_pr,
is_dependency_file,
is_test_file,
TIME_PER_POINT,
SMALL_FILES,
MEDIUM_FILES,
LARGE_FILES,
SMALL_LINES,
MEDIUM_LINES,
LARGE_LINES,
)
PASS = 0
FAIL = 0
def test(name):
def decorator(fn):
global PASS, FAIL
try:
fn()
PASS += 1
print(f" [PASS] {name}")
except AssertionError as e:
FAIL += 1
print(f" [FAIL] {name}: {e}")
except Exception as e:
FAIL += 1
print(f" [FAIL] {name}: Unexpected error: {e}")
return decorator
def assert_eq(a, b, msg=""):
if a != b:
raise AssertionError(f"{msg} expected {b!r}, got {a!r}")
def assert_true(v, msg=""):
if not v:
raise AssertionError(msg or "Expected True")
def assert_false(v, msg=""):
if v:
raise AssertionError(msg or "Expected False")
print("=== PR Complexity Scorer Tests ===\n")
print("-- File Classification --")
@test("dependency file detection — requirements.txt")
def _():
assert_true(is_dependency_file("requirements.txt"))
assert_true(is_dependency_file("src/requirements.txt"))
assert_false(is_dependency_file("requirements_test.txt"))
@test("dependency file detection — pyproject.toml")
def _():
assert_true(is_dependency_file("pyproject.toml"))
assert_false(is_dependency_file("myproject.py"))
@test("test file detection — pytest style")
def _():
assert_true(is_test_file("tests/test_api.py"))
assert_true(is_test_file("test_module.py"))
assert_true(is_test_file("src/module_test.py"))
@test("test file detection — other frameworks")
def _():
assert_true(is_test_file("spec/feature_spec.rb"))
assert_true(is_test_file("__tests__/component.test.js"))
assert_false(is_test_file("testfixtures/helper.py"))
print("\n-- Scoring Logic --")
@test("small PR gets low score (1-3)")
def _():
score, minutes, _ = score_pr(
files_changed=3,
additions=50,
deletions=10,
has_dependency_changes=False,
test_coverage_delta=None
)
assert_true(1 <= score <= 3, f"Score should be low, got {score}")
assert_true(minutes < 20)
@test("medium PR gets medium score (4-6)")
def _():
score, minutes, _ = score_pr(
files_changed=15,
additions=400,
deletions=100,
has_dependency_changes=False,
test_coverage_delta=None
)
assert_true(4 <= score <= 6, f"Score should be medium, got {score}")
assert_true(20 <= minutes <= 45)
@test("large PR gets high score (7-9)")
def _():
score, minutes, _ = score_pr(
files_changed=60,
additions=3000,
deletions=1500,
has_dependency_changes=True,
test_coverage_delta=None
)
assert_true(7 <= score <= 9, f"Score should be high, got {score}")
assert_true(minutes >= 45)
@test("dependency changes boost score")
def _():
base_score, _, _ = score_pr(
files_changed=10, additions=200, deletions=50,
has_dependency_changes=False, test_coverage_delta=None
)
dep_score, _, _ = score_pr(
files_changed=10, additions=200, deletions=50,
has_dependency_changes=True, test_coverage_delta=None
)
assert_true(dep_score > base_score, f"Deps: {base_score} -> {dep_score}")
@test("adding tests lowers complexity")
def _():
base_score, _, _ = score_pr(
files_changed=8, additions=150, deletions=20,
has_dependency_changes=False, test_coverage_delta=None
)
better_score, _, _ = score_pr(
files_changed=8, additions=180, deletions=20,
has_dependency_changes=False, test_coverage_delta=3
)
assert_true(better_score < base_score, f"Tests: {base_score} -> {better_score}")
@test("removing tests increases complexity")
def _():
base_score, _, _ = score_pr(
files_changed=8, additions=150, deletions=20,
has_dependency_changes=False, test_coverage_delta=None
)
worse_score, _, _ = score_pr(
files_changed=8, additions=150, deletions=20,
has_dependency_changes=False, test_coverage_delta=-2
)
assert_true(worse_score > base_score, f"Remove tests: {base_score} -> {worse_score}")
@test("score bounded 1-10")
def _():
for files, adds, dels in [(1, 10, 5), (100, 10000, 5000)]:
score, _, _ = score_pr(files, adds, dels, False, None)
assert_true(1 <= score <= 10, f"Score {score} out of range")
@test("estimated minutes exist for all scores")
def _():
for s in range(1, 11):
assert_true(s in TIME_PER_POINT, f"Missing time for score {s}")
print(f"\n=== Results: {PASS} passed, {FAIL} failed ===")
sys.exit(0 if FAIL == 0 else 1)

View File

@@ -0,0 +1,174 @@
#!/usr/bin/env python3
"""
Smoke tests for knowledge_to_training_pairs.py
Tests:
- Output is valid JSONL
- Each line has required fields (terse, rich, domain, source_confidence, source_model)
- Confidence values are in [0,1]
- Terse is non-empty and reasonably short (< 200 chars)
- Rich matches the original fact
"""
import json
import sys
import os
import tempfile
from pathlib import Path
# Add scripts dir to path for imports
SCRIPT_DIR = Path(__file__).parent.parent / "scripts"
sys.path.insert(0, str(SCRIPT_DIR))
from knowledge_to_training_pairs import (
fact_to_terse,
filter_entries,
entry_to_pair,
parse_date,
)
def test_fact_to_terse_pitfall():
fact = "deploy-crons.py leaves jobs in mixed model format"
category = "pitfall"
domain = "hermes-agent"
terse = fact_to_terse(fact, category, domain)
assert terse.startswith("How do I")
assert "?" in terse
assert len(terse) < 150
print("PASS: test_fact_to_terse_pitfall")
def test_fact_to_terse_fact():
fact = "Python is a high-level programming language"
terse = fact_to_terse(fact, "fact", "global")
assert terse.startswith("What should I know about")
assert "?" in terse
print("PASS: test_fact_to_terse_fact")
def test_fact_to_terse_pattern():
fact = "Use sparse checkout for large repos"
terse = fact_to_terse(fact, "pattern", "devops")
assert "recommended way" in terse or "best way" in terse
print("PASS: test_fact_to_terse_pattern")
def test_entry_to_pair_structure():
entry = {
"id": "test:001",
"fact": "Test fact text.",
"category": "fact",
"domain": "test-domain",
"confidence": 0.85,
"model": "test-model",
}
pair = entry_to_pair(entry)
assert pair is not None
assert "terse" in pair
assert "rich" in pair
assert "domain" in pair
assert "source_confidence" in pair
assert "source_model" in pair
assert pair["rich"] == "Test fact text."
assert pair["domain"] == "test-domain"
assert 0.0 <= pair["source_confidence"] <= 1.0
print("PASS: test_entry_to_pair_structure")
def test_filter_by_confidence():
entries = [
{"fact": "A", "confidence": 0.9},
{"fact": "B", "confidence": 0.4},
{"fact": "C", "confidence": 0.6},
]
filtered = filter_entries(entries, min_confidence=0.5)
assert len(filtered) == 2
assert all(e["confidence"] >= 0.5 for e in filtered)
print("PASS: test_filter_by_confidence")
def test_filter_by_model():
entries = [
{"fact": "A", "model": "claude-sonnet"},
{"fact": "B", "model": "gpt-4"},
{"fact": "C", "model": "unknown"},
]
filtered = filter_entries(entries, model_filter=["claude-sonnet", "gpt-4"])
assert len(filtered) == 2
assert all(e["model"] in ("claude-sonnet", "gpt-4") for e in filtered)
print("PASS: test_filter_by_model")
def test_filter_by_date():
entries = [
{"fact": "A", "last_confirmed": "2026-04-10"},
{"fact": "B", "last_confirmed": "2026-03-01"},
{"fact": "C", "first_seen": "2026-04-15"},
]
after_dt = parse_date("2026-04-01")
filtered = filter_entries(entries, after=after_dt)
assert len(filtered) == 2
print("PASS: test_filter_by_date")
def test_end_to_end_jsonl_output():
"""Integration test: run the script and verify JSONL validity."""
import subprocess
repo_dir = SCRIPT_DIR.parent
result = subprocess.run(
["python3", "scripts/knowledge_to_training_pairs.py", "--dry-run"],
capture_output=True, text=True, cwd=repo_dir
)
assert result.returncode == 0
stderr = result.stderr.strip()
# The stats JSON object is at the top of stderr. Find its bounds via brace matching.
start = stderr.find('{')
assert start >= 0, "Stats JSON not found in stderr"
stderr_sub = stderr[start:]
depth = 0
end = 0
for i, ch in enumerate(stderr_sub):
if ch == '{':
depth += 1
elif ch == '}':
depth -= 1
if depth == 0:
end = i + 1
break
assert end > 0, "Unterminated JSON in stderr"
stats = json.loads(stderr_sub[:end])
assert stats["input_entries"] > 0
assert stats["pairs_generated"] > 0
print("PASS: test_end_to_end_jsonl_output")
def test_terse_length_constraint():
"""Terse should be reasonably short for training."""
# Sample facts from actual knowledge
test_facts = [
("deploy-crons.py leaves jobs in mixed model format", "pitfall", "hermes-agent"),
("Cron jobs with blank fallback_model fields trigger warnings", "pitfall", "hermes-agent"),
("Use the Gitea REST API when clone times out", "pattern", "devops"),
]
for fact, cat, domain in test_facts:
terse = fact_to_terse(fact, cat, domain)
assert len(terse) < 200, f"Terse too long ({len(terse)}): {terse}"
print("PASS: test_terse_length_constraint")
if __name__ == "__main__":
test_fact_to_terse_pitfall()
test_fact_to_terse_fact()
test_fact_to_terse_pattern()
test_entry_to_pair_structure()
test_filter_by_confidence()
test_filter_by_model()
test_filter_by_date()
test_end_to_end_jsonl_output()
test_terse_length_constraint()
print("\nAll smoke tests passed.")