Compare commits

..

2 Commits

4 changed files with 324 additions and 416 deletions

View File

@@ -1,275 +0,0 @@
"""
Knowledge Gap Identifier — Pipeline 10.7
Cross-references code, docs, and tests to find gaps:
- Undocumented functions/classes
- Untested code paths
- Documented but missing implementations
- Test files without corresponding source
Produces a gap report with severity and suggestions.
"""
from __future__ import annotations
import ast
import os
import re
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Set
class GapSeverity(Enum):
INFO = "info"
WARNING = "warning"
ERROR = "error"
class GapType(Enum):
UNDOCUMENTED = "undocumented"
UNTESTED = "untested"
MISSING_IMPLEMENTATION = "missing_implementation"
ORPHAN_TEST = "orphan_test"
STALE_DOC = "stale_doc"
@dataclass
class Gap:
"""A single knowledge gap."""
gap_type: GapType
severity: GapSeverity
file: str
line: Optional[int]
name: str
description: str
suggestion: str
@dataclass
class GapReport:
"""Full gap analysis report."""
repo_path: str
gaps: List[Gap] = field(default_factory=list)
stats: Dict[str, int] = field(default_factory=dict)
def summary(self) -> str:
lines = [f"Gap Report for {self.repo_path}", "=" * 40]
by_type = {}
for g in self.gaps:
by_type.setdefault(g.gap_type.value, []).append(g)
for gtype, items in sorted(by_type.items()):
lines.append(f"\n{gtype.upper()} ({len(items)}):")
for g in items:
loc = f"{g.file}:{g.line}" if g.line else g.file
lines.append(f" [{g.severity.value}] {g.name} @ {loc}")
lines.append(f" {g.description}")
lines.append(f"\nTotal gaps: {len(self.gaps)}")
self.stats = {k: len(v) for k, v in by_type.items()}
return "\n".join(lines)
def to_dict(self) -> dict:
return {
"repo_path": self.repo_path,
"total_gaps": len(self.gaps),
"stats": {k: len(v) for k, v in
{gt: [g for g in self.gaps if g.gap_type == gt]
for gt in GapType}.items() if v},
"gaps": [
{
"type": g.gap_type.value,
"severity": g.severity.value,
"file": g.file,
"line": g.line,
"name": g.name,
"description": g.description,
"suggestion": g.suggestion,
}
for g in self.gaps
],
}
def _collect_python_files(root: Path) -> List[Path]:
"""Collect .py files, excluding venv/node_modules/.git."""
skip = {".git", "venv", "env", ".venv", "node_modules", "__pycache__", ".tox", ".mypy_cache"}
files = []
for dirpath, dirnames, filenames in os.walk(root):
dirnames[:] = [d for d in dirnames if d not in skip]
for f in filenames:
if f.endswith(".py"):
files.append(Path(dirpath) / f)
return files
def _extract_python_symbols(filepath: Path) -> Set[str]:
"""Extract top-level function and class names from a Python file."""
symbols = set()
try:
source = filepath.read_text(encoding="utf-8", errors="replace")
tree = ast.parse(source, filename=str(filepath))
except (SyntaxError, UnicodeDecodeError):
return symbols
for node in ast.iter_child_nodes(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
symbols.add(node.name)
return symbols
def _extract_doc_symbols(filepath: Path) -> Set[str]:
"""Extract function/class names mentioned in markdown docs."""
symbols = set()
try:
text = filepath.read_text(encoding="utf-8", errors="replace")
except (UnicodeDecodeError, OSError):
return symbols
# Match backtick-quoted identifiers: `ClassName`, `func_name`, `func()`
for m in re.finditer(r"`([A-Za-z_]\w+)(?:\(\))?`", text):
symbols.add(m.group(1))
# Match ## ClassName or ### func_name headings
for m in re.finditer(r"^#{1,4}\s+(\w+)", text, re.MULTILINE):
symbols.add(m.group(1))
return symbols
def _collect_test_files(root: Path) -> Dict[str, Path]:
"""Map test module names to their file paths."""
test_map = {}
for dirpath, dirnames, filenames in os.walk(root):
dirnames[:] = [d for d in dirnames if d not in {".git", "venv", "node_modules"}]
for f in filenames:
if f.startswith("test_") and f.endswith(".py"):
# test_foo.py -> foo
module_name = f[5:-3]
test_map[module_name] = Path(dirpath) / f
return test_map
class KnowledgeGapIdentifier:
"""Analyzes a repo for knowledge gaps between code, docs, and tests."""
def analyze(self, repo_path: str) -> GapReport:
root = Path(repo_path).resolve()
report = GapReport(repo_path=str(root))
if not root.is_dir():
report.gaps.append(Gap(
gap_type=GapType.UNDOCUMENTED,
severity=GapSeverity.ERROR,
file=str(root),
line=None,
name="repo",
description="Path is not a directory",
suggestion="Provide a valid repo directory",
))
return report
# Collect artifacts
py_files = _collect_python_files(root)
doc_files = list(root.glob("docs/**/*.md")) + list(root.glob("*.md"))
test_map = _collect_test_files(root / "tests") if (root / "tests").is_dir() else {}
# Extract symbols from each source file
source_symbols: Dict[str, Set[str]] = {} # relative_path -> symbols
all_source_symbols: Set[str] = set()
for pf in py_files:
rel = str(pf.relative_to(root))
# Skip test files and setup/config
if "/tests/" in rel or rel.startswith("tests/") or rel.startswith("test_"):
continue
if pf.name in ("setup.py", "conftest.py", "conf.py"):
continue
syms = _extract_python_symbols(pf)
if syms:
source_symbols[rel] = syms
all_source_symbols.update(syms)
# Extract documented symbols
doc_symbols: Set[str] = set()
for df in doc_files:
doc_symbols.update(_extract_doc_symbols(df))
# Extract test-covered symbols
tested_modules: Set[str] = set(test_map.keys())
# --- Find gaps ---
# 1. Undocumented: source symbols not in any doc
for rel_path, syms in source_symbols.items():
for sym in sorted(syms):
if sym.startswith("_") and not sym.startswith("__"):
continue # Skip private
if sym not in doc_symbols:
report.gaps.append(Gap(
gap_type=GapType.UNDOCUMENTED,
severity=GapSeverity.WARNING,
file=rel_path,
line=None,
name=sym,
description=f"{sym} defined in {rel_path} but not referenced in any docs",
suggestion=f"Add documentation for {sym} in a .md file",
))
# 2. Untested: source modules without a corresponding test file
for rel_path in source_symbols:
module_name = Path(rel_path).stem
if module_name not in tested_modules and module_name not in ("__init__", "main", "config"):
report.gaps.append(Gap(
gap_type=GapType.UNTESTED,
severity=GapSeverity.ERROR,
file=rel_path,
line=None,
name=module_name,
description=f"No test file found for {rel_path}",
suggestion=f"Create tests/test_{module_name}.py",
))
# 3. Missing implementation: doc references symbol not in any source
referenced_but_missing = doc_symbols - all_source_symbols
for sym in sorted(referenced_but_missing):
# Filter out common non-code terms
if sym.lower() in {"todo", "fixme", "note", "example", "usage", "api",
"install", "setup", "config", "license", "contributing",
"changelog", "readme", "python", "bash", "json", "yaml",
"http", "url", "cli", "gui", "ui", "api", "rest"}:
continue
if len(sym) < 3:
continue
report.gaps.append(Gap(
gap_type=GapType.MISSING_IMPLEMENTATION,
severity=GapSeverity.INFO,
file="(docs)",
line=None,
name=sym,
description=f"{sym} referenced in docs but not found in source code",
suggestion=f"Verify if {sym} should be implemented or update docs",
))
# 4. Orphan tests: test files without matching source
for test_mod, test_path in test_map.items():
if test_mod not in tested_modules and not any(
test_mod in Path(f).stem for f in source_symbols
):
# Check if any source file partially matches
matches_source = any(test_mod.replace("_", "-") in f or test_mod.replace("_", "") in Path(f).stem
for f in source_symbols)
if not matches_source:
rel = str(test_path.relative_to(root))
report.gaps.append(Gap(
gap_type=GapType.ORPHAN_TEST,
severity=GapSeverity.WARNING,
file=rel,
line=None,
name=test_mod,
description=f"Test file {rel} exists but no matching source module found",
suggestion=f"Verify if the source was renamed or removed",
))
return report

View File

@@ -0,0 +1,234 @@
#!/usr/bin/env python3
"""
Session Transcript → Training Pair Harvester
Scans Hermes session JSONL files for Q&A patterns and extracts
terse→rich training pairs. Outputs JSONL matching the timmy-config
training pairs spec.
Usage:
python3 scripts/session_pair_harvester.py ~/.hermes/sessions/
python3 scripts/session_pair_harvester.py session.jsonl --output pairs.jsonl
python3 scripts/session_pair_harvester.py --dir ~/.hermes/sessions/ --min-ratio 2.0
Output format:
{"terse": "user short prompt", "rich": "ai detailed response", "source": "session_id", "model": "..."}
"""
import argparse
import hashlib
import json
import sys
from pathlib import Path
from typing import Optional
def compute_hash(text: str) -> str:
"""Content hash for deduplication."""
return hashlib.sha256(text.encode()).hexdigest()[:16]
def extract_pairs_from_session(session_data: dict, min_ratio: float = 1.5,
min_response_words: int = 20) -> list:
"""Extract terse→rich pairs from a single session object."""
pairs = []
conversations = session_data.get("conversations", [])
session_id = session_data.get("id", "unknown")
model = session_data.get("model", "unknown")
seen_hashes = set()
for i, msg in enumerate(conversations):
# Look for assistant/gpt responses
if msg.get("from") not in ("gpt", "assistant"):
continue
response_text = msg.get("value", "")
if not response_text or len(response_text.split()) < min_response_words:
continue
# Find the preceding human message
prompt_text = ""
for j in range(i - 1, -1, -1):
if conversations[j].get("from") == "human":
prompt_text = conversations[j].get("value", "")
break
if not prompt_text:
continue
# Filter: skip tool results, system messages embedded as human
if prompt_text.startswith("{") and "output" in prompt_text[:100]:
continue # likely a tool result
if prompt_text.startswith("# SOUL.md") or prompt_text.startswith("You are"):
continue # system prompt leak
# Quality filters
prompt_words = len(prompt_text.split())
response_words = len(response_text.split())
# Must have meaningful length ratio
if prompt_words == 0 or response_words == 0:
continue
ratio = response_words / prompt_words
if ratio < min_ratio:
continue
# Skip responses that are mostly code
code_blocks = response_text.count("```")
if code_blocks >= 4 and len(response_text.replace("```", "").strip()) < 50:
continue
# Skip responses with tool call artifacts
if "tool_call" in response_text[:100] or "function_call" in response_text[:100]:
continue
# Deduplicate by content hash
content_hash = compute_hash(prompt_text + response_text[:200])
if content_hash in seen_hashes:
continue
seen_hashes.add(content_hash)
# Clean up response: remove markdown headers if too many
clean_response = response_text
pairs.append({
"terse": prompt_text.strip(),
"rich": clean_response.strip(),
"source": session_id,
"model": model,
"prompt_words": prompt_words,
"response_words": response_words,
"ratio": round(ratio, 2),
})
return pairs
def extract_from_jsonl_file(filepath: str, **kwargs) -> list:
"""Extract pairs from a session JSONL file."""
pairs = []
path = Path(filepath)
if not path.exists():
print(f"Warning: {filepath} not found", file=sys.stderr)
return pairs
content = path.read_text()
lines = content.strip().split("\n")
for line in lines:
line = line.strip()
if not line:
continue
try:
session = json.loads(line)
except json.JSONDecodeError:
continue
session_pairs = extract_pairs_from_session(session, **kwargs)
pairs.extend(session_pairs)
return pairs
def deduplicate_pairs(pairs: list) -> list:
"""Remove duplicate pairs across files."""
seen = set()
unique = []
for pair in pairs:
key = compute_hash(pair["terse"] + pair["rich"][:200])
if key not in seen:
seen.add(key)
unique.append(pair)
return unique
def main():
parser = argparse.ArgumentParser(description="Harvest training pairs from session transcripts")
parser.add_argument("input", nargs="?", help="Session JSONL file or directory")
parser.add_argument("--dir", "-d", help="Directory to scan for session files")
parser.add_argument("--output", "-o", default="harvested_pairs.jsonl", help="Output file")
parser.add_argument("--min-ratio", type=float, default=1.5, help="Min response/prompt word ratio")
parser.add_argument("--min-words", type=int, default=20, help="Min response word count")
parser.add_argument("--dry-run", action="store_true", help="Print stats without writing")
args = parser.parse_args()
all_pairs = []
files_scanned = 0
scan_dir = args.dir or args.input
if not scan_dir:
parser.print_help()
sys.exit(1)
scan_path = Path(scan_dir)
if scan_path.is_dir():
jsonl_files = sorted(scan_path.rglob("*.jsonl"))
print(f"Scanning {len(jsonl_files)} files in {scan_dir}...", file=sys.stderr)
for fpath in jsonl_files:
pairs = extract_from_jsonl_file(
str(fpath),
min_ratio=args.min_ratio,
min_response_words=args.min_words
)
all_pairs.extend(pairs)
files_scanned += 1
else:
pairs = extract_from_jsonl_file(
str(scan_path),
min_ratio=args.min_ratio,
min_response_words=args.min_words
)
all_pairs.extend(pairs)
files_scanned = 1
# Deduplicate
unique_pairs = deduplicate_pairs(all_pairs)
# Stats
if unique_pairs:
avg_prompt = sum(p["prompt_words"] for p in unique_pairs) / len(unique_pairs)
avg_response = sum(p["response_words"] for p in unique_pairs) / len(unique_pairs)
avg_ratio = sum(p["ratio"] for p in unique_pairs) / len(unique_pairs)
else:
avg_prompt = avg_response = avg_ratio = 0
stats = {
"files_scanned": files_scanned,
"raw_pairs": len(all_pairs),
"unique_pairs": len(unique_pairs),
"duplicates_removed": len(all_pairs) - len(unique_pairs),
"avg_prompt_words": round(avg_prompt, 1),
"avg_response_words": round(avg_response, 1),
"avg_ratio": round(avg_ratio, 2),
}
print(json.dumps(stats, indent=2), file=sys.stderr)
if args.dry_run:
# Print sample pairs
for pair in unique_pairs[:3]:
print(f"\n--- Source: {pair['source']} (ratio: {pair['ratio']}) ---", file=sys.stderr)
print(f"TERSE: {pair['terse'][:100]}...", file=sys.stderr)
print(f"RICH: {pair['rich'][:150]}...", file=sys.stderr)
return
# Write output
output_path = Path(args.output)
with open(output_path, "w") as f:
for pair in unique_pairs:
# Strip internal fields for output
output = {
"terse": pair["terse"],
"rich": pair["rich"],
"source": pair["source"],
"model": pair["model"],
}
f.write(json.dumps(output) + "\n")
print(f"\nWrote {len(unique_pairs)} pairs to {output_path}", file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env python3
"""Tests for session_pair_harvester."""
import json
import sys
import os
import tempfile
sys.path.insert(0, os.path.dirname(__file__))
from session_pair_harvester import extract_pairs_from_session, deduplicate_pairs, compute_hash
def test_basic_extraction():
session = {
"id": "test_001",
"model": "test-model",
"conversations": [
{"from": "system", "value": "You are helpful."},
{"from": "human", "value": "What is Python?"},
{"from": "gpt", "value": "Python is a high-level programming language known for its readability and versatility. It supports multiple paradigms including procedural, object-oriented, and functional programming. Python is widely used in web development, data science, machine learning, and automation."},
]
}
pairs = extract_pairs_from_session(session, min_ratio=1.5, min_response_words=10)
assert len(pairs) == 1
assert pairs[0]["terse"] == "What is Python?"
assert "programming language" in pairs[0]["rich"]
assert pairs[0]["source"] == "test_001"
print("PASS: test_basic_extraction")
def test_filters_short_responses():
session = {
"id": "test_002",
"model": "test",
"conversations": [
{"from": "human", "value": "Hi"},
{"from": "gpt", "value": "Hello!"},
]
}
pairs = extract_pairs_from_session(session, min_ratio=1.5, min_response_words=20)
assert len(pairs) == 0
print("PASS: test_filters_short_responses")
def test_skips_tool_results():
session = {
"id": "test_003",
"model": "test",
"conversations": [
{"from": "human", "value": '{"output": "file content", "exit_code": 0}'},
{"from": "gpt", "value": "The file was read successfully. Now let me analyze the content and provide a detailed summary of what was found in the file system."},
]
}
pairs = extract_pairs_from_session(session, min_ratio=1.5, min_response_words=10)
assert len(pairs) == 0
print("PASS: test_skips_tool_results")
def test_deduplication():
pairs = [
{"terse": "What is X?", "rich": "X is Y.", "source": "s1", "model": "m"},
{"terse": "What is X?", "rich": "X is Y.", "source": "s2", "model": "m"},
{"terse": "What is Z?", "rich": "Z is W.", "source": "s1", "model": "m"},
]
unique = deduplicate_pairs(pairs)
assert len(unique) == 2
print("PASS: test_deduplication")
def test_ratio_filter():
session = {
"id": "test_005",
"model": "test",
"conversations": [
{"from": "human", "value": "Explain quantum computing in detail with examples and applications"},
{"from": "gpt", "value": "OK."},
]
}
pairs = extract_pairs_from_session(session, min_ratio=1.5, min_response_words=10)
assert len(pairs) == 0 # response too short relative to prompt
print("PASS: test_ratio_filter")
if __name__ == "__main__":
test_basic_extraction()
test_filters_short_responses()
test_skips_tool_results()
test_deduplication()
test_ratio_filter()
print("\nAll tests passed.")

View File

@@ -1,141 +0,0 @@
"""Tests for knowledge_gap_identifier module."""
import sys
import os
import tempfile
import shutil
from pathlib import Path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'scripts'))
from knowledge_gap_identifier import KnowledgeGapIdentifier, GapType, GapSeverity
def _make_repo(tmpdir, structure):
"""Create a test repo from a dict of {path: content}."""
for rel_path, content in structure.items():
p = Path(tmpdir) / rel_path
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(content)
def test_undocumented_symbol():
with tempfile.TemporaryDirectory() as tmpdir:
_make_repo(tmpdir, {
"src/calculator.py": "def add(a, b):\n return a + b\n",
"README.md": "# Calculator\n",
})
report = KnowledgeGapIdentifier().analyze(tmpdir)
undocumented = [g for g in report.gaps if g.gap_type == GapType.UNDOCUMENTED]
assert any(g.name == "add" for g in undocumented), "add should be undocumented"
def test_documented_symbol_no_gap():
with tempfile.TemporaryDirectory() as tmpdir:
_make_repo(tmpdir, {
"src/calculator.py": "def add(a, b):\n return a + b\n",
"README.md": "# Calculator\nUse `add()` to add numbers.\n",
})
report = KnowledgeGapIdentifier().analyze(tmpdir)
undocumented = [g for g in report.gaps
if g.gap_type == GapType.UNDOCUMENTED and g.name == "add"]
assert len(undocumented) == 0, "add is documented, should not be flagged"
def test_untested_module():
with tempfile.TemporaryDirectory() as tmpdir:
_make_repo(tmpdir, {
"src/calculator.py": "def add(a, b):\n return a + b\n",
"src/helper.py": "def format(x):\n return str(x)\n",
"tests/test_calculator.py": "from src.calculator import add\nassert add(1,2) == 3\n",
})
report = KnowledgeGapIdentifier().analyze(tmpdir)
untested = [g for g in report.gaps if g.gap_type == GapType.UNTESTED]
assert any("helper" in g.name for g in untested), "helper should be untested"
def test_tested_module_no_gap():
with tempfile.TemporaryDirectory() as tmpdir:
_make_repo(tmpdir, {
"src/calculator.py": "def add(a, b):\n return a + b\n",
"tests/test_calculator.py": "def test_add():\n assert True\n",
})
report = KnowledgeGapIdentifier().analyze(tmpdir)
untested = [g for g in report.gaps
if g.gap_type == GapType.UNTESTED and "calculator" in g.name]
assert len(untested) == 0, "calculator has tests, should not be flagged"
def test_missing_implementation():
with tempfile.TemporaryDirectory() as tmpdir:
_make_repo(tmpdir, {
"src/app.py": "def run():\n pass\n",
"docs/api.md": "# API\nUse `NonExistentClass` to do things.\n",
})
report = KnowledgeGapIdentifier().analyze(tmpdir)
missing = [g for g in report.gaps if g.gap_type == GapType.MISSING_IMPLEMENTATION]
assert any(g.name == "NonExistentClass" for g in missing)
def test_private_symbols_skipped():
with tempfile.TemporaryDirectory() as tmpdir:
_make_repo(tmpdir, {
"src/app.py": "def _internal():\n pass\ndef public():\n pass\n",
"README.md": "# App\n",
})
report = KnowledgeGapIdentifier().analyze(tmpdir)
undocumented_names = [g.name for g in report.gaps if g.gap_type == GapType.UNDOCUMENTED]
assert "_internal" not in undocumented_names, "Private symbols should be skipped"
assert "public" in undocumented_names
def test_empty_repo():
with tempfile.TemporaryDirectory() as tmpdir:
report = KnowledgeGapIdentifier().analyze(tmpdir)
assert len(report.gaps) == 0
def test_invalid_path():
report = KnowledgeGapIdentifier().analyze("/nonexistent/path/xyz")
assert len(report.gaps) == 1
assert report.gaps[0].severity == GapSeverity.ERROR
def test_report_summary():
with tempfile.TemporaryDirectory() as tmpdir:
_make_repo(tmpdir, {
"src/app.py": "class MyService:\n def handle(self):\n pass\n",
"README.md": "# App\n",
})
report = KnowledgeGapIdentifier().analyze(tmpdir)
summary = report.summary()
assert "UNDOCUMENTED" in summary
assert "MyService" in summary
def test_report_to_dict():
with tempfile.TemporaryDirectory() as tmpdir:
_make_repo(tmpdir, {
"src/app.py": "def hello():\n pass\n",
"README.md": "# App\n",
})
report = KnowledgeGapIdentifier().analyze(tmpdir)
d = report.to_dict()
assert "total_gaps" in d
assert "gaps" in d
assert isinstance(d["gaps"], list)
assert d["total_gaps"] > 0
if __name__ == "__main__":
test_undocumented_symbol()
test_documented_symbol_no_gap()
test_untested_module()
test_tested_module_no_gap()
test_missing_implementation()
test_private_symbols_skipped()
test_empty_repo()
test_invalid_path()
test_report_summary()
test_report_to_dict()
print("All 10 tests passed.")