Compare commits
1 Commits
step35/144
...
step35/124
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
12c554df35 |
169
scripts/coverage_checker.py
Normal file
169
scripts/coverage_checker.py
Normal file
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test Coverage Checker — 6.6
|
||||
|
||||
Identifies changed source files, checks for corresponding test changes,
|
||||
and reports code without test coverage.
|
||||
|
||||
Usage:
|
||||
python3 scripts/test_coverage_checker.py
|
||||
python3 scripts/test_coverage_checker.py --format json
|
||||
python3 scripts/test_coverage_checker.py --compare HEAD~1 # Compare against a specific ref
|
||||
|
||||
Acceptance:
|
||||
- Identifies changed source files (git diff --name-only HEAD)
|
||||
- Checks for corresponding test changes (matches source→test file mapping)
|
||||
- Reports: code without tests (lists coverage gaps)
|
||||
- Output: coverage gap (structured text/JSON)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
def run_git_diff(ref: str = "HEAD") -> List[str]:
|
||||
"""Return list of changed file paths relative to given ref."""
|
||||
result = subprocess.run(
|
||||
["git", "diff", "--name-only", ref],
|
||||
capture_output=True, text=True, cwd=REPO_ROOT
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f"ERROR: git diff failed: {result.stderr}")
|
||||
sys.exit(1)
|
||||
return [p for p in result.stdout.splitlines() if p.strip()]
|
||||
|
||||
|
||||
def is_source_file(path: str) -> bool:
|
||||
"""True if path is a Python source file (not test)."""
|
||||
return path.endswith(".py") and not path.startswith("tests/") and "/test" not in Path(path).name
|
||||
|
||||
|
||||
def is_test_file(path: str) -> bool:
|
||||
"""True if path is a test file."""
|
||||
if not path.endswith(".py"):
|
||||
return False
|
||||
name = Path(path).name
|
||||
# Test files: test_*.py or *_test.py or in tests/ directory
|
||||
return (name.startswith("test_") or name.endswith("_test.py") or path.startswith("tests/"))
|
||||
|
||||
|
||||
def source_to_test_path(src_path: str) -> str:
|
||||
"""
|
||||
Map a source file path to its expected test file path.
|
||||
Convention: scripts/<name>.py -> tests/test_<name>.py
|
||||
<module>.py -> tests/test_<module>.py
|
||||
"""
|
||||
name = Path(src_path).name
|
||||
stem = Path(name).stem # without .py
|
||||
# Common mapping: script name -> test_ prefix in tests/
|
||||
test_name = f"test_{stem}.py"
|
||||
return str(Path("tests") / test_name)
|
||||
|
||||
|
||||
def test_file_exists() -> bool:
|
||||
"""Check if the test file exists in the repo."""
|
||||
return (REPO_ROOT / test_rel).exists()
|
||||
|
||||
|
||||
def analyze_coverage(changed_files: List[str]) -> dict:
|
||||
"""
|
||||
For each changed source file, check if corresponding test file also changed.
|
||||
Returns structured coverage gap report.
|
||||
"""
|
||||
changed_sources = [f for f in changed_files if is_source_file(f)]
|
||||
changed_tests = [f for f in changed_files if is_test_file(f)]
|
||||
|
||||
# Build set of test file paths that changed (relative paths)
|
||||
changed_test_set = set(changed_tests)
|
||||
|
||||
# Build coverage gap
|
||||
uncovered_sources = []
|
||||
covered_sources = []
|
||||
for src in changed_sources:
|
||||
coverage_entry = {"file": src}
|
||||
# Check: does the corresponding test file also appear in changed files?
|
||||
test_rel = source_to_test_path(src)
|
||||
if test_rel in changed_test_set:
|
||||
coverage_entry["status"] = "covered"
|
||||
coverage_entry["test_file"] = test_rel
|
||||
covered_sources.append(coverage_entry)
|
||||
else:
|
||||
coverage_entry["status"] = "missing"
|
||||
coverage_entry["suggested_test"] = test_rel
|
||||
uncovered_sources.append(coverage_entry)
|
||||
|
||||
return {
|
||||
"repo": REPO_ROOT.name,
|
||||
"changed_sources": len(changed_sources),
|
||||
"changed_tests": len(changed_tests),
|
||||
"covered_sources": len(covered_sources),
|
||||
"uncovered_sources": len(uncovered_sources),
|
||||
"coverage_ratio": (
|
||||
len(covered_sources) / len(changed_sources)
|
||||
if changed_sources else 1.0
|
||||
),
|
||||
"covered": covered_sources,
|
||||
"uncovered": uncovered_sources,
|
||||
"all_changed": changed_files,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Test Coverage Checker")
|
||||
parser.add_argument("--format", choices=["text", "json"], default="text",
|
||||
help="Output format")
|
||||
parser.add_argument("--compare", default="HEAD",
|
||||
help="Git ref to compare against (default: HEAD)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Step 1: Identify changed files
|
||||
print(f"Scanning changes vs {args.compare}...")
|
||||
changed_files = run_git_diff(args.compare)
|
||||
if not changed_files:
|
||||
print("No changed files detected.")
|
||||
sys.exit(0)
|
||||
|
||||
# Step 2: Analyze coverage
|
||||
report = analyze_coverage(changed_files)
|
||||
|
||||
if args.format == "json":
|
||||
print(json.dumps(report, indent=2))
|
||||
sys.exit(0)
|
||||
|
||||
# Text output
|
||||
print("=" * 60)
|
||||
print(" TEST COVERAGE CHECKER")
|
||||
print("=" * 60)
|
||||
print(f" Repository: {report['repo']}")
|
||||
print(f" Changed files total: {len(changed_files)}")
|
||||
print(f" Source files changed: {report['changed_sources']}")
|
||||
print(f" Test files changed: {report['changed_tests']}")
|
||||
print()
|
||||
print(f" Coverage (sources with test changes): {report['coverage_ratio']:.0%}")
|
||||
print(f" Covered: {report['covered_sources']} source file(s)")
|
||||
print(f" Uncovered: {report['uncovered_sources']} source file(s)")
|
||||
print()
|
||||
|
||||
if report["uncovered"]:
|
||||
print(" COVERAGE GAP — Source files without corresponding test changes:")
|
||||
print(" " + "-" * 54)
|
||||
for item in report["uncovered"]:
|
||||
print(f" {item['file']}")
|
||||
print(f" Suggested test: {item['suggested_test']}")
|
||||
print()
|
||||
print(" ACTION: Write or update tests for the files above.")
|
||||
sys.exit(1) # Non-zero exit to flag coverage gap
|
||||
else:
|
||||
print(" All changed source files have corresponding test coverage.")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,268 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
entity_extractor.py — Extract named entities from text sources.
|
||||
|
||||
Extracts: people, projects, tools, concepts, repos from session transcripts,
|
||||
README files, issue bodies, or any text input.
|
||||
|
||||
Output: knowledge/entities.json with deduplicated entity list and occurrence counts.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent.absolute()
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
|
||||
from session_reader import read_session, messages_to_text
|
||||
|
||||
# --- Configuration ---
|
||||
DEFAULT_API_BASE = os.environ.get("HARVESTER_API_BASE", "https://api.nousresearch.com/v1")
|
||||
DEFAULT_API_KEY = os.environ.get("HARVESTER_API_KEY", "")
|
||||
DEFAULT_MODEL = os.environ.get("HARVESTER_MODEL", "xiaomi/mimo-v2-pro")
|
||||
KNOWLEDGE_DIR = os.environ.get("HARVESTER_KNOWLEDGE_DIR", "knowledge")
|
||||
PROMPT_PATH = os.environ.get("ENTITY_PROMPT_PATH", str(SCRIPT_DIR.parent / "templates" / "entity-extraction-prompt.md"))
|
||||
|
||||
API_KEY_PATHS = [
|
||||
os.path.expanduser("~/.config/nous/key"),
|
||||
os.path.expanduser("~/.hermes/keymaxxing/active/minimax.key"),
|
||||
os.path.expanduser("~/.config/openrouter/key"),
|
||||
]
|
||||
|
||||
def find_api_key() -> str:
|
||||
for path in API_KEY_PATHS:
|
||||
if os.path.exists(path):
|
||||
with open(path) as f:
|
||||
key = f.read().strip()
|
||||
if key:
|
||||
return key
|
||||
return ""
|
||||
|
||||
def load_prompt() -> str:
|
||||
path = Path(PROMPT_PATH)
|
||||
if not path.exists():
|
||||
print(f"ERROR: Entity extraction prompt not found at {path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return path.read_text(encoding='utf-8')
|
||||
|
||||
def call_llm(prompt: str, text: str, api_base: str, api_key: str, model: str) -> Optional[list]:
|
||||
"""Call LLM API to extract entities."""
|
||||
import urllib.request
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": prompt},
|
||||
{"role": "user", "content": f"Extract entities from this text:\n\n{text}"}
|
||||
]
|
||||
|
||||
payload = json.dumps({
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 2048
|
||||
}).encode('utf-8')
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{api_base}/chat/completions",
|
||||
data=payload,
|
||||
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
|
||||
method="POST"
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
result = json.loads(resp.read().decode('utf-8'))
|
||||
content = result["choices"][0]["message"]["content"]
|
||||
return parse_response(content)
|
||||
except Exception as e:
|
||||
print(f"ERROR: LLM call failed: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
def parse_response(content: str) -> Optional[list]:
|
||||
"""Parse LLM JSON response containing entity array."""
|
||||
try:
|
||||
data = json.loads(content)
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
if isinstance(data, dict) and 'entities' in data:
|
||||
return data['entities']
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
import re
|
||||
match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', content, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
print(f"WARNING: Could not parse LLM response as entity list", file=sys.stderr)
|
||||
return None
|
||||
|
||||
def load_existing_entities(knowledge_dir: str) -> dict:
|
||||
path = Path(knowledge_dir) / "entities.json"
|
||||
if not path.exists():
|
||||
return {"version": 1, "last_updated": "", "entities": []}
|
||||
try:
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, IOError) as e:
|
||||
print(f"WARNING: Could not load entities: {e}", file=sys.stderr)
|
||||
return {"version": 1, "last_updated": "", "entities": []}
|
||||
|
||||
def entity_key(name: str, etype: str) -> tuple:
|
||||
return (name.lower().strip(), etype.lower().strip())
|
||||
|
||||
def merge_entities(new_entities: list, existing: list) -> list:
|
||||
"""Merge new entities into existing list, combining counts and sources."""
|
||||
existing_by_key = {}
|
||||
for e in existing:
|
||||
key = entity_key(e.get('name',''), e.get('type',''))
|
||||
existing_by_key[key] = e
|
||||
|
||||
for e in new_entities:
|
||||
key = entity_key(e['name'], e['type'])
|
||||
if key in existing_by_key:
|
||||
existing_e = existing_by_key[key]
|
||||
existing_e['count'] = existing_e.get('count', 1) + 1
|
||||
# Merge sources
|
||||
old_sources = set(existing_e.get('sources', []))
|
||||
new_sources = set(e.get('sources', []))
|
||||
existing_e['sources'] = sorted(old_sources | new_sources)
|
||||
existing_e['last_seen'] = e.get('last_seen', existing_e.get('last_seen'))
|
||||
else:
|
||||
e['count'] = e.get('count', 1)
|
||||
e.setdefault('sources', [])
|
||||
e.setdefault('first_seen', datetime.now(timezone.utc).isoformat())
|
||||
existing.append(e)
|
||||
|
||||
return existing
|
||||
|
||||
def write_entities(index: dict, knowledge_dir: str):
|
||||
kdir = Path(knowledge_dir)
|
||||
kdir.mkdir(parents=True, exist_ok=True)
|
||||
index['last_updated'] = datetime.now(timezone.utc).isoformat()
|
||||
path = kdir / "entities.json"
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
json.dump(index, f, indent=2, ensure_ascii=False)
|
||||
|
||||
def read_text_from_source(source: str) -> str:
|
||||
"""Read text from a file (plain text, markdown, or session JSONL)."""
|
||||
path = Path(source)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(source)
|
||||
if path.suffix == '.jsonl':
|
||||
# Session transcript
|
||||
from session_reader import read_session, messages_to_text
|
||||
messages = read_session(source)
|
||||
return messages_to_text(messages)
|
||||
else:
|
||||
# Plain text / markdown / issue body
|
||||
return path.read_text(encoding='utf-8', errors='replace')
|
||||
|
||||
def extract_from_text(text: str, api_base: str, api_key: str, model: str, source_name: str = "") -> list:
|
||||
prompt = load_prompt()
|
||||
raw = call_llm(prompt, text, api_base, api_key, model)
|
||||
if raw is None:
|
||||
return []
|
||||
entities = []
|
||||
for e in raw:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
name = e.get('name', '').strip()
|
||||
etype = e.get('type', '').strip().lower()
|
||||
if not name or not etype:
|
||||
continue
|
||||
entity = {
|
||||
'name': name,
|
||||
'type': etype,
|
||||
'context': e.get('context', '')[:200],
|
||||
'last_seen': datetime.now(timezone.utc).isoformat(),
|
||||
'sources': [source_name] if source_name else []
|
||||
}
|
||||
entities.append(entity)
|
||||
return entities
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Extract named entities from text sources")
|
||||
parser.add_argument('--file', help='Single file to process')
|
||||
parser.add_argument('--dir', help='Directory of files to process')
|
||||
parser.add_argument('--session', help='Single session JSONL file')
|
||||
parser.add_argument('--batch', action='store_true', help='Batch process sessions directory')
|
||||
parser.add_argument('--sessions-dir', default=os.path.expanduser('~/.hermes/sessions'),
|
||||
help='Sessions directory for batch mode')
|
||||
parser.add_argument('--output', default='knowledge', help='Knowledge/output directory')
|
||||
parser.add_argument('--api-base', default=DEFAULT_API_BASE)
|
||||
parser.add_argument('--api-key', default='', help='API key or set HARVESTER_API_KEY')
|
||||
parser.add_argument('--model', default=DEFAULT_MODEL)
|
||||
parser.add_argument('--dry-run', action='store_true', help='Preview without writing')
|
||||
parser.add_argument('--limit', type=int, default=0, help='Max files/sessions in batch mode')
|
||||
args = parser.parse_args()
|
||||
|
||||
api_key = args.api_key or DEFAULT_API_KEY or find_api_key()
|
||||
if not api_key:
|
||||
print("ERROR: No API key found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
knowledge_dir = args.output
|
||||
if not os.path.isabs(knowledge_dir):
|
||||
knowledge_dir = str(SCRIPT_DIR.parent / knowledge_dir)
|
||||
|
||||
sources = []
|
||||
if args.file:
|
||||
sources = [args.file]
|
||||
elif args.dir:
|
||||
files = sorted(Path(args.dir).rglob("*"))
|
||||
sources = [str(f) for f in files if f.is_file() and f.suffix in ('.txt','.md','.json','.jsonl','.yaml','.yml')]
|
||||
if args.limit > 0:
|
||||
sources = sources[:args.limit]
|
||||
elif args.session:
|
||||
sources = [args.session]
|
||||
elif args.batch:
|
||||
sess_dir = Path(args.sessions_dir)
|
||||
sources = sorted(sess_dir.glob("*.jsonl"), reverse=True)
|
||||
if args.limit > 0:
|
||||
sources = sources[:args.limit]
|
||||
sources = [str(s) for s in sources]
|
||||
else:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Processing {len(sources)} sources...")
|
||||
all_entities = []
|
||||
for i, src in enumerate(sources, 1):
|
||||
print(f"[{i}/{len(sources)}] {Path(src).name}...", end=" ", flush=True)
|
||||
try:
|
||||
text = read_text_from_source(src)
|
||||
entities = extract_from_text(text, args.api_base, api_key, args.model, source_name=Path(src).name)
|
||||
all_entities.extend(entities)
|
||||
print(f"→ {len(entities)} entities")
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
|
||||
# Deduplicate across all sources
|
||||
print(f"Total raw entities: {len(all_entities)}")
|
||||
existing_index = load_existing_entities(knowledge_dir)
|
||||
merged = merge_entities(all_entities, existing_index.get('entities', []))
|
||||
print(f"Total unique entities after dedup: {len(merged)}")
|
||||
|
||||
if not args.dry_run:
|
||||
new_index = {"version": 1, "last_updated": "", "entities": merged}
|
||||
write_entities(new_index, knowledge_dir)
|
||||
print(f"Written to {knowledge_dir}/entities.json")
|
||||
|
||||
stats = {
|
||||
"sources_processed": len(sources),
|
||||
"raw_entities": len(all_entities),
|
||||
"unique_entities": len(merged)
|
||||
}
|
||||
print(json.dumps(stats, indent=2))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,116 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Smoke test for entity_extractor pipeline — verifies:
|
||||
- session/plain text reading
|
||||
- mock LLM entity extraction
|
||||
- deduplication and merging
|
||||
- output file format
|
||||
|
||||
Does NOT call the real LLM.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from unittest.mock import patch
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent.absolute()
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
|
||||
from session_reader import read_session, messages_to_text
|
||||
import entity_extractor as ee
|
||||
|
||||
def mock_call_llm(prompt: str, text: str, api_base: str, api_key: str, model: str):
|
||||
"""Return a fixed entity list for any input."""
|
||||
return [
|
||||
{"name": "Hermes", "type": "tool", "context": "Hermes agent uses the tools tool."},
|
||||
{"name": "Gitea", "type": "tool", "context": "Gitea is a forge."},
|
||||
{"name": "Timmy_Foundation/hermes-agent", "type": "repo", "context": "Clone the repo at forge..."},
|
||||
]
|
||||
|
||||
def test_read_session_text():
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
|
||||
f.write('{"role": "user", "content": "Clone repo", "timestamp": "2026-04-13T10:00:00Z"}\n')
|
||||
f.write('{"role": "assistant", "content": "Done", "timestamp": "2026-04-13T10:00:05Z"}\n')
|
||||
path = f.name
|
||||
messages = read_session(path)
|
||||
text = messages_to_text(messages)
|
||||
assert "USER: Clone repo" in text
|
||||
assert "ASSISTANT: Done" in text
|
||||
os.unlink(path)
|
||||
print(" [PASS] session text extraction works")
|
||||
|
||||
def test_entity_deduplication_and_merge():
|
||||
existing = [
|
||||
{"name": "Hermes", "type": "tool", "count": 3, "sources": ["s1.jsonl"]}
|
||||
]
|
||||
new = [
|
||||
{"name": "Hermes", "type": "tool", "sources": ["s2.jsonl"]},
|
||||
{"name": "Gitea", "type": "tool", "sources": ["s2.jsonl"]},
|
||||
]
|
||||
merged = ee.merge_entities(new, existing.copy())
|
||||
# Hermes count becomes 4, sources combined
|
||||
hermes = [e for e in merged if e['name'].lower() == 'hermes'][0]
|
||||
assert hermes['count'] == 4
|
||||
assert set(hermes['sources']) == {'s1.jsonl', 's2.jsonl'}
|
||||
# Gitea new entry
|
||||
gitea = [e for e in merged if e['name'].lower() == 'gitea'][0]
|
||||
assert gitea['count'] == 1
|
||||
print(" [PASS] deduplication & merging works")
|
||||
|
||||
def test_write_and_load_entities():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
kdir = Path(tmp) / "knowledge"
|
||||
kdir.mkdir()
|
||||
index = {"version": 1, "last_updated": "", "entities": [
|
||||
{"name": "TestTool", "type": "tool", "count": 1, "sources": ["test"]}
|
||||
]}
|
||||
ee.write_entities(index, str(kdir))
|
||||
# load back
|
||||
loaded = ee.load_existing_entities(str(kdir))
|
||||
assert loaded['entities'][0]['name'] == 'TestTool'
|
||||
print(" [PASS] entities persistence works")
|
||||
|
||||
def test_full_pipeline_mocked():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Create two fake session files
|
||||
sess1 = Path(tmpdir) / "s1.jsonl"
|
||||
sess1.write_text('{"role":"user","content":"Use Hermes to clone","timestamp":"..."}\n')
|
||||
sess2 = Path(tmpdir) / "s2.jsonl"
|
||||
sess2.write_text('{"role":"user","content":"Deploy with Gitea","timestamp":"..."}\n')
|
||||
|
||||
knowledge_dir = Path(tmpdir) / "knowledge"
|
||||
knowledge_dir.mkdir()
|
||||
|
||||
# Patch call_llm
|
||||
with patch('entity_extractor.call_llm', side_effect=mock_call_llm):
|
||||
# Simulate processing both sessions via the main logic
|
||||
all_entities = []
|
||||
for src in [str(sess1), str(sess2)]:
|
||||
text = ee.read_text_from_source(src)
|
||||
ents = ee.extract_from_text(text, "http://api", "fake-key", "model", source_name=Path(src).name)
|
||||
all_entities.extend(ents)
|
||||
|
||||
# Merge into empty index
|
||||
merged = ee.merge_entities(all_entities, [])
|
||||
assert len(merged) >= 3, f"Expected >=3 unique entities, got {len(merged)}"
|
||||
|
||||
# Write
|
||||
index = {"version":1, "last_updated":"", "entities": merged}
|
||||
ee.write_entities(index, str(knowledge_dir))
|
||||
|
||||
# Verify file exists
|
||||
out = knowledge_dir / "entities.json"
|
||||
assert out.exists()
|
||||
data = json.loads(out.read_text())
|
||||
assert len(data['entities']) >= 3
|
||||
print(f" [PASS] full pipeline (mocked) produced {len(data['entities'])} entities")
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_read_session_text()
|
||||
test_entity_deduplication_and_merge()
|
||||
test_write_and_load_entities()
|
||||
test_full_pipeline_mocked()
|
||||
print("\nAll smoke tests passed.")
|
||||
@@ -1,42 +0,0 @@
|
||||
# Entity Extraction Prompt
|
||||
|
||||
## System Prompt
|
||||
You are an entity extraction engine. You read text and output ONLY a JSON array of named entities. You do not infer. You extract only what the text explicitly mentions.
|
||||
|
||||
## Task
|
||||
Extract all named entities from the provided text. Categorize each entity into exactly one of these types:
|
||||
- `person` — individual's name (e.g., Alexander, Rockachopa, Allegro)
|
||||
- `project` — software project or component name (e.g., The Nexus, Timmy Home, compounding-intelligence)
|
||||
- `tool` — software tool, command, library, framework (e.g., git, Docker, PyTorch, Hermes)
|
||||
- `concept` — abstract idea, methodology, paradigm (e.g., compounding intelligence, bootstrap, harvester)
|
||||
- `repo` — repository reference in the form `owner/repo` or URL pointing to a repo
|
||||
|
||||
## Rules
|
||||
1. Extract ONLY names that appear explicitly in the text.
|
||||
2. Do NOT infer, assume, or hallucinate.
|
||||
3. Each entity must have: `name` (exact string), `type` (one of the five above), and `context` (short snippet showing usage, 1-2 sentences).
|
||||
4. The same entity mentioned multiple times should appear only ONCE in the output (deduplicate by name+type).
|
||||
5. For `repo` type, match patterns like `owner/repo`, `github.com/owner/repo`, `forge.alexanderwhitestone.com/owner/repo`.
|
||||
6. For `tool` type, include commands (git, pytest), platforms (Linux, macOS), runtimes (Python, Node.js), and CLI utilities.
|
||||
7. For `person` type, look for capitalized full names, or single names used in personal attribution ("asked Alex", "for Alexander").
|
||||
8. For `concept`, include technical terms that represent an idea rather than a concrete thing.
|
||||
|
||||
## Output Format
|
||||
Return ONLY valid JSON, no markdown, no explanation. Array of objects:
|
||||
```json
|
||||
[
|
||||
{
|
||||
"name": "Hermes",
|
||||
"type": "tool",
|
||||
"context": "Hermes agent uses the tools tool to execute commands."
|
||||
},
|
||||
{
|
||||
"name": "Timmy_Foundation/hermes-agent",
|
||||
"type": "repo",
|
||||
"context": "Clone the repo at forge.../Timmy_Foundation/hermes-agent"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## Text to extract from:
|
||||
{{text}}
|
||||
116
tests/test_coverage_checker.py
Normal file
116
tests/test_coverage_checker.py
Normal file
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tests for coverage_checker — Issue #124 acceptance validation."""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
|
||||
|
||||
from coverage_checker import (
|
||||
is_source_file,
|
||||
is_test_file,
|
||||
source_to_test_path,
|
||||
analyze_coverage,
|
||||
)
|
||||
|
||||
|
||||
class TestSourceFileDetection:
|
||||
def test_script_in_scripts_dir(self):
|
||||
assert is_source_file("scripts/freshness.py") is True
|
||||
|
||||
def test_module_in_root(self):
|
||||
assert is_source_file("knowledge_staleness_check.py") is True
|
||||
|
||||
def test_excludes_test_files(self):
|
||||
assert is_source_file("tests/test_freshness.py") is False
|
||||
|
||||
def test_excludes_non_py(self):
|
||||
assert is_source_file("README.md") is False
|
||||
|
||||
|
||||
class TestTestFileDetection:
|
||||
def test_test_prefix(self):
|
||||
assert is_test_file("tests/test_freshness.py") is True
|
||||
|
||||
def test_test_suffix(self):
|
||||
assert is_test_file("scripts/freshness_test.py") is True
|
||||
|
||||
def test_regular_py_is_not_test(self):
|
||||
assert is_test_file("scripts/freshness.py") is False
|
||||
|
||||
|
||||
class TestSourceToTestMapping:
|
||||
def test_scripts_mapping(self):
|
||||
assert source_to_test_path("scripts/freshness.py") == "tests/test_freshness.py"
|
||||
|
||||
def test_root_module_mapping(self):
|
||||
assert source_to_test_path("knowledge_staleness_check.py") == "tests/test_knowledge_staleness_check.py"
|
||||
|
||||
|
||||
class TestAnalyzeCoverage:
|
||||
def test_no_changes(self):
|
||||
report = analyze_coverage([])
|
||||
assert report["changed_sources"] == 0
|
||||
assert report["uncovered_sources"] == 0
|
||||
assert report["coverage_ratio"] == 1.0
|
||||
|
||||
def test_all_covered(self):
|
||||
changed = [
|
||||
"scripts/freshness.py",
|
||||
"tests/test_freshness.py",
|
||||
"scripts/dedup.py",
|
||||
"tests/test_dedup.py",
|
||||
]
|
||||
report = analyze_coverage(changed)
|
||||
assert report["uncovered_sources"] == 0
|
||||
assert report["covered_sources"] == 2
|
||||
|
||||
def test_gap_detected(self):
|
||||
changed = [
|
||||
"scripts/new_feature.py",
|
||||
"README.md",
|
||||
]
|
||||
report = analyze_coverage(changed)
|
||||
assert report["uncovered_sources"] == 1
|
||||
assert report["uncovered"][0]["file"] == "scripts/new_feature.py"
|
||||
assert report["uncovered"][0]["suggested_test"] == "tests/test_new_feature.py"
|
||||
|
||||
def test_mixed_coverage(self):
|
||||
changed = [
|
||||
"scripts/covered.py",
|
||||
"tests/test_covered.py",
|
||||
"scripts/uncovered.py",
|
||||
]
|
||||
report = analyze_coverage(changed)
|
||||
assert report["covered_sources"] == 1
|
||||
assert report["uncovered_sources"] == 1
|
||||
|
||||
|
||||
def run_all():
|
||||
t = TestSourceFileDetection()
|
||||
t.test_script_in_scripts_dir()
|
||||
t.test_module_in_root()
|
||||
t.test_excludes_test_files()
|
||||
t.test_excludes_non_py()
|
||||
|
||||
t2 = TestTestFileDetection()
|
||||
t2.test_test_prefix()
|
||||
t2.test_test_suffix()
|
||||
t2.test_regular_py_is_not_test()
|
||||
|
||||
t3 = TestSourceToTestMapping()
|
||||
t3.test_scripts_mapping()
|
||||
t3.test_root_module_mapping()
|
||||
|
||||
t4 = TestAnalyzeCoverage()
|
||||
t4.test_no_changes()
|
||||
t4.test_all_covered()
|
||||
t4.test_gap_detected()
|
||||
t4.test_mixed_coverage()
|
||||
|
||||
print("All 11 tests passed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_all()
|
||||
@@ -1,82 +0,0 @@
|
||||
"""
|
||||
Test suite for entity_extractor.py (Issue #144).
|
||||
|
||||
Tests cover:
|
||||
- Text reading from various formats
|
||||
- Entity deduplication logic
|
||||
- Output file structure
|
||||
- Integration: batch processing yields 100+ entities from test_sessions
|
||||
"""
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
# We'll test the pure functions directly; avoid hitting real LLM in unit tests
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "scripts"))
|
||||
|
||||
# The test approach: mock call_llm to return predetermined entities and test
|
||||
# deduplication, merging, and output writing.
|
||||
|
||||
def test_entity_key_normalization():
|
||||
from entity_extractor import entity_key
|
||||
assert entity_key("Hermes", "tool") == entity_key("hermes", "TOOL")
|
||||
assert entity_key("Git", "tool") != entity_key("Git", "project")
|
||||
|
||||
def test_merge_entities_deduplication():
|
||||
from entity_extractor import merge_entities
|
||||
existing = [
|
||||
{"name": "Hermes", "type": "tool", "count": 5, "sources": ["a.jsonl"]}
|
||||
]
|
||||
new = [
|
||||
{"name": "Hermes", "type": "tool", "sources": ["b.jsonl"]},
|
||||
{"name": "Gitea", "type": "tool", "sources": ["b.jsonl"]}
|
||||
]
|
||||
merged = merge_entities(new, existing.copy())
|
||||
# Hermes count should be 5+1=6, sources merged
|
||||
hermes = [e for e in merged if e['name'].lower()=='hermes'][0]
|
||||
assert hermes['count'] == 6
|
||||
assert set(hermes['sources']) == {"a.jsonl", "b.jsonl"}
|
||||
# Gitea added fresh
|
||||
gitea = [e for e in merged if e['name'].lower()=='gitea'][0]
|
||||
assert gitea['count'] == 1
|
||||
|
||||
def test_output_schema():
|
||||
from entity_extractor import write_entities, load_existing_entities
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
kdir = Path(tmp) / "knowledge"
|
||||
kdir.mkdir()
|
||||
index = {"version": 1, "last_updated": "", "entities": [
|
||||
{"name": "Test", "type": "tool", "count": 1, "sources": ["test"]}
|
||||
]}
|
||||
write_entities(index, str(kdir))
|
||||
# Verify file written
|
||||
out = kdir / "entities.json"
|
||||
assert out.exists()
|
||||
data = json.loads(out.read_text())
|
||||
assert "entities" in data
|
||||
assert data["entities"][0]["name"] == "Test"
|
||||
|
||||
def test_batch_yields_many_entities():
|
||||
"""Batch on test_sessions should produce 100+ unique entities with LLM mock."""
|
||||
from entity_extractor import merge_entities, entity_key
|
||||
# Simulate a few sources each returning a diverse entity set
|
||||
mock_sources = [
|
||||
[{"name": "Hermes", "type": "tool", "sources": ["s1"]},
|
||||
{"name": "Gitea", "type": "tool", "sources": ["s1"]},
|
||||
{"name": "Timmy_Foundation/hermes-agent", "type": "repo", "sources": ["s1"]}],
|
||||
[{"name": "Hermes", "type": "tool", "sources": ["s2"]}, # duplicate
|
||||
{"name": "Docker", "type": "tool", "sources": ["s2"]},
|
||||
{"name": "Alexander", "type": "person", "sources": ["s2"]}],
|
||||
]
|
||||
merged = []
|
||||
for batch in mock_sources:
|
||||
merged = merge_entities(batch, merged)
|
||||
# Ensure dedup works across batches
|
||||
names = [e['name'].lower() for e in merged]
|
||||
assert names.count('hermes') == 1
|
||||
assert len(merged) == 4 # Hermes, Gitea, repo, Docker, Alexander
|
||||
|
||||
# The real LLM extraction test would require live API key; skip in CI
|
||||
Reference in New Issue
Block a user