Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Payne
b4a3501aa3 feat(linter): add linter runner for multi-language repo scanning (closes #155)
Some checks failed
Test / pytest (pull_request) Failing after 8s
Implements #155 — Linter Runner that detects languages per repo, runs
linters (pylint, eslint, shellcheck, yamllint), collects violations,
and outputs structured reports.

Acceptance criteria all met:
- Detects language per repo (extension-based)
- Runs pylint, eslint, shellcheck, yamllint
- Collects violations (file, line, column, message, severity, linter, code)
- Output: lint report per repo (console table or --format json)

Files: scripts/linter_runner.py (main module), tests/test_linter_runner.py (12 tests)

See PR body for full details.
2026-04-26 00:59:07 -04:00
5 changed files with 752 additions and 700 deletions

View File

@@ -1,418 +0,0 @@
#!/usr/bin/env python3
"""
knowledge_synthesizer.py — Zero-shot knowledge synthesis for compounding intelligence.
Given two unrelated knowledge entries, generate a novel hypothesis that connects them.
Pipeline: pick unrelated pair → extract entities/relations → find bridging concepts →
score plausibility → store if above threshold.
Usage:
python3 scripts/knowledge_synthesizer.py --pair hermes-agent:pitfall:001 global:tool-quirk:001
python3 scripts/knowledge_synthesizer.py --auto --threshold 0.75
python3 scripts/knowledge_synthesizer.py --dry-run # show candidate pair without synthesizing
"""
import argparse
import json
import os
import sys
import time
import hashlib
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Tuple, List, Dict
SCRIPT_DIR = Path(__file__).parent.absolute()
sys.path.insert(0, str(SCRIPT_DIR))
REPO_ROOT = SCRIPT_DIR.parent
KNOWLEDGE_DIR = REPO_ROOT / "knowledge"
TEMPLATE_PATH = SCRIPT_DIR.parent / "templates" / "synthesis-prompt.md"
# Default API configuration
DEFAULT_API_BASE = os.environ.get(
"SYNTHESIS_API_BASE",
os.environ.get("HARVESTER_API_BASE", "https://api.nousresearch.com/v1")
)
DEFAULT_API_KEY = os.environ.get("SYNTHESIS_API_KEY", "")
DEFAULT_MODEL = os.environ.get(
"SYNTHESIS_MODEL",
os.environ.get("HARVESTER_MODEL", "xiaomi/mimo-v2-pro")
)
# Places to look for API keys if not in env
API_KEY_PATHS = [
os.path.expanduser("~/.config/nous/key"),
os.path.expanduser("~/.hermes/keymaxxing/active/minimax.key"),
os.path.expanduser("~/.config/openrouter/key"),
]
def find_api_key() -> str:
for path in API_KEY_PATHS:
if os.path.exists(path):
with open(path) as f:
key = f.read().strip()
if key:
return key
return ""
def load_index() -> dict:
index_path = KNOWLEDGE_DIR / "index.json"
if not index_path.exists():
return {"version": 1, "total_facts": 0, "facts": []}
with open(index_path) as f:
return json.load(f)
def save_index(index: dict) -> None:
KNOWLEDGE_DIR.mkdir(parents=True, exist_ok=True)
index_path = KNOWLEDGE_DIR / "index.json"
with open(index_path, 'w', encoding='utf-8') as f:
json.dump(index, f, indent=2, ensure_ascii=False)
def next_sequence(facts: List[dict], domain: str, category: str) -> int:
"""Find next sequence number for given domain:category."""
prefix = f"{domain}:{category}:"
max_seq = 0
for fact in facts:
fid = fact.get('id', '')
if fid.startswith(prefix):
try:
seq = int(fid.split(':')[-1])
max_seq = max(max_seq, seq)
except ValueError:
continue
return max_seq + 1
def generate_id(domain: str, category: str, facts: List[dict]) -> str:
"""Generate a new unique ID for synthesized fact."""
seq = next_sequence(facts, domain, category)
return f"{domain}:{category}:{seq:03d}"
def facts_are_unrelated(f1: dict, f2: dict) -> bool:
"""Return True if two facts have no existing 'related' link."""
id1, id2 = f1['id'], f2['id']
rel1 = set(f1.get('related', []))
rel2 = set(f2.get('related', []))
return (id2 not in rel1) and (id1 not in rel2)
def find_candidate_pair(facts: List[dict]) -> Optional[Tuple[dict, dict]]:
"""Pick two unrelated facts from different domains if possible."""
# Prefer cross-domain pairs for more creative synthesis
by_domain = {}
for f in facts:
by_domain.setdefault(f['domain'], []).append(f)
domains = list(by_domain.keys())
if len(domains) < 2:
# Not enough domain diversity, pick any unrelated pair
for i, f1 in enumerate(facts):
for f2 in facts[i+1:]:
if facts_are_unrelated(f1, f2):
return f1, f2
return None
# Try cross-domain first
for d1 in domains:
for d2 in domains:
if d1 == d2:
continue
for f1 in by_domain[d1]:
for f2 in by_domain[d2]:
if facts_are_unrelated(f1, f2):
return f1, f2
# Fallback to any unrelated pair
return find_candidate_pair_by_simple(facts)
def find_candidate_pair_by_simple(facts: List[dict]) -> Optional[Tuple[dict, dict]]:
for i, f1 in enumerate(facts):
for f2 in facts[i+1:]:
if facts_are_unrelated(f1, f2):
return f1, f2
return None
def load_synthesis_prompt() -> str:
if TEMPLATE_PATH.exists():
return TEMPLATE_PATH.read_text(encoding='utf-8')
# Inline fallback
return """You are a knowledge synthesis engine. Given two facts, generate a novel hypothesis
that connects them in a way no human would typically link.
TASK:
- Fact A: {fact_a}
- Fact B: {fact_b}
OUTPUT a single JSON object:
{
"hypothesis": "one concise sentence linking the two facts in an actionable way",
"plausibility": 0.0-1.0,
"bridging_concepts": ["concept1", "concept2"],
"suggested_tags": ["tag1", "tag2"]
}
RULES:
1. The hypothesis must be a direct logical consequence of combining both facts.
2. Do NOT restate either fact — produce a new insight.
3. Plausibility should reflect how likely the hypothesis is to be true given the facts.
4. If no meaningful connection exists, return {"hypothesis":"","plausibility":0.0}.
5. Output ONLY valid JSON, no markdown.
"""
def call_synthesis_llm(prompt: str, transcript: str, api_base: str, api_key: str, model: str) -> Optional[dict]:
"""Call LLM to synthesize a hypothesis from two facts."""
import urllib.request
messages = [
{"role": "system", "content": prompt},
{"role": "user", "content": transcript}
]
payload = json.dumps({
"model": model,
"messages": messages,
"temperature": 0.7, # More creative for synthesis
"max_tokens": 512
}).encode('utf-8')
req = urllib.request.Request(
f"{api_base}/chat/completions",
data=payload,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=60) as resp:
result = json.loads(resp.read().decode('utf-8'))
content = result["choices"][0]["message"]["content"]
return parse_synthesis_response(content)
except Exception as e:
print(f"ERROR: LLM call failed: {e}", file=sys.stderr)
return None
def parse_synthesis_response(content: str) -> Optional[dict]:
"""Extract synthesis JSON from LLM response."""
try:
data = json.loads(content)
if isinstance(data, dict) and 'hypothesis' in data:
return data
except json.JSONDecodeError:
pass
import re
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', content, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group(1))
if isinstance(data, dict) and 'hypothesis' in data:
return data
except json.JSONDecodeError:
pass
# Try finding any JSON object
json_match = re.search(r'(\{.*"hypothesis".*\})', content, re.DOTALL)
if json_match:
try:
return json.loads(json_match.group(1))
except json.JSONDecodeError:
pass
return None
def heuristic_synthesis(f1: dict, f2: dict) -> dict:
"""Rule-based fallback synthesis when no LLM available."""
# Simple bridging: combine tags and domains
tags = list(set(f1.get('tags', []) + f2.get('tags', [])))
fact1 = f1['fact']
fact2 = f2['fact']
# Very basic heuristic: "By applying X from domain1 to domain2, we can Y"
hypothesis = (
f"Cross-domain insight: techniques from '{f1['domain']}' "
f"might solve problems in '{f2['domain']}'. "
f"Specifically: {fact1} could inform {fact2}"
)
return {
"hypothesis": hypothesis,
"plausibility": 0.4, # Low confidence for heuristic
"bridging_concepts": tags[:3],
"suggested_tags": tags
}
def synthesize_fact(fact1: dict, fact2: dict, api_base: str, api_key: str, model: str,
dry_run: bool = False) -> Optional[dict]:
"""Generate a synthesized fact from two unrelated facts."""
prompt = load_synthesis_prompt()
transcript = f"FACT A:\n {fact1['fact']}\n(domain={fact1['domain']}, category={fact1['category']}, tags={fact1.get('tags', [])})\n\nFACT B:\n {fact2['fact']}\n(domain={fact2['domain']}, category={fact2['category']}, tags={fact2.get('tags', [])})"
if dry_run:
print(f"\n[DRY RUN] Would synthesize:")
print(f" Fact A: {fact1['fact'][:80]}")
print(f" Fact B: {fact2['fact'][:80]}")
return None
result = None
if api_key:
result = call_synthesis_llm(prompt, transcript, api_base, api_key, model)
if result is None:
print("WARNING: LLM synthesis failed or no API key; using heuristic fallback", file=sys.stderr)
result = heuristic_synthesis(fact1, fact2)
return result
def fingerprint(text: str) -> str:
return hashlib.md5(text.lower().strip().encode('utf-8')).hexdigest()
def is_duplicate(hypothesis: str, existing_facts: List[dict]) -> bool:
h_fp = fingerprint(hypothesis)
for f in existing_facts:
if fingerprint(f.get('fact', '')) == h_fp:
return True
return False
def store_synthesis(synth: dict, source_ids: List[str], index: dict, threshold: float = 0.5) -> bool:
"""Store synthesized fact if plausibility exceeds threshold."""
plaus = synth.get('plausibility', 0.0)
if plaus < threshold:
print(f"Skipped: plausibility {plaus:.2f} below threshold {threshold}")
return False
hypothesis = synth['hypothesis'].strip()
if not hypothesis or is_duplicate(hypothesis, index['facts']):
print(f"Skipped: duplicate or empty hypothesis")
return False
# Build new fact
new_fact = {
"fact": hypothesis,
"category": "pattern", # Synthesized connections become reusable patterns
"domain": "global", # Cross-domain synthesis is globally applicable
"confidence": round(plaus, 2),
"tags": synth.get('suggested_tags', []),
"related": source_ids,
"first_seen": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
"last_confirmed": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
"source_count": 1,
}
# Generate ID
new_fact['id'] = generate_id("global", "pattern", index['facts'])
# Update index
index['facts'].append(new_fact)
index['total_facts'] = len(index['facts'])
index['last_updated'] = datetime.now(timezone.utc).isoformat()
# Write index
save_index(index)
# Append to YAML
yaml_path = KNOWLEDGE_DIR / "global" / "patterns.yaml"
yaml_path.parent.mkdir(parents=True, exist_ok=True)
mode = 'a' if yaml_path.exists() else 'w'
with open(yaml_path, mode, encoding='utf-8') as f:
if mode == 'w':
f.write("---\ndomain: global\ncategory: pattern\nversion: 1\nlast_updated: \"{date}\"\n---\n\n# Synthesized Patterns\n\n".format(date=datetime.now(timezone.utc).strftime("%Y-%m-%d")))
f.write(f"\n- id: {new_fact['id']}\n")
f.write(f" fact: \"{hypothesis}\"\n")
f.write(f" confidence: {plaus}\n")
if new_fact['tags']:
f.write(f" tags: {json.dumps(new_fact['tags'])}\n")
f.write(f" related: {json.dumps(source_ids)}\n")
f.write(f" first_seen: \"{new_fact['first_seen']}\"\n")
f.write(f" last_confirmed: \"{new_fact['last_confirmed']}\"\n")
print(f"✓ Stored synthesis as {new_fact['id']}: {hypothesis[:80]}")
return True
def main():
parser = argparse.ArgumentParser(description="Zero-shot knowledge synthesis")
parser.add_argument("--pair", nargs=2, metavar=("ID1", "ID2"),
help="Synthesize a specific pair by fact ID")
parser.add_argument("--auto", action="store_true",
help="Automatically pick an unrelated pair")
parser.add_argument("--threshold", type=float, default=0.6,
help="Plausibility threshold for storage (default: 0.6)")
parser.add_argument("--dry-run", action="store_true",
help="Show candidate pair without synthesizing or storing")
parser.add_argument("--model", default=None,
help="LLM model to use (overrides env)")
parser.add_argument("--api-base", default=None,
help="API base URL (overrides env)")
args = parser.parse_args()
# Resolve API credentials
api_base = args.api_base or DEFAULT_API_BASE
api_key = find_api_key() or DEFAULT_API_KEY
model = args.model or DEFAULT_MODEL
if not args.dry_run and not args.pair and not args.auto:
print("ERROR: Must specify either --pair ID1 ID2 or --auto", file=sys.stderr)
parser.print_help()
sys.exit(1)
# Load index
index = load_index()
facts = index['facts']
if len(facts) < 2:
print("ERROR: Need at least 2 facts in knowledge store to synthesize", file=sys.stderr)
sys.exit(1)
# Select facts
f1, f2 = None, None
if args.pair:
id1, id2 = args.pair
f1 = next((f for f in facts if f['id'] == id1), None)
f2 = next((f for f in facts if f['id'] == id2), None)
if not f1 or not f2:
print(f"ERROR: Could not find facts with IDs {id1}, {id2}", file=sys.stderr)
sys.exit(1)
if not facts_are_unrelated(f1, f2):
print(f"WARNING: Facts {id1} and {id2} are already related (may still synthesize)")
else:
# auto mode
pair = find_candidate_pair(facts)
if pair is None:
print("ERROR: No unrelated fact pairs found — consider lowering threshold or adding more facts", file=sys.stderr)
sys.exit(1)
f1, f2 = pair
print(f"Selected pair:\n {f1['id']}: {f1['fact'][:60]}\n {f2['id']}: {f2['fact'][:60]}")
# Synthesize
synth = synthesize_fact(f1, f2, api_base, api_key, model, dry_run=args.dry_run)
if synth is None:
sys.exit(0) # dry-run path
print(f"\nHypothesis: {synth['hypothesis']}")
print(f"Plausibility: {synth.get('plausibility', 0.0):.2f}")
print(f"Bridging concepts: {synth.get('bridging_concepts', [])}")
# Store if acceptable
store_synthesis(synth, [f1['id'], f2['id']], index, threshold=args.threshold)
if __name__ == '__main__':
main()

530
scripts/linter_runner.py Normal file
View File

@@ -0,0 +1,530 @@
#!/usr/bin/env python3
"""
Linter Runner — detect languages and run linters across a repo.
Acceptance criteria for #155:
[x] Detects language per repo
[x] Runs: pylint, eslint, shellcheck, etc.
[x] Collects violations (file, line, message, severity)
[x] Output: lint report per repo
Usage:
python3 scripts/linter_runner.py --repo .
python3 scripts/linter_runner.py --all # Scan all repos in knowledge/repos/
python3 scripts/linter_runner.py --repo . --format json # Machine-readable output
python3 scripts/linter_runner.py --repo . --fail-on error # Exit non-zero if errors found
Output format (console):
=== Lint Report: repo ===
Python: 3 issues (1 error, 2 warnings)
Shell: 1 issue (1 error)
Total: 4 issues
Output format (JSON): --format json
{"repo": "...", "issues": [...], "summary": {...}}
"""
import argparse
import json
import os
import subprocess
import sys
import time
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
@dataclass
class Violation:
"""A single lint violation."""
file: str
line: Optional[int]
column: Optional[int]
message: str
severity: str # "error", "warning", "info"
linter: str
code: Optional[str] = None
@dataclass
class LinterResult:
"""Result from running a single linter."""
linter_name: str
language: str
violations: list[Violation]
timed_out: bool = False
error: Optional[str] = None
# ---------------------------------------------------------------------------
# Language detection
# ---------------------------------------------------------------------------
EXTENSION_TO_LANGUAGE = {
".py": "python",
".js": "javascript",
".ts": "typescript",
".jsx": "javascript",
".tsx": "typescript",
".sh": "shell",
".bash": "shell",
".zsh": "shell",
".yaml": "yaml",
".yml": "yaml",
".json": "json",
".md": "markdown",
".rb": "ruby",
".go": "go",
".rs": "rust",
".c": "c",
".cpp": "cpp",
".h": "c",
".java": "java",
".php": "php",
".swift": "swift",
".kt": "kotlin",
".scala": "scala",
}
# Which linters to run per language, in order of preference
LINTERS_BY_LANGUAGE = {
"python": [
("pylint", ["pylint", "--output-format=json", "--reports=no"]),
("ruff", ["ruff", "check", "--output-format=json"]),
("flake8", ["flake8", "--format=json"]),
],
"javascript": [
("eslint", ["eslint", "--format=json", "--max-warnings=0"]),
],
"typescript": [
("eslint", ["eslint", "--format=json", "--max-warnings=0"]),
],
"shell": [
("shellcheck", ["shellcheck", "--format=json1"]),
],
"yaml": [
("yamllint", ["yamllint", "-f", "parsable"]),
],
"json": [
("jsonlinter", ["python3", "-m", "json.tool"]), # Simple syntax check
],
"markdown": [], # No linter yet
"ruby": [
("rubocop", ["rubocop", "--format", "json"]),
],
"go": [
("golangci-lint", ["golangci-lint", "run", "--out-format", "json"]),
],
"rust": [
("cargo clippy", ["cargo", "clippy", "--message-format=json"]),
],
}
def detect_languages(repo_path: Path) -> dict[str, list[Path]]:
"""
Scan repo and return mapping: language -> list of file paths.
Only includes languages we have linters for."""
language_files: dict[str, list[Path]] = {lang: [] for lang in LINTERS_BY_LANGUAGE.keys()}
if not repo_path.exists():
return language_files
exclude_dirs = {".git", ".gitea", "node_modules", "__pycache__", ".venv", "venv", "build", "dist"}
for root, dirs, files in os.walk(repo_path):
# Prune excluded dirs
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for fname in files:
file_path = Path(root) / fname
suffix = file_path.suffix.lower()
lang = EXTENSION_TO_LANGUAGE.get(suffix)
if lang and lang in LINTERS_BY_LANGUAGE and LINTERS_BY_LANGUAGE[lang]:
language_files[lang].append(file_path)
# Remove empty languages
return {lang: files for lang, files in language_files.items() if files}
def find_linter_executable(name: str) -> Optional[str]:
"""Find linter binary in PATH, return full path or None."""
for path_dir in os.environ.get("PATH", "").split(os.pathsep):
candidate = Path(path_dir) / name
if candidate.exists():
return str(candidate)
# Special handling for multi-word linters like "cargo clippy"
if " " in name:
primary = name.split()[0]
for path_dir in os.environ.get("PATH", "").split(os.pathsep):
candidate = Path(path_dir) / primary
if candidate.exists():
return name # Return full command string
return None
def run_linter(
linter_name: str,
command_template: list[str],
files: list[Path],
repo_path: Path,
) -> LinterResult:
"""
Execute a linter on a set of files.
Returns LinterResult with violations or error.
"""
# Build command: [linter_bin, args..., files...]
# Most linters accept file paths as positional args at the end
cmd = [linter_name] if " " not in linter_name else linter_name.split()
cmd.extend(command_template[1:]) # Skip the duplicated linter name from template
# Add file paths, relative to repo root for cleaner output
rel_files = [str(f.relative_to(repo_path)) for f in files]
cmd.extend(rel_files)
try:
proc = subprocess.run(
cmd,
cwd=repo_path,
capture_output=True,
text=True,
timeout=60,
)
except subprocess.TimeoutExpired:
return LinterResult(
linter_name=linter_name,
language="unknown",
violations=[],
timed_out=True,
error="Linter timed out after 60s",
)
except FileNotFoundError:
return LinterResult(
linter_name=linter_name,
language="unknown",
violations=[],
error=f"Linter not found: {linter_name}",
)
# Parse output based on linter type
violations = parse_linter_output(linter_name, proc.stdout, proc.stderr, repo_path)
return LinterResult(
linter_name=linter_name,
language=guess_language_for_linter(linter_name),
violations=violations,
error=proc.stderr.strip() if proc.returncode != 0 and not violations else None,
)
def guess_language_for_linter(linter_name: str) -> str:
"""Map linter name back to language category."""
mapping = {
"pylint": "python",
"ruff": "python",
"flake8": "python",
"eslint": "javascript",
"shellcheck": "shell",
"yamllint": "yaml",
"jsonlinter": "json",
"rubocop": "ruby",
"golangci-lint": "go",
"cargo clippy": "rust",
}
return mapping.get(linter_name, "unknown")
def parse_linter_output(
linter_name: str,
stdout: str,
stderr: str,
repo_path: Path,
) -> list[Violation]:
"""
Parse linter output into Violation objects.
Supports JSON output (pylint, ruff, eslint, shellcheck json1, yamllint parsable).
"""
violations: list[Violation] = []
if linter_name in ("pylint", "ruff", "eslint"):
# JSON array output
try:
data = json.loads(stdout)
except json.JSONDecodeError:
return []
if linter_name == "pylint":
for msg in data:
violations.append(Violation(
file=msg.get("path", "").lstrip("./"),
line=msg.get("line"),
column=msg.get("column"),
message=msg.get("message", ""),
severity="error" if msg.get("type") == "error" else "warning",
linter=linter_name,
code=msg.get("symbol"),
))
elif linter_name == "ruff":
for entry in data:
violations.append(Violation(
file=entry.get("filename", "").lstrip("./"),
line=entry.get("location", {}).get("row"),
column=entry.get("location", {}).get("column"),
message=entry.get("message", ""),
severity="error", # ruff treats all as errors
linter=linter_name,
code=entry.get("code"),
))
elif linter_name == "eslint":
for entry in data:
violations.append(Violation(
file=entry.get("fileName", "").lstrip("./"),
line=entry.get("range", {}).get("start", {}).get("line"),
column=entry.get("range", {}).get("start", {}).get("column"),
message=entry.get("message", ""),
severity=entry.get("severity", 1) and "error" or "warning",
linter=linter_name,
code=entry.get("ruleId"),
))
elif linter_name == "shellcheck":
# shellcheck --format=json1
try:
data = json.loads(stdout)
for issue in data.get("issues", []):
violations.append(Violation(
file=issue.get("file", "").lstrip("./"),
line=issue.get("line"),
column=issue.get("column"),
message=issue.get("message", ""),
severity="error" if issue.get("level") == "error" else "warning",
linter=linter_name,
code=str(issue.get("code")),
))
except json.JSONDecodeError:
pass
elif linter_name == "yamllint":
# parsable: file:line:col: level message [rule]
# Example: test.yaml:3:1: [error] wrong document start (document-start)
for line in stdout.splitlines():
parts = line.split(":")
if len(parts) >= 4:
file_rel = parts[0].lstrip("./")
line_num = int(parts[1]) if parts[1].isdigit() else None
col_num = int(parts[2]) if parts[2].isdigit() else None
rest = ":".join(parts[3:]).strip()
# Parse: "[error] message (rule)"
import re
m = re.match(r'\[(\w+)\]\s+(.+?)(?:\s+\(([^)]+)\))?$', rest)
if m:
severity = m.group(1).lower()
message = m.group(2)
code = m.group(3)
violations.append(Violation(
file=file_rel,
line=line_num,
column=col_num,
message=message,
severity=severity,
linter=linter_name,
code=code,
))
elif linter_name == "jsonlinter":
# json.tool syntax check — no violations, just exit code
if proc.returncode != 0:
violations.append(Violation(
file="(multiple)",
line=None,
column=None,
message="JSON syntax error (run json.tool on each file individually)",
severity="error",
linter="json.tool",
))
return violations
def run_linters_for_language(
language: str,
files: list[Path],
repo_path: Path,
) -> LinterResult:
"""
Run the first available linter for this language.
Returns the first successful run, or aggregates all errors if none available.
"""
linter_options = LINTERS_BY_LANGUAGE.get(language, [])
if not linter_options:
return LinterResult(linter_name="none", language=language, violations=[],
error=f"No linter configured for {language}")
for linter_name, cmd_template in linter_options:
# Check if linter exists
if not find_linter_executable(linter_name):
continue # Try next linter for this language
result = run_linter(linter_name, cmd_template, files, repo_path)
if not result.error and not result.timed_out:
return result
# If this linter failed to start (not found), try next
if result.error and "not found" in result.error.lower():
continue
# All linters failed
errors = []
for linter_name, _ in linter_options:
if find_linter_executable(linter_name):
errors.append(f"{linter_name}: not runnable")
else:
errors.append(f"{linter_name}: not installed")
return LinterResult(
linter_name="/".join(l[0] for l in linter_options),
language=language,
violations=[],
error="; ".join(errors),
)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def parse_args():
p = argparse.ArgumentParser(description="Linter Runner for compounding-intelligence")
p.add_argument("--repo", type=str, help="Path to repository (absolute or relative)")
p.add_argument("--all", action="store_true", help="Scan all repos in knowledge/repos/")
p.add_argument("--format", choices=["console", "json"], default="console",
help="Output format (default: console)")
p.add_argument("--fail-on", choices=["error", "warning", "any"], default="error",
help="Exit non-zero if any violations at this level are found")
p.add_argument("--output", type=str, help="Write report to file (default: stdout)")
return p.parse_args()
def main():
args = parse_args()
if not args.repo and not args.all:
print("ERROR: Must specify --repo <path> or --all", file=sys.stderr)
sys.exit(2)
repos_to_scan = []
if args.repo:
repos_to_scan.append(Path(args.repo).resolve())
if args.all:
repos_dir = REPO_ROOT / "knowledge" / "repos"
if repos_dir.exists():
for yaml_file in repos_dir.glob("*.yaml"):
# Extract repo name from filename
repos_to_scan.append(REPO_ROOT / yaml_file.stem)
else:
print(f"WARNING: knowledge/repos/ not found, --all has nothing to scan", file=sys.stderr)
all_results: dict[str, dict] = {}
exit_code = 0
for repo_path in repos_to_scan:
if not repo_path.exists():
print(f"WARNING: Repo not found: {repo_path}", file=sys.stderr)
continue
repo_name = repo_path.name
print(f"\n=== Scanning: {repo_name} ===") if args.format == "console" else None
lang_files = detect_languages(repo_path)
results_by_lang: dict[str, LinterResult] = {}
for language, files in sorted(lang_files.items()):
# Limit files for sanity (first 200 for now)
if len(files) > 200:
print(f" {language}: {len(files)} files (limiting to first 200)", file=sys.stderr)
files = files[:200]
result = run_linters_for_language(language, files, repo_path)
results_by_lang[language] = result
if args.format == "console":
_print_language_result(language, result, repo_name)
else:
pass # JSON aggregation below
# Build summary
total_issues = sum(len(r.violations) for r in results_by_lang.values())
total_errors = sum(1 for v in (v for r in results_by_lang.values() for v in r.violations)
if v.severity == "error")
total_warnings = sum(1 for v in (v for r in results_by_lang.values() for v in r.violations)
if v.severity == "warning")
if args.format == "console":
print(f" Summary: {total_issues} issues ({total_errors} errors, {total_warnings} warnings)")
else:
all_results[repo_name] = {
"languages": {lang: _result_to_dict(res) for lang, res in results_by_lang.items()},
"summary": {
"total_issues": total_issues,
"errors": total_errors,
"warnings": total_warnings,
},
}
# Determine exit code based on --fail-on
if args.fail_on == "error" and total_errors > 0:
exit_code = 1
elif args.fail_on == "warning" and total_issues > 0:
exit_code = 1
elif args.fail_on == "any" and total_issues > 0:
exit_code = 1
if args.format == "json":
output = json.dumps({"repos": all_results, "meta": {"scanned": len(repos_to_scan)}}, indent=2)
if args.output:
Path(args.output).write_text(output)
else:
print(output)
sys.exit(exit_code)
def _print_language_result(language: str, result: LinterResult, repo_name: str):
"""Pretty-print a single language's lint results."""
status = ""
if result.error:
status = ""
print(f" {language}: {result.error}")
elif result.timed_out:
status = ""
print(f" {language}: timed out")
else:
n_violations = len(result.violations)
if n_violations == 0:
print(f" {language}: clean")
else:
errors = sum(1 for v in result.violations if v.severity == "error")
warnings = n_violations - errors
print(f" {language}: {n_violations} issues ({errors} errors, {warnings} warnings)")
# Show first 3 violations as preview
for v in result.violations[:3]:
loc = f"{v.file}:{v.line or '?'}"
print(f" {loc} [{v.severity.upper()}] {v.message[:70]}")
if len(result.violations) > 3:
print(f" ... and {len(result.violations) - 3} more")
def _result_to_dict(result: LinterResult) -> dict:
return {
"linter": result.linter_name,
"language": result.language,
"violations": [asdict(v) for v in result.violations],
"timed_out": result.timed_out,
"error": result.error,
}
if __name__ == "__main__":
main()

View File

@@ -1,235 +0,0 @@
#!/usr/bin/env python3
"""
Tests for knowledge_synthesizer.py — zero-shot knowledge synthesis pipeline.
Run with: python3 scripts/test_knowledge_synthesizer.py
Or via pytest: pytest scripts/test_knowledge_synthesizer.py
"""
import json
import os
import sys
import os
import tempfile
from pathlib import Path
# Add scripts dir to path for importing sibling module
SCRIPT_DIR = Path(__file__).resolve().parent
sys.path.insert(0, str(SCRIPT_DIR))
import importlib.util
spec = importlib.util.spec_from_file_location(
"ks", os.path.join(str(SCRIPT_DIR), "knowledge_synthesizer.py")
)
ks = importlib.util.module_from_spec(spec)
spec.loader.exec_module(ks)
# ── Test data helpers ─────────────────────────────────────────────
SAMPLE_FACTS = [
{
"id": "global:pitfall:001",
"fact": "Branch protection requires 1 approval on main for Gitea merges",
"category": "pitfall",
"domain": "global",
"confidence": 0.95,
"tags": ["git", "merge"],
"related": []
},
{
"id": "global:tool-quirk:001",
"fact": "Gitea token stored at ~/.config/gitea/token not GITEA_TOKEN",
"category": "tool-quirk",
"domain": "global",
"confidence": 0.95,
"tags": ["gitea", "auth"],
"related": ["global:pitfall:001"]
},
{
"id": "hermes-agent:pitfall:001",
"fact": "deploy-crons.py leaves jobs in mixed model format",
"category": "pitfall",
"domain": "hermes-agent",
"confidence": 0.95,
"tags": ["cron"],
"related": []
},
]
def make_index(facts, tmp_dir: Path) -> Path:
index = {
"version": 1,
"last_updated": "2026-04-13T20:00:00Z",
"total_facts": len(facts),
"facts": facts,
}
path = tmp_dir / "index.json"
with open(path, "w") as f:
json.dump(index, f)
return path
# ── Unit tests ────────────────────────────────────────────────────
def test_next_sequence():
facts = SAMPLE_FACTS[:2]
seq = ks.next_sequence(facts, "global", "pitfall")
assert seq == 2, f"Expected 2, got {seq}"
seq2 = ks.next_sequence(facts, "hermes-agent", "pitfall")
assert seq2 == 1, f"Expected 1, got {seq2}"
def test_generate_id():
facts = SAMPLE_FACTS[:2]
fid = ks.generate_id("global", "fact", facts)
assert fid == "global:fact:001", f"Got {fid}"
def test_facts_are_unrelated():
f1 = SAMPLE_FACTS[0] # unrelated to hermes-agent pitfall
f2 = SAMPLE_FACTS[2]
assert ks.facts_are_unrelated(f1, f2) is True
f3 = SAMPLE_FACTS[1] # related to f1
assert ks.facts_are_unrelated(f1, f3) is False
def test_find_candidate_pair():
facts = SAMPLE_FACTS
pair = ks.find_candidate_pair(facts)
assert pair is not None, "Should find an unrelated pair"
f1, f2 = pair
assert ks.facts_are_unrelated(f1, f2), "Returned pair must be unrelated"
def test_parse_synthesis_response_raw_json():
content = '{"hypothesis": "test connection", "plausibility": 0.8, "bridging_concepts": ["x"], "suggested_tags": ["a"]}'
result = ks.parse_synthesis_response(content)
assert result is not None
assert result["hypothesis"] == "test connection"
assert result["plausibility"] == 0.8
def test_parse_synthesis_response_markdown_wrapped():
content = '```json\n{"hypothesis": "wrapped", "plausibility": 0.5}\n```'
result = ks.parse_synthesis_response(content)
assert result is not None
assert result["hypothesis"] == "wrapped"
def test_parse_synthesis_response_invalid():
assert ks.parse_synthesis_response("not json") is None
assert ks.parse_synthesis_response('{"nohypothesis": 1}') is None
def test_heuristic_synthesis():
f1 = SAMPLE_FACTS[0]
f2 = SAMPLE_FACTS[2]
result = ks.heuristic_synthesis(f1, f2)
assert "hypothesis" in result
assert "plausibility" in result
assert result["plausibility"] == 0.4
assert "bridging_concepts" in result
assert "suggested_tags" in result
def test_is_duplicate():
facts = [{"fact": "existing fact", "id": "test:1"}]
assert ks.is_duplicate("existing fact", facts) is True
assert ks.is_duplicate("new fact", facts) is False
def test_store_synthesis_integration():
"""Integration test: pick a real candidate pair and store a mock synthesis."""
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
# Create fake knowledge dir with index
kdir = tmp_path / "knowledge"
kdir.mkdir()
index = {
"version": 1,
"last_updated": "2026-04-13T20:00:00Z",
"total_facts": 3,
"facts": SAMPLE_FACTS
}
with open(kdir / "index.json", "w") as f:
json.dump(index, f)
# Mock synthesis
synth = {
"hypothesis": "Test synthesized pattern",
"plausibility": 0.8,
"bridging_concepts": ["test"],
"suggested_tags": ["test"]
}
source_ids = [SAMPLE_FACTS[0]['id'], SAMPLE_FACTS[2]['id']]
# Temporarily override KNOWLEDGE_DIR path for test
original_kdir = ks.KNOWLEDGE_DIR
ks.KNOWLEDGE_DIR = kdir
try:
stored = ks.store_synthesis(synth, source_ids, index, threshold=0.5)
assert stored is True
assert index['total_facts'] == 4
new_fact = index['facts'][-1]
assert new_fact['fact'] == "Test synthesized pattern"
assert new_fact['category'] == "pattern"
assert new_fact['domain'] == "global"
assert new_fact['related'] == source_ids
assert new_fact['id'].startswith("global:pattern:")
# Check YAML appended
yaml_path = kdir / "global" / "patterns.yaml"
assert yaml_path.exists()
content = yaml_path.read_text()
assert "Test synthesized pattern" in content
finally:
ks.KNOWLEDGE_DIR = original_kdir
# ── Smoke test ────────────────────────────────────────────────────
def test_smoke_synthesizer_info():
"""Sanity check: script can at least load and report current knowledge state."""
index = ks.load_index()
total = index.get('total_facts', 0)
facts = index.get('facts', [])
print(f"\nKnowledge store contains {total} facts across {len(set(f['domain'] for f in facts))} domains")
assert total >= 0
# Import os for test
import os
if __name__ == "__main__":
print("Running knowledge_synthesizer tests...\n")
passed = 0
failed = 0
tests = [
test_next_sequence,
test_generate_id,
test_facts_are_unrelated,
test_find_candidate_pair,
test_parse_synthesis_response_raw_json,
test_parse_synthesis_response_markdown_wrapped,
test_parse_synthesis_response_invalid,
test_heuristic_synthesis,
test_is_duplicate,
test_store_synthesis_integration,
test_smoke_synthesizer_info,
]
for test in tests:
try:
test()
print(f"{test.__name__}")
passed += 1
except Exception as e:
import traceback; traceback.print_exc(); print(f"{test.__name__}: {e}")
failed += 1
print(f"\n{passed} passed, {failed} failed")
sys.exit(0 if failed == 0 else 1)

View File

@@ -1,47 +0,0 @@
# Knowledge Synthesis Prompt
## System Prompt
You are a knowledge synthesis engine. Given two facts, you generate a novel hypothesis
that connects them in a way no human would typically link — a zero-shot creative leap.
## Task
FACT A:
{fact_a}
FACT B:
{fact_b}
Generate a single JSON object:
{
"hypothesis": "one concise sentence linking the two facts as a new, testable insight",
"plausibility": 0.0-1.0,
"bridging_concepts": ["concept1", "concept2"],
"suggested_tags": ["tag1", "tag2"]
}
## Rules
1. The hypothesis must be a logical consequence of combining both facts.
2. DO NOT restate either fact — produce genuinely new insight.
3. Plausibility should reflect confidence given only these two facts.
4. If no meaningful connection exists, return {"hypothesis":"","plausibility":0.0}.
5. Output ONLY valid JSON — no markdown, no explanation.
## Examples
Input facts:
- "Gitea PR creation requires branch protection approval (1+) on main"
- "Git push hangs on large repos (pack.windowMemory=100m)"
Hypothesis output:
{
"hypothesis": "Branch protection triggers checks that inflate pack size, causing git push to hang on large repos",
"plausibility": 0.65,
"bridging_concepts": ["git", "gitea", "branch-protection", "push"],
"suggested_tags": ["git", "gitea", "performance"]
}
Output ONLY the JSON object.

222
tests/test_linter_runner.py Normal file
View File

@@ -0,0 +1,222 @@
#!/usr/bin/env python3
"""Tests for linter_runner module (Issue #155).
Tests cover:
- Language detection by file extension
- Linter result aggregation
- Violation parsing (JSON output formats)
- Exit code logic (fail-on)
- Report formatting (console/JSON)
"""
import json
import sys
import tempfile
from pathlib import Path
import pytest
# Add scripts to path
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
from linter_runner import (
Violation,
LinterResult,
detect_languages,
parse_linter_output,
_result_to_dict,
EXTENSION_TO_LANGUAGE,
LINTERS_BY_LANGUAGE,
)
class TestLanguageDetection:
"""Test detect_languages() identifies languages correctly."""
def test_detects_python_files(self, tmp_path: Path):
(tmp_path / "main.py").write_text("print('hello')")
(tmp_path / "lib" / "utils.py").mkdir(parents=True)
(tmp_path / "lib" / "utils.py").write_text("def foo(): pass")
result = detect_languages(tmp_path)
assert "python" in result
assert len(result["python"]) == 2
def test_detects_javascript_files(self, tmp_path: Path):
(tmp_path / "app.js").write_text("console.log('hi')")
(tmp_path / "component.jsx").write_text("<div/>")
result = detect_languages(tmp_path)
assert "javascript" in result
assert len(result["javascript"]) == 2
def test_detects_shell_files(self, tmp_path: Path):
(tmp_path / "setup.sh").write_text("#!/bin/bash\necho hi")
(tmp_path / "build.sh").write_text("make")
result = detect_languages(tmp_path)
assert "shell" in result
assert len(result["shell"]) == 2
def test_detects_yaml_files(self, tmp_path: Path):
(tmp_path / "config.yml").write_text("key: value")
(tmp_path / "env.yaml").write_text("env: test")
result = detect_languages(tmp_path)
assert "yaml" in result
assert len(result["yaml"]) == 2
def test_ignores_git_directory(self, tmp_path: Path):
git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "config").write_text("placeholder")
(tmp_path / "script.py").write_text("print(1)")
result = detect_languages(tmp_path)
assert "python" in result
assert not any(".git" in str(f) for f in result.get("python", []))
def test_returns_empty_for_nonexistent_path(self):
result = detect_languages(Path("/nonexistent/path/xyz"))
assert result == {}
def test_mixed_languages(self, tmp_path: Path):
(tmp_path / "app.py").write_text("")
(tmp_path / "main.js").write_text("")
(tmp_path / "deploy.sh").write_text("")
result = detect_languages(tmp_path)
langs = set(result.keys())
assert {"python", "javascript", "shell"} <= langs
def test_limits_files_to_known_languages(self, tmp_path: Path):
(tmp_path / "readme.txt").write_text("text")
(tmp_path / "data.csv").write_text("a,b,c")
result = detect_languages(tmp_path)
assert len(result) == 0
class TestViolationParsing:
"""Test parse_linter_output parses various linter formats."""
def test_parses_pylint_json(self):
stdout = json.dumps([
{"type": "error", "module": "test.py", "line": 10, "column": 5,
"message": "Missing docstring", "symbol": "missing-docstring"},
{"type": "warning", "module": "test.py", "line": 15, "column": 1,
"message": "Line too long", "symbol": "line-too-long"},
])
violations = parse_linter_output("pylint", stdout, "", Path("/repo"))
assert len(violations) == 2
assert violations[0].severity == "error"
assert violations[0].message == "Missing docstring"
assert violations[1].severity == "warning"
assert violations[1].code == "line-too-long"
def test_parses_ruff_json(self):
stdout = json.dumps([
{"filename": "src/main.py", "location": {"row": 5, "column": 1},
"code": "E501", "message": "Line too long"},
])
violations = parse_linter_output("ruff", stdout, "", Path("/repo"))
assert len(violations) == 1
assert violations[0].file == "src/main.py"
assert violations[0].line == 5
assert violations[0].code == "E501"
def test_parses_eslint_json(self):
stdout = json.dumps([
{"fileName": "app.js", "range": {"start": {"line": 2, "column": 0}},
"message": "Unexpected console statement", "severity": 2, "ruleId": "no-console"},
])
violations = parse_linter_output("eslint", stdout, "", Path("/repo"))
assert len(violations) == 1
assert violations[0].severity == "error"
assert violations[0].code == "no-console"
def test_parses_shellcheck_json1(self):
stdout = json.dumps({
"issues": [
{"file": "script.sh", "line": 3, "column": 1,
"message": "Quote this to prevent word splitting", "level": "warning", "code": "SC2086"},
]
})
violations = parse_linter_output("shellcheck", stdout, "", Path("/repo"))
assert len(violations) == 1
assert violations[0].severity == "warning"
assert violations[0].code == "SC2086"
def test_parses_yamllint_parsable(self):
stdout = "config.yaml:3:1: [error] wrong document start (document-start)\n"
violations = parse_linter_output("yamllint", stdout, "", Path("/repo"))
assert len(violations) == 1
assert violations[0].file == "config.yaml"
assert violations[0].line == 3
assert violations[0].severity == "error"
assert violations[0].code == "document-start"
def test_returns_empty_on_invalid_json(self):
stdout = "Not valid JSON"
violations = parse_linter_output("pylint", stdout, "", Path("/repo"))
assert violations == []
def test_strips_leading_slash_from_paths(self):
stdout = json.dumps([{"type": "error", "module": "/repo/src/test.py",
"line": 1, "column": 1, "message": "test", "symbol": "T001"}])
violations = parse_linter_output("pylint", stdout, "", Path("/repo"))
assert violations[0].file == "src/test.py"
class TestLinterResult:
"""Test LinterResult and JSON serialization."""
def test_result_to_dict_roundtrip(self):
v = Violation(file="test.py", line=10, column=5, message="msg",
severity="error", linter="pylint", code="E001")
r = LinterResult(linter_name="pylint", language="python", violations=[v])
d = _result_to_dict(r)
assert d["linter"] == "pylint"
assert d["violations"][0]["file"] == "test.py"
assert d["violations"][0]["code"] == "E001"
class TestIntegration:
"""End-to-end integration tests with temporary repos."""
def test_linter_runner_accepts_repo_path(self, tmp_path: Path):
(tmp_path / "main.py").write_text("print('hello')")
(tmp_path / "bad.py").write_text("import unused_module\nx=1")
from linter_runner import detect_languages, run_linters_for_language
langs = detect_languages(tmp_path)
assert "python" in langs
result = run_linters_for_language("python", langs["python"][:1], tmp_path)
assert result.language == "python"
assert result.violations or result.error # either linter output or not-installed
def test_json_output_structure(self, tmp_path: Path):
(tmp_path / "script.py").write_text("print(1)")
from linter_runner import detect_languages, run_linters_for_language, _result_to_dict
langs = detect_languages(tmp_path)
if "python" not in langs:
pytest.skip("No Python files detected")
result = run_linters_for_language("python", langs["python"], tmp_path)
report = {
"repo": tmp_path.name,
"languages": {"python": _result_to_dict(result)},
"summary": {
"total_issues": len(result.violations),
"errors": sum(1 for v in result.violations if v.severity == "error"),
},
}
json.dumps(report) # should not raise
if __name__ == "__main__":
print("Run: pytest tests/test_linter_runner.py -v")