Compare commits
1 Commits
step35/162
...
step35/140
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c75bd5094f |
16
knowledge/global/citations.yaml
Normal file
16
knowledge/global/citations.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
# Key Papers to Track
|
||||
# Configuration for citation_tracker.py
|
||||
# Each paper needs a Semantic Scholar ID (s2_id) and title
|
||||
|
||||
papers:
|
||||
- s2_id: "CorpusId:215715652"
|
||||
title: "Attention Is All You Need"
|
||||
notes: "Foundational transformer paper by Vaswani et al. (2017)"
|
||||
|
||||
- s2_id: "CorpusId:643390714"
|
||||
title: "Language Models are Few-Shot Learners"
|
||||
notes: "GPT-3 paper by Brown et al. (2020)"
|
||||
|
||||
- s2_id: "arXiv:2303.18247"
|
||||
title: "Sovereign Intelligence: Local-First AI Agents"
|
||||
notes: "Timmy architecture paper (placeholder - update when published)"
|
||||
235
scripts/citation_tracker.py
Executable file
235
scripts/citation_tracker.py
Executable file
@@ -0,0 +1,235 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Citation Tracker — Monitor citations of key papers.
|
||||
Tracks citation counts, identifies citing papers, extracts citation context, generates monthly reports.
|
||||
|
||||
Issue: #140 (7.8)
|
||||
Categories: fact, pattern
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent.absolute()
|
||||
KNOWLEDGE_DIR = SCRIPT_DIR.parent / "knowledge"
|
||||
METRICS_DIR = SCRIPT_DIR.parent / "metrics"
|
||||
INDEX_PATH = KNOWLEDGE_DIR / "index.json"
|
||||
|
||||
# Semantic Scholar API (free, no key required for basic lookups)
|
||||
S2_API_BASE = "https://api.semanticscholar.org/graph/v1"
|
||||
|
||||
def fetch_paper(s2_id: str) -> Optional[Dict]:
|
||||
"""Fetch paper metadata from Semantic Scholar."""
|
||||
url = f"{S2_API_BASE}/paper/{s2_id}?fields=title,year,citationCount,externalIds,publicationVenue,publicationTypes"
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=10) as resp:
|
||||
return json.loads(resp.read())
|
||||
except (urllib.error.HTTPError, urllib.error.URLError) as e:
|
||||
print(f"Warning: Failed to fetch {s2_id}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
def fetch_citations(s2_id: str, limit: int = 50) -> List[Dict]:
|
||||
"""Fetch recent citing papers from Semantic Scholar."""
|
||||
url = f"{S2_API_BASE}/paper/{s2_id}/citations?fields=title,year,authors,publicationVenue,publicationTypes&limit={limit}"
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=15) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return [c["citingPaper"] for c in data.get("data", [])]
|
||||
except (urllib.error.HTTPError, urllib.error.URLError) as e:
|
||||
print(f"Warning: Failed to fetch citations for {s2_id}: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
def load_key_papers() -> List[Dict]:
|
||||
"""Load key papers list from citations.yaml."""
|
||||
config_path = KNOWLEDGE_DIR / "global" / "citations.yaml"
|
||||
if not config_path.exists():
|
||||
print(f"Error: {config_path} not found. Create it with key papers list.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
import yaml
|
||||
with open(config_path) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
papers = []
|
||||
for entry in data.get("papers", []):
|
||||
papers.append({
|
||||
"id": entry["s2_id"],
|
||||
"title": entry.get("title", "Unknown"),
|
||||
"notes": entry.get("notes", "")
|
||||
})
|
||||
return papers
|
||||
|
||||
def load_index() -> Dict:
|
||||
"""Load or initialize knowledge index."""
|
||||
if INDEX_PATH.exists():
|
||||
with open(INDEX_PATH) as f:
|
||||
return json.load(f)
|
||||
return {"version": 1, "last_updated": "", "total_facts": 0, "facts": []}
|
||||
|
||||
def save_index(index: Dict) -> None:
|
||||
"""Save knowledge index."""
|
||||
KNOWLEDGE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(INDEX_PATH, "w") as f:
|
||||
json.dump(index, f, indent=2)
|
||||
|
||||
def add_citation_fact(index: Dict, fact: str, repo: str, confidence: float,
|
||||
tags: List[str], source_count: int = 1) -> None:
|
||||
"""Add a new citation fact to the index."""
|
||||
# Determine next sequence number for citation:facts in this domain
|
||||
domain = "global"
|
||||
category = "fact"
|
||||
prefix = f"{domain}:{category}:"
|
||||
seq_nums = []
|
||||
for f in index["facts"]:
|
||||
if f["id"].startswith(prefix):
|
||||
try:
|
||||
seq = int(f["id"].split(":")[-1])
|
||||
seq_nums.append(seq)
|
||||
except ValueError:
|
||||
continue
|
||||
next_seq = max(seq_nums, default=0) + 1
|
||||
new_id = f"{domain}:{category}:{next_seq:03d}"
|
||||
|
||||
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
fact_entry = {
|
||||
"id": new_id,
|
||||
"fact": fact,
|
||||
"category": category,
|
||||
"domain": domain,
|
||||
"confidence": confidence,
|
||||
"tags": tags,
|
||||
"source_count": source_count,
|
||||
"first_seen": today,
|
||||
"last_confirmed": today
|
||||
}
|
||||
index["facts"].append(fact_entry)
|
||||
index["total_facts"] = len(index["facts"])
|
||||
index["last_updated"] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
def update_citation_data() -> None:
|
||||
"""Update citation counts and facts for all key papers."""
|
||||
papers = load_key_papers()
|
||||
index = load_index()
|
||||
updated = 0
|
||||
|
||||
for paper in papers:
|
||||
s2_id = paper["id"]
|
||||
title = paper["title"]
|
||||
|
||||
# Fetch current paper data
|
||||
data = fetch_paper(s2_id)
|
||||
if not data:
|
||||
continue
|
||||
|
||||
citation_count = data.get("citationCount", 0)
|
||||
external_ids = data.get("externalIds", {})
|
||||
arxiv_id = externalIds.get("ArXiv") if external_ids else None
|
||||
|
||||
# Add citation count fact (high confidence - directly from API)
|
||||
count_fact = f"Paper '{title}' (S2:{s2_id}) has {citation_count} citations as of {datetime.now(timezone.utc).strftime('%Y-%m-%d')}"
|
||||
if arxiv_id:
|
||||
count_fact += f" [arXiv:{arxiv_id}]"
|
||||
|
||||
add_citation_fact(
|
||||
index=index,
|
||||
fact=count_fact,
|
||||
repo="compounding-intelligence",
|
||||
confidence=0.95,
|
||||
tags=["citation", "tracking", "paper", s2_id],
|
||||
source_count=1
|
||||
)
|
||||
updated += 1
|
||||
|
||||
# Fetch recent citations (context extraction - limited batch)
|
||||
citations = fetch_citations(s2_id, limit=20)
|
||||
for citation in citations:
|
||||
citing_title = citation.get("title", "Unknown")
|
||||
citing_year = citation.get("year", "Unknown year")
|
||||
authors = citation.get("authors", [])
|
||||
author_names = [a.get("name", "") for a in authors[:3]]
|
||||
if len(authors) > 3:
|
||||
author_names.append("et al.")
|
||||
|
||||
cite_fact = f"Paper '{citing_title}' ({', '.join(author_names)}, {citing_year}) cites '{title}'"
|
||||
add_citation_fact(
|
||||
index=index,
|
||||
fact=cite_fact,
|
||||
repo="compounding-intelligence",
|
||||
confidence=0.8,
|
||||
tags=["citation", "citing-paper", s2_id],
|
||||
source_count=1
|
||||
)
|
||||
|
||||
print(f"Updated: {title} — {citation_count} citations, {len(citations)} recent")
|
||||
|
||||
save_index(index)
|
||||
print(f"\nUpdated {updated} papers. Total facts in index: {index['total_facts']}")
|
||||
|
||||
def generate_monthly_report(month: Optional[str] = None) -> str:
|
||||
"""Generate a monthly citation report."""
|
||||
target_month = month or datetime.now(timezone.utc).strftime("%Y-%m")
|
||||
year, mon = map(int, target_month.split("-"))
|
||||
|
||||
index = load_index()
|
||||
monthly_facts = []
|
||||
|
||||
for fact in index["facts"]:
|
||||
last_confirmed = fact.get("last_confirmed", "")
|
||||
if last_confirmed.startswith(f"{year}-{mon:02d}"):
|
||||
monthly_facts.append(fact)
|
||||
|
||||
# Build report
|
||||
lines = []
|
||||
lines.append(f"# Citation Tracker Monthly Report — {target_month}")
|
||||
lines.append("")
|
||||
lines.append(f"Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
|
||||
lines.append(f"Total citation facts this month: {len(monthly_facts)}")
|
||||
lines.append("")
|
||||
|
||||
# Group by paper
|
||||
from collections import defaultdict
|
||||
by_paper = defaultdict(list)
|
||||
for fact in monthly_facts:
|
||||
# Extract paper identifier from fact text
|
||||
text = fact["fact"]
|
||||
by_paper[text].append(fact)
|
||||
|
||||
for paper_title, facts in by_paper.items():
|
||||
lines.append(f"## {paper_title}")
|
||||
for f in facts:
|
||||
lines.append(f"- {f['fact']} (confidence: {f['confidence']})")
|
||||
lines.append("")
|
||||
|
||||
report = "\n".join(lines)
|
||||
|
||||
# Save report
|
||||
METRICS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
report_path = METRICS_DIR / f"citation_report_{target_month}.md"
|
||||
with open(report_path, "w") as f:
|
||||
f.write(report)
|
||||
|
||||
print(f"Monthly report saved to: {report_path}")
|
||||
return report
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Citation Tracker — Monitor key paper citations")
|
||||
parser.add_argument("--update", action="store_true", help="Fetch latest citation data")
|
||||
parser.add_argument("--report", action="store_true", help="Generate monthly report")
|
||||
parser.add_argument("--month", type=str, help="Month for report (YYYY-MM), defaults to current")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.update:
|
||||
update_citation_data()
|
||||
elif args.report:
|
||||
generate_monthly_report(args.month)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,366 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Code Duplication Detector — Issue #162
|
||||
|
||||
Finds duplicate functions and code blocks across Python source files.
|
||||
Reports duplication percentage and outputs a duplication report.
|
||||
|
||||
Usage:
|
||||
python3 scripts/code_duplication_detector.py --output reports/code_duplication.json
|
||||
python3 scripts/code_duplication_detector.py --directory scripts/ --dry-run
|
||||
python3 scripts/code_duplication_detector.py --test # Run built-in test
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
|
||||
|
||||
# ── AST helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
def normalize_code(text: str) -> str:
|
||||
"""Normalize code for comparison: strip comments, normalize whitespace."""
|
||||
# Remove comments (both # and docstring triple-quote strings)
|
||||
text = re.sub(r'#.*$', '', text, flags=re.MULTILINE)
|
||||
text = re.sub(r'""".*?"""', '', text, flags=re.DOTALL)
|
||||
text = re.sub(r"'''.*?'''", '', text, flags=re.DOTALL)
|
||||
# Normalize whitespace
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text.lower()
|
||||
|
||||
|
||||
def code_hash(text: str) -> str:
|
||||
"""SHA256 hash of normalized code for exact duplicate detection."""
|
||||
normalized = normalize_code(text)
|
||||
return hashlib.sha256(normalized.encode('utf-8')).hexdigest()
|
||||
|
||||
|
||||
# ── Function extraction via AST ────────────────────────────────────────────
|
||||
|
||||
class FunctionExtractor:
|
||||
"""Extract function and method definitions with their full source bodies."""
|
||||
|
||||
def __init__(self, source: str, filepath: str):
|
||||
self.source = source
|
||||
self.filepath = filepath
|
||||
self.lines = source.splitlines()
|
||||
self.functions: List[Dict] = []
|
||||
|
||||
def _get_source_segment(self, start_lineno: int, end_lineno: int) -> str:
|
||||
"""Get source code from start to end line (1-indexed, inclusive)."""
|
||||
# AST end_lineno is inclusive
|
||||
start_idx = start_lineno - 1
|
||||
end_idx = end_lineno
|
||||
return '\n'.join(self.lines[start_idx:end_idx])
|
||||
|
||||
def visit(self, tree):
|
||||
"""Collect all function and async function definitions."""
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
|
||||
# Get the full source for this function including decorators
|
||||
start = node.lineno
|
||||
end = node.end_lineno
|
||||
body_source = self._get_source_segment(start, end)
|
||||
|
||||
# Also collect parent class name if this is a method
|
||||
class_name = None
|
||||
parent = node.parent if hasattr(node, 'parent') else None
|
||||
if parent and isinstance(parent, ast.ClassDef):
|
||||
class_name = parent.name
|
||||
|
||||
self.functions.append({
|
||||
'name': node.name,
|
||||
'file': self.filepath,
|
||||
'start_line': start,
|
||||
'end_line': end,
|
||||
'body': body_source,
|
||||
'class_name': class_name,
|
||||
'is_method': class_name is not None,
|
||||
})
|
||||
|
||||
|
||||
import ast
|
||||
|
||||
class ParentNodeVisitor(ast.NodeVisitor):
|
||||
"""Annotate nodes with parent references."""
|
||||
def __init__(self, parent=None):
|
||||
self.parent = parent
|
||||
|
||||
def generic_visit(self, node):
|
||||
node.parent = self.parent
|
||||
for child in ast.iter_child_nodes(node):
|
||||
self.__class__(child).parent = node
|
||||
super().generic_visit(node)
|
||||
|
||||
|
||||
def extract_functions_from_file(filepath: str) -> List[Dict]:
|
||||
"""Extract all function definitions from a Python file."""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
|
||||
source = f.read()
|
||||
tree = ast.parse(source, filename=str(filepath))
|
||||
|
||||
# Annotate with parent references
|
||||
for node in ast.walk(tree):
|
||||
for child in ast.iter_child_nodes(node):
|
||||
child.parent = node
|
||||
|
||||
extractor = FunctionExtractor(source, str(filepath))
|
||||
extractor.visit(tree)
|
||||
return extractor.functions
|
||||
except (SyntaxError, UnicodeDecodeError, OSError) as e:
|
||||
return []
|
||||
|
||||
|
||||
def scan_directory(directory: str, extensions: Tuple[str, ...] = ('.py',)) -> List[Dict]:
|
||||
"""Scan directory for Python files and extract all functions."""
|
||||
all_functions = []
|
||||
path = Path(directory)
|
||||
|
||||
for filepath in path.rglob('*'):
|
||||
if filepath.is_file() and filepath.suffix in extensions:
|
||||
# Skip common non-source dirs
|
||||
parts = filepath.parts
|
||||
if any(ex in parts for ex in ('__pycache__', 'node_modules', '.git', 'venv', '.venv', 'dist', 'build')):
|
||||
continue
|
||||
if filepath.name.startswith('.'):
|
||||
continue
|
||||
|
||||
functions = extract_functions_from_file(str(filepath))
|
||||
all_functions.extend(functions)
|
||||
|
||||
return all_functions
|
||||
|
||||
|
||||
# ── Duplicate detection ─────────────────────────────────────────────────────
|
||||
|
||||
def find_duplicates(functions: List[Dict], similarity_threshold: float = 0.95) -> Dict:
|
||||
"""
|
||||
Find duplicate and near-duplicate functions.
|
||||
|
||||
Returns dict with:
|
||||
- exact_duplicates: {hash: [function_info, ...]}
|
||||
- near_duplicates: [[function_info, ...], ...]
|
||||
- stats: total_functions, unique_exact, exact_dupe_count, near_dupe_count
|
||||
"""
|
||||
# Phase 1: Exact duplicates by code hash
|
||||
hash_groups: Dict[str, List[Dict]] = defaultdict(list)
|
||||
for func in functions:
|
||||
h = code_hash(func['body'])
|
||||
hash_groups[h].append(func)
|
||||
|
||||
exact_duplicates = {h: group for h, group in hash_groups.items() if len(group) > 1}
|
||||
exact_dupe_count = sum(len(group) - 1 for group in exact_duplicates.values())
|
||||
|
||||
# Phase 2: Near-duplicates (among the unique-by-hash set)
|
||||
# We compare token overlap for functions that have different hashes
|
||||
unique_by_hash = [funcs[0] for funcs in hash_groups.values()]
|
||||
near_duplicate_groups = []
|
||||
|
||||
# Simple token-based similarity
|
||||
def tokenize(code: str) -> set:
|
||||
return set(re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*', code.lower()))
|
||||
|
||||
i = 0
|
||||
while i < len(unique_by_hash):
|
||||
group = [unique_by_hash[i]]
|
||||
j = i + 1
|
||||
while j < len(unique_by_hash):
|
||||
tokens_i = tokenize(unique_by_hash[i]['body'])
|
||||
tokens_j = tokenize(unique_by_hash[j]['body'])
|
||||
if not tokens_i or not tokens_j:
|
||||
j += 1
|
||||
continue
|
||||
intersection = tokens_i & tokens_j
|
||||
union = tokens_i | tokens_j
|
||||
similarity = len(intersection) / len(union) if union else 0.0
|
||||
|
||||
if similarity >= similarity_threshold:
|
||||
group.append(unique_by_hash[j])
|
||||
unique_by_hash.pop(j)
|
||||
else:
|
||||
j += 1
|
||||
|
||||
if len(group) > 1:
|
||||
near_duplicate_groups.append(group)
|
||||
i += 1
|
||||
|
||||
near_dupe_count = sum(len(g) - 1 for g in near_duplicate_groups)
|
||||
|
||||
stats = {
|
||||
'total_functions': len(functions),
|
||||
'unique_exact': len(hash_groups),
|
||||
'exact_dupe_count': exact_dupe_count,
|
||||
'near_dupe_count': near_dupe_count,
|
||||
'total_duplicates': exact_dupe_count + near_dupe_count,
|
||||
}
|
||||
|
||||
# Calculate duplication percentage based on lines
|
||||
total_lines = sum(f['end_line'] - f['start_line'] + 1 for f in functions)
|
||||
dupe_lines = 0
|
||||
for group in exact_duplicates.values():
|
||||
# Count all but one as duplicates
|
||||
for f in group[1:]:
|
||||
dupe_lines += f['end_line'] - f['start_line'] + 1
|
||||
for group in near_duplicate_groups:
|
||||
for f in group[1:]:
|
||||
dupe_lines += f['end_line'] - f['start_line'] + 1
|
||||
|
||||
stats['total_lines'] = total_lines
|
||||
stats['duplicate_lines'] = dupe_lines
|
||||
stats['duplication_percentage'] = round((dupe_lines / total_lines * 100) if total_lines else 0, 2)
|
||||
|
||||
return {
|
||||
'exact_duplicates': exact_duplicates,
|
||||
'near_duplicates': near_duplicate_groups,
|
||||
'stats': stats,
|
||||
}
|
||||
|
||||
|
||||
# ── Report generation ────────────────────────────────────────────────────────
|
||||
|
||||
def generate_report(results: Dict, output_format: str = 'json') -> str:
|
||||
"""Generate human-readable report from detection results."""
|
||||
stats = results['stats']
|
||||
|
||||
if output_format == 'json':
|
||||
return json.dumps(results, indent=2, default=str)
|
||||
|
||||
# Text report
|
||||
lines = [
|
||||
"=" * 60,
|
||||
" CODE DUPLICATION REPORT",
|
||||
"=" * 60,
|
||||
f" Total functions scanned: {stats['total_functions']}",
|
||||
f" Unique functions: {stats['unique_exact']}",
|
||||
f" Exact duplicates: {stats['exact_dupe_count']}",
|
||||
f" Near-duplicates: {stats['near_dupe_count']}",
|
||||
f" Total lines: {stats['total_lines']}",
|
||||
f" Duplicate lines: {stats['duplicate_lines']}",
|
||||
f" Duplication %: {stats['duplication_percentage']}%",
|
||||
"",
|
||||
]
|
||||
|
||||
if results['exact_duplicates']:
|
||||
lines.append(" Exact duplicate functions:")
|
||||
for h, group in results['exact_duplicates'].items():
|
||||
first = group[0]
|
||||
lines.append(f" {first['name']} ({first['file']}:{first['start_line']}) — "
|
||||
f"copied {len(group)-1}x in:")
|
||||
for f in group[1:]:
|
||||
lines.append(f" → {f['file']}:{f['start_line']}")
|
||||
lines.append("")
|
||||
|
||||
if results['near_duplicates']:
|
||||
lines.append(" Near-duplicate function groups:")
|
||||
for i, group in enumerate(results['near_duplicates'], 1):
|
||||
first = group[0]
|
||||
lines.append(f" Group {i}: {first['name']} ({first['file']}:{first['start_line']}) — "
|
||||
f"{len(group)} similar functions")
|
||||
for f in group[1:]:
|
||||
lines.append(f" → {f['file']}:{f['start_line']}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("=" * 60)
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
# ── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Code Duplication Detector")
|
||||
parser.add_argument('--directory', default='.',
|
||||
help='Directory to scan (default: current directory)')
|
||||
parser.add_argument('--output', help='Output file for JSON report')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Run without writing file')
|
||||
parser.add_argument('--threshold', type=float, default=0.95,
|
||||
help='Similarity threshold for near-dupes (default: 0.95)')
|
||||
parser.add_argument('--json', action='store_true', help='JSON output to stdout')
|
||||
parser.add_argument('--test', action='store_true', help='Run built-in test')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.test:
|
||||
_run_test()
|
||||
return
|
||||
|
||||
# Scan
|
||||
functions = scan_directory(args.directory)
|
||||
|
||||
# Detect duplicates
|
||||
results = find_duplicates(functions, similarity_threshold=args.threshold)
|
||||
stats = results['stats']
|
||||
|
||||
# Output
|
||||
if args.json:
|
||||
print(json.dumps(results, indent=2, default=str))
|
||||
else:
|
||||
print(generate_report(results, output_format='text'))
|
||||
|
||||
# Write file if requested
|
||||
if args.output and not args.dry_run:
|
||||
os.makedirs(os.path.dirname(args.output) or '.', exist_ok=True)
|
||||
with open(args.output, 'w') as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
print(f"\nReport written to: {args.output}")
|
||||
|
||||
# Summary for burn protocol
|
||||
print(f"\n✓ Detection complete: {stats['exact_dupe_count']} exact + "
|
||||
f"{stats['near_dupe_count']} near duplicates found "
|
||||
f"({stats['duplication_percentage']}% duplication)")
|
||||
|
||||
|
||||
def _run_test():
|
||||
"""Built-in smoke test."""
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Create test files with duplicate code
|
||||
f1 = Path(tmpdir) / 'mod1.py'
|
||||
f1.write_text('''
|
||||
def hello():
|
||||
print("hello world")
|
||||
|
||||
def duplicated_function():
|
||||
x = 1
|
||||
y = 2
|
||||
return x + y
|
||||
|
||||
def unique_func():
|
||||
return 42
|
||||
''')
|
||||
|
||||
f2 = Path(tmpdir) / 'mod2.py'
|
||||
f2.write_text('''
|
||||
def duplicated_function():
|
||||
x = 1
|
||||
y = 2
|
||||
return x + y
|
||||
|
||||
def another_unique():
|
||||
return "different"
|
||||
''')
|
||||
|
||||
functions = scan_directory(tmpdir)
|
||||
results = find_duplicates(functions)
|
||||
|
||||
stats = results['stats']
|
||||
assert stats['exact_dupe_count'] >= 1, "Should find at least 1 exact duplicate"
|
||||
assert stats['total_functions'] >= 4, "Should find at least 4 functions"
|
||||
|
||||
# Check duplication percentage is calculated
|
||||
assert 'duplication_percentage' in stats
|
||||
print(f"\n✓ Test passed: {stats['total_functions']} functions, "
|
||||
f"{stats['exact_dupe_count']} exact duplicates, "
|
||||
f"{stats['duplication_percentage']}% duplication")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
31
scripts/test_citation_tracker.py
Executable file
31
scripts/test_citation_tracker.py
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
sys.path.insert(0, "/Users/apayne/burn-clone/STEP35-compounding-intelligence-140/scripts")
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
|
||||
KNOWLEDGE_DIR = Path("/Users/apayne/burn-clone/STEP35-compounding-intelligence-140/knowledge")
|
||||
config_path = KNOWLEDGE_DIR / "global" / "citations.yaml"
|
||||
|
||||
with open(config_path) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
papers = data.get("papers", [])
|
||||
print(f"Loaded {len(papers)} key papers:")
|
||||
for p in papers:
|
||||
print(f" - {p['s2_id']}: {p['title']}")
|
||||
|
||||
# Test that citation_tracker module loads
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location("citation_tracker",
|
||||
"/Users/apayne/burn-clone/STEP35-compounding-intelligence-140/scripts/citation_tracker.py")
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
print("Module loaded successfully")
|
||||
|
||||
# Test fetch functions (with mock/real API)
|
||||
result = mod.fetch_paper("CorpusId:215715652") # Attention Is All You Need
|
||||
if result:
|
||||
print(f"Paper fetched: {result.get('title')} — {result.get('citationCount')} citations")
|
||||
else:
|
||||
print("Paper fetch failed (may be network issue)")
|
||||
@@ -1,168 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Smoke test for code duplication detector — verifies:
|
||||
- Function extraction from Python files
|
||||
- Exact duplicate detection
|
||||
- Near-duplicate detection (token similarity)
|
||||
- Report generation and stats
|
||||
- JSON output format
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent.absolute()
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
|
||||
from code_duplication_detector import (
|
||||
extract_functions_from_file,
|
||||
scan_directory,
|
||||
find_duplicates,
|
||||
generate_report,
|
||||
)
|
||||
|
||||
|
||||
def test_extract_functions():
|
||||
"""Test that function extraction works."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_file = Path(tmpdir) / 'sample.py'
|
||||
test_file.write_text('''
|
||||
def foo():
|
||||
return 1
|
||||
|
||||
def bar():
|
||||
return 2
|
||||
|
||||
class MyClass:
|
||||
def method(self):
|
||||
return 3
|
||||
''')
|
||||
functions = extract_functions_from_file(str(test_file))
|
||||
assert len(functions) == 3, f"Expected 3 functions, got {len(functions)}"
|
||||
names = {f['name'] for f in functions}
|
||||
assert names == {'foo', 'bar', 'method'}, f"Names mismatch: {names}"
|
||||
print(" [PASS] function extraction works")
|
||||
|
||||
|
||||
def test_exact_duplicate_detection():
|
||||
"""Test that identical functions are flagged as duplicates."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Create two files with the same function
|
||||
f1 = Path(tmpdir) / 'a.py'
|
||||
f1.write_text('''
|
||||
def duplicated():
|
||||
x = 1
|
||||
y = 2
|
||||
return x + y
|
||||
''')
|
||||
f2 = Path(tmpdir) / 'b.py'
|
||||
f2.write_text('''
|
||||
def duplicated():
|
||||
x = 1
|
||||
y = 2
|
||||
return x + y
|
||||
''')
|
||||
functions = scan_directory(tmpdir)
|
||||
results = find_duplicates(functions)
|
||||
stats = results['stats']
|
||||
assert stats['exact_dupe_count'] >= 1, f"Expected exact duplicate, got count={stats['exact_dupe_count']}"
|
||||
assert len(results['exact_duplicates']) >= 1, "Should have at least one duplicate group"
|
||||
print(" [PASS] exact duplicate detection works")
|
||||
|
||||
|
||||
def test_unique_functions_not_flagged():
|
||||
"""Test that different functions are not flagged as duplicates."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
f1 = Path(tmpdir) / 'a.py'
|
||||
f1.write_text('def func_a(): return 1')
|
||||
f2 = Path(tmpdir) / 'b.py'
|
||||
f2.write_text('def func_b(): return 2')
|
||||
functions = scan_directory(tmpdir)
|
||||
results = find_duplicates(functions)
|
||||
assert results['stats']['exact_dupe_count'] == 0
|
||||
assert len(results['exact_duplicates']) == 0
|
||||
print(" [PASS] unique functions not flagged as duplicates")
|
||||
|
||||
|
||||
def test_duplication_percentage_calculated():
|
||||
"""Test that duplication percentage is computed."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Create file with mostly duplicated content
|
||||
f1 = Path(tmpdir) / 'a.py'
|
||||
f1.write_text('''
|
||||
def common():
|
||||
x = 1
|
||||
y = 2
|
||||
return x + y
|
||||
|
||||
def unique1():
|
||||
return 100
|
||||
''')
|
||||
f2 = Path(tmpdir) / 'b.py'
|
||||
f2.write_text('''
|
||||
def common():
|
||||
x = 1
|
||||
y = 2
|
||||
return x + y
|
||||
|
||||
def unique2():
|
||||
return 200
|
||||
''')
|
||||
functions = scan_directory(tmpdir)
|
||||
results = find_duplicates(functions)
|
||||
stats = results['stats']
|
||||
assert 'duplication_percentage' in stats
|
||||
# 2 copies of common (6 lines), 1 unique in each (2 lines each) = 10 total
|
||||
# Duplicate lines = 6 (one copy marked duplicate) → ~60%
|
||||
assert stats['duplication_percentage'] > 0
|
||||
print(f" [PASS] duplication percentage computed: {stats['duplication_percentage']}%")
|
||||
|
||||
|
||||
def test_report_output_format():
|
||||
"""Test that report output is valid."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
f1 = Path(tmpdir) / 'a.py'
|
||||
f1.write_text('def dup(): return 1')
|
||||
f2 = Path(tmpdir) / 'b.py'
|
||||
f2.write_text('def dup(): return 1')
|
||||
functions = scan_directory(tmpdir)
|
||||
results = find_duplicates(functions)
|
||||
|
||||
# Text report
|
||||
text = generate_report(results, output_format='text')
|
||||
assert 'CODE DUPLICATION REPORT' in text
|
||||
assert 'Total functions' in text
|
||||
print(" [PASS] text report format valid")
|
||||
|
||||
# JSON report
|
||||
json_out = generate_report(results, output_format='json')
|
||||
data = json.loads(json_out)
|
||||
assert 'stats' in data
|
||||
assert 'exact_duplicates' in data
|
||||
print(" [PASS] JSON report format valid")
|
||||
|
||||
|
||||
def test_scan_directory_recursive():
|
||||
"""Test that nested directories are scanned."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
subdir = Path(tmpdir) / 'sub'
|
||||
subdir.mkdir()
|
||||
(subdir / 'nested.py').write_text('def nested(): pass')
|
||||
(Path(tmpdir) / 'root.py').write_text('def root(): pass')
|
||||
functions = scan_directory(tmpdir)
|
||||
names = {f['name'] for f in functions}
|
||||
assert 'nested' in names and 'root' in names
|
||||
print(" [PASS] recursive directory scanning works")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("Running code duplication detector smoke tests...")
|
||||
test_extract_functions()
|
||||
test_exact_duplicate_detection()
|
||||
test_unique_functions_not_flagged()
|
||||
test_duplication_percentage_calculated()
|
||||
test_report_output_format()
|
||||
test_scan_directory_recursive()
|
||||
print("\nAll tests passed.")
|
||||
Reference in New Issue
Block a user