Compare commits

..

2 Commits

3 changed files with 240 additions and 221 deletions

View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""
Gitea Issue Body Parser — Extract structured data from markdown issue bodies.
Usage:
cat issue_body.txt | python3 scripts/gitea_issue_parser.py --stdin --pretty
python3 scripts/gitea_issue_parser.py --url https://forge.../api/v1/repos/.../issues/123 --pretty
python3 scripts/gitea_issue_parser.py body.txt --title "Fix thing (#42)" --labels pipeline extraction
"""
import argparse
import json
import re
import sys
from typing import Dict, List, Any, Optional
def parse_issue_body(body: str, title: str = "", labels: List[str] = None) -> Dict[str, Any]:
"""Parse a Gitea issue markdown body into structured JSON.
Extracted fields:
- title: Issue title
- context: Background/description section
- criteria[]: Acceptance criteria (checkboxes or numbered lists)
- labels[]: Issue labels
- epic_ref: Parent/epic issue reference (from "Closes #N" or title)
- sections{}: All ## sections as key-value pairs
"""
result = {
"title": title,
"context": "",
"criteria": [],
"labels": labels or [],
"epic_ref": None,
"sections": {},
}
if not body:
return result
# Extract epic reference from title or body
epic_patterns = [
r"(?:closes|fixes|addresses|refs?)\s+#(\d+)",
r"#(\d+)",
]
for pattern in epic_patterns:
match = re.search(pattern, (title + " " + body).lower())
if match:
result["epic_ref"] = int(match.group(1))
break
# Parse ## sections
section_pattern = r"^##\s+(.+?)$\n((?:^(?!##\s).*$\n?)*)"
for match in re.finditer(section_pattern, body, re.MULTILINE):
section_name = match.group(1).strip().lower().replace(" ", "_")
section_content = match.group(2).strip()
result["sections"][section_name] = section_content
# Extract acceptance criteria (checkboxes)
checkbox_pattern = r"^\s*-\s*\[([ xX])\]\s*(.+)$"
for match in re.finditer(checkbox_pattern, body, re.MULTILINE):
checked = match.group(1).lower() == "x"
text = match.group(2).strip()
result["criteria"].append({"text": text, "checked": checked})
# If no checkboxes, try numbered lists in "Acceptance Criteria" or "Criteria" section
if not result["criteria"]:
for section_name in ["acceptance_criteria", "criteria", "acceptance criteria"]:
if section_name in result["sections"]:
numbered = r"^\s*\d+\.\s*(.+)$"
for match in re.finditer(numbered, result["sections"][section_name], re.MULTILINE):
result["criteria"].append({"text": match.group(1).strip(), "checked": False})
break
# Extract context (first section or first paragraph before any ## heading)
first_heading = body.find("## ")
if first_heading > 0:
context_text = body[:first_heading].strip()
else:
context_text = body.split("\n\n")[0].strip()
# Clean up: remove "## Context" or "## Problem" header if present
context_text = re.sub(r"^#+\s*\w+\s*\n?", "", context_text).strip()
result["context"] = context_text[:500] # Cap at 500 chars
return result
def fetch_issue_from_url(url: str) -> Dict[str, Any]:
"""Fetch an issue from a Gitea API URL and parse it."""
import urllib.request
req = urllib.request.Request(url, headers={"Accept": "application/json"})
with urllib.request.urlopen(req) as resp:
data = json.loads(resp.read())
return parse_issue_body(
body=data.get("body", ""),
title=data.get("title", ""),
labels=[l["name"] for l in data.get("labels", [])]
)
def main():
parser = argparse.ArgumentParser(description="Parse Gitea issue markdown into structured JSON")
parser.add_argument("file", nargs="?", help="Issue body file (or use --stdin)")
parser.add_argument("--stdin", action="store_true", help="Read from stdin")
parser.add_argument("--url", help="Gitea API URL to fetch issue from")
parser.add_argument("--title", default="", help="Issue title")
parser.add_argument("--labels", nargs="*", default=[], help="Issue labels")
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
args = parser.parse_args()
if args.url:
result = fetch_issue_from_url(args.url)
elif args.stdin:
body = sys.stdin.read()
result = parse_issue_body(body, args.title, args.labels)
elif args.file:
with open(args.file) as f:
body = f.read()
result = parse_issue_body(body, args.title, args.labels)
else:
parser.print_help()
sys.exit(1)
indent = 2 if args.pretty else None
print(json.dumps(result, indent=indent))
if __name__ == "__main__":
main()

View File

@@ -1,221 +0,0 @@
#!/usr/bin/env python3
"""
Knowledge Store Staleness Detector
Checks knowledge entries against their source files to detect staleness.
An entry is stale when its source file has been modified since extraction.
Usage:
python3 scripts/knowledge_staleness_check.py knowledge/index.json
python3 scripts/knowledge_staleness_check.py --repo /path/to/repo --index knowledge/index.json
python3 scripts/knowledge_staleness_check.py --index knowledge/index.json --fix
Expected index.json format:
{
"version": 1,
"facts": [
{
"fact": "...",
"category": "fact|pitfall|pattern|tool-quirk",
"repo": "repo-name",
"confidence": 0.8,
"source_file": "path/to/file.py",
"source_hash": "sha256:abcdef...",
"extracted_at": "2026-04-13T20:00:00Z"
}
]
}
"""
import argparse
import hashlib
import json
import sys
from pathlib import Path
from typing import Optional
def compute_file_hash(filepath: str) -> Optional[str]:
"""Compute SHA-256 hash of a file. Returns None if file not found."""
path = Path(filepath)
if not path.exists():
return None
content = path.read_bytes()
return hashlib.sha256(content).hexdigest()[:16]
def check_staleness(index_path: str, repo_root: str = None) -> dict:
"""Check all entries in the knowledge index for staleness."""
index = Path(index_path)
if not index.exists():
return {"error": f"Index not found: {index_path}"}
data = json.loads(index.read_text())
facts = data.get("facts", [])
if not facts:
return {
"total": 0,
"stale": 0,
"fresh": 0,
"no_source": 0,
"missing_files": 0,
"stale_entries": [],
}
# Determine repo root
if repo_root:
root = Path(repo_root)
else:
root = index.parent.parent # knowledge/index.json -> repo root
results = {
"total": len(facts),
"stale": 0,
"fresh": 0,
"no_source": 0,
"missing_files": 0,
"stale_entries": [],
}
for i, entry in enumerate(facts):
source_file = entry.get("source_file")
stored_hash = entry.get("source_hash")
if not source_file:
results["no_source"] += 1
continue
if not stored_hash:
# Entry has source file but no hash — consider stale
results["stale"] += 1
results["stale_entries"].append({
"index": i,
"fact": entry.get("fact", "")[:100],
"source_file": source_file,
"reason": "no_hash",
})
continue
# Compute current hash
full_path = root / source_file
current_hash = compute_file_hash(str(full_path))
if current_hash is None:
results["missing_files"] += 1
results["stale_entries"].append({
"index": i,
"fact": entry.get("fact", "")[:100],
"source_file": source_file,
"reason": "file_missing",
})
elif current_hash != stored_hash:
results["stale"] += 1
results["stale_entries"].append({
"index": i,
"fact": entry.get("fact", "")[:100],
"source_file": source_file,
"stored_hash": stored_hash,
"current_hash": current_hash,
"reason": "hash_mismatch",
})
else:
results["fresh"] += 1
return results
def add_hashes_to_index(index_path: str, repo_root: str = None) -> dict:
"""Add source hashes to entries that are missing them."""
index = Path(index_path)
data = json.loads(index.read_text())
facts = data.get("facts", [])
if repo_root:
root = Path(repo_root)
else:
root = index.parent.parent
updated = 0
skipped = 0
for entry in facts:
source_file = entry.get("source_file")
if not source_file or entry.get("source_hash"):
skipped += 1
continue
full_path = root / source_file
file_hash = compute_file_hash(str(full_path))
if file_hash:
entry["source_hash"] = file_hash
updated += 1
if updated > 0:
index.write_text(json.dumps(data, indent=2) + "\n")
return {"updated": updated, "skipped": skipped, "total": len(facts)}
def report_staleness(results: dict) -> str:
"""Format staleness check results as a report."""
lines = []
lines.append("=" * 50)
lines.append("KNOWLEDGE STORE STALENESS REPORT")
lines.append("=" * 50)
lines.append(f"Total entries: {results['total']}")
lines.append(f"Fresh: {results['fresh']}")
lines.append(f"Stale: {results['stale']}")
lines.append(f"No source: {results['no_source']}")
lines.append(f"Missing files: {results['missing_files']}")
lines.append("")
if results["stale_entries"]:
lines.append("STALE ENTRIES:")
lines.append("-" * 50)
for entry in results["stale_entries"]:
lines.append(f" [{entry['reason']}] {entry['source_file']}")
lines.append(f" {entry['fact']}")
if entry.get("stored_hash") and entry.get("current_hash"):
lines.append(f" stored: {entry['stored_hash']}")
lines.append(f" current: {entry['current_hash']}")
lines.append("")
if results["total"] > 0:
staleness_pct = results["stale"] / results["total"] * 100
lines.append(f"Staleness rate: {staleness_pct:.1f}%")
else:
lines.append("No entries to check.")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Check knowledge store for stale entries")
parser.add_argument("--index", default="knowledge/index.json", help="Path to index.json")
parser.add_argument("--repo", help="Repository root (default: auto-detect from index path)")
parser.add_argument("--fix", action="store_true", help="Add missing hashes to index")
parser.add_argument("--json", action="store_true", help="Output JSON instead of report")
args = parser.parse_args()
if args.fix:
result = add_hashes_to_index(args.index, args.repo)
if args.json:
print(json.dumps(result, indent=2))
else:
print(f"Updated {result['updated']} entries with source hashes.")
print(f"Skipped {result['skipped']} (already had hashes or no source file).")
else:
results = check_staleness(args.index, args.repo)
if "error" in results:
print(f"Error: {results['error']}", file=sys.stderr)
sys.exit(1)
if args.json:
print(json.dumps(results, indent=2))
else:
print(report_staleness(results))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""Tests for scripts/gitea_issue_parser.py"""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__) or ".")
# Import from sibling
import importlib.util
spec = importlib.util.spec_from_file_location("parser", os.path.join(os.path.dirname(__file__) or ".", "gitea_issue_parser.py"))
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
parse_issue_body = mod.parse_issue_body
def test_basic_parsing():
body = """## Context
This is the background info.
## Acceptance Criteria
- [ ] First criterion
- [x] Second criterion (done)
## What to build
Some description.
"""
result = parse_issue_body(body, title="Test (#42)", labels=["bug"])
assert result["title"] == "Test (#42)"
assert result["labels"] == ["bug"]
assert result["epic_ref"] == 42
assert len(result["criteria"]) == 2
assert result["criteria"][0]["text"] == "First criterion"
assert result["criteria"][0]["checked"] == False
assert result["criteria"][1]["checked"] == True
assert "context" in result["sections"]
print("PASS: test_basic_parsing")
def test_numbered_criteria():
body = """## Acceptance Criteria
1. First item
2. Second item
3. Third item
"""
result = parse_issue_body(body)
assert len(result["criteria"]) == 3
assert result["criteria"][0]["text"] == "First item"
print("PASS: test_numbered_criteria")
def test_epic_ref_from_body():
body = "Closes #123\n\nSome description."
result = parse_issue_body(body)
assert result["epic_ref"] == 123
print("PASS: test_epic_ref_from_body")
def test_empty_body():
result = parse_issue_body("")
assert result["criteria"] == []
assert result["context"] == ""
assert result["sections"] == {}
print("PASS: test_empty_body")
def test_no_sections():
body = "Just a plain issue body with no headings."
result = parse_issue_body(body)
assert result["context"] == "Just a plain issue body with no headings."
print("PASS: test_no_sections")
def test_multiple_sections():
body = """## Problem
Something is broken.
## Fix
Do this instead.
## Notes
Additional info.
"""
result = parse_issue_body(body)
assert "problem" in result["sections"]
assert "fix" in result["sections"]
assert "notes" in result["sections"]
assert "Something is broken" in result["sections"]["problem"]
print("PASS: test_multiple_sections")
def run_all():
test_basic_parsing()
test_numbered_criteria()
test_epic_ref_from_body()
test_empty_body()
test_no_sections()
test_multiple_sections()
print("\nAll 6 tests passed!")
if __name__ == "__main__":
run_all()