132 lines
4.8 KiB
Python
132 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Gitea Issue Body Parser — Extract structured data from markdown issue bodies.
|
|
|
|
Usage:
|
|
cat issue_body.txt | python3 scripts/gitea_issue_parser.py --stdin --pretty
|
|
python3 scripts/gitea_issue_parser.py --url https://forge.../api/v1/repos/.../issues/123 --pretty
|
|
python3 scripts/gitea_issue_parser.py body.txt --title "Fix thing (#42)" --labels pipeline extraction
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
|
|
def parse_issue_body(body: str, title: str = "", labels: List[str] = None) -> Dict[str, Any]:
|
|
"""Parse a Gitea issue markdown body into structured JSON.
|
|
|
|
Extracted fields:
|
|
- title: Issue title
|
|
- context: Background/description section
|
|
- criteria[]: Acceptance criteria (checkboxes or numbered lists)
|
|
- labels[]: Issue labels
|
|
- epic_ref: Parent/epic issue reference (from "Closes #N" or title)
|
|
- sections{}: All ## sections as key-value pairs
|
|
"""
|
|
result = {
|
|
"title": title,
|
|
"context": "",
|
|
"criteria": [],
|
|
"labels": labels or [],
|
|
"epic_ref": None,
|
|
"sections": {},
|
|
}
|
|
|
|
if not body:
|
|
return result
|
|
|
|
# Extract epic reference from title or body
|
|
epic_patterns = [
|
|
r"(?:closes|fixes|addresses|refs?)\s+#(\d+)",
|
|
r"#(\d+)",
|
|
]
|
|
for pattern in epic_patterns:
|
|
match = re.search(pattern, (title + " " + body).lower())
|
|
if match:
|
|
result["epic_ref"] = int(match.group(1))
|
|
break
|
|
|
|
# Parse ## sections
|
|
section_pattern = r"^##\s+(.+?)$\n((?:^(?!##\s).*$\n?)*)"
|
|
for match in re.finditer(section_pattern, body, re.MULTILINE):
|
|
section_name = match.group(1).strip().lower().replace(" ", "_")
|
|
section_content = match.group(2).strip()
|
|
result["sections"][section_name] = section_content
|
|
|
|
# Extract acceptance criteria (checkboxes)
|
|
checkbox_pattern = r"^\s*-\s*\[([ xX])\]\s*(.+)$"
|
|
for match in re.finditer(checkbox_pattern, body, re.MULTILINE):
|
|
checked = match.group(1).lower() == "x"
|
|
text = match.group(2).strip()
|
|
result["criteria"].append({"text": text, "checked": checked})
|
|
|
|
# If no checkboxes, try numbered lists in "Acceptance Criteria" or "Criteria" section
|
|
if not result["criteria"]:
|
|
for section_name in ["acceptance_criteria", "criteria", "acceptance criteria"]:
|
|
if section_name in result["sections"]:
|
|
numbered = r"^\s*\d+\.\s*(.+)$"
|
|
for match in re.finditer(numbered, result["sections"][section_name], re.MULTILINE):
|
|
result["criteria"].append({"text": match.group(1).strip(), "checked": False})
|
|
break
|
|
|
|
# Extract context (first section or first paragraph before any ## heading)
|
|
first_heading = body.find("## ")
|
|
if first_heading > 0:
|
|
context_text = body[:first_heading].strip()
|
|
else:
|
|
context_text = body.split("\n\n")[0].strip()
|
|
# Clean up: remove "## Context" or "## Problem" header if present
|
|
context_text = re.sub(r"^#+\s*\w+\s*\n?", "", context_text).strip()
|
|
result["context"] = context_text[:500] # Cap at 500 chars
|
|
|
|
return result
|
|
|
|
|
|
def fetch_issue_from_url(url: str) -> Dict[str, Any]:
|
|
"""Fetch an issue from a Gitea API URL and parse it."""
|
|
import urllib.request
|
|
req = urllib.request.Request(url, headers={"Accept": "application/json"})
|
|
with urllib.request.urlopen(req) as resp:
|
|
data = json.loads(resp.read())
|
|
|
|
return parse_issue_body(
|
|
body=data.get("body", ""),
|
|
title=data.get("title", ""),
|
|
labels=[l["name"] for l in data.get("labels", [])]
|
|
)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Parse Gitea issue markdown into structured JSON")
|
|
parser.add_argument("file", nargs="?", help="Issue body file (or use --stdin)")
|
|
parser.add_argument("--stdin", action="store_true", help="Read from stdin")
|
|
parser.add_argument("--url", help="Gitea API URL to fetch issue from")
|
|
parser.add_argument("--title", default="", help="Issue title")
|
|
parser.add_argument("--labels", nargs="*", default=[], help="Issue labels")
|
|
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.url:
|
|
result = fetch_issue_from_url(args.url)
|
|
elif args.stdin:
|
|
body = sys.stdin.read()
|
|
result = parse_issue_body(body, args.title, args.labels)
|
|
elif args.file:
|
|
with open(args.file) as f:
|
|
body = f.read()
|
|
result = parse_issue_body(body, args.title, args.labels)
|
|
else:
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
indent = 2 if args.pretty else None
|
|
print(json.dumps(result, indent=indent))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|