#!/usr/bin/env python3 """ Gitea Issue Body Parser — Extract structured data from markdown issue bodies. Usage: cat issue_body.txt | python3 scripts/gitea_issue_parser.py --stdin --pretty python3 scripts/gitea_issue_parser.py --url https://forge.../api/v1/repos/.../issues/123 --pretty python3 scripts/gitea_issue_parser.py body.txt --title "Fix thing (#42)" --labels pipeline extraction """ import argparse import json import re import sys from typing import Dict, List, Any, Optional def parse_issue_body(body: str, title: str = "", labels: List[str] = None) -> Dict[str, Any]: """Parse a Gitea issue markdown body into structured JSON. Extracted fields: - title: Issue title - context: Background/description section - criteria[]: Acceptance criteria (checkboxes or numbered lists) - labels[]: Issue labels - epic_ref: Parent/epic issue reference (from "Closes #N" or title) - sections{}: All ## sections as key-value pairs """ result = { "title": title, "context": "", "criteria": [], "labels": labels or [], "epic_ref": None, "sections": {}, } if not body: return result # Extract epic reference from title or body epic_patterns = [ r"(?:closes|fixes|addresses|refs?)\s+#(\d+)", r"#(\d+)", ] for pattern in epic_patterns: match = re.search(pattern, (title + " " + body).lower()) if match: result["epic_ref"] = int(match.group(1)) break # Parse ## sections section_pattern = r"^##\s+(.+?)$\n((?:^(?!##\s).*$\n?)*)" for match in re.finditer(section_pattern, body, re.MULTILINE): section_name = match.group(1).strip().lower().replace(" ", "_") section_content = match.group(2).strip() result["sections"][section_name] = section_content # Extract acceptance criteria (checkboxes) checkbox_pattern = r"^\s*-\s*\[([ xX])\]\s*(.+)$" for match in re.finditer(checkbox_pattern, body, re.MULTILINE): checked = match.group(1).lower() == "x" text = match.group(2).strip() result["criteria"].append({"text": text, "checked": checked}) # If no checkboxes, try numbered lists in "Acceptance Criteria" or "Criteria" section if not result["criteria"]: for section_name in ["acceptance_criteria", "criteria", "acceptance criteria"]: if section_name in result["sections"]: numbered = r"^\s*\d+\.\s*(.+)$" for match in re.finditer(numbered, result["sections"][section_name], re.MULTILINE): result["criteria"].append({"text": match.group(1).strip(), "checked": False}) break # Extract context (first section or first paragraph before any ## heading) first_heading = body.find("## ") if first_heading > 0: context_text = body[:first_heading].strip() else: context_text = body.split("\n\n")[0].strip() # Clean up: remove "## Context" or "## Problem" header if present context_text = re.sub(r"^#+\s*\w+\s*\n?", "", context_text).strip() result["context"] = context_text[:500] # Cap at 500 chars return result def fetch_issue_from_url(url: str) -> Dict[str, Any]: """Fetch an issue from a Gitea API URL and parse it.""" import urllib.request req = urllib.request.Request(url, headers={"Accept": "application/json"}) with urllib.request.urlopen(req) as resp: data = json.loads(resp.read()) return parse_issue_body( body=data.get("body", ""), title=data.get("title", ""), labels=[l["name"] for l in data.get("labels", [])] ) def main(): parser = argparse.ArgumentParser(description="Parse Gitea issue markdown into structured JSON") parser.add_argument("file", nargs="?", help="Issue body file (or use --stdin)") parser.add_argument("--stdin", action="store_true", help="Read from stdin") parser.add_argument("--url", help="Gitea API URL to fetch issue from") parser.add_argument("--title", default="", help="Issue title") parser.add_argument("--labels", nargs="*", default=[], help="Issue labels") parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output") args = parser.parse_args() if args.url: result = fetch_issue_from_url(args.url) elif args.stdin: body = sys.stdin.read() result = parse_issue_body(body, args.title, args.labels) elif args.file: with open(args.file) as f: body = f.read() result = parse_issue_body(body, args.title, args.labels) else: parser.print_help() sys.exit(1) indent = 2 if args.pretty else None print(json.dumps(result, indent=indent)) if __name__ == "__main__": main()