From 4fcd372de48726d5706a361469b996eead84cf37 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Wed, 15 Apr 2026 03:49:00 +0000 Subject: [PATCH] feat: Add Gitea issue body parser (closes #177) --- scripts/gitea_issue_parser.py | 131 ++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 scripts/gitea_issue_parser.py diff --git a/scripts/gitea_issue_parser.py b/scripts/gitea_issue_parser.py new file mode 100644 index 0000000..2c34f30 --- /dev/null +++ b/scripts/gitea_issue_parser.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +""" +Gitea Issue Body Parser — Extract structured data from markdown issue bodies. + +Usage: + cat issue_body.txt | python3 scripts/gitea_issue_parser.py --stdin --pretty + python3 scripts/gitea_issue_parser.py --url https://forge.../api/v1/repos/.../issues/123 --pretty + python3 scripts/gitea_issue_parser.py body.txt --title "Fix thing (#42)" --labels pipeline extraction +""" + +import argparse +import json +import re +import sys +from typing import Dict, List, Any, Optional + + +def parse_issue_body(body: str, title: str = "", labels: List[str] = None) -> Dict[str, Any]: + """Parse a Gitea issue markdown body into structured JSON. + + Extracted fields: + - title: Issue title + - context: Background/description section + - criteria[]: Acceptance criteria (checkboxes or numbered lists) + - labels[]: Issue labels + - epic_ref: Parent/epic issue reference (from "Closes #N" or title) + - sections{}: All ## sections as key-value pairs + """ + result = { + "title": title, + "context": "", + "criteria": [], + "labels": labels or [], + "epic_ref": None, + "sections": {}, + } + + if not body: + return result + + # Extract epic reference from title or body + epic_patterns = [ + r"(?:closes|fixes|addresses|refs?)\s+#(\d+)", + r"#(\d+)", + ] + for pattern in epic_patterns: + match = re.search(pattern, (title + " " + body).lower()) + if match: + result["epic_ref"] = int(match.group(1)) + break + + # Parse ## sections + section_pattern = r"^##\s+(.+?)$\n((?:^(?!##\s).*$\n?)*)" + for match in re.finditer(section_pattern, body, re.MULTILINE): + section_name = match.group(1).strip().lower().replace(" ", "_") + section_content = match.group(2).strip() + result["sections"][section_name] = section_content + + # Extract acceptance criteria (checkboxes) + checkbox_pattern = r"^\s*-\s*\[([ xX])\]\s*(.+)$" + for match in re.finditer(checkbox_pattern, body, re.MULTILINE): + checked = match.group(1).lower() == "x" + text = match.group(2).strip() + result["criteria"].append({"text": text, "checked": checked}) + + # If no checkboxes, try numbered lists in "Acceptance Criteria" or "Criteria" section + if not result["criteria"]: + for section_name in ["acceptance_criteria", "criteria", "acceptance criteria"]: + if section_name in result["sections"]: + numbered = r"^\s*\d+\.\s*(.+)$" + for match in re.finditer(numbered, result["sections"][section_name], re.MULTILINE): + result["criteria"].append({"text": match.group(1).strip(), "checked": False}) + break + + # Extract context (first section or first paragraph before any ## heading) + first_heading = body.find("## ") + if first_heading > 0: + context_text = body[:first_heading].strip() + else: + context_text = body.split("\n\n")[0].strip() + # Clean up: remove "## Context" or "## Problem" header if present + context_text = re.sub(r"^#+\s*\w+\s*\n?", "", context_text).strip() + result["context"] = context_text[:500] # Cap at 500 chars + + return result + + +def fetch_issue_from_url(url: str) -> Dict[str, Any]: + """Fetch an issue from a Gitea API URL and parse it.""" + import urllib.request + req = urllib.request.Request(url, headers={"Accept": "application/json"}) + with urllib.request.urlopen(req) as resp: + data = json.loads(resp.read()) + + return parse_issue_body( + body=data.get("body", ""), + title=data.get("title", ""), + labels=[l["name"] for l in data.get("labels", [])] + ) + + +def main(): + parser = argparse.ArgumentParser(description="Parse Gitea issue markdown into structured JSON") + parser.add_argument("file", nargs="?", help="Issue body file (or use --stdin)") + parser.add_argument("--stdin", action="store_true", help="Read from stdin") + parser.add_argument("--url", help="Gitea API URL to fetch issue from") + parser.add_argument("--title", default="", help="Issue title") + parser.add_argument("--labels", nargs="*", default=[], help="Issue labels") + parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output") + + args = parser.parse_args() + + if args.url: + result = fetch_issue_from_url(args.url) + elif args.stdin: + body = sys.stdin.read() + result = parse_issue_body(body, args.title, args.labels) + elif args.file: + with open(args.file) as f: + body = f.read() + result = parse_issue_body(body, args.title, args.labels) + else: + parser.print_help() + sys.exit(1) + + indent = 2 if args.pretty else None + print(json.dumps(result, indent=indent)) + + +if __name__ == "__main__": + main()