diff --git a/scripts/gitea_issue_parser.py b/scripts/gitea_issue_parser.py new file mode 100644 index 0000000..2c34f30 --- /dev/null +++ b/scripts/gitea_issue_parser.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +""" +Gitea Issue Body Parser — Extract structured data from markdown issue bodies. + +Usage: + cat issue_body.txt | python3 scripts/gitea_issue_parser.py --stdin --pretty + python3 scripts/gitea_issue_parser.py --url https://forge.../api/v1/repos/.../issues/123 --pretty + python3 scripts/gitea_issue_parser.py body.txt --title "Fix thing (#42)" --labels pipeline extraction +""" + +import argparse +import json +import re +import sys +from typing import Dict, List, Any, Optional + + +def parse_issue_body(body: str, title: str = "", labels: List[str] = None) -> Dict[str, Any]: + """Parse a Gitea issue markdown body into structured JSON. + + Extracted fields: + - title: Issue title + - context: Background/description section + - criteria[]: Acceptance criteria (checkboxes or numbered lists) + - labels[]: Issue labels + - epic_ref: Parent/epic issue reference (from "Closes #N" or title) + - sections{}: All ## sections as key-value pairs + """ + result = { + "title": title, + "context": "", + "criteria": [], + "labels": labels or [], + "epic_ref": None, + "sections": {}, + } + + if not body: + return result + + # Extract epic reference from title or body + epic_patterns = [ + r"(?:closes|fixes|addresses|refs?)\s+#(\d+)", + r"#(\d+)", + ] + for pattern in epic_patterns: + match = re.search(pattern, (title + " " + body).lower()) + if match: + result["epic_ref"] = int(match.group(1)) + break + + # Parse ## sections + section_pattern = r"^##\s+(.+?)$\n((?:^(?!##\s).*$\n?)*)" + for match in re.finditer(section_pattern, body, re.MULTILINE): + section_name = match.group(1).strip().lower().replace(" ", "_") + section_content = match.group(2).strip() + result["sections"][section_name] = section_content + + # Extract acceptance criteria (checkboxes) + checkbox_pattern = r"^\s*-\s*\[([ xX])\]\s*(.+)$" + for match in re.finditer(checkbox_pattern, body, re.MULTILINE): + checked = match.group(1).lower() == "x" + text = match.group(2).strip() + result["criteria"].append({"text": text, "checked": checked}) + + # If no checkboxes, try numbered lists in "Acceptance Criteria" or "Criteria" section + if not result["criteria"]: + for section_name in ["acceptance_criteria", "criteria", "acceptance criteria"]: + if section_name in result["sections"]: + numbered = r"^\s*\d+\.\s*(.+)$" + for match in re.finditer(numbered, result["sections"][section_name], re.MULTILINE): + result["criteria"].append({"text": match.group(1).strip(), "checked": False}) + break + + # Extract context (first section or first paragraph before any ## heading) + first_heading = body.find("## ") + if first_heading > 0: + context_text = body[:first_heading].strip() + else: + context_text = body.split("\n\n")[0].strip() + # Clean up: remove "## Context" or "## Problem" header if present + context_text = re.sub(r"^#+\s*\w+\s*\n?", "", context_text).strip() + result["context"] = context_text[:500] # Cap at 500 chars + + return result + + +def fetch_issue_from_url(url: str) -> Dict[str, Any]: + """Fetch an issue from a Gitea API URL and parse it.""" + import urllib.request + req = urllib.request.Request(url, headers={"Accept": "application/json"}) + with urllib.request.urlopen(req) as resp: + data = json.loads(resp.read()) + + return parse_issue_body( + body=data.get("body", ""), + title=data.get("title", ""), + labels=[l["name"] for l in data.get("labels", [])] + ) + + +def main(): + parser = argparse.ArgumentParser(description="Parse Gitea issue markdown into structured JSON") + parser.add_argument("file", nargs="?", help="Issue body file (or use --stdin)") + parser.add_argument("--stdin", action="store_true", help="Read from stdin") + parser.add_argument("--url", help="Gitea API URL to fetch issue from") + parser.add_argument("--title", default="", help="Issue title") + parser.add_argument("--labels", nargs="*", default=[], help="Issue labels") + parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output") + + args = parser.parse_args() + + if args.url: + result = fetch_issue_from_url(args.url) + elif args.stdin: + body = sys.stdin.read() + result = parse_issue_body(body, args.title, args.labels) + elif args.file: + with open(args.file) as f: + body = f.read() + result = parse_issue_body(body, args.title, args.labels) + else: + parser.print_help() + sys.exit(1) + + indent = 2 if args.pretty else None + print(json.dumps(result, indent=indent)) + + +if __name__ == "__main__": + main() diff --git a/scripts/test_gitea_issue_parser.py b/scripts/test_gitea_issue_parser.py new file mode 100644 index 0000000..36aaee0 --- /dev/null +++ b/scripts/test_gitea_issue_parser.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +"""Tests for scripts/gitea_issue_parser.py""" + +import sys +import os +sys.path.insert(0, os.path.dirname(__file__) or ".") + +# Import from sibling +import importlib.util +spec = importlib.util.spec_from_file_location("parser", os.path.join(os.path.dirname(__file__) or ".", "gitea_issue_parser.py")) +mod = importlib.util.module_from_spec(spec) +spec.loader.exec_module(mod) +parse_issue_body = mod.parse_issue_body + + +def test_basic_parsing(): + body = """## Context + +This is the background info. + +## Acceptance Criteria + +- [ ] First criterion +- [x] Second criterion (done) + +## What to build + +Some description. +""" + result = parse_issue_body(body, title="Test (#42)", labels=["bug"]) + assert result["title"] == "Test (#42)" + assert result["labels"] == ["bug"] + assert result["epic_ref"] == 42 + assert len(result["criteria"]) == 2 + assert result["criteria"][0]["text"] == "First criterion" + assert result["criteria"][0]["checked"] == False + assert result["criteria"][1]["checked"] == True + assert "context" in result["sections"] + print("PASS: test_basic_parsing") + + +def test_numbered_criteria(): + body = """## Acceptance Criteria + +1. First item +2. Second item +3. Third item +""" + result = parse_issue_body(body) + assert len(result["criteria"]) == 3 + assert result["criteria"][0]["text"] == "First item" + print("PASS: test_numbered_criteria") + + +def test_epic_ref_from_body(): + body = "Closes #123\n\nSome description." + result = parse_issue_body(body) + assert result["epic_ref"] == 123 + print("PASS: test_epic_ref_from_body") + + +def test_empty_body(): + result = parse_issue_body("") + assert result["criteria"] == [] + assert result["context"] == "" + assert result["sections"] == {} + print("PASS: test_empty_body") + + +def test_no_sections(): + body = "Just a plain issue body with no headings." + result = parse_issue_body(body) + assert result["context"] == "Just a plain issue body with no headings." + print("PASS: test_no_sections") + + +def test_multiple_sections(): + body = """## Problem + +Something is broken. + +## Fix + +Do this instead. + +## Notes + +Additional info. +""" + result = parse_issue_body(body) + assert "problem" in result["sections"] + assert "fix" in result["sections"] + assert "notes" in result["sections"] + assert "Something is broken" in result["sections"]["problem"] + print("PASS: test_multiple_sections") + + +def run_all(): + test_basic_parsing() + test_numbered_criteria() + test_epic_ref_from_body() + test_empty_body() + test_no_sections() + test_multiple_sections() + print("\nAll 6 tests passed!") + + +if __name__ == "__main__": + run_all()