From b614220eff0a2b85ee9f4f3333e3e04927a732c0 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Wed, 15 Apr 2026 03:34:30 +0000 Subject: [PATCH 1/2] feat: Gitea issue body parser for structured task extraction (#90) --- scripts/gitea_issue_parser.py | 162 ++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 scripts/gitea_issue_parser.py diff --git a/scripts/gitea_issue_parser.py b/scripts/gitea_issue_parser.py new file mode 100644 index 0000000..a3b27e4 --- /dev/null +++ b/scripts/gitea_issue_parser.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +""" +Gitea Issue Body Parser + +Extracts structured data from Gitea issue markdown bodies: +- Title +- Context section +- Acceptance criteria (checkboxes) +- Labels +- Epic/parent references + +Usage: + python3 scripts/gitea_issue_parser.py dict: + """Parse a Gitea issue body into structured JSON.""" + result = { + "title": title, + "context": "", + "criteria": [], + "labels": labels or [], + "epic_ref": None, + "sections": {}, + } + + if not body: + return result + + # Extract epic/parent reference from title or body + epic_pattern = r"#(\d+)" + title_refs = re.findall(epic_pattern, title) + body_refs = re.findall(epic_pattern, body[:200]) # Check early body refs + + # Look for "Closes #N" or "Part of #N" or "Epic: #N" + close_match = re.search(r"(?:Closes?|Fixes?|Resolves?)\s+#(\d+)", body, re.IGNORECASE) + part_match = re.search(r"(?:Part of|Epic|Parent|Blocks?)\s+#(\d+)", body, re.IGNORECASE) + + if close_match: + result["epic_ref"] = f"#{close_match.group(1)}" + elif part_match: + result["epic_ref"] = f"#{part_match.group(1)}" + elif title_refs: + result["epic_ref"] = f"#{title_refs[0]}" + elif body_refs: + result["epic_ref"] = f"#{body_refs[0]}" + + # Split into sections by ## headers + section_pattern = r"^##\s+(.+)$" + lines = body.split("\n") + current_section = None + current_content = [] + + for line in lines: + header_match = re.match(section_pattern, line) + if header_match: + # Save previous section + if current_section: + result["sections"][current_section] = "\n".join(current_content).strip() + current_section = header_match.group(1).strip().lower() + current_content = [] + else: + current_content.append(line) + + # Save last section + if current_section: + result["sections"][current_section] = "\n".join(current_content).strip() + + # Extract context + for key in ["context", "background", "description", "problem"]: + if key in result["sections"]: + result["context"] = result["sections"][key] + break + + # Extract acceptance criteria (checkboxes) + criteria_section = None + for key in ["acceptance criteria", "acceptance_criteria", "criteria", "requirements", "definition of done"]: + if key in result["sections"]: + criteria_section = result["sections"][key] + break + + if criteria_section: + checkbox_pattern = r"-\s*\[[ xX]?\]\s*(.+)" + for match in re.finditer(checkbox_pattern, criteria_section): + result["criteria"].append(match.group(1).strip()) + + # Also try plain numbered/bulleted lists if no checkboxes found + if not result["criteria"]: + list_pattern = r"^\s*(?:\d+\.|-|\*)\s+(.+)" + for match in re.finditer(list_pattern, criteria_section, re.MULTILINE): + result["criteria"].append(match.group(1).strip()) + + # If no sectioned criteria found, scan whole body for checkboxes + if not result["criteria"]: + for match in re.finditer(r"-\s*\[[ xX]?\]\s*(.+)", body): + result["criteria"].append(match.group(1).strip()) + + return result + + +def parse_from_url(api_url: str, token: str = None) -> dict: + """Parse an issue from a Gitea API URL.""" + import urllib.request + + headers = {} + if token: + headers["Authorization"] = f"token {token}" + + req = urllib.request.Request(api_url, headers=headers) + resp = json.loads(urllib.request.urlopen(req, timeout=30).read()) + + title = resp.get("title", "") + body = resp.get("body", "") + labels = [l["name"] for l in resp.get("labels", [])] + + return parse_issue_body(body, title, labels) + + +def main(): + parser = argparse.ArgumentParser(description="Parse Gitea issue body into structured JSON") + parser.add_argument("input", nargs="?", help="Issue body file (or - for stdin)") + parser.add_argument("--url", help="Gitea API URL for the issue") + parser.add_argument("--stdin", action="store_true", help="Read from stdin") + parser.add_argument("--token", help="Gitea API token (or set GITEA_TOKEN env var)") + parser.add_argument("--title", default="", help="Issue title (for epic ref extraction)") + parser.add_argument("--labels", nargs="*", default=[], help="Issue labels") + parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output") + args = parser.parse_args() + + import os + token = args.token or os.environ.get("GITEA_TOKEN") + + if args.url: + result = parse_from_url(args.url, token) + elif args.stdin or (args.input and args.input == "-"): + body = sys.stdin.read() + result = parse_issue_body(body, args.title, args.labels) + elif args.input: + with open(args.input) as f: + body = f.read() + result = parse_issue_body(body, args.title, args.labels) + else: + parser.print_help() + sys.exit(1) + + indent = 2 if args.pretty else None + print(json.dumps(result, indent=indent)) + + +if __name__ == "__main__": + main() -- 2.43.0 From 2f38c52691dcc5af72f25d36d72b9532bc89a44a Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Wed, 15 Apr 2026 03:34:33 +0000 Subject: [PATCH 2/2] test: add tests for issue body parser (#90) --- scripts/test_gitea_issue_parser.py | 111 +++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 scripts/test_gitea_issue_parser.py diff --git a/scripts/test_gitea_issue_parser.py b/scripts/test_gitea_issue_parser.py new file mode 100644 index 0000000..3f1af6d --- /dev/null +++ b/scripts/test_gitea_issue_parser.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +"""Tests for gitea_issue_parser.""" + +import json +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) +from gitea_issue_parser import parse_issue_body + + +def test_basic_structure(): + body = """## Context +This is the background. + +## Acceptance Criteria +- [ ] First criterion +- [x] Second criterion (already done) +- [ ] Third criterion + +## Labels +`pipeline`, `extraction` +""" + result = parse_issue_body(body, "Test Issue", ["pipeline", "extraction"]) + assert result["title"] == "Test Issue" + assert "background" in result["context"].lower() + assert len(result["criteria"]) == 3 + assert "First criterion" in result["criteria"] + assert result["labels"] == ["pipeline", "extraction"] + print("PASS: test_basic_structure") + + +def test_epic_ref(): + body = "Closes #645\n\nSome description." + result = parse_issue_body(body, "feat: thing (#688)") + assert result["epic_ref"] == "#645" + print("PASS: test_epic_ref") + + +def test_epic_ref_from_title(): + body = "Some description without close ref." + result = parse_issue_body(body, "feat: scene descriptions (#645)") + assert result["epic_ref"] == "#645" + print("PASS: test_epic_ref_from_title") + + +def test_no_checkboxes(): + body = """## Requirements +1. First thing +2. Second thing +3. Third thing +""" + result = parse_issue_body(body) + assert len(result["criteria"]) == 3 + print("PASS: test_no_checkboxes") + + +def test_empty_body(): + result = parse_issue_body("", "Empty Issue") + assert result["title"] == "Empty Issue" + assert result["criteria"] == [] + assert result["context"] == "" + print("PASS: test_empty_body") + + +def test_real_issue_format(): + body = """Closes #681 + +## Changes + +Add `#!/usr/bin/env python3` shebang to 6 Python scripts. + +## Verification + +All 6 files confirmed missing shebangs before fix. + +## Impact + +Scripts can now be executed directly. +""" + result = parse_issue_body(body, "fix: add python3 shebangs (#685)") + assert result["epic_ref"] == "#681" + assert "shebang" in result["context"].lower() + print("PASS: test_real_issue_format") + + +def test_all_sections_captured(): + body = """## Context +Background info. + +## Acceptance Criteria +- [ ] Do thing + +## Labels +`test` +""" + result = parse_issue_body(body) + assert "context" in result["sections"] + assert "acceptance criteria" in result["sections"] + print("PASS: test_all_sections_captured") + + +if __name__ == "__main__": + test_basic_structure() + test_epic_ref() + test_epic_ref_from_title() + test_no_checkboxes() + test_empty_body() + test_real_issue_format() + test_all_sections_captured() + print("\nAll tests passed.") -- 2.43.0