feat: Add parser tests (closes #177 )

feat: Add Gitea issue body parser (closes #177 )
2026-04-15 03:50:04 +00:00 · 2026-04-15 03:49:00 +00:00
2 changed files with 154 additions and 187 deletions
--- a/scripts/gitea_issue_parser.py
+++ b/scripts/gitea_issue_parser.py
@@ -1,31 +1,31 @@
 #!/usr/bin/env python3
 """
-Gitea Issue Body Parser
-
-Extracts structured data from Gitea issue markdown bodies:
- Title
- Context section
- Acceptance criteria (checkboxes)
- Labels
- Epic/parent references
+Gitea Issue Body Parser — Extract structured data from markdown issue bodies.

 Usage:
-  python3 scripts/gitea_issue_parser.py <issue_body.txt
-  python3 scripts/gitea_issue_parser.py --url https://forge.../api/v1/repos/.../issues/123
-  echo "issue body" | python3 scripts/gitea_issue_parser.py --stdin
-
-Output: JSON with {title, context, criteria[], labels[], epic_ref}
+    cat issue_body.txt | python3 scripts/gitea_issue_parser.py --stdin --pretty
+    python3 scripts/gitea_issue_parser.py --url https://forge.../api/v1/repos/.../issues/123 --pretty
+    python3 scripts/gitea_issue_parser.py body.txt --title "Fix thing (#42)" --labels pipeline extraction
 """

 import argparse
 import json
 import re
 import sys
-from typing import Optional
+from typing import Dict, List, Any, Optional


-def parse_issue_body(body: str, title: str = "", labels: list = None) -> dict:
-    """Parse a Gitea issue body into structured JSON."""
+def parse_issue_body(body: str, title: str = "", labels: List[str] = None) -> Dict[str, Any]:
+    """Parse a Gitea issue markdown body into structured JSON.
+    
+    Extracted fields:
+    - title: Issue title
+    - context: Background/description section
+    - criteria[]: Acceptance criteria (checkboxes or numbered lists)
+    - labels[]: Issue labels
+    - epic_ref: Parent/epic issue reference (from "Closes #N" or title)
+    - sections{}: All ## sections as key-value pairs
+    """
    result = {
        "title": title,
        "context": "",
@@ -34,126 +34,95 @@ def parse_issue_body(body: str, title: str = "", labels: list = None) -> dict:
        "epic_ref": None,
        "sections": {},
    }
-
+    
    if not body:
        return result
-
-    # Extract epic/parent reference from title or body
-    epic_pattern = r"#(\d+)"
-    title_refs = re.findall(epic_pattern, title)
-    body_refs = re.findall(epic_pattern, body[:200])  # Check early body refs
-
-    # Look for "Closes #N" or "Part of #N" or "Epic: #N"
-    close_match = re.search(r"(?:Closes?|Fixes?|Resolves?)\s+#(\d+)", body, re.IGNORECASE)
-    part_match = re.search(r"(?:Part of|Epic|Parent|Blocks?)\s+#(\d+)", body, re.IGNORECASE)
-
-    if close_match:
-        result["epic_ref"] = f"#{close_match.group(1)}"
-    elif part_match:
-        result["epic_ref"] = f"#{part_match.group(1)}"
-    elif title_refs:
-        result["epic_ref"] = f"#{title_refs[0]}"
-    elif body_refs:
-        result["epic_ref"] = f"#{body_refs[0]}"
-
-    # Split into sections by ## headers
-    section_pattern = r"^##\s+(.+)$"
-    lines = body.split("\n")
-    current_section = None
-    current_content = []
-
-    for line in lines:
-        header_match = re.match(section_pattern, line)
-        if header_match:
-            # Save previous section
-            if current_section:
-                result["sections"][current_section] = "\n".join(current_content).strip()
-            current_section = header_match.group(1).strip().lower()
-            current_content = []
-        else:
-            current_content.append(line)
-
-    # Save last section
-    if current_section:
-        result["sections"][current_section] = "\n".join(current_content).strip()
-
-    # Extract context
-    for key in ["context", "background", "description", "problem"]:
-        if key in result["sections"]:
-            result["context"] = result["sections"][key]
+    
+    # Extract epic reference from title or body
+    epic_patterns = [
+        r"(?:closes|fixes|addresses|refs?)\s+#(\d+)",
+        r"#(\d+)",
+    ]
+    for pattern in epic_patterns:
+        match = re.search(pattern, (title + " " + body).lower())
+        if match:
+            result["epic_ref"] = int(match.group(1))
            break
-
+    
+    # Parse ## sections
+    section_pattern = r"^##\s+(.+?)$\n((?:^(?!##\s).*$\n?)*)"
+    for match in re.finditer(section_pattern, body, re.MULTILINE):
+        section_name = match.group(1).strip().lower().replace(" ", "_")
+        section_content = match.group(2).strip()
+        result["sections"][section_name] = section_content
+    
    # Extract acceptance criteria (checkboxes)
-    criteria_section = None
-    for key in ["acceptance criteria", "acceptance_criteria", "criteria", "requirements", "definition of done"]:
-        if key in result["sections"]:
-            criteria_section = result["sections"][key]
-            break
-
-    if criteria_section:
-        checkbox_pattern = r"-\s*\[[ xX]?\]\s*(.+)"
-        for match in re.finditer(checkbox_pattern, criteria_section):
-            result["criteria"].append(match.group(1).strip())
-
-        # Also try plain numbered/bulleted lists if no checkboxes found
-        if not result["criteria"]:
-            list_pattern = r"^\s*(?:\d+\.|-|\*)\s+(.+)"
-            for match in re.finditer(list_pattern, criteria_section, re.MULTILINE):
-                result["criteria"].append(match.group(1).strip())
-
-    # If no sectioned criteria found, scan whole body for checkboxes
+    checkbox_pattern = r"^\s*-\s*\[([ xX])\]\s*(.+)$"
+    for match in re.finditer(checkbox_pattern, body, re.MULTILINE):
+        checked = match.group(1).lower() == "x"
+        text = match.group(2).strip()
+        result["criteria"].append({"text": text, "checked": checked})
+    
+    # If no checkboxes, try numbered lists in "Acceptance Criteria" or "Criteria" section
    if not result["criteria"]:
-        for match in re.finditer(r"-\s*\[[ xX]?\]\s*(.+)", body):
-            result["criteria"].append(match.group(1).strip())
-
+        for section_name in ["acceptance_criteria", "criteria", "acceptance criteria"]:
+            if section_name in result["sections"]:
+                numbered = r"^\s*\d+\.\s*(.+)$"
+                for match in re.finditer(numbered, result["sections"][section_name], re.MULTILINE):
+                    result["criteria"].append({"text": match.group(1).strip(), "checked": False})
+                break
+    
+    # Extract context (first section or first paragraph before any ## heading)
+    first_heading = body.find("## ")
+    if first_heading > 0:
+        context_text = body[:first_heading].strip()
+    else:
+        context_text = body.split("\n\n")[0].strip()
+    # Clean up: remove "## Context" or "## Problem" header if present
+    context_text = re.sub(r"^#+\s*\w+\s*\n?", "", context_text).strip()
+    result["context"] = context_text[:500]  # Cap at 500 chars
+    
    return result


-def parse_from_url(api_url: str, token: str = None) -> dict:
-    """Parse an issue from a Gitea API URL."""
+def fetch_issue_from_url(url: str) -> Dict[str, Any]:
+    """Fetch an issue from a Gitea API URL and parse it."""
    import urllib.request
-
-    headers = {}
-    if token:
-        headers["Authorization"] = f"token {token}"
-
-    req = urllib.request.Request(api_url, headers=headers)
-    resp = json.loads(urllib.request.urlopen(req, timeout=30).read())
-
-    title = resp.get("title", "")
-    body = resp.get("body", "")
-    labels = [l["name"] for l in resp.get("labels", [])]
-
-    return parse_issue_body(body, title, labels)
+    req = urllib.request.Request(url, headers={"Accept": "application/json"})
+    with urllib.request.urlopen(req) as resp:
+        data = json.loads(resp.read())
+    
+    return parse_issue_body(
+        body=data.get("body", ""),
+        title=data.get("title", ""),
+        labels=[l["name"] for l in data.get("labels", [])]
+    )


 def main():
-    parser = argparse.ArgumentParser(description="Parse Gitea issue body into structured JSON")
-    parser.add_argument("input", nargs="?", help="Issue body file (or - for stdin)")
-    parser.add_argument("--url", help="Gitea API URL for the issue")
+    parser = argparse.ArgumentParser(description="Parse Gitea issue markdown into structured JSON")
+    parser.add_argument("file", nargs="?", help="Issue body file (or use --stdin)")
    parser.add_argument("--stdin", action="store_true", help="Read from stdin")
-    parser.add_argument("--token", help="Gitea API token (or set GITEA_TOKEN env var)")
-    parser.add_argument("--title", default="", help="Issue title (for epic ref extraction)")
+    parser.add_argument("--url", help="Gitea API URL to fetch issue from")
+    parser.add_argument("--title", default="", help="Issue title")
    parser.add_argument("--labels", nargs="*", default=[], help="Issue labels")
    parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
+    
    args = parser.parse_args()
-
-    import os
-    token = args.token or os.environ.get("GITEA_TOKEN")
-
+    
    if args.url:
-        result = parse_from_url(args.url, token)
-    elif args.stdin or (args.input and args.input == "-"):
+        result = fetch_issue_from_url(args.url)
+    elif args.stdin:
        body = sys.stdin.read()
        result = parse_issue_body(body, args.title, args.labels)
-    elif args.input:
-        with open(args.input) as f:
+    elif args.file:
+        with open(args.file) as f:
            body = f.read()
        result = parse_issue_body(body, args.title, args.labels)
    else:
        parser.print_help()
        sys.exit(1)
-
+    
    indent = 2 if args.pretty else None
    print(json.dumps(result, indent=indent))

--- a/scripts/test_gitea_issue_parser.py
+++ b/scripts/test_gitea_issue_parser.py
@@ -1,111 +1,109 @@
 #!/usr/bin/env python3
-"""Tests for gitea_issue_parser."""
+"""Tests for scripts/gitea_issue_parser.py"""

-import json
 import sys
 import os
+sys.path.insert(0, os.path.dirname(__file__) or ".")

-sys.path.insert(0, os.path.dirname(__file__))
-from gitea_issue_parser import parse_issue_body
+# Import from sibling
+import importlib.util
+spec = importlib.util.spec_from_file_location("parser", os.path.join(os.path.dirname(__file__) or ".", "gitea_issue_parser.py"))
+mod = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(mod)
+parse_issue_body = mod.parse_issue_body


-def test_basic_structure():
+def test_basic_parsing():
    body = """## Context
-This is the background.
+
+This is the background info.

 ## Acceptance Criteria
+
 - [ ] First criterion
- [x] Second criterion (already done)
- [ ] Third criterion
+- [x] Second criterion (done)

-## Labels
-`pipeline`, `extraction`
+## What to build
+
+Some description.
 """
-    result = parse_issue_body(body, "Test Issue", ["pipeline", "extraction"])
-    assert result["title"] == "Test Issue"
-    assert "background" in result["context"].lower()
-    assert len(result["criteria"]) == 3
-    assert "First criterion" in result["criteria"]
-    assert result["labels"] == ["pipeline", "extraction"]
-    print("PASS: test_basic_structure")
+    result = parse_issue_body(body, title="Test (#42)", labels=["bug"])
+    assert result["title"] == "Test (#42)"
+    assert result["labels"] == ["bug"]
+    assert result["epic_ref"] == 42
+    assert len(result["criteria"]) == 2
+    assert result["criteria"][0]["text"] == "First criterion"
+    assert result["criteria"][0]["checked"] == False
+    assert result["criteria"][1]["checked"] == True
+    assert "context" in result["sections"]
+    print("PASS: test_basic_parsing")


-def test_epic_ref():
-    body = "Closes #645\n\nSome description."
-    result = parse_issue_body(body, "feat: thing (#688)")
-    assert result["epic_ref"] == "#645"
-    print("PASS: test_epic_ref")
+def test_numbered_criteria():
+    body = """## Acceptance Criteria

-
-def test_epic_ref_from_title():
-    body = "Some description without close ref."
-    result = parse_issue_body(body, "feat: scene descriptions (#645)")
-    assert result["epic_ref"] == "#645"
-    print("PASS: test_epic_ref_from_title")
-
-
-def test_no_checkboxes():
-    body = """## Requirements
-1. First thing
-2. Second thing
-3. Third thing
+1. First item
+2. Second item
+3. Third item
 """
    result = parse_issue_body(body)
    assert len(result["criteria"]) == 3
-    print("PASS: test_no_checkboxes")
+    assert result["criteria"][0]["text"] == "First item"
+    print("PASS: test_numbered_criteria")
+
+
+def test_epic_ref_from_body():
+    body = "Closes #123\n\nSome description."
+    result = parse_issue_body(body)
+    assert result["epic_ref"] == 123
+    print("PASS: test_epic_ref_from_body")


 def test_empty_body():
-    result = parse_issue_body("", "Empty Issue")
-    assert result["title"] == "Empty Issue"
+    result = parse_issue_body("")
    assert result["criteria"] == []
    assert result["context"] == ""
+    assert result["sections"] == {}
    print("PASS: test_empty_body")


-def test_real_issue_format():
-    body = """Closes #681
-
-## Changes
-
-Add `#!/usr/bin/env python3` shebang to 6 Python scripts.
-
-## Verification
-
-All 6 files confirmed missing shebangs before fix.
-
-## Impact
-
-Scripts can now be executed directly.
-"""
-    result = parse_issue_body(body, "fix: add python3 shebangs (#685)")
-    assert result["epic_ref"] == "#681"
-    assert "shebang" in result["context"].lower()
-    print("PASS: test_real_issue_format")
+def test_no_sections():
+    body = "Just a plain issue body with no headings."
+    result = parse_issue_body(body)
+    assert result["context"] == "Just a plain issue body with no headings."
+    print("PASS: test_no_sections")


-def test_all_sections_captured():
-    body = """## Context
-Background info.
+def test_multiple_sections():
+    body = """## Problem

-## Acceptance Criteria
- [ ] Do thing
+Something is broken.

-## Labels
-`test`
+## Fix
+
+Do this instead.
+
+## Notes
+
+Additional info.
 """
    result = parse_issue_body(body)
-    assert "context" in result["sections"]
-    assert "acceptance criteria" in result["sections"]
-    print("PASS: test_all_sections_captured")
+    assert "problem" in result["sections"]
+    assert "fix" in result["sections"]
+    assert "notes" in result["sections"]
+    assert "Something is broken" in result["sections"]["problem"]
+    print("PASS: test_multiple_sections")
+
+
+def run_all():
+    test_basic_parsing()
+    test_numbered_criteria()
+    test_epic_ref_from_body()
+    test_empty_body()
+    test_no_sections()
+    test_multiple_sections()
+    print("\nAll 6 tests passed!")


 if __name__ == "__main__":
-    test_basic_structure()
-    test_epic_ref()
-    test_epic_ref_from_title()
-    test_no_checkboxes()
-    test_empty_body()
-    test_real_issue_format()
-    test_all_sections_captured()
-    print("\nAll tests passed.")
+    run_all()
Author	SHA1	Message	Date
Alexander Whitestone	54f3bef7fc	feat: Add parser tests (closes #177 )	2026-04-15 03:50:04 +00:00
Alexander Whitestone	4fcd372de4	feat: Add Gitea issue body parser (closes #177 )	2026-04-15 03:49:00 +00:00