feat: auto-revert incomplete skill edits (#923 )

fix: restore _format_error helper for test compatibility (#916 )
fix: restore _format_error helper for test compatibility (#916)
2026-04-21 04:52:58 +00:00 · 2026-04-20 23:56:27 +00:00
3 changed files with 150 additions and 147 deletions
--- a/scripts/queue_health_check.py
+++ b/scripts/queue_health_check.py
@@ -1,147 +0,0 @@
-#!/usr/bin/env python3
-"""
-Queue Health Check — Verify dispatch queue is operational.
-
-Checks:
-1. Queue file exists and is readable
-2. Queue has pending items
-3. Queue is not stuck (items processing)
-4. Queue age (stale items)
-
-Usage:
-    python scripts/queue_health_check.py
-    python scripts/queue_health_check.py --json
-"""
-
-import json
-import sys
-from datetime import datetime, timedelta
-from pathlib import Path
-
-
-def check_queue_health(queue_path: str = "~/.hermes/queue.json") -> dict:
-    """Check queue health status."""
-    path = Path(queue_path).expanduser()
-    
-    result = {
-        "healthy": True,
-        "checks": {},
-        "warnings": [],
-        "errors": []
-    }
-    
-    # Check 1: File exists
-    if not path.exists():
-        result["healthy"] = False
-        result["errors"].append(f"Queue file not found: {path}")
-        result["checks"]["file_exists"] = False
-        return result
-    
-    result["checks"]["file_exists"] = True
-    
-    # Check 2: File is readable
-    try:
-        with open(path, "r") as f:
-            data = json.load(f)
-    except Exception as e:
-        result["healthy"] = False
-        result["errors"].append(f"Cannot read queue: {e}")
-        result["checks"]["readable"] = False
-        return result
-    
-    result["checks"]["readable"] = True
-    
-    # Check 3: Queue structure
-    if not isinstance(data, dict):
-        result["healthy"] = False
-        result["errors"].append("Queue is not a dict")
-        result["checks"]["valid_structure"] = False
-        return result
-    
-    result["checks"]["valid_structure"] = True
-    
-    # Check 4: Pending items
-    pending = data.get("pending", [])
-    processing = data.get("processing", [])
-    completed = data.get("completed", [])
-    
-    result["checks"]["pending_count"] = len(pending)
-    result["checks"]["processing_count"] = len(processing)
-    result["checks"]["completed_count"] = len(completed)
-    
-    if len(pending) == 0 and len(processing) == 0:
-        result["warnings"].append("Queue is empty")
-    
-    # Check 5: Stale processing items
-    now = datetime.now()
-    stale_threshold = timedelta(hours=1)
-    
-    for item in processing:
-        started = item.get("started_at")
-        if started:
-            try:
-                started_time = datetime.fromisoformat(started.replace("Z", "+00:00"))
-                if now - started_time > stale_threshold:
-                    result["warnings"].append(f"Stale item: {item.get('id', 'unknown')} (started {started})")
-            except:
-                pass
-    
-    # Check 6: Queue age
-    if pending:
-        oldest = min(pending, key=lambda x: x.get("added_at", ""))
-        added = oldest.get("added_at")
-        if added:
-            try:
-                added_time = datetime.fromisoformat(added.replace("Z", "+00:00"))
-                age = now - added_time
-                if age > timedelta(hours=24):
-                    result["warnings"].append(f"Old item in queue: {oldest.get('id', 'unknown')} (added {added})")
-            except:
-                pass
-    
-    return result
-
-
-def main():
-    """Main function."""
-    import argparse
-    
-    parser = argparse.ArgumentParser(description="Queue health check")
-    parser.add_argument("--queue", default="~/.hermes/queue.json", help="Queue file path")
-    parser.add_argument("--json", action="store_true", help="Output as JSON")
-    args = parser.parse_args()
-    
-    result = check_queue_health(args.queue)
-    
-    if args.json:
-        print(json.dumps(result, indent=2))
-    else:
-        print("Queue Health Check")
-        print("=" * 50)
-        print(f"Healthy: {'✓' if result['healthy'] else '✗'}")
-        print()
-        
-        print("Checks:")
-        for check, value in result["checks"].items():
-            if isinstance(value, bool):
-                print(f"  {check}: {'✓' if value else '✗'}")
-            else:
-                print(f"  {check}: {value}")
-        
-        if result["warnings"]:
-            print()
-            print("Warnings:")
-            for warning in result["warnings"]:
-                print(f"  ⚠ {warning}")
-        
-        if result["errors"]:
-            print()
-            print("Errors:")
-            for error in result["errors"]:
-                print(f"  ✗ {error}")
-    
-    sys.exit(0 if result["healthy"] else 1)
-
-
-if __name__ == "__main__":
-    main()
--- a/tools/skill_edit_guard.py
+++ b/tools/skill_edit_guard.py
@@ -0,0 +1,122 @@
+"""Skill Edit Guard — Poka-yoke auto-revert for incomplete skill edits.
+
+Creates atomic skill edits with automatic rollback on failure.
+Prevents broken skills from corrupting future sessions.
+
+Usage:
+    from tools.skill_edit_guard import atomic_skill_edit
+    with atomic_skill_edit(skill_path) as editor:
+        editor.write(new_content)
+        # If exception occurs, file is automatically reverted
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import shutil
+import tempfile
+import time
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class SkillEditGuard:
+    """Atomic skill file editing with auto-revert on failure."""
+
+    def __init__(self, skill_path: str):
+        self._path = Path(skill_path)
+        self._backup: Optional[Path] = None
+        self._committed = False
+
+    def backup(self) -> bool:
+        """Create backup before editing."""
+        if not self._path.exists():
+            return True  # New file, nothing to backup
+
+        backup_dir = self._path.parent / ".skill_backups"
+        backup_dir.mkdir(exist_ok=True)
+
+        ts = int(time.time() * 1000)
+        self._backup = backup_dir / f"{self._path.name}.{ts}.bak"
+        shutil.copy2(self._path, self._backup)
+        logger.debug("Skill backup created: %s", self._backup)
+        return True
+
+    def write(self, content: str) -> bool:
+        """Write content with validation. Returns True if valid."""
+        # Validate YAML frontmatter
+        if content.startswith("---"):
+            end = content.find("---", 3)
+            if end < 0:
+                logger.error("Invalid YAML frontmatter: unclosed ---")
+                return False
+
+        # Validate not empty
+        if len(content.strip()) < 10:
+            logger.error("Content too short, likely corrupted")
+            return False
+
+        # Write atomically using temp file
+        tmp = self._path.with_suffix(".tmp")
+        try:
+            tmp.write_text(content, encoding="utf-8")
+            tmp.rename(self._path)
+            return True
+        except Exception as e:
+            logger.error("Write failed: %s", e)
+            if tmp.exists():
+                tmp.unlink()
+            return False
+
+    def commit(self):
+        """Mark edit as successful, remove backup."""
+        self._committed = True
+        if self._backup and self._backup.exists():
+            self._backup.unlink()
+            logger.debug("Skill backup removed: %s", self._backup)
+
+    def rollback(self) -> bool:
+        """Revert to backup."""
+        if self._backup and self._backup.exists():
+            shutil.copy2(self._backup, self._path)
+            self._backup.unlink()
+            logger.warning("Skill reverted from backup: %s", self._path)
+            return True
+        return False
+
+    def __enter__(self):
+        self.backup()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            self.rollback()
+            return False  # Re-raise exception
+        if not self._committed:
+            self.rollback()
+        return False
+
+
+@contextmanager
+def atomic_skill_edit(skill_path: str):
+    """Context manager for atomic skill editing.
+
+    Usage:
+        with atomic_skill_edit("/path/to/skill/SKILL.md") as editor:
+            success = editor.write(new_content)
+            if not success:
+                raise ValueError("Write failed")
+            # __exit__ commits on success, reverts on exception
+    """
+    guard = SkillEditGuard(skill_path)
+    guard.backup()
+    try:
+        yield guard
+        guard.commit()
+    except Exception:
+        guard.rollback()
+        raise
--- a/tools/skill_manager_tool.py
+++ b/tools/skill_manager_tool.py
@@ -44,6 +44,34 @@ from typing import Dict, Any, Optional, Tuple

 logger = logging.getLogger(__name__)

+
+def _format_error(
+    message: str,
+    skill_name: str = None,
+    file_path: str = None,
+    suggestion: str = None,
+    context: dict = None,
+) -> Dict[str, Any]:
+    """Format an error with rich context for better debugging."""
+    parts = [message]
+    if skill_name:
+        parts.append(f"Skill: {skill_name}")
+    if file_path:
+        parts.append(f"File: {file_path}")
+    if suggestion:
+        parts.append(f"Suggestion: {suggestion}")
+    if context:
+        for key, value in context.items():
+            parts.append(f"{key}: {value}")
+    return {
+        "success": False,
+        "error": " | ".join(parts),
+        "skill_name": skill_name,
+        "file_path": file_path,
+        "suggestion": suggestion,
+    }
+
+
 # Import security scanner — agent-created skills get the same scrutiny as
 # community hub installs.
 try:
Author	SHA1	Message	Date
Alexander Whitestone	d27ca6d39a	feat: auto-revert incomplete skill edits (#923 ) Some checks failed Docker Build and Publish / build-and-push (pull_request) Has been skipped Details Contributor Attribution Check / check-attribution (pull_request) Failing after 35s Details Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 39s Details Tests / e2e (pull_request) Successful in 4m13s Details Tests / test (pull_request) Failing after 58m1s Details	2026-04-21 04:52:58 +00:00
Alexander Whitestone	c6f2855745	fix: restore _format_error helper for test compatibility (#916 ) Some checks failed Docker Build and Publish / build-and-push (push) Has been skipped Details Nix / nix (ubuntu-latest) (push) Failing after 2s Details Tests / e2e (push) Successful in 2m47s Details Tests / test (push) Failing after 27m41s Details Build Skills Index / build-index (push) Has been skipped Details Build Skills Index / deploy-with-index (push) Has been skipped Details Nix / nix (macos-latest) (push) Has been cancelled Details fix: restore _format_error helper for test compatibility (#916)	2026-04-20 23:56:27 +00:00