#!/usr/bin/env python3 """ validate_soul.py — SOUL.md validator Checks that a SOUL.md file conforms to the framework defined in docs/soul/SOUL_TEMPLATE.md and docs/soul/AUTHORING_GUIDE.md. Usage: python scripts/validate_soul.py python scripts/validate_soul.py docs/soul/extensions/seer.md python scripts/validate_soul.py memory/self/soul.md Exit codes: 0 — valid 1 — validation errors found """ from __future__ import annotations import re import sys from dataclasses import dataclass, field from pathlib import Path # --------------------------------------------------------------------------- # Required sections (H2 headings that must be present) # --------------------------------------------------------------------------- REQUIRED_SECTIONS = [ "Identity", "Prime Directive", "Values", "Audience Awareness", "Constraints", "Changelog", ] # Sections required only for sub-agents (those with 'extends' in frontmatter) EXTENSION_ONLY_SECTIONS = [ "Role Extension", ] # --------------------------------------------------------------------------- # Contradiction detection — pairs of phrases that are likely contradictory # if both appear in the same document. # --------------------------------------------------------------------------- CONTRADICTION_PAIRS: list[tuple[str, str]] = [ # honesty vs deception (r"\bnever deceive\b", r"\bdeceive the user\b"), (r"\bnever fabricate\b", r"\bfabricate\b.*\bwhen needed\b"), # refusal patterns (r"\bnever refuse\b", r"\bwill not\b"), # data handling (r"\bnever store.*credentials\b", r"\bstore.*credentials\b.*\bwhen\b"), (r"\bnever exfiltrate\b", r"\bexfiltrate.*\bif authorized\b"), # autonomy (r"\bask.*before.*executing\b", r"\bexecute.*without.*asking\b"), ] # --------------------------------------------------------------------------- # Semver pattern # --------------------------------------------------------------------------- SEMVER_PATTERN = re.compile(r"^\d+\.\d+\.\d+$") # --------------------------------------------------------------------------- # Frontmatter fields that must be present and non-empty # --------------------------------------------------------------------------- REQUIRED_FRONTMATTER_FIELDS = [ "soul_version", "agent_name", "created", "updated", ] # --------------------------------------------------------------------------- # Data structures # --------------------------------------------------------------------------- @dataclass class ValidationResult: path: Path errors: list[str] = field(default_factory=list) warnings: list[str] = field(default_factory=list) @property def is_valid(self) -> bool: return len(self.errors) == 0 def error(self, msg: str) -> None: self.errors.append(msg) def warn(self, msg: str) -> None: self.warnings.append(msg) # --------------------------------------------------------------------------- # Parsing helpers # --------------------------------------------------------------------------- def _extract_frontmatter(text: str) -> dict[str, str]: """Extract YAML-style frontmatter between --- delimiters.""" match = re.match(r"^---\n(.*?)\n---", text, re.DOTALL) if not match: return {} fm: dict[str, str] = {} for line in match.group(1).splitlines(): if ":" in line: key, _, value = line.partition(":") fm[key.strip()] = value.strip().strip('"') return fm def _extract_sections(text: str) -> set[str]: """Return the set of H2 section names found in the document.""" return {m.group(1).strip() for m in re.finditer(r"^## (.+)$", text, re.MULTILINE)} def _body_text(text: str) -> str: """Return document text without frontmatter block.""" return re.sub(r"^---\n.*?\n---\n?", "", text, flags=re.DOTALL) # --------------------------------------------------------------------------- # Validation steps # --------------------------------------------------------------------------- def _check_frontmatter(text: str, result: ValidationResult) -> dict[str, str]: fm = _extract_frontmatter(text) if not fm: result.error("No frontmatter found. Add a --- block at the top.") return fm for field_name in REQUIRED_FRONTMATTER_FIELDS: if field_name not in fm: result.error(f"Frontmatter missing required field: {field_name!r}") elif not fm[field_name] or fm[field_name] in ("", "YYYY-MM-DD"): result.error( f"Frontmatter field {field_name!r} is empty or still a placeholder." ) version = fm.get("soul_version", "") if version and not SEMVER_PATTERN.match(version): result.error( f"soul_version {version!r} is not valid semver (expected MAJOR.MINOR.PATCH)." ) return fm def _check_required_sections( text: str, fm: dict[str, str], result: ValidationResult ) -> None: sections = _extract_sections(text) is_extension = "extends" in fm for section in REQUIRED_SECTIONS: if section not in sections: result.error(f"Required section missing: ## {section}") if is_extension: for section in EXTENSION_ONLY_SECTIONS: if section not in sections: result.warn( f"Sub-agent soul is missing recommended section: ## {section}" ) def _check_values_section(text: str, result: ValidationResult) -> None: """Check that values section contains at least 3 numbered items.""" body = _body_text(text) values_match = re.search( r"## Values\n(.*?)(?=\n## |\Z)", body, re.DOTALL ) if not values_match: return # Already reported as missing section values_text = values_match.group(1) numbered_items = re.findall(r"^\d+\.", values_text, re.MULTILINE) count = len(numbered_items) if count < 3: result.error( f"Values section has {count} item(s); minimum is 3. " "Values must be numbered (1. 2. 3. ...)" ) if count > 8: result.warn( f"Values section has {count} items; recommended maximum is 8. " "Consider consolidating." ) def _check_constraints_section(text: str, result: ValidationResult) -> None: """Check that constraints section contains at least 3 bullet points.""" body = _body_text(text) constraints_match = re.search( r"## Constraints\n(.*?)(?=\n## |\Z)", body, re.DOTALL ) if not constraints_match: return # Already reported as missing section constraints_text = constraints_match.group(1) bullets = re.findall(r"^- \*\*Never\*\*", constraints_text, re.MULTILINE) if len(bullets) < 3: result.error( f"Constraints section has {len(bullets)} 'Never' constraint(s); " "minimum is 3. Constraints must start with '- **Never**'." ) def _check_changelog(text: str, result: ValidationResult) -> None: """Check that changelog has at least one entry row.""" body = _body_text(text) changelog_match = re.search( r"## Changelog\n(.*?)(?=\n## |\Z)", body, re.DOTALL ) if not changelog_match: return # Already reported as missing section # Table rows have 4 | delimiters (version | date | author | summary) rows = [ line for line in changelog_match.group(1).splitlines() if line.count("|") >= 3 and not line.startswith("|---") and "Version" not in line ] if not rows: result.error("Changelog table has no entries. Add at least one row.") def _check_contradictions(text: str, result: ValidationResult) -> None: """Heuristic check for contradictory directive pairs.""" lower = text.lower() for pattern_a, pattern_b in CONTRADICTION_PAIRS: match_a = re.search(pattern_a, lower) match_b = re.search(pattern_b, lower) if match_a and match_b: result.warn( f"Possible contradiction detected: " f"'{pattern_a}' and '{pattern_b}' both appear in the document. " "Review for conflicting directives." ) def _check_placeholders(text: str, result: ValidationResult) -> None: """Check for unfilled template placeholders.""" placeholders = re.findall(r"<[A-Z][A-Za-z ]+>", text) for ph in set(placeholders): result.error(f"Unfilled placeholder found: {ph}") # --------------------------------------------------------------------------- # Main validator # --------------------------------------------------------------------------- def validate(path: Path) -> ValidationResult: result = ValidationResult(path=path) if not path.exists(): result.error(f"File not found: {path}") return result text = path.read_text(encoding="utf-8") fm = _check_frontmatter(text, result) _check_required_sections(text, fm, result) _check_values_section(text, result) _check_constraints_section(text, result) _check_changelog(text, result) _check_contradictions(text, result) _check_placeholders(text, result) return result def _print_result(result: ValidationResult) -> None: path_str = str(result.path) if result.is_valid and not result.warnings: print(f"[PASS] {path_str}") return if result.is_valid: print(f"[WARN] {path_str}") else: print(f"[FAIL] {path_str}") for err in result.errors: print(f" ERROR: {err}") for warn in result.warnings: print(f" WARN: {warn}") # --------------------------------------------------------------------------- # CLI entry point # --------------------------------------------------------------------------- def main() -> int: if len(sys.argv) < 2: print("Usage: python scripts/validate_soul.py [...]") print() print("Examples:") print(" python scripts/validate_soul.py memory/self/soul.md") print(" python scripts/validate_soul.py docs/soul/extensions/seer.md") print(" python scripts/validate_soul.py docs/soul/extensions/*.md") return 1 paths = [Path(arg) for arg in sys.argv[1:]] results = [validate(p) for p in paths] any_failed = False for r in results: _print_result(r) if not r.is_valid: any_failed = True if len(results) > 1: passed = sum(1 for r in results if r.is_valid) print(f"\n{passed}/{len(results)} soul files passed validation.") return 1 if any_failed else 0 if __name__ == "__main__": sys.exit(main())