feat: cross-repo dependency graph builder (#93 )

2026-04-15 03:44:12 +00:00
2 changed files with 249 additions and 276 deletions
--- a/scripts/dependency_graph.py
+++ b/scripts/dependency_graph.py
@@ -0,0 +1,249 @@
 #!/usr/bin/env python3
 """
 Cross-Repo Dependency Graph Builder
 Scans repos for import/require/reference patterns and builds a directed
 dependency graph. Detects circular dependencies. Outputs DOT and Mermaid.
 Usage:
  python3 scripts/dependency_graph.py /path/to/repos/
  python3 scripts/dependency_graph.py --repos repo1,repo2,repo3 --format mermaid
  python3 scripts/dependency_graph.py --repos-dir /path/to/ --format dot --output deps.dot
 Patterns detected:
  - Python: import X, from X import Y
  - JavaScript: require("X"), import ... from "X"
  - Go: import "X"
  - Ansible: include_role, import_role
  - Docker/Compose: image: X, depends_on
  - Config references: repo-name in YAML/TOML/JSON
 """
 import argparse
 import json
 import os
 import re
 import sys
 from collections import defaultdict
 from pathlib import Path
 # Known repo names for matching
 KNOWN_REPOS = [
    "hermes-agent", "timmy-config", "timmy-home", "the-nexus", "the-door",
    "the-beacon", "fleet-ops", "burn-fleet", "timmy-dispatch", "turboquant",
    "compounding-intelligence", "the-playground", "second-son-of-timmy",
    "ai-safety-review", "the-echo-pattern", "timmy-academy", "wolf",
    "the-testament",
 ]
 def normalize_repo_name(name: str) -> str:
    """Normalize a repo name for comparison."""
    return name.lower().replace("_", "-").replace(".git", "").strip()
 def scan_file_for_deps(filepath: str, content: str, own_repo: str) -> set:
    """Scan a file's content for references to other repos."""
    deps = set()
    own_norm = normalize_repo_name(own_repo)
    for repo in KNOWN_REPOS:
        repo_norm = normalize_repo_name(repo)
        if repo_norm == own_norm:
            continue
        # Direct name references
        patterns = [
            repo,  # exact name
            repo.replace("-", "_"),  # underscore variant
            repo.replace("-", ""),  # no separator
            f"/{repo}/",  # path reference
            f'"{repo}"',  # quoted
            f"'{repo}'",  # single quoted
            f"Timmy_Foundation/{repo}",  # full Gitea path
            f"Timmy_Foundation.{repo}",  # Python module path
        ]
        for pattern in patterns:
            if pattern in content:
                deps.add(repo)
                break
    return deps
 def scan_repo(repo_path: str, repo_name: str = None) -> dict:
    """Scan a repo directory for dependencies."""
    path = Path(repo_path)
    if not path.is_dir():
        return {"error": f"Not a directory: {repo_path}"}
    if not repo_name:
        repo_name = path.name
    deps = set()
    files_scanned = 0
    exts = {".py", ".js", ".ts", ".go", ".yaml", ".yml", ".toml", ".json",
            ".md", ".sh", ".bash", ".Dockerfile", ".tf", ".hcl"}
    for fpath in path.rglob("*"):
        if not fpath.is_file():
            continue
        if fpath.suffix not in exts:
            continue
        # Skip common non-source dirs
        parts = fpath.parts
        if any(p in (".git", "node_modules", "__pycache__", ".venv", "venv",
                      "vendor", "dist", "build", ".tox") for p in parts):
            continue
        try:
            content = fpath.read_text(errors="ignore")
        except:
            continue
        file_deps = scan_file_for_deps(str(fpath), content, repo_name)
        deps.update(file_deps)
        files_scanned += 1
    return {
        "repo": repo_name,
        "dependencies": sorted(deps),
        "files_scanned": files_scanned,
    }
 def detect_cycles(graph: dict) -> list:
    """Detect circular dependencies using DFS."""
    cycles = []
    visited = set()
    rec_stack = set()
    def dfs(node, path):
        visited.add(node)
        rec_stack.add(node)
        for neighbor in graph.get(node, {}).get("dependencies", []):
            if neighbor not in visited:
                result = dfs(neighbor, path + [neighbor])
                if result:
                    return result
            elif neighbor in rec_stack:
                cycle_start = path.index(neighbor)
                return path[cycle_start:] + [neighbor]
        rec_stack.remove(node)
        return None
    for node in graph:
        if node not in visited:
            cycle = dfs(node, [node])
            if cycle:
                cycles.append(cycle)
    return cycles
 def to_dot(graph: dict) -> str:
    """Generate DOT format output."""
    lines = ["digraph dependencies {"]
    lines.append("  rankdir=LR;")
    lines.append("  node [shape=box, style=filled, fillcolor="#1a1a2e", fontcolor="#e6edf3"];")
    lines.append("  edge [color="#4a4a6a"];")
    lines.append("")
    for repo, data in sorted(graph.items()):
        dep_count = len(data.get("dependencies", []))
        fill = "#2d1b69" if dep_count > 2 else "#16213e"
        lines.append(f'  "{repo}" [fillcolor="{fill}"];')
        for dep in data.get("dependencies", []):
            lines.append(f'  "{repo}" -> "{dep}";')
    lines.append("}")
    return "\n".join(lines)
 def to_mermaid(graph: dict) -> str:
    """Generate Mermaid format output."""
    lines = ["graph LR"]
    for repo, data in sorted(graph.items()):
        for dep in data.get("dependencies", []):
            lines.append(f"    {repo.replace('-','_')} --> {dep.replace('-','_')}")
    # Add node labels
    lines.append("")
    for repo in sorted(graph.keys()):
        lines.append(f"    {repo.replace('-','_')}[{repo}]")
    return "\n".join(lines)
 def main():
    parser = argparse.ArgumentParser(description="Build cross-repo dependency graph")
    parser.add_argument("repos_dir", nargs="?", help="Directory containing repos")
    parser.add_argument("--repos", help="Comma-separated list of repo paths")
    parser.add_argument("--format", choices=["dot", "mermaid", "json"], default="json")
    parser.add_argument("--output", "-o", help="Output file (default: stdout)")
    parser.add_argument("--cycles-only", action="store_true", help="Only report cycles")
    args = parser.parse_args()
    results = {}
    repo_paths = []
    if args.repos:
        repo_paths = [p.strip() for p in args.repos.split(",")]
    elif args.repos_dir:
        base = Path(args.repos_dir)
        repo_paths = [str(p) for p in base.iterdir() if p.is_dir() and not p.name.startswith(".")]
    else:
        parser.print_help()
        sys.exit(1)
    for rpath in repo_paths:
        name = Path(rpath).name
        print(f"Scanning {name}...", file=sys.stderr)
        result = scan_repo(rpath, name)
        if "error" not in result:
            results[name] = result
    # Detect cycles
    cycles = detect_cycles(results)
    if args.cycles_only:
        if cycles:
            print("CIRCULAR DEPENDENCIES DETECTED:")
            for cycle in cycles:
                print(f"  {' -> '.join(cycle)}")
            sys.exit(1)
        else:
            print("No circular dependencies found.")
            sys.exit(0)
    # Output
    output = {}
    if args.format == "dot":
        output = to_dot(results)
    elif args.format == "mermaid":
        output = to_mermaid(results)
    else:
        output = json.dumps({
            "repos": results,
            "cycles": cycles,
            "summary": {
                "total_repos": len(results),
                "total_deps": sum(len(r["dependencies"]) for r in results.values()),
                "cycles_found": len(cycles),
            }
        }, indent=2)
    if args.output:
        Path(args.output).write_text(output)
        print(f"Written to {args.output}", file=sys.stderr)
    else:
        print(output)
 if __name__ == "__main__":
    main()
--- a/scripts/session_metadata.py
+++ b/scripts/session_metadata.py
@@ -1,276 +0,0 @@
 #!/usr/bin/env python3
 """
 session_metadata.py - Extract structured metadata from Hermes session transcripts.
 Works alongside session_reader.py to provide higher-level session analysis.
 """
 import json
 import re
 import sys
 from dataclasses import dataclass, asdict
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, Optional, Any
 # Import from session_reader (the canonical reader)
 from session_reader import read_session
@dataclass
 class SessionSummary:
    """Structured summary of a Hermes session transcript."""
    session_id: str
    model: str
    repo: str
    outcome: str
    message_count: int
    tool_calls: int
    duration_estimate: str
    key_actions: List[str]
    errors_encountered: List[str]
    start_time: Optional[str] = None
    end_time: Optional[str] = None
    total_tokens_estimate: int = 0
    user_messages: int = 0
    assistant_messages: int = 0
    tool_outputs: int = 0
 def extract_session_metadata(file_path: str) -> SessionSummary:
    """
    Extract structured metadata from a Hermes session JSONL transcript.
    Uses session_reader.read_session() for file reading.
    """
    session_id = Path(file_path).stem
    messages = []
    model = "unknown"
    repo = "unknown"
    tool_calls_count = 0
    key_actions = []
    errors = []
    start_time = None
    end_time = None
    total_tokens = 0
    # Common repo patterns to look for
    repo_patterns = [
        r"(?:the-nexus|compounding-intelligence|timmy-config|hermes-agent)",
        r"(?:forge\.alexanderwhitestone\.com/([^/]+/[^/\\s]+))",
        r"(?:github\.com/([^/]+/[^/\\s]+))",
        r"(?:Timmy_Foundation/([^/\\s]+))",
    ]
    try:
        # Use the canonical reader from session_reader.py
        messages = read_session(file_path)
    except FileNotFoundError:
        return SessionSummary(
            session_id=session_id,
            model="unknown",
            repo="unknown",
            outcome="failure",
            message_count=0,
            tool_calls=0,
            duration_estimate="0m",
            key_actions=[],
            errors_encountered=[f"File not found: {file_path}"]
        )
    # Process messages for metadata
    for entry in messages:
        # Extract model from assistant messages
        if entry.get("role") == "assistant" and entry.get("model"):
            model = entry["model"]
        # Extract timestamps
        if entry.get("timestamp"):
            ts = entry["timestamp"]
            if start_time is None:
                start_time = ts
            end_time = ts
        # Count tool calls
        if entry.get("tool_calls"):
            tool_calls_count += len(entry["tool_calls"])
            for tc in entry["tool_calls"]:
                if tc.get("function", {}).get("name"):
                    action = f"{tc['function']['name']}"
                    if action not in key_actions:
                        key_actions.append(action)
        # Estimate tokens from content length
        content = entry.get("content", "")
        if isinstance(content, str):
            total_tokens += len(content.split())
        elif isinstance(content, list):
            for item in content:
                if isinstance(item, dict) and "text" in item:
                    total_tokens += len(item["text"].split())
        # Look for repo mentions in content
        if entry.get("content"):
            content_str = str(entry["content"])
            for pattern in repo_patterns:
                match = re.search(pattern, content_str, re.IGNORECASE)
                if match:
                    if match.groups():
                        repo = match.group(1)
                    else:
                        repo = match.group(0)
                    break
        # Look for error messages
        if entry.get("role") == "tool" and entry.get("is_error"):
            error_msg = entry.get("content", "Unknown error")
            if isinstance(error_msg, str) and len(error_msg) < 200:
                errors.append(error_msg[:200])
    # Count message types
    user_messages = sum(1 for m in messages if m.get("role") == "user")
    assistant_messages = sum(1 for m in messages if m.get("role") == "assistant")
    tool_outputs = sum(1 for m in messages if m.get("role") == "tool")
    # Calculate duration estimate
    duration_estimate = "unknown"
    if start_time and end_time:
        try:
            # Try to parse timestamps
            start_dt = None
            end_dt = None
            # Handle various timestamp formats
            for fmt in ["%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%d %H:%M:%S"]:
                try:
                    if start_dt is None:
                        start_dt = datetime.strptime(start_time, fmt)
                    if end_dt is None:
                        end_dt = datetime.strptime(end_time, fmt)
                except ValueError:
                    continue
            if start_dt and end_dt:
                duration = end_dt - start_dt
                minutes = duration.total_seconds() / 60
                duration_estimate = f"{minutes:.0f}m"
        except Exception:
            pass
    # Classify outcome
    outcome = "unknown"
    if errors:
        # Check if any errors are fatal
        fatal_errors = any("405" in e or "permission" in e.lower() or "authentication" in e.lower() 
                          for e in errors)
        if fatal_errors:
            outcome = "failure"
        else:
            outcome = "partial"
    elif messages:
        # Check last message for success indicators
        last_msg = messages[-1]
        if last_msg.get("role") == "assistant":
            content = last_msg.get("content", "")
            if isinstance(content, str):
                success_indicators = ["done", "completed", "success", "merged", "pushed"]
                if any(indicator in content.lower() for indicator in success_indicators):
                    outcome = "success"
                else:
                    outcome = "unknown"
    # Deduplicate key actions (keep unique, limit to 10)
    unique_actions = []
    for action in key_actions:
        if action not in unique_actions:
            unique_actions.append(action)
        if len(unique_actions) >= 10:
            break
    # Deduplicate errors (keep unique, limit to 5)
    unique_errors = []
    for error in errors:
        if error not in unique_errors:
            unique_errors.append(error)
        if len(unique_errors) >= 5:
            break
    return SessionSummary(
        session_id=session_id,
        model=model,
        repo=repo,
        outcome=outcome,
        message_count=len(messages),
        tool_calls=tool_calls_count,
        duration_estimate=duration_estimate,
        key_actions=unique_actions,
        errors_encountered=unique_errors,
        start_time=start_time,
        end_time=end_time,
        total_tokens_estimate=total_tokens,
        user_messages=user_messages,
        assistant_messages=assistant_messages,
        tool_outputs=tool_outputs
    )
 def process_session_directory(directory_path: str, output_file: Optional[str] = None) -> List[SessionSummary]:
    """
    Process all JSONL files in a directory.
    """
    directory = Path(directory_path)
    if not directory.exists():
        print(f"Error: Directory {directory_path} does not exist", file=sys.stderr)
        return []
    jsonl_files = list(directory.glob("*.jsonl"))
    if not jsonl_files:
        print(f"Warning: No JSONL files found in {directory_path}", file=sys.stderr)
        return []
    summaries = []
    for jsonl_file in sorted(jsonl_files):
        print(f"Processing {jsonl_file.name}...", file=sys.stderr)
        summary = extract_session_metadata(str(jsonl_file))
        summaries.append(summary)
    if output_file:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump([asdict(s) for s in summaries], f, indent=2)
        print(f"Wrote {len(summaries)} summaries to {output_file}", file=sys.stderr)
    return summaries
 def main():
    """CLI entry point."""
    import argparse
    parser = argparse.ArgumentParser(description="Extract metadata from Hermes session JSONL transcripts")
    parser.add_argument("path", help="Path to JSONL file or directory of session files")
    parser.add_argument("-o", "--output", help="Output JSON file (default: stdout)")
    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
    args = parser.parse_args()
    path = Path(args.path)
    if path.is_file():
        summary = extract_session_metadata(str(path))
        if args.output:
            with open(args.output, 'w') as f:
                json.dump(asdict(summary), f, indent=2)
            print(f"Wrote summary to {args.output}", file=sys.stderr)
        else:
            print(json.dumps(asdict(summary), indent=2))
    elif path.is_dir():
        summaries = process_session_directory(str(path), args.output)
        if not args.output:
            print(json.dumps([asdict(s) for s in summaries], indent=2))
    else:
        print(f"Error: {args.path} is not a file or directory", file=sys.stderr)
        sys.exit(1)
 if __name__ == "__main__":
    main()