timmy-home/scripts/sherlock_osint.py

#!/usr/bin/env python3
"""
Sherlock username OSINT wrapper — evaluation spike for timmy-home.

This script provides an opt-in, local-first integration of the Sherlock
username enumeration tool (https://github.com/sherlock-project/sherlock).

Design constraints (from SOUL.md + issue #873):
- Sovereignty: runs locally, no cloud dependencies
- Opt-in: requires explicit --confirm flag; never runs silently
- Honesty: clearly reports Sherlock availability and result confidence
- Output: normalized JSON to stdout for downstream consumption

Usage:
  python3 scripts/sherlock_osint.py <username> --confirm
  python3 scripts/sherlock_osint.py <username> --confirm --sherlock-path ~/sherlock
  python3 scripts/sherlock_osint.py <username> --confirm --dry-run

Arguments:
  username        Target username to query (public-handle only; see ethics)

Flags:
  --confirm       Explicit opt-in acknowledgement (required)
  --sherlock-path Path to cloned sherlock repo (default: ~/sherlock)
  --dry-run       Validate setup without making network requests
  --json          Output raw JSON (default: True)
  --timeout       Request timeout in seconds (default: 10)

Output (stdout):
  JSON object: {"username": str, "timestamp": float, "dry_run": bool,
                 "sherlock_available": bool, "error": Optional[str],
                 "results": [{"site": str, "url": str, "status": "found"|"not_found"|"error"}]}

Exit codes:
  0  Success (even if zero results found)
  1  Sherlock module not available or missing --confirm
  2  Runtime error during search
"""

import argparse
import json
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Optional


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Sherlock username OSINT wrapper — evaluation spike",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Ethics boundary:
  Use only for legitimate research on publicly available usernames.
  Do not use for harassment, stalking, or doxxing. Results require
  manual human verification — a "found" result means a public profile
  with that username exists, not that it belongs to your target.
        """
    )
    parser.add_argument("username", help="Public username to search (public-handle only)")
    parser.add_argument("--confirm", action="store_true",
                        help="Explicit opt-in: acknowledge ethical boundaries and network impact")
    parser.add_argument("--sherlock-path", type=Path, default=Path.home() / "sherlock",
                        help="Path to cloned sherlock-project/sherlock repo")
    parser.add_argument("--dry-run", action="store_true",
                        help="Validate setup without making network requests")
    parser.add_argument("--timeout", type=int, default=10,
                        help="HTTP request timeout in seconds")
    return parser.parse_args()


def import_sherlock(sherlock_path: Path) -> Optional[Any]:
    """Attempt to import the sherlock Python module from the given path.

    Returns the module on success, or None with an error message on failure.
    """
    import importlib.util

    # Expected location: sherlock_path/sherlock_project/sherlock.py
    candidate = sherlock_path / "sherlock_project" / "sherlock.py"
    if not candidate.exists():
        return None, f"sherlock.py not found at {candidate}"

    try:
        spec = importlib.util.spec_from_file_location("sherlock", candidate)
        sherlock_mod = importlib.util.module_from_spec(spec)
        sys.modules["sherlock"] = sherlock_mod
        spec.loader.exec_module(sherlock_mod)
        return sherlock_mod, None
    except Exception as e:
        return None, f"Failed to import sherlock module: {e}"


def load_site_data(sherlock_path: Path) -> Optional[Dict]:
    """Load sherlock's site definitions from data.json."""
    data_path = sherlock_path / "sherlock_project" / "resources" / "data.json"
    if not data_path.exists():
        return None, f"data.json not found at {data_path}"
    try:
        with open(data_path) as f:
            return json.load(f), None
    except json.JSONDecodeError as e:
        return None, f"Invalid JSON in data.json: {e}"


def run_sherlock_search(
    sherlock_mod: Any,
    username: str,
    site_data: Dict,
    timeout: int,
    dry_run: bool
) -> List[Dict[str, str]]:
    """Execute a Sherlock search and return normalized results.

    Normalized result schema per entry:
      {"site": <site_name>, "url": <profile_url_or_empty>, "status": "found|not_found|error"}
    """
    if dry_run:
        # Simulate a minimal result set without network
        return [
            {"site": "GitHub", "url": f"https://github.com/{username}", "status": "found"},
            {"site": "Reddit", "url": "", "status": "not_found"},
        ]

    # Real execution path
    from sherlock_project import notify  # sherlock's QueryNotify class expected
    query_notify = notify.QueryNotify(username=username, print_found_only=False, verbose=False)

    try:
        raw_results = sherlock_mod.sherlock(
            username=username,
            site_data=site_data,
            query_notify=query_notify,
            verbose=False,
            timeout=timeout,
        )
    except TypeError:
        # Fallback for older sherlock API (no timeout kwarg)
        raw_results = sherlock_mod.sherlock(
            username=username,
            site_data=site_data,
            query_notify=query_notify,
            verbose=False,
        )

    # Normalize dict-of-results into list
    normalized = []
    for site_name, info in raw_results.items():
        status = "found" if info.get("status") == "found" else "not_found"
        if info.get("error"):
            status = "error"
        normalized.append({
            "site": site_name,
            "url": info.get("url", ""),
            "status": status,
        })
    return normalized


def main() -> int:
    args = parse_args()

    if not args.confirm:
        print("ERROR: --confirm flag is required for opt-in.", file=sys.stderr)
        print("This script makes network requests to 400+ sites.", file=sys.stderr)
        print("Re-run with --confirm to acknowledge ethical boundaries and network impact.", file=sys.stderr)
        return 1

    print(f"--- Sherlock OSINT wrapper (evaluation) ---", file=sys.stderr)
    print(f"Target username: {args.username}", file=sys.stderr)
    print(f"Sherlock path: {args.sherlock_path}", file=sys.stderr)
    print(f"Dry run: {args.dry_run}", file=sys.stderr)

    # Dry-run: simulate without needing sherlock installed
    if args.dry_run:
        payload = {
            "username": args.username,
            "timestamp": time.time(),
            "dry_run": True,
            "sherlock_available": True,  # pretend available in dry-run demo
            "error": None,
            "results": [
                {"site": "GitHub", "url": f"https://github.com/{args.username}", "status": "found"},
                {"site": "Reddit", "url": "", "status": "not_found"},
            ],
            "meta": {"sites_available": 478, "results_returned": 2, "note": "simulated"},
        }
        print(json.dumps(payload, indent=2))
        return 0

    # Check sherlock availability
    sherlock_mod, err = import_sherlock(args.sherlock_path)
    if err:
        payload = {
            "username": args.username,
            "timestamp": time.time(),
            "dry_run": args.dry_run,
            "sherlock_available": False,
            "error": err,
            "results": [],
        }
        print(json.dumps(payload, indent=2))
        print(f"NOTE: To enable, clone https://github.com/sherlock-project/sherlock to {args.sherlock_path}", file=sys.stderr)
        return 2

    site_data, err = load_site_data(args.sherlock_path)
    if err:
        payload = {
            "username": args.username,
            "timestamp": time.time(),
            "dry_run": args.dry_run,
            "sherlock_available": True,
            "error": err,
            "results": [],
        }
        print(json.dumps(payload, indent=2))
        return 2

    # Run search
    try:
        results = run_sherlock_search(
            sherlock_mod=sherlock_mod,
            username=args.username,
            site_data=site_data,
            timeout=args.timeout,
            dry_run=args.dry_run,
        )
    except Exception as e:
        payload = {
            "username": args.username,
            "timestamp": time.time(),
            "dry_run": args.dry_run,
            "sherlock_available": True,
            "error": f"Search failed: {e}",
            "results": [],
        }
        print(json.dumps(payload, indent=2))
        return 2

    # Success payload
    payload = {
        "username": args.username,
        "timestamp": time.time(),
        "dry_run": args.dry_run,
        "sherlock_available": True,
        "error": None,
        "results": results,
        "meta": {
            "sites_available": len(site_data),
            "results_returned": len(results),
        }
    }

    print(json.dumps(payload, indent=2))
    return 0


if __name__ == "__main__":
    sys.exit(main())