#!/usr/bin/env python3 """ reset_pipeline_state.py — Daily reset for pipeline_state.json Cleans stale pipeline entries so yesterday's "complete" doesn't block today's runs. Called by cron at midnight or by the scheduler before each cycle. Usage: python3 scripts/reset_pipeline_state.py python3 scripts/reset_pipeline_state.py --state-file /path/to/pipeline_state.json python3 scripts/reset_pipeline_state.py --dry-run python3 scripts/reset_pipeline_state.py --max-age-hours 12 """ import argparse import json import os import sys from datetime import datetime, timezone, timedelta from typing import Dict, Any, List, Tuple # Default age thresholds DEFAULT_COMPLETE_MAX_AGE_HOURS = 24 DEFAULT_FAILED_MAX_AGE_HOURS = 24 DEFAULT_RUNNING_MAX_AGE_HOURS = 6 DEFAULT_STATE_FILE = os.path.expanduser("~/.hermes/pipeline_state.json") def parse_timestamp(ts: str) -> datetime: """Parse ISO timestamp, handling Z suffix and missing tzinfo.""" ts = ts.replace("Z", "+00:00") try: return datetime.fromisoformat(ts) except ValueError: # Fallback for non-standard formats for fmt in ("%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"): try: return datetime.strptime(ts, fmt).replace(tzinfo=timezone.utc) except ValueError: continue raise ValueError(f"Cannot parse timestamp: {ts}") def classify_stale( entry: Dict[str, Any], now: datetime, complete_max_age: timedelta, failed_max_age: timedelta, running_max_age: timedelta, ) -> Tuple[bool, str]: """Check if a pipeline entry is stale. Returns (is_stale, reason).""" state = entry.get("state", "not_started") updated_str = entry.get("updated", "") if not updated_str: return True, "no_timestamp" try: updated = parse_timestamp(updated_str) except (ValueError, Exception): return True, "invalid_timestamp" age = now - updated if state == "complete" and age > complete_max_age: return True, f"complete_{int(age.total_seconds() / 3600)}h_ago" elif state == "failed" and age > failed_max_age: return True, f"failed_{int(age.total_seconds() / 3600)}h_ago" elif state == "running" and age > running_max_age: return True, f"running_stuck_{int(age.total_seconds() / 3600)}h_ago" return False, "" def reset_pipeline_state( state_file: str, complete_max_age_hours: int = DEFAULT_COMPLETE_MAX_AGE_HOURS, failed_max_age_hours: int = DEFAULT_FAILED_MAX_AGE_HOURS, running_max_age_hours: int = DEFAULT_RUNNING_MAX_AGE_HOURS, dry_run: bool = False, ) -> Tuple[Dict[str, Any], List[str]]: """Reset stale pipeline entries. Returns (cleaned_state_dict, list_of_removed_names). """ now = datetime.now(timezone.utc) complete_max_age = timedelta(hours=complete_max_age_hours) failed_max_age = timedelta(hours=failed_max_age_hours) running_max_age = timedelta(hours=running_max_age_hours) # Load state if not os.path.exists(state_file): return {}, [] with open(state_file) as f: state = json.load(f) if not isinstance(state, dict): # Corrupted file — reset entirely if not dry_run: with open(state_file, "w") as f: json.dump({}, f) return {}, list(state.keys()) if isinstance(state, dict) else ["corrupted"] # Classify entries to_remove = [] for name, entry in list(state.items()): if not isinstance(entry, dict): to_remove.append((name, "not_dict")) continue if "state" not in entry: to_remove.append((name, "no_state_field")) continue stale, reason = classify_stale( entry, now, complete_max_age, failed_max_age, running_max_age ) if stale: to_remove.append((name, reason)) # Remove stale entries removed_names = [] for name, reason in to_remove: del state[name] removed_names.append(f"{name}({reason})") # Write back if removed_names and not dry_run: os.makedirs(os.path.dirname(state_file) or ".", exist_ok=True) with open(state_file, "w") as f: json.dump(state, f, indent=2) return state, removed_names def main(): parser = argparse.ArgumentParser(description="Reset stale pipeline state entries") parser.add_argument("--state-file", default=DEFAULT_STATE_FILE, help="Path to pipeline_state.json") parser.add_argument("--complete-max-age-hours", type=int, default=DEFAULT_COMPLETE_MAX_AGE_HOURS, help="Max age in hours for complete entries (default: 24)") parser.add_argument("--failed-max-age-hours", type=int, default=DEFAULT_FAILED_MAX_AGE_HOURS, help="Max age in hours for failed entries (default: 24)") parser.add_argument("--running-max-age-hours", type=int, default=DEFAULT_RUNNING_MAX_AGE_HOURS, help="Max age in hours for stuck running entries (default: 6)") parser.add_argument("--dry-run", action="store_true", help="Show what would be removed without modifying") parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args() state, removed = reset_pipeline_state( state_file=args.state_file, complete_max_age_hours=args.complete_max_age_hours, failed_max_age_hours=args.failed_max_age_hours, running_max_age_hours=args.running_max_age_hours, dry_run=args.dry_run, ) if args.json: result = { "state_file": args.state_file, "removed": removed, "remaining": list(state.keys()), "dry_run": args.dry_run, } print(json.dumps(result, indent=2)) else: prefix = "[DRY RUN] " if args.dry_run else "" if removed: print(f"{prefix}Reset {len(removed)} stale pipeline(s): {', '.join(removed)}") else: print(f"{prefix}No stale pipelines found. {len(state)} active.") if state: for name, entry in state.items(): age_h = "" try: updated = parse_timestamp(entry["updated"]) age = datetime.now(timezone.utc) - updated age_h = f" ({int(age.total_seconds() / 3600)}h ago)" except Exception: pass print(f" {name}: {entry['state']}{age_h}") if __name__ == "__main__": main()