Files
timmy-config/scripts/reset_pipeline_state.py

192 lines
6.5 KiB
Python

#!/usr/bin/env python3
"""
reset_pipeline_state.py — Daily reset for pipeline_state.json
Cleans stale pipeline entries so yesterday's "complete" doesn't block
today's runs. Called by cron at midnight or by the scheduler before
each cycle.
Usage:
python3 scripts/reset_pipeline_state.py
python3 scripts/reset_pipeline_state.py --state-file /path/to/pipeline_state.json
python3 scripts/reset_pipeline_state.py --dry-run
python3 scripts/reset_pipeline_state.py --max-age-hours 12
"""
import argparse
import json
import os
import sys
from datetime import datetime, timezone, timedelta
from typing import Dict, Any, List, Tuple
# Default age thresholds
DEFAULT_COMPLETE_MAX_AGE_HOURS = 24
DEFAULT_FAILED_MAX_AGE_HOURS = 24
DEFAULT_RUNNING_MAX_AGE_HOURS = 6
DEFAULT_STATE_FILE = os.path.expanduser("~/.hermes/pipeline_state.json")
def parse_timestamp(ts: str) -> datetime:
"""Parse ISO timestamp, handling Z suffix and missing tzinfo."""
ts = ts.replace("Z", "+00:00")
try:
return datetime.fromisoformat(ts)
except ValueError:
# Fallback for non-standard formats
for fmt in ("%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
try:
return datetime.strptime(ts, fmt).replace(tzinfo=timezone.utc)
except ValueError:
continue
raise ValueError(f"Cannot parse timestamp: {ts}")
def classify_stale(
entry: Dict[str, Any],
now: datetime,
complete_max_age: timedelta,
failed_max_age: timedelta,
running_max_age: timedelta,
) -> Tuple[bool, str]:
"""Check if a pipeline entry is stale. Returns (is_stale, reason)."""
state = entry.get("state", "not_started")
updated_str = entry.get("updated", "")
if not updated_str:
return True, "no_timestamp"
try:
updated = parse_timestamp(updated_str)
except (ValueError, Exception):
return True, "invalid_timestamp"
age = now - updated
if state == "complete" and age > complete_max_age:
return True, f"complete_{int(age.total_seconds() / 3600)}h_ago"
elif state == "failed" and age > failed_max_age:
return True, f"failed_{int(age.total_seconds() / 3600)}h_ago"
elif state == "running" and age > running_max_age:
return True, f"running_stuck_{int(age.total_seconds() / 3600)}h_ago"
return False, ""
def reset_pipeline_state(
state_file: str,
complete_max_age_hours: int = DEFAULT_COMPLETE_MAX_AGE_HOURS,
failed_max_age_hours: int = DEFAULT_FAILED_MAX_AGE_HOURS,
running_max_age_hours: int = DEFAULT_RUNNING_MAX_AGE_HOURS,
dry_run: bool = False,
) -> Tuple[Dict[str, Any], List[str]]:
"""Reset stale pipeline entries.
Returns (cleaned_state_dict, list_of_removed_names).
"""
now = datetime.now(timezone.utc)
complete_max_age = timedelta(hours=complete_max_age_hours)
failed_max_age = timedelta(hours=failed_max_age_hours)
running_max_age = timedelta(hours=running_max_age_hours)
# Load state
if not os.path.exists(state_file):
return {}, []
with open(state_file) as f:
state = json.load(f)
if not isinstance(state, dict):
# Corrupted file — reset entirely
if not dry_run:
with open(state_file, "w") as f:
json.dump({}, f)
return {}, list(state.keys()) if isinstance(state, dict) else ["corrupted"]
# Classify entries
to_remove = []
for name, entry in list(state.items()):
if not isinstance(entry, dict):
to_remove.append((name, "not_dict"))
continue
if "state" not in entry:
to_remove.append((name, "no_state_field"))
continue
stale, reason = classify_stale(
entry, now, complete_max_age, failed_max_age, running_max_age
)
if stale:
to_remove.append((name, reason))
# Remove stale entries
removed_names = []
for name, reason in to_remove:
del state[name]
removed_names.append(f"{name}({reason})")
# Write back
if removed_names and not dry_run:
os.makedirs(os.path.dirname(state_file) or ".", exist_ok=True)
with open(state_file, "w") as f:
json.dump(state, f, indent=2)
return state, removed_names
def main():
parser = argparse.ArgumentParser(description="Reset stale pipeline state entries")
parser.add_argument("--state-file", default=DEFAULT_STATE_FILE,
help="Path to pipeline_state.json")
parser.add_argument("--complete-max-age-hours", type=int, default=DEFAULT_COMPLETE_MAX_AGE_HOURS,
help="Max age in hours for complete entries (default: 24)")
parser.add_argument("--failed-max-age-hours", type=int, default=DEFAULT_FAILED_MAX_AGE_HOURS,
help="Max age in hours for failed entries (default: 24)")
parser.add_argument("--running-max-age-hours", type=int, default=DEFAULT_RUNNING_MAX_AGE_HOURS,
help="Max age in hours for stuck running entries (default: 6)")
parser.add_argument("--dry-run", action="store_true",
help="Show what would be removed without modifying")
parser.add_argument("--json", action="store_true",
help="Output as JSON")
args = parser.parse_args()
state, removed = reset_pipeline_state(
state_file=args.state_file,
complete_max_age_hours=args.complete_max_age_hours,
failed_max_age_hours=args.failed_max_age_hours,
running_max_age_hours=args.running_max_age_hours,
dry_run=args.dry_run,
)
if args.json:
result = {
"state_file": args.state_file,
"removed": removed,
"remaining": list(state.keys()),
"dry_run": args.dry_run,
}
print(json.dumps(result, indent=2))
else:
prefix = "[DRY RUN] " if args.dry_run else ""
if removed:
print(f"{prefix}Reset {len(removed)} stale pipeline(s): {', '.join(removed)}")
else:
print(f"{prefix}No stale pipelines found. {len(state)} active.")
if state:
for name, entry in state.items():
age_h = ""
try:
updated = parse_timestamp(entry["updated"])
age = datetime.now(timezone.utc) - updated
age_h = f" ({int(age.total_seconds() / 3600)}h ago)"
except Exception:
pass
print(f" {name}: {entry['state']}{age_h}")
if __name__ == "__main__":
main()