192 lines
6.5 KiB
Python
192 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
reset_pipeline_state.py — Daily reset for pipeline_state.json
|
|
|
|
Cleans stale pipeline entries so yesterday's "complete" doesn't block
|
|
today's runs. Called by cron at midnight or by the scheduler before
|
|
each cycle.
|
|
|
|
Usage:
|
|
python3 scripts/reset_pipeline_state.py
|
|
python3 scripts/reset_pipeline_state.py --state-file /path/to/pipeline_state.json
|
|
python3 scripts/reset_pipeline_state.py --dry-run
|
|
python3 scripts/reset_pipeline_state.py --max-age-hours 12
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone, timedelta
|
|
from typing import Dict, Any, List, Tuple
|
|
|
|
|
|
# Default age thresholds
|
|
DEFAULT_COMPLETE_MAX_AGE_HOURS = 24
|
|
DEFAULT_FAILED_MAX_AGE_HOURS = 24
|
|
DEFAULT_RUNNING_MAX_AGE_HOURS = 6
|
|
DEFAULT_STATE_FILE = os.path.expanduser("~/.hermes/pipeline_state.json")
|
|
|
|
|
|
def parse_timestamp(ts: str) -> datetime:
|
|
"""Parse ISO timestamp, handling Z suffix and missing tzinfo."""
|
|
ts = ts.replace("Z", "+00:00")
|
|
try:
|
|
return datetime.fromisoformat(ts)
|
|
except ValueError:
|
|
# Fallback for non-standard formats
|
|
for fmt in ("%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
|
|
try:
|
|
return datetime.strptime(ts, fmt).replace(tzinfo=timezone.utc)
|
|
except ValueError:
|
|
continue
|
|
raise ValueError(f"Cannot parse timestamp: {ts}")
|
|
|
|
|
|
def classify_stale(
|
|
entry: Dict[str, Any],
|
|
now: datetime,
|
|
complete_max_age: timedelta,
|
|
failed_max_age: timedelta,
|
|
running_max_age: timedelta,
|
|
) -> Tuple[bool, str]:
|
|
"""Check if a pipeline entry is stale. Returns (is_stale, reason)."""
|
|
state = entry.get("state", "not_started")
|
|
updated_str = entry.get("updated", "")
|
|
|
|
if not updated_str:
|
|
return True, "no_timestamp"
|
|
|
|
try:
|
|
updated = parse_timestamp(updated_str)
|
|
except (ValueError, Exception):
|
|
return True, "invalid_timestamp"
|
|
|
|
age = now - updated
|
|
|
|
if state == "complete" and age > complete_max_age:
|
|
return True, f"complete_{int(age.total_seconds() / 3600)}h_ago"
|
|
elif state == "failed" and age > failed_max_age:
|
|
return True, f"failed_{int(age.total_seconds() / 3600)}h_ago"
|
|
elif state == "running" and age > running_max_age:
|
|
return True, f"running_stuck_{int(age.total_seconds() / 3600)}h_ago"
|
|
|
|
return False, ""
|
|
|
|
|
|
def reset_pipeline_state(
|
|
state_file: str,
|
|
complete_max_age_hours: int = DEFAULT_COMPLETE_MAX_AGE_HOURS,
|
|
failed_max_age_hours: int = DEFAULT_FAILED_MAX_AGE_HOURS,
|
|
running_max_age_hours: int = DEFAULT_RUNNING_MAX_AGE_HOURS,
|
|
dry_run: bool = False,
|
|
) -> Tuple[Dict[str, Any], List[str]]:
|
|
"""Reset stale pipeline entries.
|
|
|
|
Returns (cleaned_state_dict, list_of_removed_names).
|
|
"""
|
|
now = datetime.now(timezone.utc)
|
|
|
|
complete_max_age = timedelta(hours=complete_max_age_hours)
|
|
failed_max_age = timedelta(hours=failed_max_age_hours)
|
|
running_max_age = timedelta(hours=running_max_age_hours)
|
|
|
|
# Load state
|
|
if not os.path.exists(state_file):
|
|
return {}, []
|
|
|
|
with open(state_file) as f:
|
|
state = json.load(f)
|
|
|
|
if not isinstance(state, dict):
|
|
# Corrupted file — reset entirely
|
|
if not dry_run:
|
|
with open(state_file, "w") as f:
|
|
json.dump({}, f)
|
|
return {}, list(state.keys()) if isinstance(state, dict) else ["corrupted"]
|
|
|
|
# Classify entries
|
|
to_remove = []
|
|
for name, entry in list(state.items()):
|
|
if not isinstance(entry, dict):
|
|
to_remove.append((name, "not_dict"))
|
|
continue
|
|
if "state" not in entry:
|
|
to_remove.append((name, "no_state_field"))
|
|
continue
|
|
|
|
stale, reason = classify_stale(
|
|
entry, now, complete_max_age, failed_max_age, running_max_age
|
|
)
|
|
if stale:
|
|
to_remove.append((name, reason))
|
|
|
|
# Remove stale entries
|
|
removed_names = []
|
|
for name, reason in to_remove:
|
|
del state[name]
|
|
removed_names.append(f"{name}({reason})")
|
|
|
|
# Write back
|
|
if removed_names and not dry_run:
|
|
os.makedirs(os.path.dirname(state_file) or ".", exist_ok=True)
|
|
with open(state_file, "w") as f:
|
|
json.dump(state, f, indent=2)
|
|
|
|
return state, removed_names
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Reset stale pipeline state entries")
|
|
parser.add_argument("--state-file", default=DEFAULT_STATE_FILE,
|
|
help="Path to pipeline_state.json")
|
|
parser.add_argument("--complete-max-age-hours", type=int, default=DEFAULT_COMPLETE_MAX_AGE_HOURS,
|
|
help="Max age in hours for complete entries (default: 24)")
|
|
parser.add_argument("--failed-max-age-hours", type=int, default=DEFAULT_FAILED_MAX_AGE_HOURS,
|
|
help="Max age in hours for failed entries (default: 24)")
|
|
parser.add_argument("--running-max-age-hours", type=int, default=DEFAULT_RUNNING_MAX_AGE_HOURS,
|
|
help="Max age in hours for stuck running entries (default: 6)")
|
|
parser.add_argument("--dry-run", action="store_true",
|
|
help="Show what would be removed without modifying")
|
|
parser.add_argument("--json", action="store_true",
|
|
help="Output as JSON")
|
|
|
|
args = parser.parse_args()
|
|
|
|
state, removed = reset_pipeline_state(
|
|
state_file=args.state_file,
|
|
complete_max_age_hours=args.complete_max_age_hours,
|
|
failed_max_age_hours=args.failed_max_age_hours,
|
|
running_max_age_hours=args.running_max_age_hours,
|
|
dry_run=args.dry_run,
|
|
)
|
|
|
|
if args.json:
|
|
result = {
|
|
"state_file": args.state_file,
|
|
"removed": removed,
|
|
"remaining": list(state.keys()),
|
|
"dry_run": args.dry_run,
|
|
}
|
|
print(json.dumps(result, indent=2))
|
|
else:
|
|
prefix = "[DRY RUN] " if args.dry_run else ""
|
|
if removed:
|
|
print(f"{prefix}Reset {len(removed)} stale pipeline(s): {', '.join(removed)}")
|
|
else:
|
|
print(f"{prefix}No stale pipelines found. {len(state)} active.")
|
|
if state:
|
|
for name, entry in state.items():
|
|
age_h = ""
|
|
try:
|
|
updated = parse_timestamp(entry["updated"])
|
|
age = datetime.now(timezone.utc) - updated
|
|
age_h = f" ({int(age.total_seconds() / 3600)}h ago)"
|
|
except Exception:
|
|
pass
|
|
print(f" {name}: {entry['state']}{age_h}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|