fix: prevent data loss in skills sync on copy/update failure

Two bugs in sync_skills():

1. Failed copytree poisons manifest: when shutil.copytree fails (disk
   full, permission error), the skill is still recorded in the manifest.
   On the next sync, the skill appears as "in manifest but not on disk"
   which is interpreted as "user deliberately deleted it" — the skill
   is never retried.  Fix: only write to manifest on successful copy.

2. Failed update destroys user copy: rmtree deletes the existing skill
   directory before copytree runs. If copytree then fails, the user's
   skill is gone with no way to recover.  Fix: move to .bak before
   copying, restore from backup if copytree fails.

Both bugs are proven by new regression tests that fail on the old code
and pass on the fix.
This commit is contained in:
0xbyt4
2026-03-07 03:58:32 +03:00
parent b98301677a
commit 211b55815e
2 changed files with 89 additions and 7 deletions

View File

@@ -376,6 +376,76 @@ class TestSyncSkills:
"user_modified": [], "cleaned": [], "total_bundled": 0,
}
def test_failed_copy_does_not_poison_manifest(self, tmp_path):
"""If copytree fails, the skill must NOT be added to the manifest.
Otherwise the next sync treats it as 'user deleted' and never retries.
"""
bundled = self._setup_bundled(tmp_path)
skills_dir = tmp_path / "user_skills"
manifest_file = skills_dir / ".bundled_manifest"
with self._patches(bundled, skills_dir, manifest_file):
# Patch copytree to fail for new-skill
original_copytree = __import__("shutil").copytree
def failing_copytree(src, dst, *a, **kw):
if "new-skill" in str(src):
raise OSError("Simulated disk full")
return original_copytree(src, dst, *a, **kw)
with patch("shutil.copytree", side_effect=failing_copytree):
result = sync_skills(quiet=True)
# new-skill should NOT be in copied (it failed)
assert "new-skill" not in result["copied"]
# Critical: new-skill must NOT be in the manifest
manifest = _read_manifest()
assert "new-skill" not in manifest, (
"Failed copy was recorded in manifest — next sync will "
"treat it as 'user deleted' and never retry"
)
# Now run sync again (copytree works this time) — it should retry
result2 = sync_skills(quiet=True)
assert "new-skill" in result2["copied"]
assert (skills_dir / "category" / "new-skill" / "SKILL.md").exists()
def test_failed_update_does_not_destroy_user_copy(self, tmp_path):
"""If copytree fails during update, the user's existing copy must survive."""
bundled = self._setup_bundled(tmp_path)
skills_dir = tmp_path / "user_skills"
manifest_file = skills_dir / ".bundled_manifest"
# Start with old synced version
user_skill = skills_dir / "old-skill"
user_skill.mkdir(parents=True)
(user_skill / "SKILL.md").write_text("# Old v1")
old_hash = _dir_hash(user_skill)
manifest_file.write_text(f"old-skill:{old_hash}\n")
with self._patches(bundled, skills_dir, manifest_file):
# Patch copytree to fail (rmtree succeeds, copytree fails)
original_copytree = __import__("shutil").copytree
def failing_copytree(src, dst, *a, **kw):
if "old-skill" in str(src):
raise OSError("Simulated write failure")
return original_copytree(src, dst, *a, **kw)
with patch("shutil.copytree", side_effect=failing_copytree):
result = sync_skills(quiet=True)
# old-skill should NOT be in updated (it failed)
assert "old-skill" not in result.get("updated", [])
# The skill directory should still exist (rmtree destroyed it
# but copytree failed to replace it — this is data loss)
assert user_skill.exists(), (
"Update failure destroyed user's skill copy without replacing it"
)
def test_update_records_new_origin_hash(self, tmp_path):
"""After updating a skill, the manifest should record the new bundled hash."""
bundled = self._setup_bundled(tmp_path)

View File

@@ -153,16 +153,18 @@ def sync_skills(quiet: bool = False) -> dict:
if dest.exists():
# User already has a skill with the same name — don't overwrite
skipped += 1
manifest[skill_name] = bundled_hash
else:
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copytree(skill_src, dest)
copied.append(skill_name)
manifest[skill_name] = bundled_hash
if not quiet:
print(f" + {skill_name}")
except (OSError, IOError) as e:
if not quiet:
print(f" ! Failed to copy {skill_name}: {e}")
manifest[skill_name] = bundled_hash
# Do NOT add to manifest — next sync should retry
elif dest.exists():
# ── Existing skill — in manifest AND on disk ──
@@ -190,12 +192,22 @@ def sync_skills(quiet: bool = False) -> dict:
# User copy matches origin — check if bundled has a newer version
if bundled_hash != origin_hash:
try:
shutil.rmtree(dest)
shutil.copytree(skill_src, dest)
manifest[skill_name] = bundled_hash
updated.append(skill_name)
if not quiet:
print(f"{skill_name} (updated)")
# Move old copy to a backup so we can restore on failure
backup = dest.with_suffix(".bak")
shutil.move(str(dest), str(backup))
try:
shutil.copytree(skill_src, dest)
manifest[skill_name] = bundled_hash
updated.append(skill_name)
if not quiet:
print(f"{skill_name} (updated)")
# Remove backup after successful copy
shutil.rmtree(backup, ignore_errors=True)
except (OSError, IOError):
# Restore from backup
if backup.exists() and not dest.exists():
shutil.move(str(backup), str(dest))
raise
except (OSError, IOError) as e:
if not quiet:
print(f" ! Failed to update {skill_name}: {e}")