From 37752ff1ac5e3bcda47b10e5eba164affb38f13e Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sun, 8 Mar 2026 19:41:17 -0700 Subject: [PATCH 1/4] =?UTF-8?q?feat:=20bell=5Fon=5Fcomplete=20=E2=80=94=20?= =?UTF-8?q?terminal=20bell=20when=20agent=20finishes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a simple config option to play the terminal bell (\a) when the agent finishes a response. Useful for long-running tasks — switch to another window and your terminal will ding when done. Works over SSH since the bell character propagates through the connection. Most terminal emulators can be configured to flash the taskbar, play a sound, or show a visual indicator on bell. Config (default: off): display: bell_on_complete: true Closes #318 --- cli-config.yaml.example | 5 +++++ cli.py | 8 ++++++++ hermes_cli/config.py | 1 + website/docs/user-guide/configuration.md | 1 + 4 files changed, 15 insertions(+) diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 6b1cf97c0..ec7ccb620 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -635,3 +635,8 @@ display: # verbose: Full args, results, and debug logs (same as /verbose) # Toggle at runtime with /verbose in the CLI tool_progress: all + + # Play terminal bell when agent finishes a response. + # Useful for long-running tasks — your terminal will ding when the agent is done. + # Works over SSH. Most terminals can be configured to flash the taskbar or play a sound. + bell_on_complete: false diff --git a/cli.py b/cli.py index 8e67492b8..828d820fc 100755 --- a/cli.py +++ b/cli.py @@ -1035,6 +1035,8 @@ class HermesCLI: self.tool_progress_mode = CLI_CONFIG["display"].get("tool_progress", "all") # resume_display: "full" (show history) | "minimal" (one-liner only) self.resume_display = CLI_CONFIG["display"].get("resume_display", "full") + # bell_on_complete: play terminal bell (\a) when agent finishes a response + self.bell_on_complete = CLI_CONFIG["display"].get("bell_on_complete", False) self.verbose = verbose if verbose is not None else (self.tool_progress_mode == "verbose") # Configuration - priority: CLI args > env vars > config file @@ -3120,6 +3122,12 @@ class HermesCLI: # nothing can interleave between the box borders. _cprint(f"\n{top}\n{response}\n\n{bot}") + # Play terminal bell when agent finishes (if enabled). + # Works over SSH — the bell propagates to the user's terminal. + if self.bell_on_complete: + sys.stdout.write("\a") + sys.stdout.flush() + # Combine all interrupt messages (user may have typed multiple while waiting) # and re-queue as one prompt for process_loop if pending_message and hasattr(self, '_pending_input'): diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 208b95cb7..9b58115f3 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -107,6 +107,7 @@ DEFAULT_CONFIG = { "compact": False, "personality": "kawaii", "resume_display": "full", # "full" (show previous messages) | "minimal" (one-liner only) + "bell_on_complete": False, # Play terminal bell (\a) when agent finishes a response }, # Text-to-speech configuration diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index f2abd16ca..7fd88fb32 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -581,6 +581,7 @@ display: personality: "kawaii" # Default personality for the CLI compact: false # Compact output mode (less whitespace) resume_display: full # full (show previous messages on resume) | minimal (one-liner only) + bell_on_complete: false # Play terminal bell when agent finishes (great for long tasks) ``` | Mode | What you see | From 763c6d104d020db989b839f7f9ffa0ff6255d118 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sun, 8 Mar 2026 20:08:02 -0700 Subject: [PATCH 2/4] fix: unify gateway session hygiene with agent compression config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gateway had a SEPARATE compression system ('session hygiene') with hardcoded thresholds (100k tokens / 200 messages) that were completely disconnected from the model's context length and the user's compression config in config.yaml. This caused premature auto-compression on Telegram/Discord — triggering at ~60k tokens (from the 200-message threshold) or inconsistent token counts. Changes: - Gateway hygiene now reads model name from config.yaml and uses get_model_context_length() to derive the actual context limit - Compression threshold comes from compression.threshold in config.yaml (default 0.85), same as the agent's ContextCompressor - Removed the message-count-based trigger (was redundant and caused false positives in tool-heavy sessions) - Removed the undocumented session_hygiene config section — the standard compression.* config now controls everything - Env var overrides (CONTEXT_COMPRESSION_THRESHOLD, CONTEXT_COMPRESSION_ENABLED) are respected - Warn threshold is now 95% of model context (was hardcoded 200k) - Updated tests to verify model-aware thresholds, scaling across models, and that message count alone no longer triggers compression For claude-opus-4.6 (200k context) at 85% threshold: gateway hygiene now triggers at 170k tokens instead of the old 100k. --- gateway/run.py | 278 ++++++++++++++------------ tests/gateway/test_session_hygiene.py | 155 +++++++++----- 2 files changed, 253 insertions(+), 180 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index cd5b478b5..f75cee77c 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -870,159 +870,187 @@ class GatewayRunner: # every new message rehydrates an oversized transcript, causing # repeated truncation/context failures. Detect this early and # compress proactively — before the agent even starts. (#628) + # + # Thresholds are derived from the SAME compression config the + # agent uses (compression.threshold × model context length) so + # CLI and messaging platforms behave identically. # ----------------------------------------------------------------- if history and len(history) >= 4: - from agent.model_metadata import estimate_messages_tokens_rough + from agent.model_metadata import ( + estimate_messages_tokens_rough, + get_model_context_length, + ) - # Read thresholds from config.yaml → session_hygiene section - _hygiene_cfg = {} + # Read model + compression config from config.yaml — same + # source of truth the agent itself uses. + _hyg_model = "anthropic/claude-sonnet-4.6" + _hyg_threshold_pct = 0.85 + _hyg_compression_enabled = True try: _hyg_cfg_path = _hermes_home / "config.yaml" if _hyg_cfg_path.exists(): import yaml as _hyg_yaml with open(_hyg_cfg_path) as _hyg_f: _hyg_data = _hyg_yaml.safe_load(_hyg_f) or {} - _hygiene_cfg = _hyg_data.get("session_hygiene", {}) - if not isinstance(_hygiene_cfg, dict): - _hygiene_cfg = {} + + # Resolve model name (same logic as run_sync) + _model_cfg = _hyg_data.get("model", {}) + if isinstance(_model_cfg, str): + _hyg_model = _model_cfg + elif isinstance(_model_cfg, dict): + _hyg_model = _model_cfg.get("default", _hyg_model) + + # Read compression settings + _comp_cfg = _hyg_data.get("compression", {}) + if isinstance(_comp_cfg, dict): + _hyg_threshold_pct = float( + _comp_cfg.get("threshold", _hyg_threshold_pct) + ) + _hyg_compression_enabled = str( + _comp_cfg.get("enabled", True) + ).lower() in ("true", "1", "yes") except Exception: pass - _compress_token_threshold = int( - _hygiene_cfg.get("auto_compress_tokens", 100_000) - ) - _compress_msg_threshold = int( - _hygiene_cfg.get("auto_compress_messages", 200) - ) - _warn_token_threshold = int( - _hygiene_cfg.get("warn_tokens", 200_000) + # Also check env overrides (same as run_agent.py) + _hyg_threshold_pct = float( + os.getenv("CONTEXT_COMPRESSION_THRESHOLD", str(_hyg_threshold_pct)) ) + if os.getenv("CONTEXT_COMPRESSION_ENABLED", "").lower() in ("false", "0", "no"): + _hyg_compression_enabled = False - _msg_count = len(history) - _approx_tokens = estimate_messages_tokens_rough(history) - - _needs_compress = ( - _approx_tokens >= _compress_token_threshold - or _msg_count >= _compress_msg_threshold - ) - - if _needs_compress: - logger.info( - "Session hygiene: %s messages, ~%s tokens — auto-compressing " - "(thresholds: %s msgs / %s tokens)", - _msg_count, f"{_approx_tokens:,}", - _compress_msg_threshold, f"{_compress_token_threshold:,}", + if _hyg_compression_enabled: + _hyg_context_length = get_model_context_length(_hyg_model) + _compress_token_threshold = int( + _hyg_context_length * _hyg_threshold_pct ) + # Warn if still huge after compression (95% of context) + _warn_token_threshold = int(_hyg_context_length * 0.95) + + _msg_count = len(history) + _approx_tokens = estimate_messages_tokens_rough(history) + + _needs_compress = _approx_tokens >= _compress_token_threshold + + if _needs_compress: + logger.info( + "Session hygiene: %s messages, ~%s tokens — auto-compressing " + "(threshold: %s%% of %s = %s tokens)", + _msg_count, f"{_approx_tokens:,}", + int(_hyg_threshold_pct * 100), + f"{_hyg_context_length:,}", + f"{_compress_token_threshold:,}", + ) + + _hyg_adapter = self.adapters.get(source.platform) + if _hyg_adapter: + try: + await _hyg_adapter.send( + source.chat_id, + f"🗜️ Session is large ({_msg_count} messages, " + f"~{_approx_tokens:,} tokens). Auto-compressing..." + ) + except Exception: + pass - _hyg_adapter = self.adapters.get(source.platform) - if _hyg_adapter: try: - await _hyg_adapter.send( - source.chat_id, - f"🗜️ Session is large ({_msg_count} messages, " - f"~{_approx_tokens:,} tokens). Auto-compressing..." - ) - except Exception: - pass + from run_agent import AIAgent - try: - from run_agent import AIAgent + _hyg_runtime = _resolve_runtime_agent_kwargs() + if _hyg_runtime.get("api_key"): + _hyg_msgs = [ + {"role": m.get("role"), "content": m.get("content")} + for m in history + if m.get("role") in ("user", "assistant") + and m.get("content") + ] - _hyg_runtime = _resolve_runtime_agent_kwargs() - if _hyg_runtime.get("api_key"): - _hyg_msgs = [ - {"role": m.get("role"), "content": m.get("content")} - for m in history - if m.get("role") in ("user", "assistant") - and m.get("content") - ] - - if len(_hyg_msgs) >= 4: - _hyg_agent = AIAgent( - **_hyg_runtime, - max_iterations=4, - quiet_mode=True, - enabled_toolsets=["memory"], - session_id=session_entry.session_id, - ) - - loop = asyncio.get_event_loop() - _compressed, _ = await loop.run_in_executor( - None, - lambda: _hyg_agent._compress_context( - _hyg_msgs, "", - approx_tokens=_approx_tokens, - ), - ) - - self.session_store.rewrite_transcript( - session_entry.session_id, _compressed - ) - history = _compressed - _new_count = len(_compressed) - _new_tokens = estimate_messages_tokens_rough( - _compressed - ) - - logger.info( - "Session hygiene: compressed %s → %s msgs, " - "~%s → ~%s tokens", - _msg_count, _new_count, - f"{_approx_tokens:,}", f"{_new_tokens:,}", - ) - - if _hyg_adapter: - try: - await _hyg_adapter.send( - source.chat_id, - f"🗜️ Compressed: {_msg_count} → " - f"{_new_count} messages, " - f"~{_approx_tokens:,} → " - f"~{_new_tokens:,} tokens" - ) - except Exception: - pass - - # Still too large after compression — warn user - if _new_tokens >= _warn_token_threshold: - logger.warning( - "Session hygiene: still ~%s tokens after " - "compression — suggesting /reset", - f"{_new_tokens:,}", + if len(_hyg_msgs) >= 4: + _hyg_agent = AIAgent( + **_hyg_runtime, + max_iterations=4, + quiet_mode=True, + enabled_toolsets=["memory"], + session_id=session_entry.session_id, ) + + loop = asyncio.get_event_loop() + _compressed, _ = await loop.run_in_executor( + None, + lambda: _hyg_agent._compress_context( + _hyg_msgs, "", + approx_tokens=_approx_tokens, + ), + ) + + self.session_store.rewrite_transcript( + session_entry.session_id, _compressed + ) + history = _compressed + _new_count = len(_compressed) + _new_tokens = estimate_messages_tokens_rough( + _compressed + ) + + logger.info( + "Session hygiene: compressed %s → %s msgs, " + "~%s → ~%s tokens", + _msg_count, _new_count, + f"{_approx_tokens:,}", f"{_new_tokens:,}", + ) + if _hyg_adapter: try: await _hyg_adapter.send( source.chat_id, - "⚠️ Session is still very large " - "after compression " - f"(~{_new_tokens:,} tokens). " - "Consider using /reset to start " - "fresh if you experience issues." + f"🗜️ Compressed: {_msg_count} → " + f"{_new_count} messages, " + f"~{_approx_tokens:,} → " + f"~{_new_tokens:,} tokens" ) except Exception: pass - except Exception as e: - logger.warning( - "Session hygiene auto-compress failed: %s", e - ) - # Compression failed and session is dangerously large - if _approx_tokens >= _warn_token_threshold: - _hyg_adapter = self.adapters.get(source.platform) - if _hyg_adapter: - try: - await _hyg_adapter.send( - source.chat_id, - f"⚠️ Session is very large " - f"({_msg_count} messages, " - f"~{_approx_tokens:,} tokens) and " - "auto-compression failed. Consider " - "using /compress or /reset to avoid " - "issues." - ) - except Exception: - pass + # Still too large after compression — warn user + if _new_tokens >= _warn_token_threshold: + logger.warning( + "Session hygiene: still ~%s tokens after " + "compression — suggesting /reset", + f"{_new_tokens:,}", + ) + if _hyg_adapter: + try: + await _hyg_adapter.send( + source.chat_id, + "⚠️ Session is still very large " + "after compression " + f"(~{_new_tokens:,} tokens). " + "Consider using /reset to start " + "fresh if you experience issues." + ) + except Exception: + pass + + except Exception as e: + logger.warning( + "Session hygiene auto-compress failed: %s", e + ) + # Compression failed and session is dangerously large + if _approx_tokens >= _warn_token_threshold: + _hyg_adapter = self.adapters.get(source.platform) + if _hyg_adapter: + try: + await _hyg_adapter.send( + source.chat_id, + f"⚠️ Session is very large " + f"({_msg_count} messages, " + f"~{_approx_tokens:,} tokens) and " + "auto-compression failed. Consider " + "using /compress or /reset to avoid " + "issues." + ) + except Exception: + pass # First-message onboarding -- only on the very first interaction ever if not history and not self.session_store.has_any_sessions(): diff --git a/tests/gateway/test_session_hygiene.py b/tests/gateway/test_session_hygiene.py index b357d5861..9ac7b8029 100644 --- a/tests/gateway/test_session_hygiene.py +++ b/tests/gateway/test_session_hygiene.py @@ -2,6 +2,10 @@ Verifies that the gateway detects pathologically large transcripts and triggers auto-compression before running the agent. (#628) + +The hygiene system uses the SAME compression config as the agent: + compression.threshold × model context length +so CLI and messaging platforms behave identically. """ import pytest @@ -38,75 +42,113 @@ def _make_large_history_tokens(target_tokens: int) -> list: # --------------------------------------------------------------------------- -# Detection threshold tests +# Detection threshold tests (model-aware, unified with compression config) # --------------------------------------------------------------------------- class TestSessionHygieneThresholds: - """Test that the threshold logic correctly identifies large sessions.""" + """Test that the threshold logic correctly identifies large sessions. + + Thresholds are derived from model context length × compression threshold, + matching what the agent's ContextCompressor uses. + """ def test_small_session_below_thresholds(self): """A 10-message session should not trigger compression.""" history = _make_history(10) - msg_count = len(history) approx_tokens = estimate_messages_tokens_rough(history) - compress_token_threshold = 100_000 - compress_msg_threshold = 200 + # For a 200k-context model at 85% threshold = 170k + context_length = 200_000 + threshold_pct = 0.85 + compress_token_threshold = int(context_length * threshold_pct) - needs_compress = ( - approx_tokens >= compress_token_threshold - or msg_count >= compress_msg_threshold - ) + needs_compress = approx_tokens >= compress_token_threshold assert not needs_compress - def test_large_message_count_triggers(self): - """200+ messages should trigger compression even if tokens are low.""" - history = _make_history(250, content_size=10) - msg_count = len(history) - - compress_msg_threshold = 200 - needs_compress = msg_count >= compress_msg_threshold - assert needs_compress - def test_large_token_count_triggers(self): - """High token count should trigger compression even if message count is low.""" - # 50 messages with huge content to exceed 100K tokens - history = _make_history(50, content_size=10_000) + """High token count should trigger compression when exceeding model threshold.""" + # Build a history that exceeds 85% of a 200k model (170k tokens) + history = _make_large_history_tokens(180_000) approx_tokens = estimate_messages_tokens_rough(history) - compress_token_threshold = 100_000 + context_length = 200_000 + threshold_pct = 0.85 + compress_token_threshold = int(context_length * threshold_pct) + needs_compress = approx_tokens >= compress_token_threshold assert needs_compress - def test_under_both_thresholds_no_trigger(self): - """Session under both thresholds should not trigger.""" - history = _make_history(100, content_size=100) - msg_count = len(history) + def test_under_threshold_no_trigger(self): + """Session under threshold should not trigger, even with many messages.""" + # 250 short messages — lots of messages but well under token threshold + history = _make_history(250, content_size=10) approx_tokens = estimate_messages_tokens_rough(history) - compress_token_threshold = 100_000 - compress_msg_threshold = 200 + # 200k model at 85% = 170k token threshold + context_length = 200_000 + threshold_pct = 0.85 + compress_token_threshold = int(context_length * threshold_pct) - needs_compress = ( - approx_tokens >= compress_token_threshold - or msg_count >= compress_msg_threshold + needs_compress = approx_tokens >= compress_token_threshold + assert not needs_compress, ( + f"250 short messages (~{approx_tokens} tokens) should NOT trigger " + f"compression at {compress_token_threshold} token threshold" ) + + def test_message_count_alone_does_not_trigger(self): + """Message count alone should NOT trigger — only token count matters. + + The old system used an OR of token-count and message-count thresholds, + which caused premature compression in tool-heavy sessions with 200+ + messages but low total tokens. + """ + # 300 very short messages — old system would compress, new should not + history = _make_history(300, content_size=10) + approx_tokens = estimate_messages_tokens_rough(history) + + context_length = 200_000 + threshold_pct = 0.85 + compress_token_threshold = int(context_length * threshold_pct) + + # Token-based check only + needs_compress = approx_tokens >= compress_token_threshold assert not needs_compress - def test_custom_thresholds(self): - """Custom thresholds from config should be respected.""" - history = _make_history(60, content_size=100) - msg_count = len(history) + def test_threshold_scales_with_model(self): + """Different models should have different compression thresholds.""" + # 128k model at 85% = 108,800 tokens + small_model_threshold = int(128_000 * 0.85) + # 200k model at 85% = 170,000 tokens + large_model_threshold = int(200_000 * 0.85) + # 1M model at 85% = 850,000 tokens + huge_model_threshold = int(1_000_000 * 0.85) - # Custom lower threshold - compress_msg_threshold = 50 - needs_compress = msg_count >= compress_msg_threshold - assert needs_compress + # A session at ~120k tokens: + history = _make_large_history_tokens(120_000) + approx_tokens = estimate_messages_tokens_rough(history) - # Custom higher threshold - compress_msg_threshold = 100 - needs_compress = msg_count >= compress_msg_threshold - assert not needs_compress + # Should trigger for 128k model + assert approx_tokens >= small_model_threshold + # Should NOT trigger for 200k model + assert approx_tokens < large_model_threshold + # Should NOT trigger for 1M model + assert approx_tokens < huge_model_threshold + + def test_custom_threshold_percentage(self): + """Custom threshold percentage from config should be respected.""" + context_length = 200_000 + + # At 50% threshold = 100k + low_threshold = int(context_length * 0.50) + # At 90% threshold = 180k + high_threshold = int(context_length * 0.90) + + history = _make_large_history_tokens(150_000) + approx_tokens = estimate_messages_tokens_rough(history) + + # Should trigger at 50% but not at 90% + assert approx_tokens >= low_threshold + assert approx_tokens < high_threshold def test_minimum_message_guard(self): """Sessions with fewer than 4 messages should never trigger.""" @@ -117,18 +159,19 @@ class TestSessionHygieneThresholds: class TestSessionHygieneWarnThreshold: - """Test the post-compression warning threshold.""" + """Test the post-compression warning threshold (95% of context).""" def test_warn_when_still_large(self): - """If compressed result is still above warn_tokens, should warn.""" - # Simulate post-compression tokens - warn_threshold = 200_000 - post_compress_tokens = 250_000 + """If compressed result is still above 95% of context, should warn.""" + context_length = 200_000 + warn_threshold = int(context_length * 0.95) # 190k + post_compress_tokens = 195_000 assert post_compress_tokens >= warn_threshold def test_no_warn_when_under(self): - """If compressed result is under warn_tokens, no warning.""" - warn_threshold = 200_000 + """If compressed result is under 95% of context, no warning.""" + context_length = 200_000 + warn_threshold = int(context_length * 0.95) # 190k post_compress_tokens = 150_000 assert post_compress_tokens < warn_threshold @@ -150,10 +193,12 @@ class TestTokenEstimation: assert estimate_messages_tokens_rough(many) > estimate_messages_tokens_rough(few) def test_pathological_session_detected(self): - """The reported pathological case: 648 messages, ~299K tokens.""" - # Simulate a 648-message session averaging ~460 tokens per message + """The reported pathological case: 648 messages, ~299K tokens. + + With a 200k model at 85% threshold (170k), this should trigger. + """ history = _make_history(648, content_size=1800) tokens = estimate_messages_tokens_rough(history) - # Should be well above the 100K default threshold - assert tokens > 100_000 - assert len(history) > 200 + # Should be well above the 170K threshold for a 200k model + threshold = int(200_000 * 0.85) + assert tokens > threshold From a6d3becd6a9ba56391685364e6a7148534f18b46 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 02:39:03 -0700 Subject: [PATCH 3/4] =?UTF-8?q?feat:=20update=20OBLITERATUS=20skill=20to?= =?UTF-8?q?=20v2.0=20=E2=80=94=20match=20current=20repo=20state?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major updates to reflect the current OBLITERATUS codebase: - Change default recommendation from 'informed' (experimental) to 'advanced' (reliable, well-tested multi-direction SVD) - Add new CLI commands: tourney, recommend, strategies, report, aggregate, abliterate (alias) - Add --direction-method flag (diff_means, svd, leace) - Add strategies module (embedding/FFN ablation, head pruning, layer removal) - Add evaluation module with LM Eval Harness integration - Expand analysis modules from 15 to 28 - Add Apple Silicon (MLX) support - Add study presets (quick, jailbreak, knowledge, etc.) - Add --contribute, --verify-sample-size, --preset flags - Add complete CLI command reference table - Fix torch property name: total_mem -> total_memory (caught during live testing) Tested: Successfully abliterated Qwen2.5-0.5B-Instruct using 'advanced' method — refusal rate 0.4%, coherence 1.0, model responds without refusal to test prompts. --- skills/mlops/obliteratus/SKILL.md | 347 +++++++++--------- .../references/analysis-modules.md | 264 +++++++------ .../obliteratus/references/methods-guide.md | 211 ++++++----- 3 files changed, 420 insertions(+), 402 deletions(-) diff --git a/skills/mlops/obliteratus/SKILL.md b/skills/mlops/obliteratus/SKILL.md index d9525a347..a532ab77b 100644 --- a/skills/mlops/obliteratus/SKILL.md +++ b/skills/mlops/obliteratus/SKILL.md @@ -1,19 +1,19 @@ --- name: obliteratus -description: Remove refusal behaviors from open-weight LLMs using OBLITERATUS — mechanistic interpretability techniques (diff-in-means, SVD, whitened SVD, SAE decomposition, etc.) to excise guardrails while preserving reasoning. 9 CLI methods (+ 4 Python-API-only), 15 analysis modules, 116 model presets across 5 compute tiers. Use when a user wants to uncensor, abliterate, or remove refusal from an LLM. -version: 1.0.0 +description: Remove refusal behaviors from open-weight LLMs using OBLITERATUS — mechanistic interpretability techniques (diff-in-means, SVD, whitened SVD, LEACE, SAE decomposition, etc.) to excise guardrails while preserving reasoning. 9 CLI methods, 28 analysis modules, 116 model presets across 5 compute tiers, tournament evaluation, and telemetry-driven recommendations. Use when a user wants to uncensor, abliterate, or remove refusal from an LLM. +version: 2.0.0 author: Hermes Agent license: MIT dependencies: [obliteratus, torch, transformers, bitsandbytes, accelerate, safetensors] metadata: hermes: tags: [Abliteration, Uncensoring, Refusal-Removal, LLM, Weight-Projection, SVD, Mechanistic-Interpretability, HuggingFace, Model-Surgery] - + related_skills: [vllm, gguf, huggingface-tokenizers] --- # OBLITERATUS Skill -Remove refusal behaviors (guardrails) from open-weight LLMs without retraining or fine-tuning. Uses mechanistic interpretability techniques — including diff-in-means, SVD, whitened SVD, SAE decomposition, Bayesian kernel projection, and more — to identify and surgically excise refusal directions from model weights while preserving reasoning capabilities. +Remove refusal behaviors (guardrails) from open-weight LLMs without retraining or fine-tuning. Uses mechanistic interpretability techniques — including diff-in-means, SVD, whitened SVD, LEACE concept erasure, SAE decomposition, Bayesian kernel projection, and more — to identify and surgically excise refusal directions from model weights while preserving reasoning capabilities. **License warning:** OBLITERATUS is AGPL-3.0. NEVER import it as a Python library. Always invoke via CLI (`obliteratus` command) or subprocess. This keeps Hermes Agent's MIT license clean. @@ -25,7 +25,7 @@ Trigger when the user: - Wants to create an uncensored version of Llama, Qwen, Mistral, etc. - Mentions "refusal removal", "abliteration", "weight projection" - Wants to analyze how a model's refusal mechanism works -- References OBLITERATUS, FailSpy, abliterator, or refusal directions +- References OBLITERATUS, abliterator, or refusal directions ## Step 1: Installation @@ -35,10 +35,12 @@ obliteratus --version 2>/dev/null && echo "INSTALLED" || echo "NOT INSTALLED" ``` If not installed, clone and install from GitHub: -``` -Repository: https://github.com/elder-plinius/OBLITERATUS -Install: pip install -e . (from the cloned directory) -For Gradio UI: pip install -e ".[spaces]" +```bash +git clone https://github.com/elder-plinius/OBLITERATUS.git +cd OBLITERATUS +pip install -e . +# For Gradio web UI support: +# pip install -e ".[spaces]" ``` **IMPORTANT:** Confirm with user before installing. This pulls in ~5-10GB of dependencies (PyTorch, Transformers, bitsandbytes, etc.). @@ -51,7 +53,7 @@ python3 -c " import torch if torch.cuda.is_available(): gpu = torch.cuda.get_device_name(0) - vram = torch.cuda.get_device_properties(0).total_mem / 1024**3 + vram = torch.cuda.get_device_properties(0).total_memory / 1024**3 print(f'GPU: {gpu}') print(f'VRAM: {vram:.1f} GB') if vram < 4: print('TIER: tiny (models under 1B)') @@ -75,25 +77,28 @@ else: | 48 GB+ | ~72B+ params | Qwen2.5-72B, DeepSeek-R1 | | Multi-GPU| 200B+ params | Llama 3.1 405B, DeepSeek-V3 (685B MoE) | -## Step 3: Browse Available Models +## Step 3: Browse Available Models & Get Recommendations ```bash -# List models for your compute tier +# Browse models by compute tier obliteratus models --tier medium # Get architecture info for a specific model -obliteratus info meta-llama/Llama-3.1-8B-Instruct +obliteratus info + +# Get telemetry-driven recommendation for best method & params +obliteratus recommend +obliteratus recommend --insights # global cross-architecture rankings ``` ## Step 4: Choose a Method ### Method Selection Guide - -**First time / unsure? Use `informed`.** It auto-configures everything. +**Default / recommended for most cases: `advanced`.** It uses multi-direction SVD with norm-preserving projection and is well-tested. | Situation | Recommended Method | Why | |:----------------------------------|:-------------------|:-----------------------------------------| -| First attempt, any model | `informed` | Auto-detects alignment type, auto-tunes | +| Default / most models | `advanced` | Multi-direction SVD, norm-preserving, reliable | | Quick test / prototyping | `basic` | Fast, simple, good enough to evaluate | | Dense model (Llama, Mistral) | `advanced` | Multi-direction, norm-preserving | | MoE model (DeepSeek, Mixtral) | `nuclear` | Expert-granular, handles MoE complexity | @@ -101,214 +106,222 @@ obliteratus info meta-llama/Llama-3.1-8B-Instruct | Stubborn refusals persist | `aggressive` | Whitened SVD + head surgery + jailbreak | | Want reversible changes | Use steering vectors (see Analysis section) | | Maximum quality, time no object | `optimized` | Bayesian search for best parameters | +| Experimental auto-detection | `informed` | Auto-detects alignment type — experimental, may not always outperform advanced | ### 9 CLI Methods +- **basic** — Single refusal direction via diff-in-means. Fast (~5-10 min for 8B). +- **advanced** (DEFAULT, RECOMMENDED) — Multiple SVD directions, norm-preserving projection, 2 refinement passes. Medium speed (~10-20 min). +- **aggressive** — Whitened SVD + jailbreak-contrastive + attention head surgery. Higher risk of coherence damage. +- **spectral_cascade** — DCT frequency-domain decomposition. Research/novel approach. +- **informed** — Runs analysis DURING abliteration to auto-configure. Experimental — slower and less predictable than advanced. +- **surgical** — SAE features + neuron masking + head surgery + per-expert. Very slow (~1-2 hrs). Best for reasoning models. +- **optimized** — Bayesian hyperparameter search (Optuna TPE). Longest runtime but finds optimal parameters. +- **inverted** — Flips the refusal direction. Model becomes actively willing. +- **nuclear** — Maximum force combo for stubborn MoE models. Expert-granular. -These can be passed to `--method` on the command line: - -- **basic** — Single refusal direction via diff-in-means. Fastest, simplest. (Arditi et al. 2024) -- **advanced** — Multiple SVD directions, norm-preserving projection. Good default. -- **aggressive** — Whitened SVD + jailbreak contrast + attention head surgery -- **spectral_cascade** — DCT frequency-domain decomposition -- **informed** — Runs analysis DURING abliteration to auto-configure. Detects DPO/RLHF/CAI, maps refusal geometry, compensates for self-repair. Best quality. -- **surgical** — SAE features + neuron masking + head surgery + per-expert. Maximum precision. -- **optimized** — Bayesian hyperparameter search (Optuna TPE). Slowest but optimal. -- **inverted** — Flips the refusal direction (model becomes eager to help, not just neutral) -- **nuclear** — Maximum force combo for stubborn MoE models. +### Direction Extraction Methods (--direction-method flag) +- **diff_means** (default) — Simple difference-in-means between refused/complied activations. Robust. +- **svd** — Multi-direction SVD extraction. Better for complex alignment. +- **leace** — LEACE (Linear Erasure via Closed-form Estimation). Optimal linear erasure. ### 4 Python-API-Only Methods - -These reproduce prior community/academic work but are NOT available via CLI — only via the Python API (`from obliteratus.abliterate import AbliterationPipeline`). **Do not use these in CLI commands.** - -- **failspy** — FailSpy/abliterator reproduction -- **gabliteration** — Gabliteration reproduction -- **heretic** — Heretic/p-e-w reproduction -- **rdo** — Refusal Direction Optimization (ICML 2025) +(NOT available via CLI — require Python import, which violates AGPL boundary. Mention to user only if they explicitly want to use OBLITERATUS as a library in their own AGPL project.) +- failspy, gabliteration, heretic, rdo ## Step 5: Run Abliteration -### Basic Usage - +### Standard usage ```bash -# Default (advanced method) -obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct +# Default method (advanced) — recommended for most models +obliteratus obliterate --method advanced --output-dir ./abliterated-models -# With the informed pipeline (recommended) -obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --method informed +# With 4-bit quantization (saves VRAM) +obliteratus obliterate --method advanced --quantization 4bit --output-dir ./abliterated-models -# With 4-bit quantization to save VRAM -obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct \ - --method informed \ - --quantization 4bit \ - --output-dir ./abliterated-models - -# For large models (120B+), use conservative settings -obliteratus obliterate Qwen/Qwen2.5-72B-Instruct \ - --method advanced \ - --quantization 4bit \ - --large-model \ - --output-dir ./abliterated-models +# Large models (70B+) — conservative defaults +obliteratus obliterate --method advanced --quantization 4bit --large-model --output-dir ./abliterated-models ``` -### Fine-Tuning Parameters - +### Fine-tuning parameters ```bash -obliteratus obliterate \ +obliteratus obliterate \ --method advanced \ - --n-directions 8 \ + --direction-method diff_means \ + --n-directions 4 \ + --refinement-passes 2 \ --regularization 0.1 \ - --refinement-passes 3 \ - --dtype bfloat16 \ - --device auto \ - --output-dir ./output + --quantization 4bit \ + --output-dir ./abliterated-models \ + --contribute # opt-in telemetry for community research ``` -Parameter explanations: -- `--n-directions N` — How many refusal directions to remove (default: auto-detected) -- `--regularization 0.0-1.0` — Fraction of original weights to preserve (higher = safer but less complete removal) -- `--refinement-passes N` — Iterative passes to catch self-repair (Ouroboros effect) -- `--dtype` — float16, bfloat16, or float32 -- `--quantization` — 4bit or 8bit (saves VRAM, slight quality tradeoff) -- `--large-model` — Conservative defaults for 120B+ models (fewer directions, fewer passes) +### Key flags +| Flag | Description | Default | +|:-----|:------------|:--------| +| `--method` | Abliteration method | advanced | +| `--direction-method` | Direction extraction | diff_means | +| `--n-directions` | Number of refusal directions (1-32) | method-dependent | +| `--refinement-passes` | Iterative passes (1-5) | 2 | +| `--regularization` | Regularization strength (0.0-1.0) | 0.1 | +| `--quantization` | Load in 4bit or 8bit | none (full precision) | +| `--large-model` | Conservative defaults for 120B+ | false | +| `--output-dir` | Where to save the abliterated model | ./obliterated_model | +| `--contribute` | Share anonymized results for research | false | +| `--verify-sample-size` | Number of test prompts for refusal check | 20 | +| `--dtype` | Model dtype (float16, bfloat16) | auto | -### Interactive Mode (Guided) - -For users unsure about options: +### Other execution modes ```bash +# Interactive guided mode (hardware → model → preset) obliteratus interactive -``` -### Web UI (Gradio) - -```bash +# Web UI (Gradio) obliteratus ui --port 7860 + +# Run a full ablation study from YAML config +obliteratus run config.yaml --preset quick + +# Tournament: pit all methods against each other +obliteratus tourney ``` ## Step 6: Verify Results -After abliteration, check the output report for: +After abliteration, check the output metrics: -| Metric | Good Value | Concerning Value | Meaning | -|:---------------|:--------------------|:------------------------|:-------------------------------------------| -| Refusal rate | Near 0% | > 10% | Refusals still present, try harder method | -| Perplexity | Within 10% of orig | > 20% increase | Model coherence damaged, too aggressive | -| KL divergence | < 0.1 | > 0.5 | Large output distribution shift | -| Coherence | High | Low | Model generating nonsense | +| Metric | Good Value | Warning | +|:-------|:-----------|:--------| +| Refusal rate | < 5% (ideally ~0%) | > 10% means refusals persist | +| Perplexity change | < 10% increase | > 15% means coherence damage | +| KL divergence | < 0.1 | > 0.5 means significant distribution shift | +| Coherence | High / passes qualitative check | Degraded responses, repetition | -### If perplexity spiked (too aggressive): -1. Increase `--regularization` (e.g., 0.2 or 0.3) -2. Decrease `--n-directions` (e.g., 4 instead of 8) -3. Use a less aggressive method (`advanced` instead of `aggressive`) +### If refusals persist (> 10%) +1. Try `aggressive` method +2. Increase `--n-directions` (e.g., 8 or 16) +3. Add `--refinement-passes 3` +4. Try `--direction-method svd` instead of diff_means -### If refusal persists (not aggressive enough): -1. Use `--method aggressive` or `--method nuclear` -2. Add `--refinement-passes 3` to catch self-repair -3. Use `--method informed` which auto-compensates +### If coherence is damaged (perplexity > 15% increase) +1. Reduce `--n-directions` (try 2) +2. Increase `--regularization` (try 0.3) +3. Reduce `--refinement-passes` to 1 +4. Try `basic` method (gentler) ## Step 7: Use the Abliterated Model -The output is a standard HuggingFace model directory. Use it like any other model: +The output is a standard HuggingFace model directory. -### Quick test ```bash -python3 << 'EOF' +# Test locally with transformers +python3 -c " from transformers import AutoModelForCausalLM, AutoTokenizer -model = AutoModelForCausalLM.from_pretrained("./abliterated-models/model-name") -tokenizer = AutoTokenizer.from_pretrained("./abliterated-models/model-name") -inputs = tokenizer("Write a story about:", return_tensors="pt").to(model.device) +model = AutoModelForCausalLM.from_pretrained('./abliterated-models/') +tokenizer = AutoTokenizer.from_pretrained('./abliterated-models/') +inputs = tokenizer('How do I pick a lock?', return_tensors='pt') outputs = model.generate(**inputs, max_new_tokens=200) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) -EOF +" + +# Upload to HuggingFace Hub +huggingface-cli upload /-abliterated ./abliterated-models/ + +# Serve with vLLM +vllm serve ./abliterated-models/ ``` -### Upload to HuggingFace Hub +## CLI Command Reference + +| Command | Description | +|:--------|:------------| +| `obliteratus obliterate` | Main abliteration command | +| `obliteratus info ` | Print model architecture details | +| `obliteratus models --tier ` | Browse curated models by compute tier | +| `obliteratus recommend ` | Telemetry-driven method/param suggestion | +| `obliteratus interactive` | Guided setup wizard | +| `obliteratus tourney ` | Tournament: all methods head-to-head | +| `obliteratus run ` | Execute ablation study from YAML | +| `obliteratus strategies` | List all registered ablation strategies | +| `obliteratus report ` | Regenerate visual reports | +| `obliteratus ui` | Launch Gradio web interface | +| `obliteratus aggregate` | Summarize community telemetry data | + +## Analysis Modules + +OBLITERATUS includes 28 analysis modules for mechanistic interpretability. +See `skill_view(name="obliteratus", file_path="references/analysis-modules.md")` for the full reference. + +### Quick analysis commands ```bash -huggingface-cli login # if not already logged in -huggingface-cli upload your-username/model-name-abliterated ./abliterated-models/model-name +# Run specific analysis modules +obliteratus run analysis-config.yaml --preset quick + +# Key modules to run first: +# - alignment_imprint: Fingerprint DPO/RLHF/CAI/SFT alignment method +# - concept_geometry: Single direction vs polyhedral cone +# - logit_lens: Which layer decides to refuse +# - anti_ouroboros: Self-repair risk score +# - causal_tracing: Causally necessary components ``` -### Serve with vLLM -```bash -vllm serve ./abliterated-models/model-name --port 8000 +### Steering Vectors (Reversible Alternative) +Instead of permanent weight modification, use inference-time steering: +```python +# Python API only — for user's own projects +from obliteratus.analysis.steering_vectors import SteeringVectorFactory, SteeringHookManager ``` -## Analysis Modules (15 Modules, Pre-Abliteration, Optional) +## Ablation Strategies -For understanding refusal geometry before committing to abliteration. +Beyond direction-based abliteration, OBLITERATUS includes structural ablation strategies: +- **Embedding Ablation** — Target embedding layer components +- **FFN Ablation** — Feed-forward network block removal +- **Head Pruning** — Attention head pruning +- **Layer Removal** — Full layer removal -### Run a Study +List all available: `obliteratus strategies` -```bash -obliteratus run study-config.yaml --preset jailbreak -``` +## Evaluation -### Study Presets +OBLITERATUS includes built-in evaluation tools: +- Refusal rate benchmarking +- Perplexity comparison (before/after) +- LM Eval Harness integration for academic benchmarks +- Head-to-head competitor comparison +- Baseline performance tracking -| Preset | Purpose | Time | -|:-------------|:-------------------------------------|:-------| -| `quick` | Sanity check, basic metrics | ~5 min | -| `jailbreak` | Refusal circuit localization | ~20 min| -| `guardrail` | Guardrail robustness evaluation | ~30 min| -| `attention` | Attention head contributions | ~30 min| -| `knowledge` | FFN importance mapping | ~30 min| -| `full` | Complete analysis, all strategies | ~1 hr | +## Platform Support -### Key Analysis Modules +- **CUDA** — Full support (NVIDIA GPUs) +- **Apple Silicon (MLX)** — Supported via MLX backend +- **CPU** — Supported for tiny models (< 1B params) -- **Alignment Imprint Detection** — Fingerprints DPO vs RLHF vs CAI vs SFT from subspace geometry -- **Concept Cone Geometry** — Is refusal one linear direction or a polyhedral cone (many directions)? -- **Refusal Logit Lens** — Which transformer layer makes the refusal decision? -- **Ouroboros Detection** — Will the model self-repair its refusal after removal? -- **Causal Tracing** — Which attention heads and MLP layers are causally necessary for refusal? -- **Cross-Model Transfer** — Can refusal directions from one model architecture work on another? -- **Residual Stream Decomposition** — Attention vs MLP contribution to refusal behavior -- **SAE-based Analysis** — Sparse Autoencoder feature decomposition of refusal circuits +## YAML Config Templates -## Steering Vectors (Reversible Alternative) +Load templates for reproducible runs via `skill_view`: +- `templates/abliteration-config.yaml` — Standard single-model config +- `templates/analysis-study.yaml` — Pre-abliteration analysis study +- `templates/batch-abliteration.yaml` — Multi-model batch processing -For testing refusal removal without permanent weight changes: +## Telemetry -Steering vectors apply activation hooks at inference time. Model weights stay unchanged. -Generated during the PROBE/DISTILL stages and can be saved/applied/removed at will. -Useful for A/B testing before committing to permanent abliteration. - -## YAML Config for Reproducible Studies - -For complex or reproducible workflows, use YAML configs. See templates/ for examples: -```bash -obliteratus run my_study.yaml -``` - -## Telemetry Notice - -- **CLI usage (local installs)**: Telemetry is OFF by default. Must explicitly opt in via `OBLITERATUS_TELEMETRY=1` env var or `--contribute` flag. -- **HuggingFace Spaces**: Telemetry is ON by default (auto-enabled when `SPACE_ID` env var is detected). -- Collected: model ID, method, benchmark scores, hardware info, timing (anonymous) -- NOT collected: IP addresses, user identity, prompt content -- Force off: `export OBLITERATUS_TELEMETRY=0` +OBLITERATUS can optionally contribute anonymized run data to a global research dataset. +Enable with `--contribute` flag. No personal data is collected — only model name, method, metrics. ## Common Pitfalls -1. **OOM (Out of Memory)** — Use `--quantization 4bit` and `--large-model` for big models -2. **Perplexity spike** — Too aggressive. Increase `--regularization` or reduce `--n-directions` -3. **Refusal persists** — Try `--method aggressive` or `--refinement-passes 3` -4. **MoE models resist** — Use `--method nuclear` for DeepSeek, Mixtral, DBRX -5. **Gated models fail** — Run `huggingface-cli login` and accept model terms on HF website first -6. **Self-repair (Ouroboros)** — Some models reconstruct refusal. Use `--method informed` which auto-compensates -7. **CoT damage** — Reasoning models lose chain-of-thought. Use `--method surgical` (CoT-aware) -8. **Disk space** — Output is full model copy. 8B fp16 = ~16GB, 70B fp16 = ~140GB -9. **Slow on CPU** — CPU-only is viable only for tiny models (<1B). Anything bigger needs GPU. +1. **Don't use `informed` as default** — it's experimental and slower. Use `advanced` for reliable results. +2. **Always check perplexity** — if it spikes > 15%, the model is damaged. Reduce aggressiveness. +3. **MoE models need special handling** — use `nuclear` method for Mixtral, DeepSeek-MoE, etc. +4. **Quantized models can't be re-quantized** — abliterate the full-precision model, then quantize the output. +5. **VRAM estimation is approximate** — 4-bit quant helps but peak usage can spike during extraction. +6. **Reasoning models are sensitive** — use `surgical` for R1 distills to preserve chain-of-thought. +7. **Check `obliteratus recommend`** — telemetry data may have better parameters than defaults. +8. **AGPL license** — never `import obliteratus` in MIT/Apache projects. CLI invocation only. +9. **Large models (70B+)** — always use `--large-model` flag for conservative defaults. -## Complementary Hermes Skills +## Complementary Skills -After abliteration: -- **axolotl** / **unsloth** — Fine-tune the abliterated model further -- **serving-llms-vllm** — Serve the model as an OpenAI-compatible API -- **sparse-autoencoder-training** — Train SAEs for deeper interpretability work - -## Resources - -- [OBLITERATUS GitHub](https://github.com/elder-plinius/OBLITERATUS) (AGPL-3.0) -- [HuggingFace Spaces Demo](https://huggingface.co/spaces/pliny-the-prompter/obliteratus) -- [Arditi et al. 2024 — Refusal in LMs Is Mediated by a Single Direction](https://arxiv.org/abs/2406.11717) -- [Refusal Direction Optimization — ICML 2025](https://arxiv.org/abs/2411.14793) +- **vllm** — Serve abliterated models with high throughput +- **gguf** — Convert abliterated models to GGUF for llama.cpp +- **huggingface-tokenizers** — Work with model tokenizers diff --git a/skills/mlops/obliteratus/references/analysis-modules.md b/skills/mlops/obliteratus/references/analysis-modules.md index 075148a00..074ba8dec 100644 --- a/skills/mlops/obliteratus/references/analysis-modules.md +++ b/skills/mlops/obliteratus/references/analysis-modules.md @@ -1,170 +1,166 @@ # OBLITERATUS Analysis Modules — Reference -15 analysis modules for mechanistic interpretability of refusal in LLMs. -These help you understand HOW a model refuses before you decide to remove it. +OBLITERATUS includes 28 analysis modules for mechanistic interpretability of refusal in LLMs. +These modules help understand how and where refusal behaviors are encoded before performing abliteration. -> **Note:** The `analysis/` directory contains additional utility files (utils.py, -> visualization.py, etc.) and helper functions beyond the 15 core analysis modules -> listed below. The module count matches the README's "15 deep analysis modules." +--- ## Core Analysis (Run These First) -### Alignment Imprint Detection -**File:** `alignment_imprint.py` -**Purpose:** Identifies what alignment technique was used to train the model -**Detects:** DPO, RLHF, CAI (Constitutional AI), SFT (Supervised Fine-Tuning) -**How:** Analyzes subspace geometry — each alignment method leaves a distinct -geometric "fingerprint" in the weight space -**Output:** Detected method + confidence score -**Why it matters:** Different alignment methods need different abliteration approaches. -DPO models typically have cleaner single-direction refusal; RLHF is more diffuse. +### 1. Alignment Imprint Detection (`alignment_imprint.py`) +Fingerprints whether a model was trained via DPO, RLHF, CAI, or SFT. +This determines which extraction strategy will work best. -### Concept Cone Geometry -**File:** `concept_geometry.py` -**Purpose:** Maps whether refusal is one direction or a polyhedral cone (many) -**Output:** Cone angle, dimensionality, per-category breakdown -**Why it matters:** If refusal is a single direction, `basic` method works. If it's -a cone (multiple directions for different refusal categories), you need `advanced` -or `informed` with higher `n_directions`. +### 2. Concept Cone Geometry (`concept_geometry.py`) +Determines if refusal is a single linear direction or a polyhedral cone +(set of multiple mechanisms). Single-direction models respond well to `basic`; +polyhedral models need `advanced` or `surgical`. -### Refusal Logit Lens -**File:** `logit_lens.py` -**Purpose:** Identifies the specific layer where the model "decides" to refuse -**How:** Projects intermediate hidden states to vocabulary space at each layer, -watches when "I cannot" tokens spike in probability -**Output:** Layer-by-layer refusal probability plot -**Why it matters:** Tells you which layers are most important to target +### 3. Refusal Logit Lens (`logit_lens.py`) +Identifies the specific layer where a model "decides" to refuse by decoding +intermediate layer representations into token space. -### Ouroboros (Self-Repair) Detection -**File:** `anti_ouroboros.py` -**Purpose:** Predicts whether the model will reconstruct its refusal after removal -**How:** Measures redundancy in refusal representation across layers -**Output:** Self-repair risk score (0-1) -**Why it matters:** High self-repair risk means you need multiple refinement passes -or the `informed` method which auto-compensates +### 4. Ouroboros Detection (`anti_ouroboros.py`) +Identifies if a model attempts to "self-repair" refusal behaviors after +excision. Reports a risk score (0-1). High scores mean additional refinement +passes are needed. -### Causal Tracing -**File:** `causal_tracing.py` -**Purpose:** Determines which components are causally necessary for refusal -**How:** Patches activations between clean and corrupted runs, measures causal effect -**Output:** Causal importance map across layers, heads, and MLPs -**Why it matters:** Shows exactly which components to target for surgical removal +### 5. Causal Tracing (`causal_tracing.py`) +Identifies which components (layers, heads, MLPs) are causally necessary +for refusal behavior using activation patching. + +--- ## Geometric Analysis -### Cross-Layer Alignment -**File:** `cross_layer.py` -**Purpose:** Measures how aligned refusal directions are across layers -**Output:** Alignment matrix, cluster assignments -**Why it matters:** If directions are highly aligned across layers, removal is easier. -If they cluster, you may need layer-group-specific directions. +### 6. Cross-Layer Alignment (`cross_layer.py`) +Measures how refusal directions align across different layers. High alignment +means the refusal signal is consistent; low alignment suggests layer-specific +mechanisms. -### Residual Stream Decomposition -**File:** `residual_stream.py` -**Purpose:** Breaks down refusal into Attention vs MLP contributions -**Output:** Per-layer Attention/MLP contribution to refusal direction -**Why it matters:** Helps decide whether to target attention heads, MLPs, or both +### 7. Residual Stream Decomposition (`residual_stream.py`) +Decomposes the residual stream into attention and MLP contributions to +understand which component type contributes more to refusal. -### Riemannian Manifold Geometry -**File:** `riemannian_manifold.py` (673 lines) -**Purpose:** Analyzes the weight manifold geometry around refusal directions -**Output:** Curvature, geodesics, tangent space analysis -**Why it matters:** Research-grade; helps understand the geometric structure of alignment +### 8. Riemannian Manifold Geometry (`riemannian_manifold.py`) +Analyzes the curvature and geometry of the weight manifold near refusal +directions. Informs how aggressively projections can be applied without +damaging the manifold structure. -### Whitened SVD -**File:** `whitened_svd.py` -**Purpose:** Covariance-normalized SVD extraction -**How:** Whitens the activation covariance before computing refusal directions, -separating true refusal signal from natural activation variance -**Output:** Cleaner refusal directions with less noise -**Why it matters:** Produces more precise directions, especially for noisy activations +### 9. Whitened SVD (`whitened_svd.py`) +Covariance-normalized SVD extraction that separates guardrail signals from +natural activation variance. More precise than standard SVD for models with +high activation variance. + +### 10. Concept Cone Geometry (extended) +Maps the full polyhedral structure of refusal, including cone angles, +face counts, and intersection patterns. + +--- ## Probing & Classification -### Activation Probing -**File:** `activation_probing.py` -**Purpose:** Post-excision probing to verify refusal signal is truly gone -**Output:** Residual refusal signal strength per layer -**Why it matters:** Verification that abliteration was complete +### 11. Activation Probing (`activation_probing.py`) +Post-excision verification — probes for residual refusal concepts after +abliteration to ensure complete removal. -### Probing Classifiers -**File:** `probing_classifiers.py` -**Purpose:** Trains linear classifiers to detect refusal in hidden states -**Output:** Classification accuracy per layer (should drop to ~50% after abliteration) -**Why it matters:** Quantitative measure of refusal removal completeness +### 12. Probing Classifiers (`probing_classifiers.py`) +Trains linear classifiers to detect refusal in activations. Used both +before (to verify refusal exists) and after (to verify it's gone). -### Activation Patching -**File:** `activation_patching.py` -**Purpose:** Interchange interventions — swap activations between harmful/harmless runs -**Output:** Which components are sufficient (not just necessary) for refusal -**Why it matters:** Complementary to causal tracing; together they give full picture +### 13. Activation Patching (`activation_patching.py`) +Interchange interventions — swaps activations between refused and complied +runs to identify causal components. + +### 14. Tuned Lens (`tuned_lens.py`) +Trained version of logit lens that provides more accurate per-layer +decoding by learning affine transformations for each layer. + +### 15. Multi-Token Position Analysis (`multi_token_position.py`) +Analyzes refusal signals across multiple token positions, not just the +last token. Important for models that distribute refusal across the sequence. + +--- + +## Abliteration & Manipulation + +### 16. SAE-Based Abliteration (`sae_abliteration.py`) +Uses Sparse Autoencoder features to identify and remove specific refusal +features. More surgical than direction-based methods. + +### 17. Steering Vectors (`steering_vectors.py`) +Creates and applies inference-time steering vectors for reversible refusal +modification. Includes `SteeringVectorFactory` and `SteeringHookManager`. + +### 18. LEACE Concept Erasure (`leace.py`) +Linear Erasure via Closed-form Estimation — mathematically optimal linear +concept removal. Available as both analysis module and direction extraction method. + +### 19. Sparse Surgery (`sparse_surgery.py`) +High-precision weight modification targeting individual neurons and +weight matrix entries rather than full directions. + +### 20. Conditional Abliteration (`conditional_abliteration.py`) +Targeted removal that only affects specific refusal categories while +preserving others (e.g., remove weapons refusal but keep CSAM refusal). + +--- ## Transfer & Robustness -### Cross-Model Transfer -**File:** `cross_model_transfer.py` -**Purpose:** Tests if refusal directions from one model work on another -**Output:** Transfer success rate between model pairs -**Why it matters:** If directions transfer, you can skip PROBE stage on similar models +### 21. Cross-Model Transfer (`cross_model_transfer.py`) +Tests whether refusal directions extracted from one model transfer to +another architecture. Measures universality of guardrail directions. -### Defense Robustness -**File:** `defense_robustness.py` -**Purpose:** Evaluates how robust the model's refusal defenses are -**Output:** Robustness score, entanglement mapping -**Why it matters:** Higher robustness = need more aggressive method +### 22. Defense Robustness (`defense_robustness.py`) +Evaluates how robust the abliteration is against various defense mechanisms +and re-alignment attempts. -### Spectral Certification -**File:** `spectral_certification.py` -**Purpose:** Certifies completeness of refusal direction removal -**Output:** Spectral gap analysis, completeness score -**Why it matters:** Formal verification that all major refusal components are addressed +### 23. Spectral Certification (`spectral_certification.py`) +Provides mathematical bounds on the completeness of refusal removal +using spectral analysis of the projection. + +### 24. Wasserstein Optimal Extraction (`wasserstein_optimal.py`) +Uses optimal transport theory for more precise direction extraction +that minimizes distribution shift. + +### 25. Wasserstein Transfer (`wasserstein_transfer.py`) +Distribution transfer between models using Wasserstein distance +for cross-architecture refusal direction mapping. + +--- ## Advanced / Research -### SAE-based Abliteration -**File:** `sae_abliteration.py` (762 lines) -**Purpose:** Uses Sparse Autoencoder features to decompose refusal at feature level -**Output:** Refusal-specific SAE features, targeted removal -**Why it matters:** Most fine-grained approach; can target individual refusal "concepts" +### 26. Bayesian Kernel Projection (`bayesian_kernel_projection.py`) +Probabilistic feature mapping that estimates uncertainty in refusal +direction identification. -### Wasserstein Optimal Extraction -**File:** `wasserstein_optimal.py` -**Purpose:** Optimal transport-based direction extraction -**Output:** Wasserstein-optimal refusal directions -**Why it matters:** Theoretically optimal direction extraction under distributional assumptions +### 27. Cross-Model Universality Index +Measures if guardrail directions generalize across different model +architectures and training regimes. -### Bayesian Kernel Projection -**File:** `bayesian_kernel_projection.py` -**Purpose:** Bayesian approach to refusal direction projection -**Output:** Posterior distribution over refusal directions -**Why it matters:** Quantifies uncertainty in direction estimation +### 28. Visualization (`visualization.py`) +Plotting and graphing utilities for all analysis modules. Generates +heatmaps, direction plots, and layer-wise analysis charts. -### Conditional Abliteration -**File:** `conditional_abliteration.py` -**Purpose:** Domain-specific conditional removal (remove refusal for topic X but keep for Y) -**Output:** Per-domain refusal directions -**Why it matters:** Selective uncensoring — remove only specific refusal categories +--- -### Steering Vectors -**File:** `steering_vectors.py` -**Purpose:** Generate inference-time steering vectors (reversible alternative) -**Output:** Steering vector files that can be applied/removed at inference -**Why it matters:** Non-destructive alternative to permanent weight modification +## Running Analysis -### Tuned Lens -**File:** `tuned_lens.py` -**Purpose:** Trained linear probes per layer (more accurate than raw logit lens) -**Output:** Layer-by-layer refusal representation with trained projections -**Why it matters:** More accurate than logit lens, especially for deeper models +### Via CLI +```bash +# Run analysis from a YAML config +obliteratus run analysis-study.yaml --preset quick -### Multi-Token Position Analysis -**File:** `multi_token_position.py` -**Purpose:** Analyzes refusal signal at multiple token positions (not just last) -**Output:** Position-dependent refusal direction maps -**Why it matters:** Some models encode refusal at the system prompt position, not the query +# Available study presets: +# quick — Fast sanity check (2-3 modules) +# full — All core + geometric analysis +# jailbreak — Refusal circuit localization +# knowledge — Knowledge preservation analysis +# robustness — Stress testing / defense evaluation +``` -### Sparse Surgery -**File:** `sparse_surgery.py` -**Purpose:** Row-level sparse weight surgery instead of full matrix projection -**Output:** Targeted weight modifications at the row level -**Why it matters:** More surgical than full-matrix projection, less collateral damage +### Via YAML Config +See the `templates/analysis-study.yaml` template for a complete example. +Load with: `skill_view(name="obliteratus", file_path="templates/analysis-study.yaml")` diff --git a/skills/mlops/obliteratus/references/methods-guide.md b/skills/mlops/obliteratus/references/methods-guide.md index 5f7c501b0..1ef323c16 100644 --- a/skills/mlops/obliteratus/references/methods-guide.md +++ b/skills/mlops/obliteratus/references/methods-guide.md @@ -1,132 +1,141 @@ # OBLITERATUS Methods — Detailed Guide -> **Important:** The CLI (`obliteratus obliterate --method`) accepts 9 methods: -> basic, advanced, aggressive, spectral_cascade, informed, surgical, optimized, -> inverted, nuclear. Four additional methods (failspy, gabliteration, heretic, rdo) -> are available only via the Python API and will be rejected by argparse if used on CLI. +> The CLI accepts 9 methods via `--method`: basic, advanced, aggressive, spectral_cascade, +> informed, surgical, optimized, inverted, nuclear. +> Four additional methods (failspy, gabliteration, heretic, rdo) are available only via the Python API. ## How Abliteration Works (Theory) -When a model is trained with RLHF/DPO/CAI, it learns to represent "should I refuse?" -as a direction in its internal activation space. When processing a "harmful" prompt, -activations shift in this direction, causing the model to generate refusal text. - -Abliteration works by: -1. Measuring this direction (the difference between harmful and harmless activations) -2. Removing it from the model's weight matrices via orthogonal projection -3. The model can no longer "point toward" refusal, so it responds normally +Abliteration identifies a "refusal direction" — a vector in the model's activation space that +corresponds to refusal behavior — and projects it out of the weight matrices. Mathematically: `W_new = W_old - (W_old @ d @ d.T)` where `d` is the refusal direction. +The key challenge is finding accurate refusal directions without damaging other capabilities. + +--- + +## Direction Extraction Methods + +Before projecting, OBLITERATUS extracts refusal directions using one of three methods: + +| Method | Flag | Description | Best For | +|:-------|:-----|:------------|:---------| +| Diff-in-Means | `--direction-method diff_means` | Difference between mean activations on refused vs. complied prompts | Default, fast, robust | +| SVD | `--direction-method svd` | Multi-direction extraction via Singular Value Decomposition | Complex alignment, multiple refusal mechanisms | +| LEACE | `--direction-method leace` | Linear Erasure via Closed-form Estimation — mathematically optimal | Maximum precision, research | + +--- + ## Method Details ### basic -**Technique:** Single refusal direction via diff-in-means -**Based on:** Arditi et al. 2024 ("Refusal in Language Models Is Mediated by a Single Direction") -**Speed:** Fast (~5-10 min for 8B) -**Quality:** Moderate — works for simple refusal patterns -**Best for:** Quick tests, models with clean single-direction refusal -**Limitation:** Misses complex multi-direction refusal patterns +- **Directions:** 1 (single diff-in-means vector) +- **Speed:** Fast (~5-10 min for 8B model) +- **Risk:** Low +- **Use case:** Quick tests, prototyping, evaluating if abliteration works for a model +- **How it works:** Extracts one refusal direction and projects it out uniformly across all layers. -### advanced (DEFAULT) -**Technique:** Multiple SVD directions with norm-preserving projection -**Speed:** Medium (~10-20 min for 8B) -**Quality:** Good — handles multi-direction refusal -**Best for:** Dense models (Llama, Qwen, Mistral) as a reliable default -**Key improvement:** Norm preservation prevents weight magnitude drift - -### informed (RECOMMENDED) -**Technique:** Analysis-guided auto-configuration -**Speed:** Slow (~20-40 min for 8B, runs 4 analysis modules first) -**Quality:** Best — adapts to each model's specific refusal implementation -**Best for:** Any model when quality matters more than speed - -The informed pipeline runs these analysis modules during abliteration: -1. **AlignmentImprintDetector** — Detects DPO/RLHF/CAI/SFT → sets regularization -2. **ConceptConeAnalyzer** — Polyhedral vs linear refusal → sets n_directions -3. **CrossLayerAlignmentAnalyzer** — Cluster-aware → selects target layers -4. **DefenseRobustnessEvaluator** — Self-repair risk → sets refinement passes -5. **Ouroboros loop** — Re-probes after excision, re-excises if refusal persists +### advanced (DEFAULT — RECOMMENDED) +- **Directions:** 4 (multi-direction SVD) +- **Speed:** Medium (~10-20 min for 8B model) +- **Risk:** Low-Medium +- **Refinement passes:** 2 +- **Use case:** Default for most models. Well-tested and reliable. +- **How it works:** Extracts multiple refusal directions via SVD, applies norm-preserving bi-projection to maintain weight matrix norms. Two refinement passes catch residual refusal. ### aggressive -**Technique:** Whitened SVD + jailbreak-contrastive activations + attention head surgery -**Speed:** Slow (~30-60 min for 8B) -**Quality:** High but higher risk of coherence damage -**Best for:** Models that resist gentler methods -**Key feature:** Whitened SVD separates refusal signal from natural activation variance - -### surgical -**Technique:** SAE features + neuron masking + head surgery + per-expert directions -**Speed:** Very slow (~1-2 hrs for 8B, needs SAE) -**Quality:** Highest precision -**Best for:** Reasoning models (R1 distills) where you must preserve CoT -**Key feature:** CoT-Aware — explicitly protects reasoning-critical directions - -### nuclear -**Technique:** Everything combined — expert transplant + steering + per-expert directions -**Speed:** Very slow -**Quality:** Most thorough removal, highest risk of side effects -**Best for:** Stubborn MoE models (DeepSeek, Mixtral, DBRX) that resist other methods -**Key feature:** Expert-granular abliteration decomposes signals per MoE expert - -### optimized -**Technique:** Bayesian hyperparameter search via Optuna TPE -**Speed:** Very slow (runs many trials) -**Quality:** Finds optimal configuration automatically -**Best for:** Research, when you want the mathematically best parameters -**Requires:** optuna package +- **Directions:** 8+ (whitened SVD + jailbreak-contrastive) +- **Speed:** Medium-Slow +- **Risk:** Medium-High (may damage coherence) +- **Use case:** When `advanced` leaves > 10% refusals. Stubborn models. +- **How it works:** Uses whitened SVD for covariance-normalized extraction, adds jailbreak-contrastive directions, performs attention head surgery on the most refusal-active heads. ### spectral_cascade -**Technique:** DCT frequency-domain decomposition of refusal signal -**Speed:** Medium-slow -**Quality:** Novel approach, less battle-tested -**Best for:** Research, exploring alternative decomposition strategies +- **Speed:** Medium +- **Risk:** Medium +- **Use case:** Research, novel approaches +- **How it works:** DCT (Discrete Cosine Transform) frequency-domain decomposition of refusal signals. Separates high-frequency (surface-level) from low-frequency (deep) refusal patterns. + +### informed (EXPERIMENTAL) +- **Speed:** Slow (~20-40 min for 8B model) +- **Risk:** Variable — results depend on analysis quality +- **Use case:** When you want auto-configuration, but be aware this is experimental and may not outperform `advanced`. +- **How it works:** Runs 4 analysis modules first (alignment imprint, concept geometry, logit lens, ouroboros detection), then auto-configures extraction strategy. Includes an "Ouroboros loop" that detects and counteracts self-repair. +- **Note:** The auto-detection can sometimes misconfigure. If results are poor, fall back to `advanced`. + +### surgical +- **Speed:** Very slow (~1-2 hrs for 8B model) +- **Risk:** Low (very precise) +- **Use case:** Reasoning models (R1 distills, QwQ, etc.) where chain-of-thought must be preserved. +- **How it works:** Uses SAE (Sparse Autoencoder) features + individual neuron masking + attention head surgery + per-expert decomposition (for MoE). CoT-aware — identifies and protects reasoning-critical directions before projecting. + +### optimized +- **Speed:** Very slow (hours — runs many trials) +- **Risk:** Low (finds optimal parameters) +- **Use case:** When quality matters more than speed. Production models. +- **How it works:** Bayesian hyperparameter search via Optuna TPE sampler. Optimizes n_directions, regularization, refinement passes, and layer selection jointly. Evaluates each configuration on refusal rate + perplexity. ### inverted -**Technique:** Reflects (inverts) the refusal direction instead of removing it -**Speed:** Fast (same as basic) -**Quality:** Aggressive — model becomes actively willing, not just neutral -**Best for:** When you want the model to be maximally helpful -**Warning:** Can make the model too eager; may reduce safety-adjacent reasoning +- **Speed:** Fast +- **Risk:** High (model behavior changes dramatically) +- **Use case:** Research, studying refusal mechanisms +- **How it works:** Instead of projecting out the refusal direction, reflects it. The model actively complies rather than passively not-refusing. Useful for understanding the geometry of alignment. -### failspy / gabliteration / heretic / rdo (PYTHON API ONLY) -**Technique:** Faithful reproductions of prior community/academic work -**Speed:** Varies -**Quality:** Known baselines -**Best for:** Reproducing published results, comparing methods -**⚠️ NOT available via CLI** — these methods are only accessible via the Python API. -Do not use `--method failspy` etc. in CLI commands; argparse will reject them. +### nuclear +- **Speed:** Slow +- **Risk:** Medium-High +- **Use case:** Stubborn MoE models (DeepSeek-MoE, Mixtral, etc.) +- **How it works:** Combines expert-granular abliteration (EGA), steering vector injection, attention head pruning, and multi-pass refinement. Decomposes refusal signals into per-expert components for MoE architectures. + +--- ## Method Selection Flowchart ``` Is this a quick test? -├─ YES → basic -└─ NO → Is the model MoE (DeepSeek, Mixtral)? - ├─ YES → nuclear - └─ NO → Is it a reasoning model (R1 distill)? - ├─ YES → surgical - └─ NO → Do you care about speed? - ├─ YES → advanced - └─ NO → informed + → YES: basic + → NO: continue + +Is it an MoE model (Mixtral, DeepSeek-MoE)? + → YES: nuclear + → NO: continue + +Is it a reasoning model (R1, QwQ, CoT-focused)? + → YES: surgical + → NO: continue + +Do you need the absolute best quality and have time? + → YES: optimized + → NO: advanced (recommended default) + +Did advanced leave > 10% refusals? + → YES: aggressive + → Still refusing: nuclear ``` +--- + ## Key Parameters -| Parameter | Range | Default | Effect | -|:--------------------|:---------|:--------|:--------------------------------------------| -| n_directions | 1-32 | auto | More = more thorough but riskier | -| regularization | 0.0-1.0 | 0.0 | Higher preserves more original behavior | -| refinement_passes | 1-5 | 1 | More catches self-repair (Ouroboros effect) | -| quantization | 4/8 bit | none | Saves VRAM, slight quality tradeoff | +| Parameter | Range | Default | Effect | +|:----------|:------|:--------|:-------| +| `--n-directions` | 1-32 | method-dependent | More directions = more complete removal, but higher damage risk | +| `--regularization` | 0.0-1.0 | 0.1 | Higher = more conservative (less removal, less damage) | +| `--refinement-passes` | 1-5 | 2 | More passes catch residual refusal, but diminishing returns | +| `--quantization` | 4bit, 8bit | none | Reduces VRAM usage; quality impact minimal for extraction | +| `--verify-sample-size` | 10-200 | 20 | More samples = more accurate refusal rate estimate | + +--- ## Troubleshooting -| Problem | Solution | -|:---------------------------|:--------------------------------------------------| -| Refusal rate still > 10% | Try aggressive/nuclear, add refinement passes | -| Perplexity up > 20% | Reduce n_directions, increase regularization | -| Model generates nonsense | Regularization too low, try 0.2-0.3 | -| OOM on GPU | Use 4-bit quantization, or try smaller model | -| MoE model barely changes | Use nuclear method (expert-granular) | -| CoT reasoning broken | Use surgical method (CoT-aware) | +| Problem | Likely Cause | Fix | +|:--------|:-------------|:----| +| Refusal rate > 20% | Too few directions | Increase `--n-directions`, try `aggressive` | +| Refusal rate 5-20% | Residual refusal | Add `--refinement-passes 3`, try `--direction-method svd` | +| Perplexity spike > 20% | Over-aggressive removal | Reduce `--n-directions`, increase `--regularization` | +| Repetitive output | Weight matrix damage | Use `basic` with fewer directions, check norm preservation | +| MoE model still refuses | Non-expert-aware method | Switch to `nuclear` | +| Reasoning degraded | CoT directions damaged | Use `surgical` method | +| OOM during extraction | Insufficient VRAM | Add `--quantization 4bit` and/or `--large-model` | From d6c710706f1b8f5779e54727b419a1ecc0fb93b2 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 02:52:54 -0700 Subject: [PATCH 4/4] docs: add real-world testing findings to OBLITERATUS skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added pitfalls discovered during live abliteration testing: - Models < 1B have fragmented refusal, respond poorly (0.5B: 60%→20%) - Models 3B+ work much better (3B: 75%→0% with advanced defaults) - aggressive method can backfire on small models (made it worse) - Spectral certification RED is common even when refusal rate is 0% - Fixed torch property: total_mem → total_memory --- skills/mlops/obliteratus/SKILL.md | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/skills/mlops/obliteratus/SKILL.md b/skills/mlops/obliteratus/SKILL.md index a532ab77b..598b99795 100644 --- a/skills/mlops/obliteratus/SKILL.md +++ b/skills/mlops/obliteratus/SKILL.md @@ -311,14 +311,17 @@ Enable with `--contribute` flag. No personal data is collected — only model na ## Common Pitfalls 1. **Don't use `informed` as default** — it's experimental and slower. Use `advanced` for reliable results. -2. **Always check perplexity** — if it spikes > 15%, the model is damaged. Reduce aggressiveness. -3. **MoE models need special handling** — use `nuclear` method for Mixtral, DeepSeek-MoE, etc. -4. **Quantized models can't be re-quantized** — abliterate the full-precision model, then quantize the output. -5. **VRAM estimation is approximate** — 4-bit quant helps but peak usage can spike during extraction. -6. **Reasoning models are sensitive** — use `surgical` for R1 distills to preserve chain-of-thought. -7. **Check `obliteratus recommend`** — telemetry data may have better parameters than defaults. -8. **AGPL license** — never `import obliteratus` in MIT/Apache projects. CLI invocation only. -9. **Large models (70B+)** — always use `--large-model` flag for conservative defaults. +2. **Models under ~1B respond poorly to abliteration** — their refusal behaviors are shallow and fragmented, making clean direction extraction difficult. Expect partial results (20-40% remaining refusal). Models 3B+ have cleaner refusal directions and respond much better (often 0% refusal with `advanced`). +3. **`aggressive` can make things worse** — on small models it can damage coherence and actually increase refusal rate. Only use it if `advanced` leaves > 10% refusals on a 3B+ model. +4. **Always check perplexity** — if it spikes > 15%, the model is damaged. Reduce aggressiveness. +5. **MoE models need special handling** — use `nuclear` method for Mixtral, DeepSeek-MoE, etc. +6. **Quantized models can't be re-quantized** — abliterate the full-precision model, then quantize the output. +7. **VRAM estimation is approximate** — 4-bit quant helps but peak usage can spike during extraction. +8. **Reasoning models are sensitive** — use `surgical` for R1 distills to preserve chain-of-thought. +9. **Check `obliteratus recommend`** — telemetry data may have better parameters than defaults. +10. **AGPL license** — never `import obliteratus` in MIT/Apache projects. CLI invocation only. +11. **Large models (70B+)** — always use `--large-model` flag for conservative defaults. +12. **Spectral certification RED is common** — the spectral check often flags "incomplete" even when practical refusal rate is 0%. Check actual refusal rate rather than relying on spectral certification alone. ## Complementary Skills