1
0

Compare commits

...

1 Commits

Author SHA1 Message Date
kimi
75a6a498b4 fix: use word-boundary regex for sensitive pattern matching to avoid false positives on max_tokens
The _SENSITIVE_PATTERNS list used simple substring matching, so "token"
matched "max_tokens", causing the distillation pipeline to block facts
about max_tokens parameters. Replaced with compiled regexes using
lookaround assertions so compound terms like max_tokens and num_tokens
are no longer falsely flagged.

Fixes #625

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 16:35:37 -04:00
2 changed files with 55 additions and 14 deletions

View File

@@ -39,19 +39,21 @@ _DEFAULT_DB = Path("data/thoughts.db")
# qwen3 and other reasoning models wrap chain-of-thought in <think> tags
_THINK_TAG_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
# Sensitive patterns that must never be stored as facts
_SENSITIVE_PATTERNS = [
"token",
"password",
"secret",
"api_key",
"apikey",
"credential",
".config/",
"/token",
"access_token",
"private_key",
"ssh_key",
# Sensitive patterns that must never be stored as facts.
# Uses compiled regexes with word boundaries so that compound technical
# terms like "max_tokens" or "num_tokens" are NOT falsely flagged.
_SENSITIVE_RE = [
re.compile(r"(?<![a-z_])token(?![a-z_])", re.IGNORECASE), # "token" but not "max_tokens"
re.compile(r"password", re.IGNORECASE),
re.compile(r"secret", re.IGNORECASE),
re.compile(r"api_key", re.IGNORECASE),
re.compile(r"apikey", re.IGNORECASE),
re.compile(r"credential", re.IGNORECASE),
re.compile(r"\.config/"),
re.compile(r"/token\b"),
re.compile(r"access_token", re.IGNORECASE),
re.compile(r"private_key", re.IGNORECASE),
re.compile(r"ssh_key", re.IGNORECASE),
]
# Meta-observation phrases to filter out from distilled facts
@@ -548,7 +550,7 @@ class ThinkingEngine:
fact_lower = fact.lower()
# Block sensitive information
if any(pat in fact_lower for pat in _SENSITIVE_PATTERNS):
if any(pat.search(fact) for pat in _SENSITIVE_RE):
logger.warning("Distill: blocked sensitive fact: %s", fact[:60])
continue

View File

@@ -1188,3 +1188,42 @@ def test_references_real_files_blocks_mixed(tmp_path):
# Mix of real and fake files — should fail because of the fake one
text = "Fix src/timmy/thinking.py and also src/timmy/nonexistent_module.py for the memory leak."
assert ThinkingEngine._references_real_files(text) is False
# ---------------------------------------------------------------------------
# Sensitive-pattern regression: max_tokens must NOT be flagged (#625)
# ---------------------------------------------------------------------------
def test_sensitive_patterns_allow_max_tokens():
"""_SENSITIVE_RE should not flag 'max_tokens' as sensitive (#625)."""
from timmy.thinking import _SENSITIVE_RE
safe_facts = [
"The cascade router passes max_tokens to Ollama provider.",
"max_tokens=request.max_tokens in the completion call.",
"num_tokens defaults to 2048.",
"total_prompt_tokens is tracked in stats.",
]
for fact in safe_facts:
assert not any(pat.search(fact) for pat in _SENSITIVE_RE), (
f"False positive: {fact!r} was flagged as sensitive"
)
def test_sensitive_patterns_still_block_real_secrets():
"""_SENSITIVE_RE should still block actual secrets."""
from timmy.thinking import _SENSITIVE_RE
dangerous_facts = [
"The token is abc123def456.",
"Set password to hunter2.",
"api_key = sk-live-xyz",
"Found credential in .env file.",
"access_token expired yesterday.",
"private_key stored in vault.",
]
for fact in dangerous_facts:
assert any(pat.search(fact) for pat in _SENSITIVE_RE), (
f"Missed secret: {fact!r} was NOT flagged as sensitive"
)