From 429ae0095b65a37276098ee314fc9f97dc361e1e Mon Sep 17 00:00:00 2001 From: SwiftWing21 Date: Tue, 28 Apr 2026 10:01:11 -0700 Subject: [PATCH] fix(proxy): guard CCR tool injection against frozen prefix to preserve cache The Anthropic handler's CCR injector path applied a frozen_message_count guard to system instruction injection but not to tool injection. When Kompress fired for the first time in a session, the tools array was mutated unconditionally, invalidating Anthropic's prefix cache and dropping cache_read_input_tokens to zero on calls where ~48K tokens were previously being cached. Mirror the existing inject_system_instructions guard for inject_tool: when frozen_message_count > 0, defer tool injection so the warm prefix stays intact. Adds test_ccr_tool_injection_disabled_when_prefix_frozen as a direct companion to the existing test_ccr_system_instruction_injection_ disabled_when_prefix_frozen. Fixes #294 Co-Authored-By: Claude Sonnet 4.6 --- headroom/proxy/handlers/anthropic.py | 9 ++- tests/test_proxy_anthropic_cache_stability.py | 64 +++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/headroom/proxy/handlers/anthropic.py b/headroom/proxy/handlers/anthropic.py index 1c024f9a..f1531442 100644 --- a/headroom/proxy/handlers/anthropic.py +++ b/headroom/proxy/handlers/anthropic.py @@ -966,10 +966,17 @@ async def _finalize_pre_upstream() -> None: f"(frozen prefix={frozen_message_count}) to preserve cache" ) inject_system_instructions = False + inject_tool = self.config.ccr_inject_tool + if inject_tool and frozen_message_count > 0: + logger.info( + f"[{request_id}] CCR: deferring tool injection " + f"(frozen prefix={frozen_message_count}) to preserve cache" + ) + inject_tool = False # Create fresh injector to avoid state leakage between requests injector = CCRToolInjector( provider="anthropic", - inject_tool=self.config.ccr_inject_tool, + inject_tool=inject_tool, inject_system_instructions=inject_system_instructions, ) optimized_messages, tools, was_injected = injector.process_request( diff --git a/tests/test_proxy_anthropic_cache_stability.py b/tests/test_proxy_anthropic_cache_stability.py index 534ba7f0..ce5d6e7e 100644 --- a/tests/test_proxy_anthropic_cache_stability.py +++ b/tests/test_proxy_anthropic_cache_stability.py @@ -494,6 +494,70 @@ async def _fake_retry(method, url, headers, body, stream=False): # noqa: ANN001 assert captured["inject_system"] is False +def test_ccr_tool_injection_disabled_when_prefix_frozen(monkeypatch) -> None: + captured = {"inject_tool": None} + with _make_proxy_client() as client: + proxy = client.app.state.proxy + proxy.config.optimize = False + proxy.config.image_optimize = False + proxy.config.ccr_inject_tool = True + proxy.config.ccr_inject_system_instructions = False + + fake_tracker = _FakePrefixTracker(frozen_count=1) + proxy.session_tracker_store.compute_session_id = lambda request, model, messages: ( + "stable-session" + ) + proxy.session_tracker_store.get_or_create = lambda session_id, provider: fake_tracker + + class _FakeInjector: + def __init__( + self, + provider, # noqa: ANN001 + inject_tool, # noqa: ANN001 + inject_system_instructions, # noqa: ANN001 + ): + captured["inject_tool"] = inject_tool + self.has_compressed_content = False + self.detected_hashes = [] + + def process_request(self, messages, tools): # noqa: ANN001 + return messages, tools, False + + monkeypatch.setattr("headroom.ccr.CCRToolInjector", _FakeInjector) + + async def _fake_retry(method, url, headers, body, stream=False): # noqa: ANN001 + return httpx.Response( + 200, + json={ + "id": "msg_ccr_tool_1", + "type": "message", + "role": "assistant", + "content": [{"type": "text", "text": "ok"}], + "usage": { + "input_tokens": 20, + "output_tokens": 3, + "cache_read_input_tokens": 0, + "cache_creation_input_tokens": 0, + }, + }, + ) + + proxy._retry_request = _fake_retry + + response = client.post( + "/v1/messages", + headers={"x-api-key": "test-key", "anthropic-version": "2023-06-01"}, + json={ + "model": "claude-sonnet-4-6", + "max_tokens": 64, + "messages": [{"role": "user", "content": "hello"}], + }, + ) + + assert response.status_code == 200 + assert captured["inject_tool"] is False + + def test_previous_turns_always_frozen_only_final_turn_mutable() -> None: captured = {} with _make_proxy_client() as client: