Merge pull request #302 from chopratejas/rust-stage-3d-compression-metrics

chopratejas · web-flow · commit 1ee1bb295b7c · 2026-04-28T15:41:55.000-07:00
chore(proxy): per-strategy compression observability
diff --git a/headroom/proxy/prometheus_metrics.py b/headroom/proxy/prometheus_metrics.py
@@ -81,6 +81,17 @@ def __init__(
         self.tokens_output_total = 0
         self.tokens_saved_total = 0
 
+        # Per-strategy compression counters. Populated lazily as we see
+        # each strategy tag — no hardcoded list of strategies; the keys
+        # come from ContentRouter's `CompressionStrategy.value` and
+        # SmartCrusher's literal `"smart_crusher"`. The forcing
+        # function for catching strategy-level silent regressions:
+        # if SmartCrusher events drop to zero in production, the
+        # `headroom_compressions_total{strategy="smart_crusher"}`
+        # counter shows it on day 1, not week 3.
+        self.compressions_by_strategy: dict[str, int] = defaultdict(int)
+        self.tokens_saved_by_strategy: dict[str, int] = defaultdict(int)
+
         self.latency_sum_ms = 0.0
         self.latency_min_ms = float("inf")
         self.latency_max_ms = 0.0
@@ -242,6 +253,36 @@ def record_stack(self, stack: str | None) -> None:
             return
         self.requests_by_stack[slug] += 1
 
+    def record_compression(
+        self,
+        strategy: str,
+        original_tokens: int,
+        compressed_tokens: int,
+    ) -> None:
+        """Implements `headroom.transforms.observability.CompressionObserver`.
+
+        Called once per real compression event by the configured
+        transforms (ContentRouter at routing-decision granularity;
+        SmartCrusher at message granularity in the legacy direct-
+        pipeline path). Increments the per-strategy counters that
+        get exported as labelled Prometheus metrics, so silent
+        regressions in any single strategy become visible in the
+        scrape.
+
+        Synchronous + lock-free: `defaultdict(int)` writes are
+        atomic under the GIL for these key types; the proxy serves
+        many requests concurrently and the contention here would be
+        a single dict write per routing decision.
+
+        Tokens saved is `max(0, original - compressed)` — the
+        observer never records "negative savings" even if a
+        compressor goofs and emits more tokens than it received.
+        """
+        self.compressions_by_strategy[strategy] += 1
+        saved = original_tokens - compressed_tokens
+        if saved > 0:
+            self.tokens_saved_by_strategy[strategy] += saved
+
     async def record_request(
         self,
         provider: str,
@@ -523,6 +564,15 @@ async def export(self) -> str:
                 help_text="Tokens saved by optimization",
                 value=self.tokens_saved_total,
             )
+            # NOTE: per-strategy compression breakdown is tracked
+            # internally on `self.compressions_by_strategy` and
+            # `self.tokens_saved_by_strategy` (populated by
+            # `record_compression`) but **deliberately not exported
+            # here**. The proxy's metric→Supabase pipeline treats
+            # each metric name as a column, and we cannot add new
+            # columns. The state is still observable for tests +
+            # programmatic introspection; if/when a non-column-
+            # adding export path exists, surface it there.
             _append_metric(
                 lines,
                 name="headroom_latency_ms_sum",
diff --git a/headroom/proxy/server.py b/headroom/proxy/server.py
@@ -239,6 +239,22 @@ def __init__(self, config: ProxyConfig):
         self.anthropic_provider = self.provider_runtime.pipeline_provider("anthropic")
         self.openai_provider = self.provider_runtime.pipeline_provider("openai")
 
+        # `metrics` is hoisted ahead of transform construction so the
+        # transforms can receive `self.metrics` as their compression
+        # observer at __init__ time. The forcing function for catching
+        # silent strategy regressions: per-strategy counters increment
+        # only when wired up here, so the wiring is mandatory, not
+        # something we patch in later. (See `RUST_DEV.md` audit notes.)
+        self.cost_tracker = (
+            CostTracker(
+                budget_limit_usd=config.budget_limit_usd,
+                budget_period=config.budget_period,
+            )
+            if config.cost_tracking_enabled
+            else None
+        )
+        self.metrics = PrometheusMetrics(cost_tracker=self.cost_tracker)
+
         # Initialize transforms based on routing mode
         # Choose context manager: IntelligentContextManager (smart) or RollingWindow (legacy)
         context_manager: Transform  # Can be either IntelligentContextManager or RollingWindow
@@ -280,7 +296,7 @@ def __init__(self, config: ProxyConfig):
                 router_config.protect_recent_reads_fraction = 0.3
             transforms = [
                 CacheAligner(CacheAlignerConfig(enabled=False)),
-                ContentRouter(router_config),
+                ContentRouter(router_config, observer=self.metrics),
                 context_manager,
             ]
             self._code_aware_status = "lazy" if config.code_aware_enabled else "disabled"
@@ -298,6 +314,7 @@ def __init__(self, config: ProxyConfig):
                         enabled=config.ccr_inject_tool,
                         inject_retrieval_marker=config.ccr_inject_tool,  # Add CCR markers
                     ),
+                    observer=self.metrics,
                 ),
                 context_manager,
             ]
@@ -332,16 +349,9 @@ def __init__(self, config: ProxyConfig):
             else None
         )
 
-        self.cost_tracker = (
-            CostTracker(
-                budget_limit_usd=config.budget_limit_usd,
-                budget_period=config.budget_period,
-            )
-            if config.cost_tracking_enabled
-            else None
-        )
-
-        self.metrics = PrometheusMetrics(cost_tracker=self.cost_tracker)
+        # `cost_tracker` and `metrics` were hoisted to before transforms so
+        # ContentRouter / SmartCrusher could take `self.metrics` as their
+        # compression observer at __init__ time.
 
         # Prefix cache tracking: freeze already-cached messages to avoid
         # invalidating the provider's prefix cache with our transforms
diff --git a/headroom/transforms/content_router.py b/headroom/transforms/content_router.py
@@ -643,13 +643,26 @@ class ContentRouter(Transform):
 
     name: str = "content_router"
 
-    def __init__(self, config: ContentRouterConfig | None = None):
+    def __init__(
+        self,
+        config: ContentRouterConfig | None = None,
+        observer: Any = None,
+    ):
         """Initialize content router.
 
         Args:
             config: Router configuration. Uses defaults if None.
+            observer: Optional `CompressionObserver` (see
+                `headroom.transforms.observability`) called once per
+                routing decision after `compress()` finishes. The
+                proxy's `PrometheusMetrics` is the production
+                implementation — it increments per-strategy counters
+                so silent regressions become visible. `None` disables
+                observation; pick one explicitly per the no-fallback
+                rule in the audit doc.
         """
         self.config = config or ContentRouterConfig()
+        self._observer = observer
 
         # Lazy-loaded compressors
         self._code_compressor: Any = None
@@ -766,20 +779,46 @@ def compress(
             RouterCompressionResult with compressed content and routing metadata.
         """
         if not content or not content.strip():
-            return RouterCompressionResult(
+            result = RouterCompressionResult(
                 compressed=content,
                 original=content,
                 strategy_used=CompressionStrategy.PASSTHROUGH,
                 routing_log=[],
             )
+        else:
+            # Determine strategy from content analysis
+            strategy = self._determine_strategy(content)
 
-        # Determine strategy from content analysis
-        strategy = self._determine_strategy(content)
+            if strategy == CompressionStrategy.MIXED:
+                result = self._compress_mixed(content, context, question, bias=bias)
+            else:
+                result = self._compress_pure(content, strategy, context, question, bias=bias)
 
-        if strategy == CompressionStrategy.MIXED:
-            return self._compress_mixed(content, context, question, bias=bias)
-        else:
-            return self._compress_pure(content, strategy, context, question, bias=bias)
+        # One observer call per routing decision; the observer is the
+        # forcing function for catching strategy-level regressions.
+        # Empty routing_log (passthrough fast path) → no calls.
+        self._observe(result)
+        return result
+
+    def _observe(self, result: RouterCompressionResult) -> None:
+        """Forward each `RoutingDecision` in `result.routing_log` to the
+        configured `CompressionObserver`. No-op when no observer is set.
+
+        Observers MUST NOT raise per the protocol contract; if one does
+        anyway, swallow at debug level. Compression already succeeded;
+        a buggy observer must not turn a 200 into a 500.
+        """
+        if self._observer is None:
+            return
+        for d in result.routing_log:
+            try:
+                self._observer.record_compression(
+                    strategy=d.strategy.value,
+                    original_tokens=d.original_tokens,
+                    compressed_tokens=d.compressed_tokens,
+                )
+            except Exception as e:  # pragma: no cover - defensive
+                logger.debug("CompressionObserver raised (non-fatal): %s", e)
 
     def _determine_strategy(self, content: str) -> CompressionStrategy:
         """Determine the compression strategy from content analysis.
diff --git a/headroom/transforms/observability.py b/headroom/transforms/observability.py
@@ -0,0 +1,77 @@
+"""Observability protocol for compression events.
+
+A single `CompressionObserver` interface that any transform can call
+after a real compression event. Concrete observers — Prometheus, OTel,
+structured logs — implement this; transforms only see the protocol.
+
+The motivating regression: `ContentRouter._record_to_toin` skipped
+SmartCrusher on the assumption SmartCrusher recorded its own TOIN
+events (it did when SmartCrusher was Python; it stopped when the Rust
+port took over). The disconnect was invisible for three weeks because
+no metric distinguished compression events by strategy. This module
+exists so the next regression of that shape alerts on day 1: if
+SmartCrusher events drop to zero in production, the Prometheus
+counter shows it immediately.
+
+Design choices, called out for posterity:
+
+- **No fallback observer.** Callers pass `None` or pass a real
+  observer. There is no "default no-op" instance — that would let a
+  caller silently disable observability by forgetting to pass one,
+  and we just spent a PR fixing exactly that class of bug. Be
+  explicit.
+- **No observer registry.** A single observer per transform instance.
+  If you need multi-fanout, compose at the call site (one wrapper
+  observer that forwards to N children) — but the trivial pattern
+  doesn't need a registry baked in.
+- **No batching.** Each compression event is one call. Volume is
+  bounded by the number of routing decisions per request — small.
+  Batching would only matter if observers had to round-trip to a
+  remote system; production observers (Prometheus) are in-process
+  counter increments, which are cheaper than the protocol dispatch.
+- **Strategy as a string.** The router and crusher both already
+  serialize their strategy as the enum's `.value` tag. Passing the
+  string keeps observers from importing `CompressionStrategy` and
+  lets non-router callers (e.g. SmartCrusher in legacy mode) emit
+  the same shape without round-tripping through the enum.
+"""
+
+from __future__ import annotations
+
+from typing import Protocol, runtime_checkable
+
+
+@runtime_checkable
+class CompressionObserver(Protocol):
+    """Receive one notification per real compression event.
+
+    Implementations should be cheap — this lives on the proxy hot path,
+    one call per routing decision per request. A Prometheus-counter
+    increment is the right order of magnitude.
+
+    Args:
+        strategy: Lowercase tag identifying the compression strategy
+            that ran. Matches `CompressionStrategy.<NAME>.value` for
+            ContentRouter; SmartCrusher's legacy direct-call path
+            passes the literal `"smart_crusher"`.
+        original_tokens: Token count of the input the strategy
+            received.
+        compressed_tokens: Token count of the output the strategy
+            produced. Equal to `original_tokens` for passthrough;
+            less when compression saved tokens.
+
+    Implementations MUST NOT raise. If the observer needs to fail-
+    over (Prometheus client misconfigured, OTel exporter offline)
+    handle that internally — bubbling exceptions out of an observer
+    would break the compression that just succeeded, which is the
+    opposite of what observability should do. (See the audit
+    in `RUST_DEV.md`: any silent regression is bad, but a noisy
+    observer that breaks compression is worse.)
+    """
+
+    def record_compression(
+        self,
+        strategy: str,
+        original_tokens: int,
+        compressed_tokens: int,
+    ) -> None: ...
diff --git a/headroom/transforms/smart_crusher.py b/headroom/transforms/smart_crusher.py
@@ -152,6 +152,7 @@ def __init__(
         scorer: Any = None,
         ccr_config: CCRConfig | None = None,
         with_compaction: bool = True,
+        observer: Any = None,
     ):
         # Hard import — no Python fallback. If the wheel is missing the
         # caller must build it (scripts/build_rust_extension.sh) or
@@ -167,6 +168,12 @@ def __init__(
         cfg = config or SmartCrusherConfig()
         self.config = cfg
         self._with_compaction = with_compaction
+        # `observer`: see `headroom.transforms.observability`. The
+        # legacy proxy pipeline uses SmartCrusher.apply() directly
+        # (no ContentRouter); without an observer here, those
+        # compressions would be invisible to per-strategy metrics —
+        # exactly the silent-regression class we're guarding against.
+        self._observer = observer
 
         # CCR config is preserved on `self` for callers that read it
         # back (`headroom.proxy.server` does). Storage-side semantics
@@ -473,6 +480,24 @@ def _extract_context_from_messages(self, messages: list[dict[str, Any]]) -> str:
 
         return " ".join(context_parts)
 
+    def _notify_observer(self, original_tokens: int, compressed_tokens: int) -> None:
+        """Forward a compression event to the configured
+        `CompressionObserver` (see `headroom.transforms.observability`).
+        No-op when no observer is set; swallows observer exceptions at
+        debug level so a buggy metrics impl doesn't break the
+        compression that just succeeded.
+        """
+        if self._observer is None:
+            return
+        try:
+            self._observer.record_compression(
+                strategy="smart_crusher",
+                original_tokens=original_tokens,
+                compressed_tokens=compressed_tokens,
+            )
+        except Exception as e:  # pragma: no cover - defensive
+            logger.debug("CompressionObserver raised (non-fatal): %s", e)
+
     def apply(
         self,
         messages: list[dict[str, Any]],
@@ -516,6 +541,7 @@ def apply(
                             markers_inserted.append(marker)
                             if info:
                                 transforms_applied.append(f"smart:{info}")
+                            self._notify_observer(tokens, tokenizer.count_text(crushed))
 
             # Anthropic-style: content is a list of blocks; each tool_result
             # block has a string content field of its own.
@@ -541,6 +567,7 @@ def apply(
                         markers_inserted.append(marker)
                         if info:
                             transforms_applied.append(f"smart:{info}")
+                        self._notify_observer(tokens, tokenizer.count_text(crushed))
 
         if crushed_count > 0:
             transforms_applied.insert(0, f"smart_crush:{crushed_count}")
diff --git a/tests/test_compression_observability.py b/tests/test_compression_observability.py