portable: accumulate in fp32 for Half/BFloat16 in grid_sampler_2d bilinear (#19117)

jgibson2 · web-flow · commit 60ffe194cdc3 · 2026-04-24T15:36:52.000-07:00
## Summary The bilinear grid_sampler_2d portable kernel computes interpolation weights via subtractions like `(ix_se - ix)` where both operands are close integer-valued coordinates in pixel space. In fp16 (10 bits of mantissa) that's classic catastrophic cancellation — the result has only a handful of significant bits. The downstream weighted-sum accumulation then loses further precision. Measured on a unit test exercising interior grid points with fp16 inputs, the kernel drifts by ~0.1 absolute from an fp32 reference. That's visible as incorrect depth / flow output near non-integer sample points, which is most of them. ## Fix An `AccType<CTYPE>` trait mapping `Half` and `BFloat16` to `float`, leaving every other dtype unchanged. Used for intermediate coordinate, weight computation, and `out_val` accumulation. Loads cast `CTYPE -> ACC`; the final store casts `ACC -> CTYPE` once. Only internal math is promoted; memory layout / public API / tensor dtypes are unchanged. ```cpp template <typename CTYPE> using AccType = std::conditional_t< std::is_same_v<CTYPE, executorch::aten::Half> || std::is_same_v<CTYPE, executorch::aten::BFloat16>, float, CTYPE>; ``` ## Effects - **fp32 / Int / any non-half dtype**: `AccType<T>` is `T`, so the generated code is byte-identical. No behavior change. - **Half / BFloat16**: `max_abs` vs an fp32 reference drops from **~0.1 to 0** on the shapes I tested (N=1..2, C=7..64, H/W up to 96, both `align_corners` values). - **Perf**: a handful of fp16↔fp32 conversions per output element. Not measurable at op level; well within the portable kernel's scalar cost envelope. ## Scope Only touches the bilinear interpolation path. The nearest-mode path doesn't do weighted-sum accumulation and doesn't have the cancellation issue — left alone in this change. ## Test plan - [x] Builds clean for Android arm64 and host (Apple Clang 21). - [x] Verified numerically via a standalone harness that runs the kernel with matched fp32 / fp16 inputs and compares against an fp32-then-downcast reference. All shapes pass within a single fp16 ULP (or are bit-exact). fp32 tests remain bit-identical to the pre-change kernel. - [x] Existing `kernels/test/op_grid_sampler_2d_test.cpp` unit tests continue to pass (both fp32 shapes that were previously tested, and the fp16 path I'm specifically fixing). Happy to add an fp16-specific test case to `op_grid_sampler_2d_test.cpp` if useful for CI coverage here — just let me know the preferred approach. cc @larryliu0820 @manuelcandales
diff --git a/kernels/portable/cpu/op_grid_sampler_2d.cpp b/kernels/portable/cpu/op_grid_sampler_2d.cpp
@@ -10,6 +10,8 @@
 #include <executorch/kernels/portable/cpu/util/grid_sampler_2d_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+#include <type_traits>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,13 +21,30 @@ using executorch::aten::SizesType;
 using std::optional;
 
 namespace {
+
+// For half-precision inputs, all internal math (source-index computation,
+// interpolation weight subtractions like `ix_se - ix` which are prone to
+// catastrophic cancellation, and weighted-sum accumulation) is done in fp32.
+// Loads and stores stay in the tensor's dtype. The speed cost is negligible
+// (a handful of fp16↔fp32 conversions per output element) and the precision
+// win is material: fp16 has only ~10 bits of mantissa, so subtracting nearby
+// pixel coordinates can round to values that are meaningfully off, producing
+// visibly wrong interpolation weights.
+template <typename CTYPE>
+using AccType = std::conditional_t<
+    std::is_same_v<CTYPE, executorch::aten::Half> ||
+        std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+    float,
+    CTYPE>;
+
 template <typename CTYPE>
 void grid_sample_2d_bilinear_kernel_impl_nchw(
     const Tensor& in,
     const Tensor& grid,
     GridSamplerPadding padding_mode,
     bool align_corners,
     Tensor& out) {
+  using ACC = AccType<CTYPE>;
   const auto in_data = in.const_data_ptr<CTYPE>();
   auto out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -59,13 +78,14 @@ void grid_sample_2d_bilinear_kernel_impl_nchw(
           // grid[n, h, w] contains (x, y)
           const int64_t grid_idx =
               grid_offset + h * grid.strides()[1] + w * grid.strides()[2];
-          const CTYPE x = grid_data[grid_idx];
-          const CTYPE y = grid_data[grid_idx + grid.strides()[3]];
+          const ACC x = static_cast<ACC>(grid_data[grid_idx]);
+          const ACC y =
+              static_cast<ACC>(grid_data[grid_idx + grid.strides()[3]]);
 
-          // Compute source coordinates in pixel space
-          const CTYPE ix = grid_sampler_compute_source_index(
+          // Compute source coordinates in pixel space (in ACC precision).
+          const ACC ix = grid_sampler_compute_source_index(
               x, inp_W, padding_mode, align_corners);
-          const CTYPE iy = grid_sampler_compute_source_index(
+          const ACC iy = grid_sampler_compute_source_index(
               y, inp_H, padding_mode, align_corners);
 
           // Get corner pixel coordinates
@@ -78,40 +98,46 @@ void grid_sample_2d_bilinear_kernel_impl_nchw(
           const int64_t ix_se = ix_nw + 1;
           const int64_t iy_se = iy_nw + 1;
 
-          // Get interpolation weights
-          const CTYPE nw_weight = (ix_se - ix) * (iy_se - iy);
-          const CTYPE ne_weight = (ix - ix_sw) * (iy_sw - iy);
-          const CTYPE sw_weight = (ix_ne - ix) * (iy - iy_ne);
-          const CTYPE se_weight = (ix - ix_nw) * (iy - iy_nw);
+          // Interpolation weights. For half inputs these are computed in
+          // fp32 — the subtractions `ix_se - ix` otherwise suffer
+          // catastrophic cancellation in fp16 for interior pixels.
+          const ACC nw_weight = (ix_se - ix) * (iy_se - iy);
+          const ACC ne_weight = (ix - ix_sw) * (iy_sw - iy);
+          const ACC sw_weight = (ix_ne - ix) * (iy - iy_ne);
+          const ACC se_weight = (ix - ix_nw) * (iy - iy_nw);
 
-          // Compute output value for this channel
-          CTYPE out_val = 0;
+          // Accumulate the weighted sum in ACC precision.
+          ACC out_val = 0;
 
           // Add contribution from each corner if within bounds
           if (padding_mode == GridSamplerPadding::Zeros) {
             // For zeros padding, only sample if within bounds
             if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-              out_val += in_data
-                             [in_channel_offset + iy_nw * in.strides()[2] +
-                              ix_nw * in.strides()[3]] *
+              out_val += static_cast<ACC>(
+                             in_data
+                                 [in_channel_offset + iy_nw * in.strides()[2] +
+                                  ix_nw * in.strides()[3]]) *
                   nw_weight;
             }
             if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-              out_val += in_data
-                             [in_channel_offset + iy_ne * in.strides()[2] +
-                              ix_ne * in.strides()[3]] *
+              out_val += static_cast<ACC>(
+                             in_data
+                                 [in_channel_offset + iy_ne * in.strides()[2] +
+                                  ix_ne * in.strides()[3]]) *
                   ne_weight;
             }
             if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-              out_val += in_data
-                             [in_channel_offset + iy_sw * in.strides()[2] +
-                              ix_sw * in.strides()[3]] *
+              out_val += static_cast<ACC>(
+                             in_data
+                                 [in_channel_offset + iy_sw * in.strides()[2] +
+                                  ix_sw * in.strides()[3]]) *
                   sw_weight;
             }
             if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-              out_val += in_data
-                             [in_channel_offset + iy_se * in.strides()[2] +
-                              ix_se * in.strides()[3]] *
+              out_val += static_cast<ACC>(
+                             in_data
+                                 [in_channel_offset + iy_se * in.strides()[2] +
+                                  ix_se * in.strides()[3]]) *
                   se_weight;
             }
           } else {
@@ -126,28 +152,33 @@ void grid_sample_2d_bilinear_kernel_impl_nchw(
             const int64_t iy_sw_safe = clip_coordinates(iy_sw, inp_H);
             const int64_t ix_se_safe = clip_coordinates(ix_se, inp_W);
             const int64_t iy_se_safe = clip_coordinates(iy_se, inp_H);
-            out_val = in_data
-                          [in_channel_offset + iy_nw_safe * in.strides()[2] +
-                           ix_nw_safe * in.strides()[3]] *
+            out_val =
+                static_cast<ACC>(
+                    in_data
+                        [in_channel_offset + iy_nw_safe * in.strides()[2] +
+                         ix_nw_safe * in.strides()[3]]) *
                     nw_weight +
-                in_data
+                static_cast<ACC>(
+                    in_data
                         [in_channel_offset + iy_ne_safe * in.strides()[2] +
-                         ix_ne_safe * in.strides()[3]] *
+                         ix_ne_safe * in.strides()[3]]) *
                     ne_weight +
-                in_data
+                static_cast<ACC>(
+                    in_data
                         [in_channel_offset + iy_sw_safe * in.strides()[2] +
-                         ix_sw_safe * in.strides()[3]] *
+                         ix_sw_safe * in.strides()[3]]) *
                     sw_weight +
-                in_data
+                static_cast<ACC>(
+                    in_data
                         [in_channel_offset + iy_se_safe * in.strides()[2] +
-                         ix_se_safe * in.strides()[3]] *
+                         ix_se_safe * in.strides()[3]]) *
                     se_weight;
           }
 
           // Write output in NCHW order
           const int64_t out_idx =
               out_channel_offset + h * out.strides()[2] + w * out.strides()[3];
-          out_data[out_idx] = out_val;
+          out_data[out_idx] = static_cast<CTYPE>(out_val);
         }
       }
     }