Skip to content

[FEA]: Consider TopK optimization for small K #8274

@gevtushenko

Description

@gevtushenko

Is this a duplicate?

Area

CUB

Is your feature request related to a problem? Please describe.

Current TopK implementation is generic and doesn't make assumptions about K. On GTC, there was interest in SOL TopK implementation for very small K (under 20). After #7495 is closed, we'll be able to provide different implementation for this case.

Describe the solution you'd like

In the GTC24 "Mastering CUDA C++: Modern Best Practices with the CUDA C++ Core Libraries" talk, we presented an atomic-based TopK implementation that looked something like this:

template <typename T>
using twiddle_t = cub::RadixSortTwiddle<false, T>;

template <typename T>
using bit_ordered_t = typename cub::Traits<T>::UnsignedBits;

template <typename U, cuda::thread_scope Scope>
__host__ __device__ U top_k(U val, U *top, unsigned n) {
  for (unsigned j = 0; j < n; ++j) {
    cuda::atomic_ref<U, Scope> shared_top_ref(top[j]);
    const U old_max_j = shared_top_ref.fetch_max(val, cuda::memory_order_relaxed);
    if (old_max_j < val) {
      val = old_max_j;
    }
  }
  return val;
}

constexpr unsigned custom_block_size = 1024;

// Kernel operates on bit-ordered unsigned integers for native atomicMax support.
// Input is read as KeyT, twiddled to unsigned, atomics run on unsigned, output twiddled back.
template <typename KeyT>
__global__ __launch_bounds__(custom_block_size)
void custom_topk_kernel(const KeyT* in, KeyT* global_top, unsigned elements, unsigned n) {
  using Twiddle = twiddle_t<KeyT>;
  using U       = bit_ordered_t<KeyT>;

  extern __shared__ __align__(sizeof(U)) unsigned char shared_mem[];
  U* shared_top = reinterpret_cast<U*>(shared_mem);

  const U sentinel = Twiddle::DefaultKey();
  for (unsigned i = threadIdx.x; i < n; i += custom_block_size) {
    shared_top[i] = sentinel;
  }
  __syncthreads();

  U min = sentinel;
  for (unsigned i = threadIdx.x + blockIdx.x * custom_block_size; i < elements; i += custom_block_size * gridDim.x) {
    const U val = Twiddle::In(reinterpret_cast<const U*>(in)[i]);
    if (val > min) {
      min = top_k<U, cuda::thread_scope_block>(val, shared_top, n);
    }
  }
  __syncthreads();

  U* global_top_u = reinterpret_cast<U*>(global_top);
  for (unsigned i = threadIdx.x; i < n; i += custom_block_size) {
    top_k<U, cuda::thread_scope_device>(shared_top[i], global_top_u + i, n - i);
  }
}

template <typename KeyT>
__global__ void fill_sentinel_kernel(KeyT* out, unsigned n) {
  using Twiddle = twiddle_t<KeyT>;
  using U       = bit_ordered_t<KeyT>;
  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) {
    reinterpret_cast<U*>(out)[i] = Twiddle::DefaultKey();
  }
}

// Twiddle-convert output back from bit-ordered to KeyT
template <typename KeyT>
__global__ void untwiddle_kernel(KeyT* data, unsigned n) {
  using Twiddle = twiddle_t<KeyT>;
  using U       = bit_ordered_t<KeyT>;
  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) {
    U& u = reinterpret_cast<U*>(data)[i];
    u = Twiddle::Out(u);
  }
}

template <typename KeyT>
void custom_topk_launch(const KeyT* in, KeyT* out, unsigned elements, unsigned n, cudaStream_t stream) {
  unsigned fill_threads = 256;
  unsigned fill_blocks  = (n + fill_threads - 1) / fill_threads;
  fill_sentinel_kernel<KeyT><<<fill_blocks, fill_threads, 0, stream>>>(out, n);

  int sm_count{};
  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, 0);

  int max_sm_occupancy{};
  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_sm_occupancy, custom_topk_kernel<KeyT>, custom_block_size, n * sizeof(bit_ordered_t<KeyT>));

  int device_occupancy = max_sm_occupancy * sm_count;
  int max_blocks = device_occupancy * 2;

  custom_topk_kernel<KeyT><<<max_blocks, custom_block_size, n * sizeof(bit_ordered_t<KeyT>), stream>>>(in, out, elements, n);

  // Convert output from bit-ordered back to KeyT
  untwiddle_kernel<KeyT><<<fill_blocks, fill_threads, 0, stream>>>(out, n);
}

Comparing this implementation with

auto requirements = cuda::execution::require(
  cuda::execution::determinism::not_guaranteed,
  cuda::execution::output_ordering::unsorted);
auto env = cuda::std::execution::env{cuda::stream_ref{launch.get_stream()}, requirements};
cub::DeviceTopK::MaxKeys(d_temp, temp_size, d_keys_in, d_keys_out, num_items, k, env);

leads to:

[0] NVIDIA RTX PRO 6000 Blackwell Workstation Edition

KeyT K Ref Time Ref Noise Cmp Time Cmp Noise Diff %Diff Status
F32 2^1 1.372 ms 0.28% 763.775 us 0.30% -608.279 us -44.33% FAST
F32 2^2 1.369 ms 0.25% 765.919 us 0.25% -602.707 us -44.04% FAST
F32 2^3 1.369 ms 0.18% 778.858 us 0.29% -590.291 us -43.11% FAST
F32 2^4 1.369 ms 0.25% 801.651 us 0.25% -567.243 us -41.44% FAST
F32 2^5 1.369 ms 0.20% 867.482 us 0.21% -501.602 us -36.64% FAST
F32 2^6 1.369 ms 0.23% 1.043 ms 0.20% -325.652 us -23.79% FAST
F64 2^1 3.996 ms 0.08% 1.374 ms 0.20% -2621.534 us -65.61% FAST
F64 2^2 3.995 ms 0.09% 1.384 ms 0.17% -2611.404 us -65.36% FAST
F64 2^3 3.996 ms 0.08% 1.394 ms 0.19% -2602.054 us -65.12% FAST
F64 2^4 3.998 ms 0.09% 1.440 ms 0.15% -2558.523 us -63.99% FAST
F64 2^5 3.999 ms 0.08% 1.630 ms 0.17% -2368.790 us -59.24% FAST
F64 2^6 3.999 ms 0.08% 2.231 ms 0.19% -1767.678 us -44.21% FAST

We should consider:

  • if this implementation is sound, and
  • if speedup is preserved on datacenter GPUs

Describe alternatives you've considered

No response

Additional context

No response

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

Status

Todo

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions