Is this a duplicate?
Area
CUB
Is your feature request related to a problem? Please describe.
Current TopK implementation is generic and doesn't make assumptions about K. On GTC, there was interest in SOL TopK implementation for very small K (under 20). After #7495 is closed, we'll be able to provide different implementation for this case.
Describe the solution you'd like
In the GTC24 "Mastering CUDA C++: Modern Best Practices with the CUDA C++ Core Libraries" talk, we presented an atomic-based TopK implementation that looked something like this:
template <typename T>
using twiddle_t = cub::RadixSortTwiddle<false, T>;
template <typename T>
using bit_ordered_t = typename cub::Traits<T>::UnsignedBits;
template <typename U, cuda::thread_scope Scope>
__host__ __device__ U top_k(U val, U *top, unsigned n) {
for (unsigned j = 0; j < n; ++j) {
cuda::atomic_ref<U, Scope> shared_top_ref(top[j]);
const U old_max_j = shared_top_ref.fetch_max(val, cuda::memory_order_relaxed);
if (old_max_j < val) {
val = old_max_j;
}
}
return val;
}
constexpr unsigned custom_block_size = 1024;
// Kernel operates on bit-ordered unsigned integers for native atomicMax support.
// Input is read as KeyT, twiddled to unsigned, atomics run on unsigned, output twiddled back.
template <typename KeyT>
__global__ __launch_bounds__(custom_block_size)
void custom_topk_kernel(const KeyT* in, KeyT* global_top, unsigned elements, unsigned n) {
using Twiddle = twiddle_t<KeyT>;
using U = bit_ordered_t<KeyT>;
extern __shared__ __align__(sizeof(U)) unsigned char shared_mem[];
U* shared_top = reinterpret_cast<U*>(shared_mem);
const U sentinel = Twiddle::DefaultKey();
for (unsigned i = threadIdx.x; i < n; i += custom_block_size) {
shared_top[i] = sentinel;
}
__syncthreads();
U min = sentinel;
for (unsigned i = threadIdx.x + blockIdx.x * custom_block_size; i < elements; i += custom_block_size * gridDim.x) {
const U val = Twiddle::In(reinterpret_cast<const U*>(in)[i]);
if (val > min) {
min = top_k<U, cuda::thread_scope_block>(val, shared_top, n);
}
}
__syncthreads();
U* global_top_u = reinterpret_cast<U*>(global_top);
for (unsigned i = threadIdx.x; i < n; i += custom_block_size) {
top_k<U, cuda::thread_scope_device>(shared_top[i], global_top_u + i, n - i);
}
}
template <typename KeyT>
__global__ void fill_sentinel_kernel(KeyT* out, unsigned n) {
using Twiddle = twiddle_t<KeyT>;
using U = bit_ordered_t<KeyT>;
unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
reinterpret_cast<U*>(out)[i] = Twiddle::DefaultKey();
}
}
// Twiddle-convert output back from bit-ordered to KeyT
template <typename KeyT>
__global__ void untwiddle_kernel(KeyT* data, unsigned n) {
using Twiddle = twiddle_t<KeyT>;
using U = bit_ordered_t<KeyT>;
unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
U& u = reinterpret_cast<U*>(data)[i];
u = Twiddle::Out(u);
}
}
template <typename KeyT>
void custom_topk_launch(const KeyT* in, KeyT* out, unsigned elements, unsigned n, cudaStream_t stream) {
unsigned fill_threads = 256;
unsigned fill_blocks = (n + fill_threads - 1) / fill_threads;
fill_sentinel_kernel<KeyT><<<fill_blocks, fill_threads, 0, stream>>>(out, n);
int sm_count{};
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, 0);
int max_sm_occupancy{};
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_sm_occupancy, custom_topk_kernel<KeyT>, custom_block_size, n * sizeof(bit_ordered_t<KeyT>));
int device_occupancy = max_sm_occupancy * sm_count;
int max_blocks = device_occupancy * 2;
custom_topk_kernel<KeyT><<<max_blocks, custom_block_size, n * sizeof(bit_ordered_t<KeyT>), stream>>>(in, out, elements, n);
// Convert output from bit-ordered back to KeyT
untwiddle_kernel<KeyT><<<fill_blocks, fill_threads, 0, stream>>>(out, n);
}
Comparing this implementation with
auto requirements = cuda::execution::require(
cuda::execution::determinism::not_guaranteed,
cuda::execution::output_ordering::unsorted);
auto env = cuda::std::execution::env{cuda::stream_ref{launch.get_stream()}, requirements};
cub::DeviceTopK::MaxKeys(d_temp, temp_size, d_keys_in, d_keys_out, num_items, k, env);
leads to:
[0] NVIDIA RTX PRO 6000 Blackwell Workstation Edition
| KeyT |
K |
Ref Time |
Ref Noise |
Cmp Time |
Cmp Noise |
Diff |
%Diff |
Status |
| F32 |
2^1 |
1.372 ms |
0.28% |
763.775 us |
0.30% |
-608.279 us |
-44.33% |
FAST |
| F32 |
2^2 |
1.369 ms |
0.25% |
765.919 us |
0.25% |
-602.707 us |
-44.04% |
FAST |
| F32 |
2^3 |
1.369 ms |
0.18% |
778.858 us |
0.29% |
-590.291 us |
-43.11% |
FAST |
| F32 |
2^4 |
1.369 ms |
0.25% |
801.651 us |
0.25% |
-567.243 us |
-41.44% |
FAST |
| F32 |
2^5 |
1.369 ms |
0.20% |
867.482 us |
0.21% |
-501.602 us |
-36.64% |
FAST |
| F32 |
2^6 |
1.369 ms |
0.23% |
1.043 ms |
0.20% |
-325.652 us |
-23.79% |
FAST |
| F64 |
2^1 |
3.996 ms |
0.08% |
1.374 ms |
0.20% |
-2621.534 us |
-65.61% |
FAST |
| F64 |
2^2 |
3.995 ms |
0.09% |
1.384 ms |
0.17% |
-2611.404 us |
-65.36% |
FAST |
| F64 |
2^3 |
3.996 ms |
0.08% |
1.394 ms |
0.19% |
-2602.054 us |
-65.12% |
FAST |
| F64 |
2^4 |
3.998 ms |
0.09% |
1.440 ms |
0.15% |
-2558.523 us |
-63.99% |
FAST |
| F64 |
2^5 |
3.999 ms |
0.08% |
1.630 ms |
0.17% |
-2368.790 us |
-59.24% |
FAST |
| F64 |
2^6 |
3.999 ms |
0.08% |
2.231 ms |
0.19% |
-1767.678 us |
-44.21% |
FAST |
We should consider:
- if this implementation is sound, and
- if speedup is preserved on datacenter GPUs
Describe alternatives you've considered
No response
Additional context
No response
Is this a duplicate?
Area
CUB
Is your feature request related to a problem? Please describe.
Current TopK implementation is generic and doesn't make assumptions about K. On GTC, there was interest in SOL TopK implementation for very small K (under 20). After #7495 is closed, we'll be able to provide different implementation for this case.
Describe the solution you'd like
In the GTC24 "Mastering CUDA C++: Modern Best Practices with the CUDA C++ Core Libraries" talk, we presented an atomic-based TopK implementation that looked something like this:
Comparing this implementation with
leads to:
[0] NVIDIA RTX PRO 6000 Blackwell Workstation Edition
We should consider:
Describe alternatives you've considered
No response
Additional context
No response