23#include <cooperative_groups.h>
25#include <cuda/std/bit>
26#include <cuda/std/optional>
27#include <cuda_runtime.h>
35#ifdef GPURHH_BENCHMARK_COUNTERS
36#define IF_GPURHH_BENCHMARK_COUNTERS(...) __VA_ARGS__
38#define IF_GPURHH_BENCHMARK_COUNTERS(...)
70 static_assert(std::is_unsigned_v<Key>,
71 "default_empty_key only provides a default for unsigned "
72 "integral types; supply an explicit EmptyKey for signed "
73 "or other key types");
76 static constexpr Key
key =
static_cast<Key
>(~Key{});
92__host__ __device__
inline std::uint32_t
fmix32(std::uint32_t h)
noexcept {
101__host__ __device__
inline std::uint64_t
fmix64(std::uint64_t k)
noexcept {
103 k *= 0xff51afd7ed558ccdULL;
105 k *= 0xc4ceb9fe1a85ec53ULL;
111constexpr inline std::size_t
next_pow2(std::size_t n)
noexcept {
112 if (n <= 1)
return 1;
113 return std::size_t{1} << std::bit_width(n - 1);
120template <
class Key, std::
size_t = sizeof(Key)>
125 __device__
inline std::uint32_t
operator()(Key key)
const noexcept {
132 __device__
inline std::uint64_t
operator()(Key key)
const noexcept {
154 __device__ T
operator()(T existing, T incoming)
const noexcept {
155 return existing + incoming;
176 int CacheLineBytes = 128,
178 int MaxProbeBuckets = 8
192 struct alignas(sizeof(Key) + sizeof(Value))
Slot {
209 "slot must be 8 or 16 bytes (a 64-bit or 128-bit packed "
210 "key+value pair). The 16-byte case uses 128-bit CAS via "
211 "cuda::atomic_ref — single-instruction on sm_90+, "
212 "emulated by libcu++ on earlier architectures.");
214 "slot does not fit in a cache line");
215 static_assert(CacheLineBytes %
slot_bytes == 0,
216 "cache line must be a whole multiple of slot size");
221 "tile size must be positive");
223 "tile size must be a power of two "
224 "(required by cooperative_groups::tiled_partition)");
226 "tile size must not exceed warp size");
228 "warp size must be a whole multiple of bucket size");
238 static_assert(
sizeof(
Bucket) == CacheLineBytes,
239 "bucket must occupy exactly one cache line");
240 static_assert(
alignof(
Bucket) == CacheLineBytes,
241 "bucket alignment must equal cache line size");
257 static constexpr int max_probe_buckets = MaxProbeBuckets;
258 static_assert(max_probe_buckets > 0,
259 "max_probe_buckets must be positive");
264 static constexpr unsigned char empty_key_byte =
265 std::bit_cast<std::array<
unsigned char,
sizeof(Key)>>(empty_key)[0];
269 const auto bytes = std::bit_cast<std::array<
unsigned char,
sizeof(Key)>>(empty_key);
270 for (std::size_t i = 1; i <
sizeof(Key); ++i) {
271 if (bytes[i] != bytes[0])
return false;
275 "EmptyKey must be a single byte pattern repeated across sizeof(Key) "
276 "bytes so the slot array can be initialized via cudaMemset. Use the "
277 "default for unsigned keys, or provide an EmptyKey of the form 0xBB..BB.");
314 template <
class Tile>
315 __device__ cuda::std::optional<Slot>
insert(
316 const Tile& tile, Key key, Value value
326 template <
class Tile>
327 __device__ cuda::std::optional<Value>
get(
328 const Tile& tile, Key key
332 __host__ __device__ std::size_t
capacity() const noexcept {
return capacity_; }
337 Bucket* buckets_ =
nullptr;
338 std::size_t capacity_ = 0;
339 std::size_t capacity_mask_ = 0;
353 explicit HashTable(std::size_t min_capacity);
363 View view() const noexcept;
366 std::
size_t capacity() const noexcept {
return capacity_; }
372 void clear(cudaStream_t stream = default_stream);
384 Bucket* buckets_ =
nullptr;
385 std::size_t capacity_ = 0;
397template <
class K,
class V,
class H, K E,
class R,
int CL,
int WS,
int MPB>
404 const std::size_t num_buckets = capacity_ /
bucket_size;
409 cudaMalloc(&buckets_, num_buckets *
sizeof(
Bucket));
417template <
class K,
class V,
class H, K E,
class R,
int CL,
int WS,
int MPB>
423template <
class K,
class V,
class H, K E,
class R,
int CL,
int WS,
int MPB>
425 : buckets_(other.buckets_), capacity_(other.capacity_)
427 other.buckets_ =
nullptr;
431template <
class K,
class V,
class H, K E,
class R,
int CL,
int WS,
int MPB>
434 if (
this != &other) {
439 buckets_ = other.buckets_;
440 capacity_ = other.capacity_;
441 other.buckets_ =
nullptr;
447template <
class K,
class V,
class H, K E,
class R,
int CL,
int WS,
int MPB>
451 v.buckets_ = buckets_;
452 v.capacity_ = capacity_;
457 v.capacity_mask_ = capacity_ > 0 ? capacity_ - 1 : 0;
463template <
class K,
class V,
class H, K E,
class R,
int CL,
int WS,
int MPB>
465 const std::size_t num_buckets = capacity_ /
bucket_size;
467 num_buckets *
sizeof(
Bucket), stream);
470template <
class K,
class V,
class H, K E,
class R,
int CL,
int WS,
int MPB>
473 const Tile& tile, K key, V value
476 -> cuda::std::optional<Slot>
487 const std::size_t num_buckets = capacity_ /
bucket_size;
488 const std::size_t bucket_mask = num_buckets > 0 ? num_buckets - 1 : 0;
493 std::size_t cur_home = (hash_(cur_key) & capacity_mask_) /
bucket_size;
494 std::size_t probe = 0;
496 while (probe < probe_bound) {
497 const std::size_t bucket_idx = (cur_home + probe) & bucket_mask;
506 const Slot slot = buckets_[bucket_idx].slots[tile.thread_rank()];
514 auto try_cas = [&](
int target_lane,
bool apply_reduction) ->
bool {
523 if (tile.thread_rank() == target_lane) {
524 const V to_store = apply_reduction
525 ? reduce_(slot.value, cur_value)
527 Slot expected = slot;
528 const Slot desired{cur_key, to_store};
529 cuda::atomic_ref<Slot, cuda::thread_scope_device> atomic_slot(
530 buckets_[bucket_idx].slots[target_lane]);
531 cas_ok = atomic_slot.compare_exchange_strong(expected, desired);
533 return tile.shfl(cas_ok, target_lane);
538 const auto match_mask = tile.ballot(slot.key == cur_key);
539 if (match_mask != 0) {
540 if (try_cas(__ffs(match_mask) - 1,
true))
541 return cuda::std::nullopt;
546 const auto empty_mask = tile.ballot(slot.key ==
empty_key);
547 if (empty_mask != 0) {
548 if (try_cas(__ffs(empty_mask) - 1,
false))
549 return cuda::std::nullopt;
557 const std::size_t resident_home =
559 const std::size_t resident_probe =
560 (bucket_idx - resident_home) & bucket_mask;
561 const auto displaceable_mask = tile.ballot(resident_probe < probe);
562 if (displaceable_mask != 0) {
563 const int target_lane = __ffs(displaceable_mask) - 1;
564 if (try_cas(target_lane,
false)) {
568 cur_key = tile.shfl(slot.key, target_lane);
569 cur_value = tile.shfl(slot.value, target_lane);
570 cur_home = tile.shfl(resident_home, target_lane);
571 probe = tile.shfl(resident_probe, target_lane) + 1;
586 return Slot{cur_key, cur_value};
589template <
class K,
class V,
class H, K E,
class R,
int CL,
int WS,
int MPB>
591__device__ cuda::std::optional<V>
593 const Tile& tile, K key
602 const auto h = hash_(key);
603 const std::size_t home_slot = h & capacity_mask_;
604 const std::size_t home_bucket = home_slot /
bucket_size;
605 const std::size_t num_buckets = capacity_ /
bucket_size;
606 const std::size_t bucket_mask = num_buckets > 0 ? num_buckets - 1 : 0;
614 for (std::size_t probe = 0; probe < probe_bound; ++probe) {
615 const std::size_t bucket_idx = (home_bucket + probe) & bucket_mask;
622 const Slot slot = buckets_[bucket_idx].slots[tile.thread_rank()];
627 const auto match_mask = tile.ballot(slot.key == key);
628 if (match_mask != 0) {
629 const int matching_lane = __ffs(match_mask) - 1;
630 return cuda::std::optional<V>{tile.shfl(slot.value, matching_lane)};
635 if (tile.ballot(slot.key ==
empty_key) != 0)
return cuda::std::nullopt;
641 const std::size_t resident_home =
643 const std::size_t resident_probe =
644 (bucket_idx - resident_home) & bucket_mask;
645 if (tile.ballot(resident_probe < probe) != 0)
return cuda::std::nullopt;
656 return cuda::std::nullopt;
663#undef IF_GPURHH_BENCHMARK_COUNTERS
Definition hash_table.cuh:283
cuda::std::optional< Slot > insert(const Tile &tile, Key key, Value value)
std::size_t capacity() const noexcept
Definition hash_table.cuh:332
cuda::std::optional< Value > get(const Tile &tile, Key key) const
Definition hash_table.cuh:180
static constexpr int cache_line_bytes
Definition hash_table.cuh:201
static constexpr unsigned char empty_key_byte
Definition hash_table.cuh:264
Bucket * data() noexcept
Definition hash_table.cuh:380
View view() const noexcept
Definition hash_table.cuh:449
static constexpr int max_probe_buckets
Definition hash_table.cuh:257
HashTable(std::size_t min_capacity)
Definition hash_table.cuh:398
~HashTable()
Definition hash_table.cuh:418
Value value_type
Definition hash_table.cuh:183
static constexpr int warp_size
Definition hash_table.cuh:202
Hash hasher
Definition hash_table.cuh:184
static constexpr int tile_size
Definition hash_table.cuh:204
static constexpr int tiles_per_warp
Definition hash_table.cuh:205
Key key_type
Definition hash_table.cuh:182
const Bucket * data() const noexcept
Definition hash_table.cuh:381
static constexpr Key empty_key
Definition hash_table.cuh:199
HashTable(const HashTable &)=delete
static constexpr int slot_bytes
Definition hash_table.cuh:200
static constexpr int bucket_size
Definition hash_table.cuh:203
void clear(cudaStream_t stream=default_stream)
Definition hash_table.cuh:464
HashTable & operator=(const HashTable &)=delete
#define IF_GPURHH_BENCHMARK_COUNTERS(...)
Definition hash_table.cuh:38
constexpr std::size_t next_pow2(std::size_t n) noexcept
Definition hash_table.cuh:111
std::uint64_t fmix64(std::uint64_t k) noexcept
Definition hash_table.cuh:101
std::uint32_t fmix32(std::uint32_t h) noexcept
Definition hash_table.cuh:92
Definition hash_table.cuh:41
constexpr cudaStream_t default_stream
Definition hash_table.cuh:164
Definition hash_table.cuh:234
Definition hash_table.cuh:192
Key key
Definition hash_table.cuh:193
Value value
Definition hash_table.cuh:194
Definition hash_table.cuh:69
static constexpr Key key
Definition hash_table.cuh:76
static constexpr unsigned char memset_byte
Definition hash_table.cuh:75
std::uint32_t operator()(Key key) const noexcept
Definition hash_table.cuh:125
std::uint64_t operator()(Key key) const noexcept
Definition hash_table.cuh:132
Definition hash_table.cuh:121
Definition hash_table.cuh:145
T operator()(T, T incoming) const noexcept
Definition hash_table.cuh:147
Definition hash_table.cuh:152
T operator()(T existing, T incoming) const noexcept
Definition hash_table.cuh:154