gpurhh
GPU Robin Hood Hashing — header-only CUDA library
Loading...
Searching...
No Matches
hash_table.cuh
Go to the documentation of this file.
1#pragma once
2
3// gpurhh — GPU Robin Hood Hashing.
4//
5// Header-only CUDA library implementing a parallel hash table for NVIDIA GPUs,
6// using Robin Hood open-addressing with bucketed, sub-warp-cooperative probing.
7//
8// See the docs/ directory for the design rationale behind this layout.
9//
10// The hash table itself lives in device memory. Host code constructs and
11// destructs a HashTable, which allocates and zero-initializes the slot array
12// on the device. All operations on the contents — insert, get — are device
13// code, exposed through the View handle which is meant to be passed by value
14// into user kernels. Data movement between host and device (e.g. building the
15// input arrays, reading back results) is the caller's responsibility.
16
17#include <array>
18#include <bit>
19#include <cstddef>
20#include <cstdint>
21#include <type_traits>
22
23#include <cooperative_groups.h>
24#include <cuda/atomic>
25#include <cuda/std/bit>
26#include <cuda/std/optional>
27#include <cuda_runtime.h>
28
29// IF_GPURHH_BENCHMARK_COUNTERS(stmt): expands to `stmt` when the
30// GPURHH_BENCHMARK_COUNTERS macro is defined, and to nothing otherwise.
31// Used inline at every site where the probe counter would otherwise sit
32// behind an `#ifdef`. Keeps the conditional logic to one line each, and
33// keeps the rest of the header free of scattered preprocessor blocks.
34// `#undef`'d at the bottom of this header so it doesn't leak into TUs.
35#ifdef GPURHH_BENCHMARK_COUNTERS
36#define IF_GPURHH_BENCHMARK_COUNTERS(...) __VA_ARGS__
37#else
38#define IF_GPURHH_BENCHMARK_COUNTERS(...)
39#endif
40
41namespace gpurhh {
42
43// -----------------------------------------------------------------------------
44// Defaults: empty-key sentinel and hash function.
45// -----------------------------------------------------------------------------
46
47// Reserved value signaling an empty slot. Users may not insert this key.
48//
49// Default is the all-bits-set representation of `Key` — every byte is 0xFF,
50// so `value` is the maximum representable value (e.g. 0xFFFFFFFFu for
51// uint32_t). The all-ones bit pattern lets `HashTable` initialize its
52// slot array with a single cudaMemset(0xFF, ...) at construction.
53//
54// Compiles for: unsigned integral types only. For signed integers,
55// floating-point types, scoped enums, or user-defined structs, supply an
56// explicit `EmptyKey` template argument to `HashTable` (and likely your
57// own `Hash` functor too). We deliberately don't provide a default for
58// signed types because no single repeated byte pattern gives a "good"
59// signed sentinel: 0xFF gives -1 (a common legitimate key); 0x80 gives
60// type-dependent values that are only INT_MIN for int8_t; INT_MIN itself
61// isn't a repeated-byte pattern. Leaving the choice to the user avoids
62// committing to any of those compromises.
63//
64// The trait exposes both `key` (the empty-key value itself) and
65// `memset_byte` (the byte that, repeated, produces that value).
66// `HashTable`'s constructor uses `memset_byte` to init the slot array
67// with one cudaMemset call.
68template <class Key>
70 static_assert(std::is_unsigned_v<Key>,
71 "default_empty_key only provides a default for unsigned "
72 "integral types; supply an explicit EmptyKey for signed "
73 "or other key types");
74
75 static constexpr unsigned char memset_byte = 0xFFu;
76 static constexpr Key key = static_cast<Key>(~Key{});
77};
78
79// Compile-time verification for the supported integer types.
80static_assert(default_empty_key<std::uint8_t >::key == 0xFFu);
81static_assert(default_empty_key<std::uint16_t>::key == 0xFFFFu);
82static_assert(default_empty_key<std::uint32_t>::key == 0xFFFFFFFFu);
83static_assert(default_empty_key<std::uint64_t>::key == 0xFFFFFFFFFFFFFFFFull);
84
85// Default hash: MurmurHash3 32-bit finalizer (fmix32) for 4-byte keys; an
86// 8-byte specialization is reserved for when 128-bit slot support is added.
87// Both are cheap on the GPU, branch-free, and have strong avalanche. Users
88// can supply their own via the Hash template parameter.
89
90namespace detail {
91
92__host__ __device__ inline std::uint32_t fmix32(std::uint32_t h) noexcept {
93 h ^= h >> 16;
94 h *= 0x85ebca6bu;
95 h ^= h >> 13;
96 h *= 0xc2b2ae35u;
97 h ^= h >> 16;
98 return h;
99}
100
101__host__ __device__ inline std::uint64_t fmix64(std::uint64_t k) noexcept {
102 k ^= k >> 33;
103 k *= 0xff51afd7ed558ccdULL;
104 k ^= k >> 33;
105 k *= 0xc4ceb9fe1a85ec53ULL;
106 k ^= k >> 33;
107 return k;
108}
109
110// Round `n` up to the next power of two. Returns 1 for n == 0 or n == 1.
111constexpr inline std::size_t next_pow2(std::size_t n) noexcept {
112 if (n <= 1) return 1;
113 return std::size_t{1} << std::bit_width(n - 1);
114}
115
116} // namespace detail
117
118// Primary template: dispatch on sizeof(Key). Defined only for 4- and 8-byte
119// keys, matching the supported slot widths.
120template <class Key, std::size_t = sizeof(Key)>
122
123template <class Key>
124struct default_hash<Key, 4> {
125 __device__ inline std::uint32_t operator()(Key key) const noexcept {
126 return detail::fmix32(cuda::std::bit_cast<std::uint32_t>(key));
127 }
128};
129
130template <class Key>
131struct default_hash<Key, 8> {
132 __device__ inline std::uint64_t operator()(Key key) const noexcept {
133 return detail::fmix64(cuda::std::bit_cast<std::uint64_t>(key));
134 }
135};
136
137// -----------------------------------------------------------------------------
138// Reduction operators applied by View::insert when an existing key is hit.
139// -----------------------------------------------------------------------------
140//
141// Called as `reduce(existing_value, incoming_value)`; the returned value is
142// stored in the slot via the same atomic CAS that the insertion would do.
143// Stateless and trivially copyable so they sit inside a View by value.
144
146 template <class T>
147 __device__ T operator()(T /*existing*/, T incoming) const noexcept {
148 return incoming;
149 }
150};
151
152struct sum_op {
153 template <class T>
154 __device__ T operator()(T existing, T incoming) const noexcept {
155 return existing + incoming;
156 }
157};
158
159// Named constant for the CUDA runtime's default stream, used as the
160// default argument for host-side methods that issue async work. Stream
161// 0 is the legacy/default stream, equivalent to nullptr for the
162// `cudaStream_t` pointer type; the named alias makes call sites
163// self-documenting at the declaration of e.g. `clear`.
164inline constexpr cudaStream_t default_stream = 0;
165
166// -----------------------------------------------------------------------------
167// HashTable: host-side owner of the device-resident slot array.
168// -----------------------------------------------------------------------------
169
170template <
171 class Key,
172 class Value,
173 class Hash = default_hash<Key>,
174 Key EmptyKey = default_empty_key<Key>::key,
175 class Reduction = replace_op,
176 int CacheLineBytes = 128,
177 int WarpSize = 32,
178 int MaxProbeBuckets = 8
179>
181public:
182 using key_type = Key;
183 using value_type = Value;
184 using hasher = Hash;
185
186 // Packed (key, value) slot. One Slot is the unit of an atomic CAS.
187 // The alignas forces alignof(Slot) == sizeof(Slot), which is required
188 // by cuda::atomic_ref<Slot> (it needs the storage aligned to the
189 // atomic op width). For the supported Key/Value pairs (same-size
190 // integral types) sizeof(Key) + sizeof(Value) == sizeof(Slot) with no
191 // internal padding.
192 struct alignas(sizeof(Key) + sizeof(Value)) Slot {
193 Key key;
194 Value value;
195 };
196
197 // ---- Architecture-derived constants ----
198
199 static constexpr Key empty_key = EmptyKey;
200 static constexpr int slot_bytes = sizeof(Slot);
201 static constexpr int cache_line_bytes = CacheLineBytes;
202 static constexpr int warp_size = WarpSize;
203 static constexpr int bucket_size = CacheLineBytes / slot_bytes;
204 static constexpr int tile_size = bucket_size;
205 static constexpr int tiles_per_warp = WarpSize / bucket_size;
206
207 // Slot / cache-line invariants.
208 static_assert(slot_bytes == 8 || slot_bytes == 16,
209 "slot must be 8 or 16 bytes (a 64-bit or 128-bit packed "
210 "key+value pair). The 16-byte case uses 128-bit CAS via "
211 "cuda::atomic_ref — single-instruction on sm_90+, "
212 "emulated by libcu++ on earlier architectures.");
213 static_assert(slot_bytes <= CacheLineBytes,
214 "slot does not fit in a cache line");
215 static_assert(CacheLineBytes % slot_bytes == 0,
216 "cache line must be a whole multiple of slot size");
217
218 // Tile / bucket invariants. tile_size == bucket_size by construction; the
219 // cooperative_groups::thread_block_tile requirements drive these.
220 static_assert(tile_size > 0,
221 "tile size must be positive");
222 static_assert((tile_size & (tile_size - 1)) == 0,
223 "tile size must be a power of two "
224 "(required by cooperative_groups::tiled_partition)");
225 static_assert(tile_size <= WarpSize,
226 "tile size must not exceed warp size");
227 static_assert(WarpSize % bucket_size == 0,
228 "warp size must be a whole multiple of bucket size");
229
230 // A bucket is a contiguous run of `bucket_size` slots aligned to one
231 // cache line. The alignas guarantees that loading a bucket via a
232 // cooperative tile produces a single coalesced cache-line transaction
233 // (the bucket can never straddle two cache lines).
234 struct alignas(CacheLineBytes) Bucket {
235 Slot slots[bucket_size];
236 };
237
238 static_assert(sizeof(Bucket) == CacheLineBytes,
239 "bucket must occupy exactly one cache line");
240 static_assert(alignof(Bucket) == CacheLineBytes,
241 "bucket alignment must equal cache line size");
242
243 // Probe-length cap. An insert that would place a key further than this
244 // many buckets from its home bucket fails (the caller is expected to
245 // rehash into a larger table). Get takes advantage of the same cap: if
246 // it probes this many buckets without finding the key, the key cannot
247 // be in the table, because insert would have failed before placing it
248 // beyond the cap.
249 //
250 // The default of 8 comfortably handles load factors up to ~0.9 with
251 // bucket_size 16, where the expected longest probe is well under the
252 // cap. The cap exists to bound the worst case for adversarial inputs
253 // and very high load factors. Users targeting load factors above ~0.9
254 // can override via the `MaxProbeBuckets` template parameter; lowering
255 // it tightens the time bound at the cost of higher insert-failure
256 // probability.
257 static constexpr int max_probe_buckets = MaxProbeBuckets;
258 static_assert(max_probe_buckets > 0,
259 "max_probe_buckets must be positive");
260
261 // Byte value used to initialize the slot array via cudaMemset. Derived
262 // from `empty_key`: every byte of `empty_key` must equal this byte for
263 // the cudaMemset-based init path to produce a correctly-empty slot array.
264 static constexpr unsigned char empty_key_byte =
265 std::bit_cast<std::array<unsigned char, sizeof(Key)>>(empty_key)[0];
266
267 // EmptyKey memsettability invariant.
268 static_assert([]() {
269 const auto bytes = std::bit_cast<std::array<unsigned char, sizeof(Key)>>(empty_key);
270 for (std::size_t i = 1; i < sizeof(Key); ++i) {
271 if (bytes[i] != bytes[0]) return false;
272 }
273 return true;
274 }(),
275 "EmptyKey must be a single byte pattern repeated across sizeof(Key) "
276 "bytes so the slot array can be initialized via cudaMemset. Use the "
277 "default for unsigned keys, or provide an EmptyKey of the form 0xBB..BB.");
278
279 // -------------------------------------------------------------------------
280 // View: lightweight device-side handle. Trivially copyable; pass by value
281 // into user kernels. Holds only a device pointer and a couple of integers.
282 // -------------------------------------------------------------------------
283 class View {
284 public:
285 // Cooperative single-key insert.
286 // Called by a tile of exactly tile_size threads in lock-step. Returns
287 // nullopt on success (the in-flight pair was placed in an empty slot
288 // or merged into an existing slot via the Reduction operator).
289 //
290 // On probe-cap failure returns the leftover Slot that could not be
291 // placed. Note this is not necessarily the (key, value) originally
292 // passed in: if the insertion triggered Robin Hood displacements
293 // before failing, the returned Slot is the most-recently-evicted
294 // victim — the originally-passed-in pair is already in the table,
295 // and it is the victim that is now unreachable. Callers that need
296 // to avoid data loss should buffer the returned Slot and retry it
297 // elsewhere (typically by rebuilding a larger table).
298 //
299 // When GPURHH_BENCHMARK_COUNTERS is defined, takes an additional
300 // trailing reference parameter `tile_counter`. Lane 0 of the
301 // tile increments it on every cache-line-sized memory event:
302 // - once per probe-loop iteration (the cooperative 128-byte
303 // bucket load at the top of each iteration, including
304 // CAS-retry re-reads of the same bucket), AND
305 // - once per attempted CAS, whether or not it succeeds. Each
306 // CAS attempt requires the cache line in an exclusive state
307 // — that's an additional memory transaction beyond the
308 // bucket load, otherwise invisible to a probes-only counter.
309 // The caller is expected to pass a per-tile register accumulator
310 // and flush it to a per-tile global slot at end of kernel — no
311 // atomic is involved in the increment. The metric is a lower-
312 // bound estimate of cache-line transfers attributable to the
313 // insert; downstream bandwidth = counter × sizeof(Bucket) / time.
314 template <class Tile>
315 __device__ cuda::std::optional<Slot> insert(
316 const Tile& tile, Key key, Value value
317 IF_GPURHH_BENCHMARK_COUNTERS(, std::uint32_t& tile_counter)
318 );
319
320 // Cooperative single-key lookup. Returns the stored value if the
321 // key is present, or an empty optional otherwise. Every lane in the
322 // tile receives the same result.
323 //
324 // The `tile_counter` parameter under GPURHH_BENCHMARK_COUNTERS
325 // behaves the same way as in `insert`.
326 template <class Tile>
327 __device__ cuda::std::optional<Value> get(
328 const Tile& tile, Key key
329 IF_GPURHH_BENCHMARK_COUNTERS(, std::uint32_t& tile_counter)
330 ) const;
331
332 __host__ __device__ std::size_t capacity() const noexcept { return capacity_; }
333
334 private:
335 friend class HashTable;
336
337 Bucket* buckets_ = nullptr;
338 std::size_t capacity_ = 0; // power of two, in slots
339 std::size_t capacity_mask_ = 0; // capacity_ - 1, for modulo via AND
340 Hash hash_{};
341 Reduction reduce_{};
342 };
343
344 // -------------------------------------------------------------------------
345 // Host-side resource management.
346 // -------------------------------------------------------------------------
347
348 // Allocates a slot array on the current device sized for at least
349 // `min_capacity` slots, rounded up to the next power of two, and
350 // initializes every slot to (empty_key, _). Synchronous: cudaMalloc
351 // already blocks the host, so there is nothing to be gained from
352 // routing the slot-init memset through a stream.
353 explicit HashTable(std::size_t min_capacity);
354
355 ~HashTable();
356
357 HashTable(const HashTable&) = delete;
358 HashTable& operator=(const HashTable&) = delete;
359 HashTable(HashTable&&) noexcept;
360 HashTable& operator=(HashTable&&) noexcept;
361
362 // Returns a device-side handle for use inside user kernels.
363 View view() const noexcept;
364
365 // Actual capacity (input capacity rounded up to a power of two).
366 std::size_t capacity() const noexcept { return capacity_; }
367
368 // Empty the table — every slot is reset to (empty_key, _). Async on
369 // the given CUDA stream; the caller is responsible for synchronizing
370 // before launching anything that depends on the cleared state.
371 // `default_stream` (= 0) selects the runtime's default stream.
372 void clear(cudaStream_t stream = default_stream);
373
374 // Direct access to the underlying device-resident bucket array, of
375 // length `capacity() / bucket_size`. Provided for tests, diagnostics,
376 // and benchmark instrumentation that need to inspect or seed table
377 // state directly via cudaMemcpy or custom kernels. Bypasses the
378 // table's concurrency contract — there is no synchronization against
379 // concurrent insert / get from other kernels.
380 Bucket* data() noexcept { return buckets_; }
381 const Bucket* data() const noexcept { return buckets_; }
382
383private:
384 Bucket* buckets_ = nullptr;
385 std::size_t capacity_ = 0; // in slots
386};
387
388// -----------------------------------------------------------------------------
389// Definitions.
390// -----------------------------------------------------------------------------
391//
392// Function bodies live in the header because the class is a template and
393// instantiations need full definitions visible at the call site. Host-side
394// methods (constructor, destructor, move ops, view) are implemented; the
395// device-side View::insert and View::get bodies are still TODO.
396
397template <class K, class V, class H, K E, class R, int CL, int WS, int MPB>
399 // Round up to the next power of two, but never below a single bucket —
400 // the bucketed probing logic requires at least bucket_size slots.
401 const std::size_t rounded = detail::next_pow2(min_capacity);
402 capacity_ = rounded < bucket_size ? std::size_t{bucket_size} : rounded;
403
404 const std::size_t num_buckets = capacity_ / bucket_size;
405
406 // Allocate the bucket array on the current device. cudaMalloc leaves
407 // buckets_ as nullptr on failure; callers can check via
408 // cudaPeekAtLastError().
409 cudaMalloc(&buckets_, num_buckets * sizeof(Bucket));
410
411 // Initialize every byte to empty_key_byte. For the supported default
412 // (unsigned EmptyKey = 0xFF..FF) this writes the empty sentinel into
413 // each slot's key field in a single bandwidth-bound DMA.
414 cudaMemset(buckets_, empty_key_byte, num_buckets * sizeof(Bucket));
415}
416
417template <class K, class V, class H, K E, class R, int CL, int WS, int MPB>
419 // cudaFree(nullptr) is a documented no-op, so no need to guard.
420 cudaFree(buckets_);
421}
422
423template <class K, class V, class H, K E, class R, int CL, int WS, int MPB>
425 : buckets_(other.buckets_), capacity_(other.capacity_)
426{
427 other.buckets_ = nullptr;
428 other.capacity_ = 0;
429}
430
431template <class K, class V, class H, K E, class R, int CL, int WS, int MPB>
434 if (this != &other) {
435 // Free the current allocation before taking ownership of the new one.
436 // cudaFree(nullptr) is a no-op, so this is safe even if *this is in
437 // a moved-from / empty state.
438 cudaFree(buckets_);
439 buckets_ = other.buckets_;
440 capacity_ = other.capacity_;
441 other.buckets_ = nullptr;
442 other.capacity_ = 0;
443 }
444 return *this;
445}
446
447template <class K, class V, class H, K E, class R, int CL, int WS, int MPB>
450 View v;
451 v.buckets_ = buckets_;
452 v.capacity_ = capacity_;
453 // For a usable table capacity_ is a power of two and the mask is
454 // capacity_ - 1. For a moved-from / empty table capacity_ is 0; we set
455 // the mask to 0 explicitly to avoid a wrap-around to ~0 which would be
456 // misleading even though the View is unusable in that state anyway.
457 v.capacity_mask_ = capacity_ > 0 ? capacity_ - 1 : 0;
458 // v.hash_ and v.reduce_ are default-constructed by View's in-class
459 // default initializers.
460 return v;
461}
462
463template <class K, class V, class H, K E, class R, int CL, int WS, int MPB>
465 const std::size_t num_buckets = capacity_ / bucket_size;
466 cudaMemsetAsync(buckets_, empty_key_byte,
467 num_buckets * sizeof(Bucket), stream);
468}
469
470template <class K, class V, class H, K E, class R, int CL, int WS, int MPB>
471template <class Tile>
473 const Tile& tile, K key, V value
474 IF_GPURHH_BENCHMARK_COUNTERS(, std::uint32_t& tile_counter)
475)
476 -> cuda::std::optional<Slot>
477{
478 // Bucketed Robin Hood insertion with lock-free CAS. The tile carries an
479 // in-flight pair (cur_key, cur_value) that may change due to Robin Hood
480 // displacement — when we evict a "richer" resident, that resident
481 // becomes our new in-flight pair and we keep probing from the next
482 // bucket.
483
484 K cur_key = key;
485 V cur_value = value;
486
487 const std::size_t num_buckets = capacity_ / bucket_size;
488 const std::size_t bucket_mask = num_buckets > 0 ? num_buckets - 1 : 0;
489 const std::size_t probe_bound = num_buckets < max_probe_buckets
490 ? num_buckets
491 : static_cast<std::size_t>(max_probe_buckets);
492
493 std::size_t cur_home = (hash_(cur_key) & capacity_mask_) / bucket_size;
494 std::size_t probe = 0;
495
496 while (probe < probe_bound) {
497 const std::size_t bucket_idx = (cur_home + probe) & bucket_mask;
498
499 // Count one bucket-load per probe iteration. CAS-retry `continue`s
500 // re-enter the loop and re-read the same bucket — those count too,
501 // because they are real DRAM transactions.
502 IF_GPURHH_BENCHMARK_COUNTERS(if (tile.thread_rank() == 0) ++tile_counter;)
503
504 // Cooperative load: one coalesced cache-line transaction. Each
505 // lane reads one slot of the bucket (lane i reads slots[i]).
506 const Slot slot = buckets_[bucket_idx].slots[tile.thread_rank()];
507
508 // Helper: `target_lane` performs an atomic CAS to replace its slot
509 // (whose current contents we have in `slot`) with the in-flight
510 // pair. On the match case the new value is computed as
511 // `reduce_(existing, incoming)`; on the empty / displaceable cases
512 // the new value is just the incoming value. Returns the broadcast
513 // result of the CAS to every lane.
514 auto try_cas = [&](int target_lane, bool apply_reduction) -> bool {
515 // Count this CAS attempt as one cache-line memory transaction
516 // (the atomic op requires the line in an exclusive state,
517 // beyond the cooperative bucket load above). Bumps the same
518 // tile_counter as the probe-loop top, so the metric stays a
519 // single combined cache-line-transfer count.
520 IF_GPURHH_BENCHMARK_COUNTERS(if (tile.thread_rank() == 0) ++tile_counter;)
521
522 bool cas_ok = false;
523 if (tile.thread_rank() == target_lane) {
524 const V to_store = apply_reduction
525 ? reduce_(slot.value, cur_value)
526 : cur_value;
527 Slot expected = slot;
528 const Slot desired{cur_key, to_store};
529 cuda::atomic_ref<Slot, cuda::thread_scope_device> atomic_slot(
530 buckets_[bucket_idx].slots[target_lane]);
531 cas_ok = atomic_slot.compare_exchange_strong(expected, desired);
532 }
533 return tile.shfl(cas_ok, target_lane);
534 };
535
536 // (1) Key match — combine the in-flight value with the existing one
537 // via the Reduction operator (default: replace).
538 const auto match_mask = tile.ballot(slot.key == cur_key);
539 if (match_mask != 0) {
540 if (try_cas(__ffs(match_mask) - 1, /*apply_reduction=*/true))
541 return cuda::std::nullopt;
542 continue; // CAS lost the race — retry this bucket.
543 }
544
545 // (2) Empty slot — claim it for our in-flight pair.
546 const auto empty_mask = tile.ballot(slot.key == empty_key);
547 if (empty_mask != 0) {
548 if (try_cas(__ffs(empty_mask) - 1, /*apply_reduction=*/false))
549 return cuda::std::nullopt;
550 continue; // Someone else took the slot — retry.
551 }
552
553 // (3) Displaceable slot — Robin Hood swap with a richer resident.
554 // Compute each resident's probe distance. (For empty slots this
555 // gives nonsense, but we already eliminated the empty case above,
556 // so the next ballot is meaningful.)
557 const std::size_t resident_home =
558 (hash_(slot.key) & capacity_mask_) / bucket_size;
559 const std::size_t resident_probe =
560 (bucket_idx - resident_home) & bucket_mask;
561 const auto displaceable_mask = tile.ballot(resident_probe < probe);
562 if (displaceable_mask != 0) {
563 const int target_lane = __ffs(displaceable_mask) - 1;
564 if (try_cas(target_lane, /*apply_reduction=*/false)) {
565 // Adopt the evicted pair as our new in-flight pair. We
566 // continue from the next bucket; the evicted pair's probe
567 // distance there is `resident_probe + 1`.
568 cur_key = tile.shfl(slot.key, target_lane);
569 cur_value = tile.shfl(slot.value, target_lane);
570 cur_home = tile.shfl(resident_home, target_lane);
571 probe = tile.shfl(resident_probe, target_lane) + 1;
572 continue;
573 }
574 continue; // CAS lost the race — retry.
575 }
576
577 // (4) Nothing applies — advance one bucket.
578 ++probe;
579 }
580
581 // Probe budget exhausted. Hand the in-flight pair back so the caller
582 // can buffer it and rehash into a larger table. This is the original
583 // (key, value) if no displacement happened, or the most-recently-
584 // evicted victim otherwise — either way, returning it is what keeps
585 // the operation lossless.
586 return Slot{cur_key, cur_value};
587}
588
589template <class K, class V, class H, K E, class R, int CL, int WS, int MPB>
590template <class Tile>
591__device__ cuda::std::optional<V>
593 const Tile& tile, K key
594 IF_GPURHH_BENCHMARK_COUNTERS(, std::uint32_t& tile_counter)
595) const
596{
597 // Bucket-level Robin Hood lookup. Probe home_bucket, then home_bucket+1,
598 // etc., wrapping modulo num_buckets. Each iteration is one coalesced
599 // cache-line load (one slot per tile lane) and a few tile-wide ballots
600 // to identify the terminating condition.
601
602 const auto h = hash_(key);
603 const std::size_t home_slot = h & capacity_mask_;
604 const std::size_t home_bucket = home_slot / bucket_size;
605 const std::size_t num_buckets = capacity_ / bucket_size;
606 const std::size_t bucket_mask = num_buckets > 0 ? num_buckets - 1 : 0;
607
608 // Probe at most max_probe_buckets buckets, or every bucket in the table
609 // if it's smaller than the cap.
610 const std::size_t probe_bound = num_buckets < max_probe_buckets
611 ? num_buckets
612 : static_cast<std::size_t>(max_probe_buckets);
613
614 for (std::size_t probe = 0; probe < probe_bound; ++probe) {
615 const std::size_t bucket_idx = (home_bucket + probe) & bucket_mask;
616
617 // One bucket-load per probe iteration.
618 IF_GPURHH_BENCHMARK_COUNTERS(if (tile.thread_rank() == 0) ++tile_counter;)
619
620 // Each lane reads one slot — one coalesced cache-line transaction
621 // for the whole tile.
622 const Slot slot = buckets_[bucket_idx].slots[tile.thread_rank()];
623
624 // (1) Key match. The matching lane broadcasts its value to the rest
625 // of the tile via shfl; each lane wraps the broadcast result in
626 // an optional and returns it (every lane returns the same one).
627 const auto match_mask = tile.ballot(slot.key == key);
628 if (match_mask != 0) {
629 const int matching_lane = __ffs(match_mask) - 1;
630 return cuda::std::optional<V>{tile.shfl(slot.value, matching_lane)};
631 }
632
633 // (2) Empty slot. The key isn't in the table — Robin Hood would have
634 // placed it earlier in the probe sequence.
635 if (tile.ballot(slot.key == empty_key) != 0) return cuda::std::nullopt;
636
637 // (3) Robin Hood early termination. If any resident is "richer"
638 // (closer to its home bucket than we are to ours), our key would
639 // have evicted it on insertion, so it can't be later in the
640 // probe sequence.
641 const std::size_t resident_home =
642 (hash_(slot.key) & capacity_mask_) / bucket_size;
643 const std::size_t resident_probe =
644 (bucket_idx - resident_home) & bucket_mask;
645 if (tile.ballot(resident_probe < probe) != 0) return cuda::std::nullopt;
646
647 // No match, no empty slot, no richer resident — every slot here is
648 // occupied by a resident at least as displaced as we would be.
649 // Advance one bucket.
650 }
651
652 // Either hit the probe cap, or wrapped fully around in a small table,
653 // without finding the key, an empty slot, or a richer resident. The
654 // insert invariant guarantees the key cannot be at probe distance
655 // greater than max_probe_buckets, so this is a definite "not present".
656 return cuda::std::nullopt;
657}
658
659} // namespace gpurhh
660
661// Helper macro is purely an in-header convenience — undef so it doesn't
662// leak into translation units that include <gpurhh/hash_table.cuh>.
663#undef IF_GPURHH_BENCHMARK_COUNTERS
Definition hash_table.cuh:283
cuda::std::optional< Slot > insert(const Tile &tile, Key key, Value value)
std::size_t capacity() const noexcept
Definition hash_table.cuh:332
cuda::std::optional< Value > get(const Tile &tile, Key key) const
Definition hash_table.cuh:180
static constexpr int cache_line_bytes
Definition hash_table.cuh:201
static constexpr unsigned char empty_key_byte
Definition hash_table.cuh:264
Bucket * data() noexcept
Definition hash_table.cuh:380
View view() const noexcept
Definition hash_table.cuh:449
static constexpr int max_probe_buckets
Definition hash_table.cuh:257
HashTable(std::size_t min_capacity)
Definition hash_table.cuh:398
~HashTable()
Definition hash_table.cuh:418
Value value_type
Definition hash_table.cuh:183
static constexpr int warp_size
Definition hash_table.cuh:202
Hash hasher
Definition hash_table.cuh:184
static constexpr int tile_size
Definition hash_table.cuh:204
static constexpr int tiles_per_warp
Definition hash_table.cuh:205
Key key_type
Definition hash_table.cuh:182
const Bucket * data() const noexcept
Definition hash_table.cuh:381
static constexpr Key empty_key
Definition hash_table.cuh:199
HashTable(const HashTable &)=delete
static constexpr int slot_bytes
Definition hash_table.cuh:200
static constexpr int bucket_size
Definition hash_table.cuh:203
void clear(cudaStream_t stream=default_stream)
Definition hash_table.cuh:464
HashTable & operator=(const HashTable &)=delete
#define IF_GPURHH_BENCHMARK_COUNTERS(...)
Definition hash_table.cuh:38
constexpr std::size_t next_pow2(std::size_t n) noexcept
Definition hash_table.cuh:111
std::uint64_t fmix64(std::uint64_t k) noexcept
Definition hash_table.cuh:101
std::uint32_t fmix32(std::uint32_t h) noexcept
Definition hash_table.cuh:92
Definition hash_table.cuh:41
constexpr cudaStream_t default_stream
Definition hash_table.cuh:164
Definition hash_table.cuh:234
Definition hash_table.cuh:192
Key key
Definition hash_table.cuh:193
Value value
Definition hash_table.cuh:194
Definition hash_table.cuh:69
static constexpr Key key
Definition hash_table.cuh:76
static constexpr unsigned char memset_byte
Definition hash_table.cuh:75
std::uint32_t operator()(Key key) const noexcept
Definition hash_table.cuh:125
std::uint64_t operator()(Key key) const noexcept
Definition hash_table.cuh:132
Definition hash_table.cuh:121
Definition hash_table.cuh:145
T operator()(T, T incoming) const noexcept
Definition hash_table.cuh:147
Definition hash_table.cuh:152
T operator()(T existing, T incoming) const noexcept
Definition hash_table.cuh:154