OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
block_exchange.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include "../util_ptx.cuh"
37#include "../util_arch.cuh"
38#include "../util_macro.cuh"
39#include "../util_type.cuh"
40#include "../util_namespace.cuh"
41
43CUB_NS_PREFIX
44
46namespace cub {
47
108template <
109 typename InputT,
110 int BLOCK_DIM_X,
111 int ITEMS_PER_THREAD,
112 bool WARP_TIME_SLICING = false,
113 int BLOCK_DIM_Y = 1,
114 int BLOCK_DIM_Z = 1,
115 int PTX_ARCH = CUB_PTX_ARCH>
117{
118private:
119
120 /******************************************************************************
121 * Constants
122 ******************************************************************************/
123
125 enum
126 {
128 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
129
130 LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
131 WARP_THREADS = 1 << LOG_WARP_THREADS,
132 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
133
134 LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH),
135 SMEM_BANKS = 1 << LOG_SMEM_BANKS,
136
137 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
138
139 TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1,
140
141 TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
142 TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
143
144 WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
145 WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
146
147 // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
148 INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
149 PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
150 };
151
152 /******************************************************************************
153 * Type definitions
154 ******************************************************************************/
155
157 struct __align__(16) _TempStorage
158 {
159 InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
160 };
161
162public:
163
165 struct TempStorage : Uninitialized<_TempStorage> {};
166
167private:
168
169
170 /******************************************************************************
171 * Thread fields
172 ******************************************************************************/
173
175 _TempStorage &temp_storage;
176
178 unsigned int linear_tid;
179 unsigned int lane_id;
180 unsigned int warp_id;
181 unsigned int warp_offset;
182
183
184 /******************************************************************************
185 * Utility methods
186 ******************************************************************************/
187
189 __device__ __forceinline__ _TempStorage& PrivateStorage()
190 {
191 __shared__ _TempStorage private_storage;
192 return private_storage;
193 }
194
195
199 template <typename OutputT>
200 __device__ __forceinline__ void BlockedToStriped(
201 InputT input_items[ITEMS_PER_THREAD],
202 OutputT output_items[ITEMS_PER_THREAD],
203 Int2Type<false> /*time_slicing*/)
204 {
205 #pragma unroll
206 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
207 {
208 int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
209 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
210 temp_storage.buff[item_offset] = input_items[ITEM];
211 }
212
213 CTA_SYNC();
214
215 #pragma unroll
216 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
217 {
218 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
219 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
220 output_items[ITEM] = temp_storage.buff[item_offset];
221 }
222 }
223
224
228 template <typename OutputT>
229 __device__ __forceinline__ void BlockedToStriped(
230 InputT input_items[ITEMS_PER_THREAD],
231 OutputT output_items[ITEMS_PER_THREAD],
232 Int2Type<true> /*time_slicing*/)
233 {
234 InputT temp_items[ITEMS_PER_THREAD];
235
236 #pragma unroll
237 for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
238 {
239 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
240 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
241
242 CTA_SYNC();
243
244 if (warp_id == SLICE)
245 {
246 #pragma unroll
247 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
248 {
249 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
250 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
251 temp_storage.buff[item_offset] = input_items[ITEM];
252 }
253 }
254
255 CTA_SYNC();
256
257 #pragma unroll
258 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
259 {
260 // Read a strip of items
261 const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
262 const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
263
264 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
265 {
266 int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
267 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
268 {
269 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
270 temp_items[ITEM] = temp_storage.buff[item_offset];
271 }
272 }
273 }
274 }
275
276 // Copy
277 #pragma unroll
278 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
279 {
280 output_items[ITEM] = temp_items[ITEM];
281 }
282 }
283
284
288 template <typename OutputT>
289 __device__ __forceinline__ void BlockedToWarpStriped(
290 InputT input_items[ITEMS_PER_THREAD],
291 OutputT output_items[ITEMS_PER_THREAD],
292 Int2Type<false> /*time_slicing*/)
293 {
294 #pragma unroll
295 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
296 {
297 int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
298 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
299 temp_storage.buff[item_offset] = input_items[ITEM];
300 }
301
302 WARP_SYNC(0xffffffff);
303
304 #pragma unroll
305 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
306 {
307 int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
308 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
309 output_items[ITEM] = temp_storage.buff[item_offset];
310 }
311 }
312
316 template <typename OutputT>
317 __device__ __forceinline__ void BlockedToWarpStriped(
318 InputT input_items[ITEMS_PER_THREAD],
319 OutputT output_items[ITEMS_PER_THREAD],
320 Int2Type<true> /*time_slicing*/)
321 {
322 if (warp_id == 0)
323 {
324 #pragma unroll
325 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
326 {
327 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
328 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
329 temp_storage.buff[item_offset] = input_items[ITEM];
330 }
331
332 WARP_SYNC(0xffffffff);
333
334 #pragma unroll
335 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
336 {
337 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
338 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
339 output_items[ITEM] = temp_storage.buff[item_offset];
340 }
341 }
342
343 #pragma unroll
344 for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
345 {
346 CTA_SYNC();
347
348 if (warp_id == SLICE)
349 {
350 #pragma unroll
351 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
352 {
353 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
354 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
355 temp_storage.buff[item_offset] = input_items[ITEM];
356 }
357
358 WARP_SYNC(0xffffffff);
359
360 #pragma unroll
361 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
362 {
363 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
364 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
365 output_items[ITEM] = temp_storage.buff[item_offset];
366 }
367 }
368 }
369 }
370
371
375 template <typename OutputT>
376 __device__ __forceinline__ void StripedToBlocked(
377 InputT input_items[ITEMS_PER_THREAD],
378 OutputT output_items[ITEMS_PER_THREAD],
379 Int2Type<false> /*time_slicing*/)
380 {
381 #pragma unroll
382 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
383 {
384 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
385 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
386 temp_storage.buff[item_offset] = input_items[ITEM];
387 }
388
389 CTA_SYNC();
390
391 // No timeslicing
392 #pragma unroll
393 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
394 {
395 int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
396 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
397 output_items[ITEM] = temp_storage.buff[item_offset];
398 }
399 }
400
401
405 template <typename OutputT>
406 __device__ __forceinline__ void StripedToBlocked(
407 InputT input_items[ITEMS_PER_THREAD],
408 OutputT output_items[ITEMS_PER_THREAD],
409 Int2Type<true> /*time_slicing*/)
410 {
411 // Warp time-slicing
412 InputT temp_items[ITEMS_PER_THREAD];
413
414 #pragma unroll
415 for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
416 {
417 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
418 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
419
420 CTA_SYNC();
421
422 #pragma unroll
423 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
424 {
425 // Write a strip of items
426 const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
427 const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
428
429 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
430 {
431 int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
432 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
433 {
434 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
435 temp_storage.buff[item_offset] = input_items[ITEM];
436 }
437 }
438 }
439
440 CTA_SYNC();
441
442 if (warp_id == SLICE)
443 {
444 #pragma unroll
445 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
446 {
447 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
448 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
449 temp_items[ITEM] = temp_storage.buff[item_offset];
450 }
451 }
452 }
453
454 // Copy
455 #pragma unroll
456 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
457 {
458 output_items[ITEM] = temp_items[ITEM];
459 }
460 }
461
462
466 template <typename OutputT>
467 __device__ __forceinline__ void WarpStripedToBlocked(
468 InputT input_items[ITEMS_PER_THREAD],
469 OutputT output_items[ITEMS_PER_THREAD],
470 Int2Type<false> /*time_slicing*/)
471 {
472 #pragma unroll
473 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
474 {
475 int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
476 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
477 temp_storage.buff[item_offset] = input_items[ITEM];
478 }
479
480 WARP_SYNC(0xffffffff);
481
482 #pragma unroll
483 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
484 {
485 int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
486 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
487 output_items[ITEM] = temp_storage.buff[item_offset];
488 }
489 }
490
491
495 template <typename OutputT>
496 __device__ __forceinline__ void WarpStripedToBlocked(
497 InputT input_items[ITEMS_PER_THREAD],
498 OutputT output_items[ITEMS_PER_THREAD],
499 Int2Type<true> /*time_slicing*/)
500 {
501 #pragma unroll
502 for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
503 {
504 CTA_SYNC();
505
506 if (warp_id == SLICE)
507 {
508 #pragma unroll
509 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
510 {
511 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
512 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
513 temp_storage.buff[item_offset] = input_items[ITEM];
514 }
515
516 WARP_SYNC(0xffffffff);
517
518 #pragma unroll
519 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
520 {
521 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
522 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
523 output_items[ITEM] = temp_storage.buff[item_offset];
524 }
525 }
526 }
527 }
528
529
533 template <typename OutputT, typename OffsetT>
534 __device__ __forceinline__ void ScatterToBlocked(
535 InputT input_items[ITEMS_PER_THREAD],
536 OutputT output_items[ITEMS_PER_THREAD],
537 OffsetT ranks[ITEMS_PER_THREAD],
538 Int2Type<false> /*time_slicing*/)
539 {
540 #pragma unroll
541 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
542 {
543 int item_offset = ranks[ITEM];
544 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
545 temp_storage.buff[item_offset] = input_items[ITEM];
546 }
547
548 CTA_SYNC();
549
550 #pragma unroll
551 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
552 {
553 int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
554 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
555 output_items[ITEM] = temp_storage.buff[item_offset];
556 }
557 }
558
562 template <typename OutputT, typename OffsetT>
563 __device__ __forceinline__ void ScatterToBlocked(
564 InputT input_items[ITEMS_PER_THREAD],
565 OutputT output_items[ITEMS_PER_THREAD],
566 OffsetT ranks[ITEMS_PER_THREAD],
567 Int2Type<true> /*time_slicing*/)
568 {
569 InputT temp_items[ITEMS_PER_THREAD];
570
571 #pragma unroll
572 for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
573 {
574 CTA_SYNC();
575
576 const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
577
578 #pragma unroll
579 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
580 {
581 int item_offset = ranks[ITEM] - SLICE_OFFSET;
582 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
583 {
584 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
585 temp_storage.buff[item_offset] = input_items[ITEM];
586 }
587 }
588
589 CTA_SYNC();
590
591 if (warp_id == SLICE)
592 {
593 #pragma unroll
594 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
595 {
596 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
597 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
598 temp_items[ITEM] = temp_storage.buff[item_offset];
599 }
600 }
601 }
602
603 // Copy
604 #pragma unroll
605 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
606 {
607 output_items[ITEM] = temp_items[ITEM];
608 }
609 }
610
611
615 template <typename OutputT, typename OffsetT>
616 __device__ __forceinline__ void ScatterToStriped(
617 InputT input_items[ITEMS_PER_THREAD],
618 OutputT output_items[ITEMS_PER_THREAD],
619 OffsetT ranks[ITEMS_PER_THREAD],
620 Int2Type<false> /*time_slicing*/)
621 {
622 #pragma unroll
623 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
624 {
625 int item_offset = ranks[ITEM];
626 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
627 temp_storage.buff[item_offset] = input_items[ITEM];
628 }
629
630 CTA_SYNC();
631
632 #pragma unroll
633 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
634 {
635 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
636 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
637 output_items[ITEM] = temp_storage.buff[item_offset];
638 }
639 }
640
641
645 template <typename OutputT, typename OffsetT>
646 __device__ __forceinline__ void ScatterToStriped(
647 InputT input_items[ITEMS_PER_THREAD],
648 OutputT output_items[ITEMS_PER_THREAD],
649 OffsetT ranks[ITEMS_PER_THREAD],
650 Int2Type<true> /*time_slicing*/)
651 {
652 InputT temp_items[ITEMS_PER_THREAD];
653
654 #pragma unroll
655 for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
656 {
657 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
658 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
659
660 CTA_SYNC();
661
662 #pragma unroll
663 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
664 {
665 int item_offset = ranks[ITEM] - SLICE_OFFSET;
666 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
667 {
668 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
669 temp_storage.buff[item_offset] = input_items[ITEM];
670 }
671 }
672
673 CTA_SYNC();
674
675 #pragma unroll
676 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
677 {
678 // Read a strip of items
679 const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
680 const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
681
682 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
683 {
684 int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
685 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
686 {
687 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
688 temp_items[ITEM] = temp_storage.buff[item_offset];
689 }
690 }
691 }
692 }
693
694 // Copy
695 #pragma unroll
696 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
697 {
698 output_items[ITEM] = temp_items[ITEM];
699 }
700 }
701
702
703public:
704
705 /******************************************************************/
709
713 __device__ __forceinline__ BlockExchange()
714 :
716 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
717 warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
718 lane_id(LaneId()),
719 warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
720 {}
721
722
726 __device__ __forceinline__ BlockExchange(
728 :
729 temp_storage(temp_storage.Alias()),
730 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
731 lane_id(LaneId()),
732 warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
733 warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
734 {}
735
736
738 /******************************************************************/
742
779 template <typename OutputT>
780 __device__ __forceinline__ void StripedToBlocked(
781 InputT input_items[ITEMS_PER_THREAD],
782 OutputT output_items[ITEMS_PER_THREAD])
783 {
784 StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
785 }
786
787
828 template <typename OutputT>
829 __device__ __forceinline__ void BlockedToStriped(
830 InputT input_items[ITEMS_PER_THREAD],
831 OutputT output_items[ITEMS_PER_THREAD])
832 {
833 BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
834 }
835
836
837
876 template <typename OutputT>
877 __device__ __forceinline__ void WarpStripedToBlocked(
878 InputT input_items[ITEMS_PER_THREAD],
879 OutputT output_items[ITEMS_PER_THREAD])
880 {
881 WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
882 }
883
884
885
927 template <typename OutputT>
928 __device__ __forceinline__ void BlockedToWarpStriped(
929 InputT input_items[ITEMS_PER_THREAD],
930 OutputT output_items[ITEMS_PER_THREAD])
931 {
932 BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
933 }
934
935
936
938 /******************************************************************/
942
943
952 template <typename OutputT, typename OffsetT>
953 __device__ __forceinline__ void ScatterToBlocked(
954 InputT input_items[ITEMS_PER_THREAD],
955 OutputT output_items[ITEMS_PER_THREAD],
956 OffsetT ranks[ITEMS_PER_THREAD])
957 {
958 ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
959 }
960
961
962
971 template <typename OutputT, typename OffsetT>
972 __device__ __forceinline__ void ScatterToStriped(
973 InputT input_items[ITEMS_PER_THREAD],
974 OutputT output_items[ITEMS_PER_THREAD],
975 OffsetT ranks[ITEMS_PER_THREAD])
976 {
977 ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
978 }
979
980
981
990 template <typename OutputT, typename OffsetT>
991 __device__ __forceinline__ void ScatterToStripedGuarded(
992 InputT input_items[ITEMS_PER_THREAD],
993 OutputT output_items[ITEMS_PER_THREAD],
994 OffsetT ranks[ITEMS_PER_THREAD])
995 {
996 #pragma unroll
997 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
998 {
999 int item_offset = ranks[ITEM];
1000 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1001 if (ranks[ITEM] >= 0)
1002 temp_storage.buff[item_offset] = input_items[ITEM];
1003 }
1004
1005 CTA_SYNC();
1006
1007 #pragma unroll
1008 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1009 {
1010 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
1011 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1012 output_items[ITEM] = temp_storage.buff[item_offset];
1013 }
1014 }
1015
1016
1017
1018
1028 template <typename OutputT, typename OffsetT, typename ValidFlag>
1029 __device__ __forceinline__ void ScatterToStripedFlagged(
1030 InputT input_items[ITEMS_PER_THREAD],
1031 OutputT output_items[ITEMS_PER_THREAD],
1032 OffsetT ranks[ITEMS_PER_THREAD],
1033 ValidFlag is_valid[ITEMS_PER_THREAD])
1034 {
1035 #pragma unroll
1036 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1037 {
1038 int item_offset = ranks[ITEM];
1039 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1040 if (is_valid[ITEM])
1041 temp_storage.buff[item_offset] = input_items[ITEM];
1042 }
1043
1044 CTA_SYNC();
1045
1046 #pragma unroll
1047 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1048 {
1049 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
1050 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1051 output_items[ITEM] = temp_storage.buff[item_offset];
1052 }
1053 }
1054
1055
1057
1058
1059
1060#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
1061
1062
1063 __device__ __forceinline__ void StripedToBlocked(
1064 InputT items[ITEMS_PER_THREAD])
1065 {
1066 StripedToBlocked(items, items);
1067 }
1068
1069 __device__ __forceinline__ void BlockedToStriped(
1070 InputT items[ITEMS_PER_THREAD])
1071 {
1072 BlockedToStriped(items, items);
1073 }
1074
1075 __device__ __forceinline__ void WarpStripedToBlocked(
1076 InputT items[ITEMS_PER_THREAD])
1077 {
1078 WarpStripedToBlocked(items, items);
1079 }
1080
1081 __device__ __forceinline__ void BlockedToWarpStriped(
1082 InputT items[ITEMS_PER_THREAD])
1083 {
1084 BlockedToWarpStriped(items, items);
1085 }
1086
1087 template <typename OffsetT>
1088 __device__ __forceinline__ void ScatterToBlocked(
1089 InputT items[ITEMS_PER_THREAD],
1090 OffsetT ranks[ITEMS_PER_THREAD])
1091 {
1092 ScatterToBlocked(items, items, ranks);
1093 }
1094
1095 template <typename OffsetT>
1096 __device__ __forceinline__ void ScatterToStriped(
1097 InputT items[ITEMS_PER_THREAD],
1098 OffsetT ranks[ITEMS_PER_THREAD])
1099 {
1100 ScatterToStriped(items, items, ranks);
1101 }
1102
1103 template <typename OffsetT>
1104 __device__ __forceinline__ void ScatterToStripedGuarded(
1105 InputT items[ITEMS_PER_THREAD],
1106 OffsetT ranks[ITEMS_PER_THREAD])
1107 {
1108 ScatterToStripedGuarded(items, items, ranks);
1109 }
1110
1111 template <typename OffsetT, typename ValidFlag>
1112 __device__ __forceinline__ void ScatterToStripedFlagged(
1113 InputT items[ITEMS_PER_THREAD],
1114 OffsetT ranks[ITEMS_PER_THREAD],
1115 ValidFlag is_valid[ITEMS_PER_THREAD])
1116 {
1117 ScatterToStriped(items, items, ranks, is_valid);
1118 }
1119
1120#endif // DOXYGEN_SHOULD_SKIP_THIS
1121
1122
1123};
1124
1125
1126#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
1127
1128
1129template <
1130 typename T,
1131 int ITEMS_PER_THREAD,
1132 int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
1133 int PTX_ARCH = CUB_PTX_ARCH>
1135{
1136private:
1137
1138 /******************************************************************************
1139 * Constants
1140 ******************************************************************************/
1141
1143 enum
1144 {
1145 // Whether the logical warp size and the PTX warp size coincide
1146 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
1147
1148 WARP_ITEMS = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
1149
1150 LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH),
1151 SMEM_BANKS = 1 << LOG_SMEM_BANKS,
1152
1153 // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
1154 INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
1155 PADDING_ITEMS = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
1156 };
1157
1158 /******************************************************************************
1159 * Type definitions
1160 ******************************************************************************/
1161
1164 {
1165 T buff[WARP_ITEMS + PADDING_ITEMS];
1166 };
1167
1168public:
1169
1171 struct TempStorage : Uninitialized<_TempStorage> {};
1172
1173private:
1174
1175
1176 /******************************************************************************
1177 * Thread fields
1178 ******************************************************************************/
1179
1180 _TempStorage &temp_storage;
1181 int lane_id;
1182
1183public:
1184
1185 /******************************************************************************
1186 * Construction
1187 ******************************************************************************/
1188
1190 __device__ __forceinline__ WarpExchange(
1191 TempStorage &temp_storage)
1192 :
1193 temp_storage(temp_storage.Alias()),
1194 lane_id(IS_ARCH_WARP ?
1195 LaneId() :
1196 LaneId() % LOGICAL_WARP_THREADS)
1197 {}
1198
1199
1200 /******************************************************************************
1201 * Interface
1202 ******************************************************************************/
1203
1212 template <typename OffsetT>
1213 __device__ __forceinline__ void ScatterToStriped(
1214 T items[ITEMS_PER_THREAD],
1215 OffsetT ranks[ITEMS_PER_THREAD])
1216 {
1217 #pragma unroll
1218 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1219 {
1220 if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
1221 temp_storage.buff[ranks[ITEM]] = items[ITEM];
1222 }
1223
1224 WARP_SYNC(0xffffffff);
1225
1226 #pragma unroll
1227 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1228 {
1229 int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
1230 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1231 items[ITEM] = temp_storage.buff[item_offset];
1232 }
1233 }
1234
1235};
1236
1237
1238
1239
1240#endif // DOXYGEN_SHOULD_SKIP_THIS
1241
1242
1243
1244
1245
1246} // CUB namespace
1247CUB_NS_POSTFIX // Optional outer namespace(s)
1248
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA th...
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from striped arrangement to blocked arrangement.
__device__ __forceinline__ BlockExchange(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from blocked arrangement to striped arrangement.
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into blocked arrangement.
__device__ __forceinline__ BlockExchange()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void StripedToBlocked(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToStripedGuarded(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged...
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ void BlockedToStriped(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void ScatterToStripedGuarded(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
__device__ __forceinline__ void BlockedToWarpStriped(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void ScatterToStriped(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from warp-striped arrangement to blocked arrangement.
struct __align__(16) _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into striped arrangement.
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToStripedFlagged(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD])
Exchanges valid data items annotated by rank into striped arrangement.
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void WarpStripedToBlocked(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToBlocked(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from blocked arrangement to warp-striped arrangement.
__device__ __forceinline__ void ScatterToStripedFlagged(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD])
__device__ __forceinline__ WarpExchange(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ void ScatterToStriped(T items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges valid data items annotated by rank into striped arrangement.
#define CUB_MIN(a, b)
Select minimum(a, b)
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition util_ptx.cuh:273
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409
__device__ __forceinline__ unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-right then add. Returns (x >> shift) + addend.
Definition util_ptx.cuh:87
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420
CTA_SYNC()
Definition util_ptx.cuh:255
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
OffsetT OffsetT
[in] Total number of input data items
\smemstorage{BlockExchange}
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Statically determine if N is a power-of-two.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
\smemstorage{WarpExchange}
Shared memory storage layout type.
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
Definition util_arch.cuh:73
#define CUB_LOG_SMEM_BANKS(arch)
Number of smem banks.
Definition util_arch.cuh:85
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53