36#include "../util_ptx.cuh"
37#include "../util_arch.cuh"
38#include "../util_macro.cuh"
39#include "../util_type.cuh"
40#include "../util_namespace.cuh"
111 int ITEMS_PER_THREAD,
112 bool WARP_TIME_SLICING =
false,
131 WARP_THREADS = 1 << LOG_WARP_THREADS,
135 SMEM_BANKS = 1 << LOG_SMEM_BANKS,
139 TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1,
142 TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
145 WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
148 INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (
PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
149 PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
159 InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
179 unsigned int lane_id;
180 unsigned int warp_id;
181 unsigned int warp_offset;
191 __shared__ _TempStorage private_storage;
192 return private_storage;
199 template <
typename OutputT>
201 InputT input_items[ITEMS_PER_THREAD],
202 OutputT output_items[ITEMS_PER_THREAD],
206 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
208 int item_offset = (
linear_tid * ITEMS_PER_THREAD) + ITEM;
209 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
216 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
219 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
228 template <
typename OutputT>
230 InputT input_items[ITEMS_PER_THREAD],
231 OutputT output_items[ITEMS_PER_THREAD],
234 InputT temp_items[ITEMS_PER_THREAD];
237 for (
int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
239 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
240 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
244 if (warp_id == SLICE)
247 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
249 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
250 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
258 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
264 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
266 int item_offset = STRIP_OFFSET +
linear_tid - SLICE_OFFSET;
267 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
269 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
278 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
280 output_items[ITEM] = temp_items[ITEM];
288 template <
typename OutputT>
290 InputT input_items[ITEMS_PER_THREAD],
291 OutputT output_items[ITEMS_PER_THREAD],
295 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
297 int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
298 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
305 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
307 int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
308 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
316 template <
typename OutputT>
318 InputT input_items[ITEMS_PER_THREAD],
319 OutputT output_items[ITEMS_PER_THREAD],
325 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
327 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
328 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
335 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
337 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
338 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
344 for (
unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
348 if (warp_id == SLICE)
351 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
353 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
354 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
361 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
363 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
364 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
375 template <
typename OutputT>
377 InputT input_items[ITEMS_PER_THREAD],
378 OutputT output_items[ITEMS_PER_THREAD],
382 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
385 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
393 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
395 int item_offset = (
linear_tid * ITEMS_PER_THREAD) + ITEM;
396 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
405 template <
typename OutputT>
407 InputT input_items[ITEMS_PER_THREAD],
408 OutputT output_items[ITEMS_PER_THREAD],
412 InputT temp_items[ITEMS_PER_THREAD];
415 for (
int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
417 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
418 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
423 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
429 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
431 int item_offset = STRIP_OFFSET +
linear_tid - SLICE_OFFSET;
432 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
434 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
442 if (warp_id == SLICE)
445 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
447 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
448 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
456 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
458 output_items[ITEM] = temp_items[ITEM];
466 template <
typename OutputT>
468 InputT input_items[ITEMS_PER_THREAD],
469 OutputT output_items[ITEMS_PER_THREAD],
473 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
475 int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
476 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
483 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
485 int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
486 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
495 template <
typename OutputT>
497 InputT input_items[ITEMS_PER_THREAD],
498 OutputT output_items[ITEMS_PER_THREAD],
502 for (
unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
506 if (warp_id == SLICE)
509 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
511 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
512 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
519 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
521 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
522 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
533 template <
typename OutputT,
typename OffsetT>
535 InputT input_items[ITEMS_PER_THREAD],
536 OutputT output_items[ITEMS_PER_THREAD],
537 OffsetT ranks[ITEMS_PER_THREAD],
541 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
543 int item_offset = ranks[ITEM];
544 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
551 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
553 int item_offset = (
linear_tid * ITEMS_PER_THREAD) + ITEM;
554 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
562 template <
typename OutputT,
typename OffsetT>
564 InputT input_items[ITEMS_PER_THREAD],
565 OutputT output_items[ITEMS_PER_THREAD],
566 OffsetT ranks[ITEMS_PER_THREAD],
569 InputT temp_items[ITEMS_PER_THREAD];
572 for (
int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
576 const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
579 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
581 int item_offset = ranks[ITEM] - SLICE_OFFSET;
582 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
584 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
591 if (warp_id == SLICE)
594 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
596 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
597 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
605 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
607 output_items[ITEM] = temp_items[ITEM];
615 template <
typename OutputT,
typename OffsetT>
617 InputT input_items[ITEMS_PER_THREAD],
618 OutputT output_items[ITEMS_PER_THREAD],
619 OffsetT ranks[ITEMS_PER_THREAD],
623 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
625 int item_offset = ranks[ITEM];
626 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
633 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
636 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
645 template <
typename OutputT,
typename OffsetT>
647 InputT input_items[ITEMS_PER_THREAD],
648 OutputT output_items[ITEMS_PER_THREAD],
649 OffsetT ranks[ITEMS_PER_THREAD],
652 InputT temp_items[ITEMS_PER_THREAD];
655 for (
int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
657 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
658 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
663 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
665 int item_offset = ranks[ITEM] - SLICE_OFFSET;
666 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
668 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
676 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
682 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
684 int item_offset = STRIP_OFFSET +
linear_tid - SLICE_OFFSET;
685 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
687 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
696 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
698 output_items[ITEM] = temp_items[ITEM];
717 warp_id((WARPS == 1) ? 0 :
linear_tid / WARP_THREADS),
719 warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
732 warp_id((WARPS == 1) ? 0 :
linear_tid / WARP_THREADS),
733 warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
779 template <
typename OutputT>
781 InputT input_items[ITEMS_PER_THREAD],
782 OutputT output_items[ITEMS_PER_THREAD])
828 template <
typename OutputT>
830 InputT input_items[ITEMS_PER_THREAD],
831 OutputT output_items[ITEMS_PER_THREAD])
876 template <
typename OutputT>
878 InputT input_items[ITEMS_PER_THREAD],
879 OutputT output_items[ITEMS_PER_THREAD])
927 template <
typename OutputT>
929 InputT input_items[ITEMS_PER_THREAD],
930 OutputT output_items[ITEMS_PER_THREAD])
952 template <
typename OutputT,
typename OffsetT>
954 InputT input_items[ITEMS_PER_THREAD],
955 OutputT output_items[ITEMS_PER_THREAD],
956 OffsetT ranks[ITEMS_PER_THREAD])
971 template <
typename OutputT,
typename OffsetT>
973 InputT input_items[ITEMS_PER_THREAD],
974 OutputT output_items[ITEMS_PER_THREAD],
975 OffsetT ranks[ITEMS_PER_THREAD])
990 template <
typename OutputT,
typename OffsetT>
992 InputT input_items[ITEMS_PER_THREAD],
993 OutputT output_items[ITEMS_PER_THREAD],
994 OffsetT ranks[ITEMS_PER_THREAD])
997 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
999 int item_offset = ranks[ITEM];
1000 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1001 if (ranks[ITEM] >= 0)
1008 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1011 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1028 template <
typename OutputT,
typename OffsetT,
typename Val
idFlag>
1030 InputT input_items[ITEMS_PER_THREAD],
1031 OutputT output_items[ITEMS_PER_THREAD],
1032 OffsetT ranks[ITEMS_PER_THREAD],
1033 ValidFlag is_valid[ITEMS_PER_THREAD])
1036 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1038 int item_offset = ranks[ITEM];
1039 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1047 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1050 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1060#ifndef DOXYGEN_SHOULD_SKIP_THIS
1064 InputT items[ITEMS_PER_THREAD])
1070 InputT items[ITEMS_PER_THREAD])
1076 InputT items[ITEMS_PER_THREAD])
1082 InputT items[ITEMS_PER_THREAD])
1087 template <
typename OffsetT>
1089 InputT items[ITEMS_PER_THREAD],
1090 OffsetT ranks[ITEMS_PER_THREAD])
1095 template <
typename OffsetT>
1097 InputT items[ITEMS_PER_THREAD],
1098 OffsetT ranks[ITEMS_PER_THREAD])
1103 template <
typename OffsetT>
1105 InputT items[ITEMS_PER_THREAD],
1106 OffsetT ranks[ITEMS_PER_THREAD])
1111 template <
typename OffsetT,
typename Val
idFlag>
1113 InputT items[ITEMS_PER_THREAD],
1114 OffsetT ranks[ITEMS_PER_THREAD],
1115 ValidFlag is_valid[ITEMS_PER_THREAD])
1126#ifndef DOXYGEN_SHOULD_SKIP_THIS
1131 int ITEMS_PER_THREAD,
1132 int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
1146 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
1148 WARP_ITEMS = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
1151 SMEM_BANKS = 1 << LOG_SMEM_BANKS,
1155 PADDING_ITEMS = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
1165 T buff[WARP_ITEMS + PADDING_ITEMS];
1193 temp_storage(temp_storage.Alias()),
1194 lane_id(IS_ARCH_WARP ?
1196 LaneId() % LOGICAL_WARP_THREADS)
1212 template <
typename OffsetT>
1214 T items[ITEMS_PER_THREAD],
1215 OffsetT ranks[ITEMS_PER_THREAD])
1218 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1220 if (INSERT_PADDING) ranks[ITEM] =
SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
1221 temp_storage.buff[ranks[ITEM]] = items[ITEM];
1227 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1229 int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
1230 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1231 items[ITEM] = temp_storage.buff[item_offset];
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA th...
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from striped arrangement to blocked arrangement.
__device__ __forceinline__ BlockExchange(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from blocked arrangement to striped arrangement.
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into blocked arrangement.
__device__ __forceinline__ BlockExchange()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void StripedToBlocked(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToStripedGuarded(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged...
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ void BlockedToStriped(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void ScatterToStripedGuarded(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
__device__ __forceinline__ void BlockedToWarpStriped(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void ScatterToStriped(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from warp-striped arrangement to blocked arrangement.
struct __align__(16) _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into striped arrangement.
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToStripedFlagged(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD])
Exchanges valid data items annotated by rank into striped arrangement.
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void WarpStripedToBlocked(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToBlocked(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from blocked arrangement to warp-striped arrangement.
__device__ __forceinline__ void ScatterToStripedFlagged(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD])
__device__ __forceinline__ WarpExchange(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ void ScatterToStriped(T items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges valid data items annotated by rank into striped arrangement.
#define CUB_MIN(a, b)
Select minimum(a, b)
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
__device__ __forceinline__ unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-right then add. Returns (x >> shift) + addend.
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
OffsetT OffsetT
[in] Total number of input data items
\smemstorage{BlockExchange}
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Statically determine if N is a power-of-two.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
\smemstorage{WarpExchange}
Shared memory storage layout type.
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
#define CUB_LOG_SMEM_BANKS(arch)
Number of smem banks.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...