36 #include "../util_type.cuh" 37 #include "../util_ptx.cuh" 38 #include "../util_namespace.cuh" 140 return private_storage;
145 template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
149 static __device__ __forceinline__
bool FlagT(FlagOp flag_op,
const T &a,
const T &b,
int idx)
151 return flag_op(a, b, idx);
156 template <
typename FlagOp>
160 static __device__ __forceinline__
bool FlagT(FlagOp flag_op,
const T &a,
const T &b,
int )
162 return flag_op(a, b);
167 template <
int ITERATION,
int MAX_ITERATIONS>
172 int ITEMS_PER_THREAD,
177 FlagT (&flags)[ITEMS_PER_THREAD],
178 T (&input)[ITEMS_PER_THREAD],
179 T (&preds)[ITEMS_PER_THREAD],
182 preds[ITERATION] = input[ITERATION - 1];
195 int ITEMS_PER_THREAD,
200 FlagT (&flags)[ITEMS_PER_THREAD],
201 T (&input)[ITEMS_PER_THREAD],
207 input[ITERATION + 1],
208 (
linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
216 template <
int MAX_ITERATIONS>
217 struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
221 int ITEMS_PER_THREAD,
224 static __device__ __forceinline__
void FlagHeads(
226 FlagT (&)[ITEMS_PER_THREAD],
227 T (&)[ITEMS_PER_THREAD],
228 T (&)[ITEMS_PER_THREAD],
234 int ITEMS_PER_THREAD,
237 static __device__ __forceinline__
void FlagTails(
239 FlagT (&)[ITEMS_PER_THREAD],
240 T (&)[ITEMS_PER_THREAD],
296 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 299 int ITEMS_PER_THREAD,
303 FlagT (&head_flags)[ITEMS_PER_THREAD],
304 T (&input)[ITEMS_PER_THREAD],
305 T (&preds)[ITEMS_PER_THREAD],
329 int ITEMS_PER_THREAD,
333 FlagT (&head_flags)[ITEMS_PER_THREAD],
334 T (&input)[ITEMS_PER_THREAD],
335 T (&preds)[ITEMS_PER_THREAD],
337 T tile_predecessor_item)
346 tile_predecessor_item :
355 #endif // DOXYGEN_SHOULD_SKIP_THIS 408 int ITEMS_PER_THREAD,
412 FlagT (&head_flags)[ITEMS_PER_THREAD],
413 T (&input)[ITEMS_PER_THREAD],
416 T preds[ITEMS_PER_THREAD];
417 FlagHeads(head_flags, input, preds, flag_op);
477 int ITEMS_PER_THREAD,
481 FlagT (&head_flags)[ITEMS_PER_THREAD],
482 T (&input)[ITEMS_PER_THREAD],
484 T tile_predecessor_item)
486 T preds[ITEMS_PER_THREAD];
487 FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
550 int ITEMS_PER_THREAD,
554 FlagT (&tail_flags)[ITEMS_PER_THREAD],
555 T (&input)[ITEMS_PER_THREAD],
568 input[ITEMS_PER_THREAD - 1],
570 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
634 int ITEMS_PER_THREAD,
638 FlagT (&tail_flags)[ITEMS_PER_THREAD],
639 T (&input)[ITEMS_PER_THREAD],
641 T tile_successor_item)
650 tile_successor_item :
655 input[ITEMS_PER_THREAD - 1],
657 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
732 int ITEMS_PER_THREAD,
736 FlagT (&head_flags)[ITEMS_PER_THREAD],
737 FlagT (&tail_flags)[ITEMS_PER_THREAD],
738 T (&input)[ITEMS_PER_THREAD],
747 T preds[ITEMS_PER_THREAD];
770 input[ITEMS_PER_THREAD - 1],
772 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
848 int ITEMS_PER_THREAD,
852 FlagT (&head_flags)[ITEMS_PER_THREAD],
853 FlagT (&tail_flags)[ITEMS_PER_THREAD],
854 T tile_successor_item,
855 T (&input)[ITEMS_PER_THREAD],
864 T preds[ITEMS_PER_THREAD];
883 tile_successor_item :
888 input[ITEMS_PER_THREAD - 1],
890 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
972 int ITEMS_PER_THREAD,
976 FlagT (&head_flags)[ITEMS_PER_THREAD],
977 T tile_predecessor_item,
978 FlagT (&tail_flags)[ITEMS_PER_THREAD],
979 T (&input)[ITEMS_PER_THREAD],
988 T preds[ITEMS_PER_THREAD];
992 tile_predecessor_item :
1006 input[ITEMS_PER_THREAD - 1],
1008 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
1091 int ITEMS_PER_THREAD,
1095 FlagT (&head_flags)[ITEMS_PER_THREAD],
1096 T tile_predecessor_item,
1097 FlagT (&tail_flags)[ITEMS_PER_THREAD],
1098 T tile_successor_item,
1099 T (&input)[ITEMS_PER_THREAD],
1108 T preds[ITEMS_PER_THREAD];
1112 tile_predecessor_item :
1123 tile_successor_item :
1128 input[ITEMS_PER_THREAD - 1],
1130 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
\smemstorage{BlockDiscontinuity}
Shared memory storage layout type (last element from each thread's input)
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item)
Sets head flags indicating discontinuities between items partitioned across the thread block.
Optional outer namespace(s)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ void FlagTails(FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_successor_item)
Sets tail flags indicating discontinuities between items partitioned across the thread block.
__device__ __forceinline__ void FlagTails(FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets tail flags indicating discontinuities between items partitioned across the thread block,...
The BlockDiscontinuity class provides collective methods for flagging discontinuities within an order...
Specialization for when FlagOp has third index param.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
static __device__ __forceinline__ void FlagHeads(int linear_tid, FlagT(&flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ BlockDiscontinuity()
Collective constructor using a private static allocation of shared memory as temporary storage.
static __device__ __forceinline__ void FlagTails(int linear_tid, FlagT(&flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ BlockDiscontinuity(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
The thread block size in threads.
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT(&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets both head and tail flags indicating discontinuities between items partitioned across the thread ...
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], FlagT(&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets both head and tail flags indicating discontinuities between items partitioned across the thread ...
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets both head and tail flags indicating discontinuities between items partitioned across the thread ...
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item)
Templated unrolling of item comparison (inductive case)
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets both head and tail flags indicating discontinuities between items partitioned across the thread ...
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets head flags indicating discontinuities between items partitioned across the thread block,...