36#include "../util_type.cuh"
37#include "../util_ptx.cuh"
38#include "../util_namespace.cuh"
84 return private_storage;
89 template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
93 static __device__ __forceinline__ T FlagT(FlagOp flag_op,
const T &a,
const T &b,
int idx)
95 return flag_op(b, a, idx);
100 template <
typename FlagOp>
104 static __device__ __forceinline__ T FlagT(FlagOp flag_op,
const T &a,
const T &b,
int )
106 return flag_op(b, a);
111 template <
int ITERATION,
int MAX_ITERATIONS>
116 int ITEMS_PER_THREAD,
121 FlagT (&flags)[ITEMS_PER_THREAD],
122 T (&input)[ITEMS_PER_THREAD],
123 T (&preds)[ITEMS_PER_THREAD],
126 preds[ITERATION] = input[ITERATION - 1];
139 int ITEMS_PER_THREAD,
144 FlagT (&flags)[ITEMS_PER_THREAD],
145 T (&input)[ITEMS_PER_THREAD],
151 input[ITERATION + 1],
152 (
linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
160 template <
int MAX_ITERATIONS>
161 struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
165 int ITEMS_PER_THREAD,
168 static __device__ __forceinline__
void FlagHeads(
170 FlagT (&)[ITEMS_PER_THREAD],
171 T (&)[ITEMS_PER_THREAD],
172 T (&)[ITEMS_PER_THREAD],
178 int ITEMS_PER_THREAD,
181 static __device__ __forceinline__
void FlagTails(
183 FlagT (&)[ITEMS_PER_THREAD],
184 T (&)[ITEMS_PER_THREAD],
240#ifndef DOXYGEN_SHOULD_SKIP_THIS
243 int ITEMS_PER_THREAD,
247 FlagT (&head_flags)[ITEMS_PER_THREAD],
248 T (&input)[ITEMS_PER_THREAD],
249 T (&preds)[ITEMS_PER_THREAD],
273 int ITEMS_PER_THREAD,
277 FlagT (&head_flags)[ITEMS_PER_THREAD],
278 T (&input)[ITEMS_PER_THREAD],
279 T (&preds)[ITEMS_PER_THREAD],
281 T tile_predecessor_item)
290 tile_predecessor_item :
303 int ITEMS_PER_THREAD,
307 FlagT (&head_flags)[ITEMS_PER_THREAD],
308 T (&input)[ITEMS_PER_THREAD],
311 T preds[ITEMS_PER_THREAD];
312 FlagHeads(head_flags, input, preds, flag_op);
317 int ITEMS_PER_THREAD,
321 FlagT (&head_flags)[ITEMS_PER_THREAD],
322 T (&input)[ITEMS_PER_THREAD],
324 T tile_predecessor_item)
326 T preds[ITEMS_PER_THREAD];
327 FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
333 int ITEMS_PER_THREAD,
337 FlagT (&tail_flags)[ITEMS_PER_THREAD],
338 T (&input)[ITEMS_PER_THREAD],
351 input[ITEMS_PER_THREAD - 1],
353 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
361 int ITEMS_PER_THREAD,
365 FlagT (&tail_flags)[ITEMS_PER_THREAD],
366 T (&input)[ITEMS_PER_THREAD],
368 T tile_successor_item)
377 tile_successor_item :
382 input[ITEMS_PER_THREAD - 1],
384 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
392 int ITEMS_PER_THREAD,
396 FlagT (&head_flags)[ITEMS_PER_THREAD],
397 FlagT (&tail_flags)[ITEMS_PER_THREAD],
398 T (&input)[ITEMS_PER_THREAD],
407 T preds[ITEMS_PER_THREAD];
430 input[ITEMS_PER_THREAD - 1],
432 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
443 int ITEMS_PER_THREAD,
447 FlagT (&head_flags)[ITEMS_PER_THREAD],
448 FlagT (&tail_flags)[ITEMS_PER_THREAD],
449 T tile_successor_item,
450 T (&input)[ITEMS_PER_THREAD],
459 T preds[ITEMS_PER_THREAD];
478 tile_successor_item :
483 input[ITEMS_PER_THREAD - 1],
485 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
495 int ITEMS_PER_THREAD,
499 FlagT (&head_flags)[ITEMS_PER_THREAD],
500 T tile_predecessor_item,
501 FlagT (&tail_flags)[ITEMS_PER_THREAD],
502 T (&input)[ITEMS_PER_THREAD],
511 T preds[ITEMS_PER_THREAD];
515 tile_predecessor_item :
529 input[ITEMS_PER_THREAD - 1],
531 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
542 int ITEMS_PER_THREAD,
546 FlagT (&head_flags)[ITEMS_PER_THREAD],
547 T tile_predecessor_item,
548 FlagT (&tail_flags)[ITEMS_PER_THREAD],
549 T tile_successor_item,
550 T (&input)[ITEMS_PER_THREAD],
559 T preds[ITEMS_PER_THREAD];
563 tile_predecessor_item :
574 tile_successor_item :
579 input[ITEMS_PER_THREAD - 1],
581 (
linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item)
__device__ __forceinline__ BlockAdjacentDifference()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT(&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item)
__device__ __forceinline__ BlockAdjacentDifference(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ void FlagTails(FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ void FlagTails(FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_successor_item)
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], FlagT(&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Optional outer namespace(s)
Specialization for when FlagOp has third index param.
Templated unrolling of item comparison (inductive case)
static __device__ __forceinline__ void FlagHeads(int linear_tid, FlagT(&flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
static __device__ __forceinline__ void FlagTails(int linear_tid, FlagT(&flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
\smemstorage{BlockDiscontinuity}
Shared memory storage layout type (last element from each thread's input)
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...