38 #include "../block/block_load.cuh" 39 #include "../block/block_reduce.cuh" 40 #include "../grid/grid_mapping.cuh" 41 #include "../grid/grid_even_share.cuh" 42 #include "../util_type.cuh" 43 #include "../iterator/cache_modified_input_iterator.cuh" 44 #include "../util_namespace.cuh" 63 int _ITEMS_PER_THREAD,
64 int _VECTOR_LOAD_LENGTH,
95 typename InputIteratorT,
96 typename OutputIteratorT,
107 typedef typename std::iterator_traits<InputIteratorT>::value_type
InputT;
111 typename std::iterator_traits<InputIteratorT>::value_type,
112 typename std::iterator_traits<OutputIteratorT>::value_type>::Type
OutputT;
120 InputIteratorT>::Type
129 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
132 ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) &&
133 (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
170 template <
typename Iterator>
171 static __device__ __forceinline__
bool IsAligned(
175 return (
size_t(
d_in) & (
sizeof(
VectorT) - 1)) == 0;
179 template <
typename Iterator>
180 static __device__ __forceinline__
bool IsAligned(
214 template <
int IS_FIRST_TILE>
222 OutputT items[ITEMS_PER_THREAD];
228 thread_aggregate = (IS_FIRST_TILE) ?
237 template <
int IS_FIRST_TILE>
246 enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
251 reinterpret_cast<VectorT*>(d_in_unqualified));
254 InputT input_items[ITEMS_PER_THREAD];
255 VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
257 for (
int i = 0; i < WORDS; ++i)
258 vec_items[i] = d_vec_in[BLOCK_THREADS * i];
261 OutputT items[ITEMS_PER_THREAD];
263 for (
int i = 0; i < ITEMS_PER_THREAD; ++i)
264 items[i] = input_items[i];
267 thread_aggregate = (IS_FIRST_TILE) ?
276 template <
int IS_FIRST_TILE,
int CAN_VECTORIZE>
285 int thread_offset = threadIdx.x;
288 if ((IS_FIRST_TILE) && (thread_offset < valid_items))
291 thread_offset += BLOCK_THREADS;
295 while (thread_offset < valid_items)
298 thread_aggregate =
reduction_op(thread_aggregate, item);
299 thread_offset += BLOCK_THREADS;
311 template <
int CAN_VECTORIZE>
372 even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
InputIteratorT d_in
Input data to reduce.
Alias wrapper allowing storage to be unioned.
std::iterator_traits< InputIteratorT >::value_type InputT
The input value type.
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Optional outer namespace(s)
Number of items per vectorized load.
__device__ __forceinline__ AgentReduce(TempStorage &temp_storage, InputIteratorT d_in, ReductionOp reduction_op)
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
__device__ __forceinline__ OutputT ConsumeRange(OffsetT block_offset, OffsetT block_end)
Reduce a contiguous segment of input tiles.
Threads per thread block.
CubVector< InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH >::Type VectorT
Vector type of InputT for data movement.
< Cache load modifier for reading input elements
If<(Equals< typename std::iterator_traits< OutputIteratorT >::value_type, void >::VALUE), typename std::iterator_traits< InputIteratorT >::value_type, typename std::iterator_traits< OutputIteratorT >::value_type >::Type OutputT
The output value type.
OffsetT OffsetT
[in] Total number of input data items
_TempStorage & temp_storage
Reference to temp_storage.
static const BlockReduceAlgorithm BLOCK_ALGORITHM
Cooperative block-wide reduction algorithm to use.
AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide ...
BlockReduce< OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM > BlockReduceT
Parameterized BlockReduce primitive.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
WrappedInputIteratorT d_wrapped_in
Wrapped input data to reduce.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
ReductionOp reduction_op
Binary reduction operator.
Type selection (IF ? ThenType : ElseType)
If< IsPointer< InputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT >, InputIteratorT >::Type WrappedInputIteratorT
Input iterator wrapper type (for applying cache modifier)
#define CUB_MIN(a, b)
Select minimum(a, b)
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int valid_items, Int2Type< false >, Int2Type< CAN_VECTORIZE >)
__device__ __forceinline__ OutputT ConsumeTiles(GridEvenShare< OffsetT > &even_share)
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int, Int2Type< true >, Int2Type< false >)
Items per thread (per tile of input)
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
\smemstorage{BlockReduce}
__device__ __forceinline__ OutputT ConsumeRange(GridEvenShare< OffsetT > &even_share, Int2Type< CAN_VECTORIZE > can_vectorize)
Reduce a contiguous segment of input tiles.
Shared memory type required by this thread block.
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int, Int2Type< true >, Int2Type< true >)