38#include "../block/block_load.cuh"
39#include "../block/block_reduce.cuh"
40#include "../grid/grid_mapping.cuh"
41#include "../grid/grid_even_share.cuh"
42#include "../util_type.cuh"
43#include "../iterator/cache_modified_input_iterator.cuh"
44#include "../util_namespace.cuh"
63 int _ITEMS_PER_THREAD,
64 int _VECTOR_LOAD_LENGTH,
95 typename InputIteratorT,
96 typename OutputIteratorT,
107 typedef typename std::iterator_traits<InputIteratorT>::value_type
InputT;
111 typename std::iterator_traits<InputIteratorT>::value_type,
112 typename std::iterator_traits<OutputIteratorT>::value_type>::Type
OutputT;
120 InputIteratorT>::Type
129 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
132 ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) &&
133 (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
170 template <
typename Iterator>
171 static __device__ __forceinline__
bool IsAligned(
175 return (
size_t(
d_in) & (
sizeof(
VectorT) - 1)) == 0;
179 template <
typename Iterator>
180 static __device__ __forceinline__
bool IsAligned(
214 template <
int IS_FIRST_TILE>
222 OutputT items[ITEMS_PER_THREAD];
228 thread_aggregate = (IS_FIRST_TILE) ?
237 template <
int IS_FIRST_TILE>
246 enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
251 reinterpret_cast<VectorT*
>(d_in_unqualified));
254 InputT input_items[ITEMS_PER_THREAD];
257 for (
int i = 0; i < WORDS; ++i)
258 vec_items[i] = d_vec_in[BLOCK_THREADS * i];
261 OutputT items[ITEMS_PER_THREAD];
263 for (
int i = 0; i < ITEMS_PER_THREAD; ++i)
264 items[i] = input_items[i];
267 thread_aggregate = (IS_FIRST_TILE) ?
276 template <
int IS_FIRST_TILE,
int CAN_VECTORIZE>
285 int thread_offset = threadIdx.x;
288 if ((IS_FIRST_TILE) && (thread_offset < valid_items))
291 thread_offset += BLOCK_THREADS;
295 while (thread_offset < valid_items)
298 thread_aggregate =
reduction_op(thread_aggregate, item);
299 thread_offset += BLOCK_THREADS;
311 template <
int CAN_VECTORIZE>
372 even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
#define CUB_MIN(a, b)
Select minimum(a, b)
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Optional outer namespace(s)
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
OffsetT OffsetT
[in] Total number of input data items
< Cache load modifier for reading input elements
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
static const BlockReduceAlgorithm BLOCK_ALGORITHM
Cooperative block-wide reduction algorithm to use.
@ BLOCK_THREADS
Threads per thread block.
@ VECTOR_LOAD_LENGTH
Number of items per vectorized load.
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
Alias wrapper allowing storage to be unioned.
Shared memory type required by this thread block.
AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide ...
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int, Int2Type< true >, Int2Type< true >)
BlockReduce< OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM > BlockReduceT
Parameterized BlockReduce primitive.
ReductionOp reduction_op
Binary reduction operator.
__device__ __forceinline__ OutputT ConsumeRange(GridEvenShare< OffsetT > &even_share, Int2Type< CAN_VECTORIZE > can_vectorize)
Reduce a contiguous segment of input tiles.
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int, Int2Type< true >, Int2Type< false >)
WrappedInputIteratorT d_wrapped_in
Wrapped input data to reduce.
__device__ __forceinline__ AgentReduce(TempStorage &temp_storage, InputIteratorT d_in, ReductionOp reduction_op)
CubVector< InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH >::Type VectorT
Vector type of InputT for data movement.
__device__ __forceinline__ OutputT ConsumeTiles(GridEvenShare< OffsetT > &even_share)
__device__ __forceinline__ OutputT ConsumeRange(OffsetT block_offset, OffsetT block_end)
Reduce a contiguous segment of input tiles.
InputIteratorT d_in
Input data to reduce.
If< IsPointer< InputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT >, InputIteratorT >::Type WrappedInputIteratorT
Input iterator wrapper type (for applying cache modifier)
_TempStorage & temp_storage
Reference to temp_storage.
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int valid_items, Int2Type< false >, Int2Type< CAN_VECTORIZE >)
std::iterator_traits< InputIteratorT >::value_type InputT
The input value type.
If<(Equals< typenamestd::iterator_traits< OutputIteratorT >::value_type, void >::VALUE), typenamestd::iterator_traits< InputIteratorT >::value_type, typenamestd::iterator_traits< OutputIteratorT >::value_type >::Type OutputT
The output value type.
\smemstorage{BlockReduce}
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
Type selection (IF ? ThenType : ElseType)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.