36#include "../../block/block_radix_sort.cuh"
37#include "../../block/block_discontinuity.cuh"
38#include "../../util_ptx.cuh"
39#include "../../util_namespace.cuh"
76 (PTX_ARCH >= 350) ?
true : false,
78 cudaSharedMemBankSizeFourByte,
105 unsigned int run_begin[BINS];
106 unsigned int run_end[BINS];
117 unsigned int linear_tid;
124 temp_storage(temp_storage.Alias()),
125 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
137 temp_storage(temp_storage)
141 __device__ __forceinline__
bool operator()(
const T &a,
const T &b,
int b_index)
146 temp_storage.run_begin[b] = b_index;
147 temp_storage.run_end[a] = b_index;
163 T (&items)[ITEMS_PER_THREAD],
164 CounterT histogram[BINS])
174 int histo_offset = 0;
179 temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
180 temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
183 if ((BINS %
BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
185 temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
186 temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
191 int flags[ITEMS_PER_THREAD];
198 if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
208 int thread_offset = histo_offset + linear_tid;
209 CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
210 histogram[thread_offset] += count;
214 if ((BINS %
BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
216 int thread_offset = histo_offset + linear_tid;
217 CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
218 histogram[thread_offset] += count;
The BlockDiscontinuity class provides collective methods for flagging discontinuities within an order...
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
The BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thre...
__device__ __forceinline__ void Sort(KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8)
Performs an ascending block-wide radix sort over a blocked arrangement of keys.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Optional outer namespace(s)
\smemstorage{BlockDiscontinuity}
Alias wrapper allowing storage to be unioned.
The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms fr...
__device__ __forceinline__ BlockHistogramSort(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ void Composite(T(&items)[ITEMS_PER_THREAD], CounterT histogram[BINS])
@ BLOCK_THREADS
The thread block size in threads.
\smemstorage{BlockRadixSort}
A simple "NULL" marker type.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.