OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_shuffle.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../util_arch.cuh"
37 #include "../util_ptx.cuh"
38 #include "../util_macro.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
64 template <
65  typename T,
66  int BLOCK_DIM_X,
67  int BLOCK_DIM_Y = 1,
68  int BLOCK_DIM_Z = 1,
69  int PTX_ARCH = CUB_PTX_ARCH>
71 {
72 private:
73 
74  /******************************************************************************
75  * Constants
76  ******************************************************************************/
77 
78  enum
79  {
80  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
81 
82  LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
83  WARP_THREADS = 1 << LOG_WARP_THREADS,
84  WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
85  };
86 
87  /******************************************************************************
88  * Type definitions
89  ******************************************************************************/
90 
92  struct _TempStorage
93  {
94  T prev[BLOCK_THREADS];
95  T next[BLOCK_THREADS];
96  };
97 
98 
99 public:
100 
102  struct TempStorage : Uninitialized<_TempStorage> {};
103 
104 private:
105 
106 
107  /******************************************************************************
108  * Thread fields
109  ******************************************************************************/
110 
113 
115  unsigned int linear_tid;
116 
117 
118  /******************************************************************************
119  * Utility methods
120  ******************************************************************************/
121 
123  __device__ __forceinline__ _TempStorage& PrivateStorage()
124  {
125  __shared__ _TempStorage private_storage;
126  return private_storage;
127  }
128 
129 
130 public:
131 
132  /******************************************************************/
136 
140  __device__ __forceinline__ BlockShuffle()
141  :
143  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
144  {}
145 
146 
150  __device__ __forceinline__ BlockShuffle(
152  :
153  temp_storage(temp_storage.Alias()),
154  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
155  {}
156 
157 
159  /******************************************************************/
163 
164 
171  __device__ __forceinline__ void Offset(
172  T input,
173  T& output,
174  int distance = 1)
175  {
176  temp_storage[linear_tid].prev = input;
177 
178  CTA_SYNC();
179 
180  if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS))
181  output = temp_storage[linear_tid + distance].prev;
182  }
183 
184 
191  __device__ __forceinline__ void Rotate(
192  T input,
193  T& output,
194  unsigned int distance = 1)
195  {
196  temp_storage[linear_tid].prev = input;
197 
198  CTA_SYNC();
199 
200  unsigned int offset = threadIdx.x + distance;
201  if (offset >= BLOCK_THREADS)
202  offset -= BLOCK_THREADS;
203 
204  output = temp_storage[offset].prev;
205  }
206 
207 
216  template <int ITEMS_PER_THREAD>
217  __device__ __forceinline__ void Up(
218  T (&input)[ITEMS_PER_THREAD],
219  T (&prev)[ITEMS_PER_THREAD])
220  {
221  temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
222 
223  CTA_SYNC();
224 
225  #pragma unroll
226  for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
227  prev[ITEM] = input[ITEM - 1];
228 
229 
230  if (linear_tid > 0)
231  prev[0] = temp_storage[linear_tid - 1].prev;
232  }
233 
234 
243  template <int ITEMS_PER_THREAD>
244  __device__ __forceinline__ void Up(
245  T (&input)[ITEMS_PER_THREAD],
246  T (&prev)[ITEMS_PER_THREAD],
247  T &block_suffix)
248  {
249  Up(input, prev);
250  block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
251  }
252 
253 
262  template <int ITEMS_PER_THREAD>
263  __device__ __forceinline__ void Down(
264  T (&input)[ITEMS_PER_THREAD],
265  T (&prev)[ITEMS_PER_THREAD])
266  {
267  temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
268 
269  CTA_SYNC();
270 
271  #pragma unroll
272  for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
273  prev[ITEM] = input[ITEM - 1];
274 
275  if (linear_tid > 0)
276  prev[0] = temp_storage[linear_tid - 1].prev;
277  }
278 
279 
288  template <int ITEMS_PER_THREAD>
289  __device__ __forceinline__ void Down(
290  T (&input)[ITEMS_PER_THREAD],
291  T (&prev)[ITEMS_PER_THREAD],
292  T &block_prefix)
293  {
294  Up(input, prev);
295  block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
296  }
297 
299 
300 
301 };
302 
303 } // CUB namespace
304 CUB_NS_POSTFIX // Optional outer namespace(s)
305 
__device__ __forceinline__ void Up(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD])
The thread block rotates its blocked arrangement of input items, shifting it up by one item.
Shared memory storage layout type (last element from each thread's input)
Optional outer namespace(s)
\smemstorage{BlockShuffle}
__device__ __forceinline__ void Down(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD], T &block_prefix)
The thread block rotates its blocked arrangement of input items, shifting it down by one item....
__device__ __forceinline__ BlockShuffle(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
Definition: util_arch.cuh:73
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
The BlockShuffle class provides collective methods for shuffling data partitioned across a CUDA threa...
CTA_SYNC()
Definition: util_ptx.cuh:255
__device__ __forceinline__ void Down(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD])
The thread block rotates its blocked arrangement of input items, shifting it down by one item.
__device__ __forceinline__ void Rotate(T input, T &output, unsigned int distance=1)
Each threadi obtains the input provided by threadi+distance.
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ void Up(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD], T &block_suffix)
The thread block rotates its blocked arrangement of input items, shifting it up by one item....
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ BlockShuffle()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void Offset(T input, T &output, int distance=1)
Each threadi obtains the input provided by threadi+distance. The offset distance may be negative.