OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_raking_layout.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
35 #pragma once
36 
37 #include "../util_macro.cuh"
38 #include "../util_arch.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
63 template <
64  typename T,
65  int BLOCK_THREADS,
66  int PTX_ARCH = CUB_PTX_ARCH>
68 {
69  //---------------------------------------------------------------------
70  // Constants and type definitions
71  //---------------------------------------------------------------------
72 
73  enum
74  {
76  SHARED_ELEMENTS = BLOCK_THREADS,
77 
79  MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
80 
83 
86 
88  HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
89 
92  (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
93  1,
94 
97 
100 
103  };
104 
105 
109  struct __align__(16) _TempStorage
110  {
112  };
113 
115  struct TempStorage : Uninitialized<_TempStorage> {};
116 
117 
121  static __device__ __forceinline__ T* PlacementPtr(
122  TempStorage &temp_storage,
123  unsigned int linear_tid)
124  {
125  // Offset for partial
126  unsigned int offset = linear_tid;
127 
128  // Add in one padding element for every segment
129  if (USE_SEGMENT_PADDING > 0)
130  {
131  offset += offset / SEGMENT_LENGTH;
132  }
133 
134  // Incorporating a block of padding partials every shared memory segment
135  return temp_storage.Alias().buff + offset;
136  }
137 
138 
142  static __device__ __forceinline__ T* RakingPtr(
143  TempStorage &temp_storage,
144  unsigned int linear_tid)
145  {
146  return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
147  }
148 };
149 
150 } // CUB namespace
151 CUB_NS_POSTFIX // Optional outer namespace(s)
152 
Alias wrapper allowing storage to be unioned.
Maximum number of warp-synchronous raking threads.
Optional outer namespace(s)
Pad each segment length with one element if segment length is not relatively prime to warp size and c...
Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LE...
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
Number of raking elements per warp-synchronous raking thread (rounded up)
struct __align__(16) _TempStorage
Shared memory storage type.
BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thre...
The total number of elements that need to be cooperatively reduced.
Whether or not we need bounds checking during raking (the number of reduction elements is not a multi...
static __device__ __forceinline__ T * PlacementPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to place data into the grid.
Total number of elements in the raking grid.
static __device__ __forceinline__ T * RakingPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to begin sequential raking.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
Degree of bank conflicts (e.g., 4-way)
Whether we will have bank conflicts (technically we should find out if the GCD is > 1)