OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
block_raking_layout.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
35#pragma once
36
37#include "../util_macro.cuh"
38#include "../util_arch.cuh"
39#include "../util_type.cuh"
40#include "../util_namespace.cuh"
41
43CUB_NS_PREFIX
44
46namespace cub {
47
63template <
64 typename T,
65 int BLOCK_THREADS,
66 int PTX_ARCH = CUB_PTX_ARCH>
68{
69 //---------------------------------------------------------------------
70 // Constants and type definitions
71 //---------------------------------------------------------------------
72
73 enum
74 {
76 SHARED_ELEMENTS = BLOCK_THREADS,
77
79 MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
80
83
86
88 HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
89
92 (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
93 1,
94
97
100
103 };
104
105
109 struct __align__(16) _TempStorage
110 {
112 };
113
115 struct TempStorage : Uninitialized<_TempStorage> {};
116
117
121 static __device__ __forceinline__ T* PlacementPtr(
122 TempStorage &temp_storage,
123 unsigned int linear_tid)
124 {
125 // Offset for partial
126 unsigned int offset = linear_tid;
127
128 // Add in one padding element for every segment
129 if (USE_SEGMENT_PADDING > 0)
130 {
131 offset += offset / SEGMENT_LENGTH;
132 }
133
134 // Incorporating a block of padding partials every shared memory segment
135 return temp_storage.Alias().buff + offset;
136 }
137
138
142 static __device__ __forceinline__ T* RakingPtr(
143 TempStorage &temp_storage,
144 unsigned int linear_tid)
145 {
146 return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
147 }
148};
149
150} // CUB namespace
151CUB_NS_POSTFIX // Optional outer namespace(s)
152
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
Alias wrapper allowing storage to be unioned.
BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thre...
struct __align__(16) _TempStorage
Shared memory storage type.
@ USE_SEGMENT_PADDING
Pad each segment length with one element if segment length is not relatively prime to warp size and c...
@ UNGUARDED
Whether or not we need bounds checking during raking (the number of reduction elements is not a multi...
@ HAS_CONFLICTS
Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
@ MAX_RAKING_THREADS
Maximum number of warp-synchronous raking threads.
@ SHARED_ELEMENTS
The total number of elements that need to be cooperatively reduced.
@ SEGMENT_LENGTH
Number of raking elements per warp-synchronous raking thread (rounded up)
@ CONFLICT_DEGREE
Degree of bank conflicts (e.g., 4-way)
@ RAKING_THREADS
Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LE...
@ GRID_ELEMENTS
Total number of elements in the raking grid.
static __device__ __forceinline__ T * PlacementPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to place data into the grid.
static __device__ __forceinline__ T * RakingPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to begin sequential raking.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53