OpenFPM_pdata  3.1.0
Project that contain the implementation of distributed structures
block_exchange.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../util_ptx.cuh"
37 #include "../util_arch.cuh"
38 #include "../util_macro.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
108 template <
109  typename InputT,
110  int BLOCK_DIM_X,
111  int ITEMS_PER_THREAD,
112  bool WARP_TIME_SLICING = false,
113  int BLOCK_DIM_Y = 1,
114  int BLOCK_DIM_Z = 1,
115  int PTX_ARCH = CUB_PTX_ARCH>
117 {
118 private:
119 
120  /******************************************************************************
121  * Constants
122  ******************************************************************************/
123 
125  enum
126  {
128  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
129 
130  LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
131  WARP_THREADS = 1 << LOG_WARP_THREADS,
132  WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
133 
134  LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH),
135  SMEM_BANKS = 1 << LOG_SMEM_BANKS,
136 
137  TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
138 
139  TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1,
140 
141  TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
142  TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
143 
144  WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
145  WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
146 
147  // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
148  INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
149  PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
150  };
151 
152  /******************************************************************************
153  * Type definitions
154  ******************************************************************************/
155 
157  struct __align__(16) _TempStorage
158  {
159  InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
160  };
161 
162 public:
163 
165  struct TempStorage : Uninitialized<_TempStorage> {};
166 
167 private:
168 
169 
170  /******************************************************************************
171  * Thread fields
172  ******************************************************************************/
173 
175  _TempStorage &temp_storage;
176 
178  unsigned int linear_tid;
179  unsigned int lane_id;
180  unsigned int warp_id;
181  unsigned int warp_offset;
182 
183 
184  /******************************************************************************
185  * Utility methods
186  ******************************************************************************/
187 
189  __device__ __forceinline__ _TempStorage& PrivateStorage()
190  {
191  __shared__ _TempStorage private_storage;
192  return private_storage;
193  }
194 
195 
199  template <typename OutputT>
200  __device__ __forceinline__ void BlockedToStriped(
201  InputT input_items[ITEMS_PER_THREAD],
202  OutputT output_items[ITEMS_PER_THREAD],
203  Int2Type<false> /*time_slicing*/)
204  {
205  #pragma unroll
206  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
207  {
208  int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
209  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
210  temp_storage.buff[item_offset] = input_items[ITEM];
211  }
212 
213  CTA_SYNC();
214 
215  #pragma unroll
216  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
217  {
218  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
219  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
220  output_items[ITEM] = temp_storage.buff[item_offset];
221  }
222  }
223 
224 
228  template <typename OutputT>
229  __device__ __forceinline__ void BlockedToStriped(
230  InputT input_items[ITEMS_PER_THREAD],
231  OutputT output_items[ITEMS_PER_THREAD],
232  Int2Type<true> /*time_slicing*/)
233  {
234  InputT temp_items[ITEMS_PER_THREAD];
235 
236  #pragma unroll
237  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
238  {
239  const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
240  const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
241 
242  CTA_SYNC();
243 
244  if (warp_id == SLICE)
245  {
246  #pragma unroll
247  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
248  {
249  int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
250  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
251  temp_storage.buff[item_offset] = input_items[ITEM];
252  }
253  }
254 
255  CTA_SYNC();
256 
257  #pragma unroll
258  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
259  {
260  // Read a strip of items
261  const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
262  const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
263 
264  if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
265  {
266  int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
267  if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
268  {
269  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
270  temp_items[ITEM] = temp_storage.buff[item_offset];
271  }
272  }
273  }
274  }
275 
276  // Copy
277  #pragma unroll
278  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
279  {
280  output_items[ITEM] = temp_items[ITEM];
281  }
282  }
283 
284 
288  template <typename OutputT>
289  __device__ __forceinline__ void BlockedToWarpStriped(
290  InputT input_items[ITEMS_PER_THREAD],
291  OutputT output_items[ITEMS_PER_THREAD],
292  Int2Type<false> /*time_slicing*/)
293  {
294  #pragma unroll
295  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
296  {
297  int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
298  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
299  temp_storage.buff[item_offset] = input_items[ITEM];
300  }
301 
302  WARP_SYNC(0xffffffff);
303 
304  #pragma unroll
305  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
306  {
307  int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
308  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
309  output_items[ITEM] = temp_storage.buff[item_offset];
310  }
311  }
312 
316  template <typename OutputT>
317  __device__ __forceinline__ void BlockedToWarpStriped(
318  InputT input_items[ITEMS_PER_THREAD],
319  OutputT output_items[ITEMS_PER_THREAD],
320  Int2Type<true> /*time_slicing*/)
321  {
322  if (warp_id == 0)
323  {
324  #pragma unroll
325  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
326  {
327  int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
328  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
329  temp_storage.buff[item_offset] = input_items[ITEM];
330  }
331 
332  WARP_SYNC(0xffffffff);
333 
334  #pragma unroll
335  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
336  {
337  int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
338  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
339  output_items[ITEM] = temp_storage.buff[item_offset];
340  }
341  }
342 
343  #pragma unroll
344  for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
345  {
346  CTA_SYNC();
347 
348  if (warp_id == SLICE)
349  {
350  #pragma unroll
351  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
352  {
353  int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
354  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
355  temp_storage.buff[item_offset] = input_items[ITEM];
356  }
357 
358  WARP_SYNC(0xffffffff);
359 
360  #pragma unroll
361  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
362  {
363  int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
364  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
365  output_items[ITEM] = temp_storage.buff[item_offset];
366  }
367  }
368  }
369  }
370 
371 
375  template <typename OutputT>
376  __device__ __forceinline__ void StripedToBlocked(
377  InputT input_items[ITEMS_PER_THREAD],
378  OutputT output_items[ITEMS_PER_THREAD],
379  Int2Type<false> /*time_slicing*/)
380  {
381  #pragma unroll
382  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
383  {
384  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
385  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
386  temp_storage.buff[item_offset] = input_items[ITEM];
387  }
388 
389  CTA_SYNC();
390 
391  // No timeslicing
392  #pragma unroll
393  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
394  {
395  int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
396  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
397  output_items[ITEM] = temp_storage.buff[item_offset];
398  }
399  }
400 
401 
405  template <typename OutputT>
406  __device__ __forceinline__ void StripedToBlocked(
407  InputT input_items[ITEMS_PER_THREAD],
408  OutputT output_items[ITEMS_PER_THREAD],
409  Int2Type<true> /*time_slicing*/)
410  {
411  // Warp time-slicing
412  InputT temp_items[ITEMS_PER_THREAD];
413 
414  #pragma unroll
415  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
416  {
417  const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
418  const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
419 
420  CTA_SYNC();
421 
422  #pragma unroll
423  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
424  {
425  // Write a strip of items
426  const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
427  const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
428 
429  if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
430  {
431  int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
432  if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
433  {
434  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
435  temp_storage.buff[item_offset] = input_items[ITEM];
436  }
437  }
438  }
439 
440  CTA_SYNC();
441 
442  if (warp_id == SLICE)
443  {
444  #pragma unroll
445  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
446  {
447  int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
448  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
449  temp_items[ITEM] = temp_storage.buff[item_offset];
450  }
451  }
452  }
453 
454  // Copy
455  #pragma unroll
456  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
457  {
458  output_items[ITEM] = temp_items[ITEM];
459  }
460  }
461 
462 
466  template <typename OutputT>
467  __device__ __forceinline__ void WarpStripedToBlocked(
468  InputT input_items[ITEMS_PER_THREAD],
469  OutputT output_items[ITEMS_PER_THREAD],
470  Int2Type<false> /*time_slicing*/)
471  {
472  #pragma unroll
473  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
474  {
475  int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
476  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
477  temp_storage.buff[item_offset] = input_items[ITEM];
478  }
479 
480  WARP_SYNC(0xffffffff);
481 
482  #pragma unroll
483  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
484  {
485  int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
486  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
487  output_items[ITEM] = temp_storage.buff[item_offset];
488  }
489  }
490 
491 
495  template <typename OutputT>
496  __device__ __forceinline__ void WarpStripedToBlocked(
497  InputT input_items[ITEMS_PER_THREAD],
498  OutputT output_items[ITEMS_PER_THREAD],
499  Int2Type<true> /*time_slicing*/)
500  {
501  #pragma unroll
502  for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
503  {
504  CTA_SYNC();
505 
506  if (warp_id == SLICE)
507  {
508  #pragma unroll
509  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
510  {
511  int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
512  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
513  temp_storage.buff[item_offset] = input_items[ITEM];
514  }
515 
516  WARP_SYNC(0xffffffff);
517 
518  #pragma unroll
519  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
520  {
521  int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
522  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
523  output_items[ITEM] = temp_storage.buff[item_offset];
524  }
525  }
526  }
527  }
528 
529 
533  template <typename OutputT, typename OffsetT>
534  __device__ __forceinline__ void ScatterToBlocked(
535  InputT input_items[ITEMS_PER_THREAD],
536  OutputT output_items[ITEMS_PER_THREAD],
537  OffsetT ranks[ITEMS_PER_THREAD],
538  Int2Type<false> /*time_slicing*/)
539  {
540  #pragma unroll
541  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
542  {
543  int item_offset = ranks[ITEM];
544  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
545  temp_storage.buff[item_offset] = input_items[ITEM];
546  }
547 
548  CTA_SYNC();
549 
550  #pragma unroll
551  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
552  {
553  int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
554  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
555  output_items[ITEM] = temp_storage.buff[item_offset];
556  }
557  }
558 
562  template <typename OutputT, typename OffsetT>
563  __device__ __forceinline__ void ScatterToBlocked(
564  InputT input_items[ITEMS_PER_THREAD],
565  OutputT output_items[ITEMS_PER_THREAD],
566  OffsetT ranks[ITEMS_PER_THREAD],
567  Int2Type<true> /*time_slicing*/)
568  {
569  InputT temp_items[ITEMS_PER_THREAD];
570 
571  #pragma unroll
572  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
573  {
574  CTA_SYNC();
575 
576  const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
577 
578  #pragma unroll
579  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
580  {
581  int item_offset = ranks[ITEM] - SLICE_OFFSET;
582  if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
583  {
584  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
585  temp_storage.buff[item_offset] = input_items[ITEM];
586  }
587  }
588 
589  CTA_SYNC();
590 
591  if (warp_id == SLICE)
592  {
593  #pragma unroll
594  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
595  {
596  int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
597  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
598  temp_items[ITEM] = temp_storage.buff[item_offset];
599  }
600  }
601  }
602 
603  // Copy
604  #pragma unroll
605  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
606  {
607  output_items[ITEM] = temp_items[ITEM];
608  }
609  }
610 
611 
615  template <typename OutputT, typename OffsetT>
616  __device__ __forceinline__ void ScatterToStriped(
617  InputT input_items[ITEMS_PER_THREAD],
618  OutputT output_items[ITEMS_PER_THREAD],
619  OffsetT ranks[ITEMS_PER_THREAD],
620  Int2Type<false> /*time_slicing*/)
621  {
622  #pragma unroll
623  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
624  {
625  int item_offset = ranks[ITEM];
626  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
627  temp_storage.buff[item_offset] = input_items[ITEM];
628  }
629 
630  CTA_SYNC();
631 
632  #pragma unroll
633  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
634  {
635  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
636  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
637  output_items[ITEM] = temp_storage.buff[item_offset];
638  }
639  }
640 
641 
645  template <typename OutputT, typename OffsetT>
646  __device__ __forceinline__ void ScatterToStriped(
647  InputT input_items[ITEMS_PER_THREAD],
648  OutputT output_items[ITEMS_PER_THREAD],
649  OffsetT ranks[ITEMS_PER_THREAD],
650  Int2Type<true> /*time_slicing*/)
651  {
652  InputT temp_items[ITEMS_PER_THREAD];
653 
654  #pragma unroll
655  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
656  {
657  const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
658  const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
659 
660  CTA_SYNC();
661 
662  #pragma unroll
663  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
664  {
665  int item_offset = ranks[ITEM] - SLICE_OFFSET;
666  if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
667  {
668  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
669  temp_storage.buff[item_offset] = input_items[ITEM];
670  }
671  }
672 
673  CTA_SYNC();
674 
675  #pragma unroll
676  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
677  {
678  // Read a strip of items
679  const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
680  const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
681 
682  if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
683  {
684  int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
685  if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
686  {
687  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
688  temp_items[ITEM] = temp_storage.buff[item_offset];
689  }
690  }
691  }
692  }
693 
694  // Copy
695  #pragma unroll
696  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
697  {
698  output_items[ITEM] = temp_items[ITEM];
699  }
700  }
701 
702 
703 public:
704 
705  /******************************************************************/
709 
713  __device__ __forceinline__ BlockExchange()
714  :
716  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
717  warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
718  lane_id(LaneId()),
719  warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
720  {}
721 
722 
726  __device__ __forceinline__ BlockExchange(
728  :
729  temp_storage(temp_storage.Alias()),
730  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
731  lane_id(LaneId()),
732  warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
733  warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
734  {}
735 
736 
738  /******************************************************************/
742 
779  template <typename OutputT>
780  __device__ __forceinline__ void StripedToBlocked(
781  InputT input_items[ITEMS_PER_THREAD],
782  OutputT output_items[ITEMS_PER_THREAD])
783  {
784  StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
785  }
786 
787 
828  template <typename OutputT>
829  __device__ __forceinline__ void BlockedToStriped(
830  InputT input_items[ITEMS_PER_THREAD],
831  OutputT output_items[ITEMS_PER_THREAD])
832  {
833  BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
834  }
835 
836 
837 
876  template <typename OutputT>
877  __device__ __forceinline__ void WarpStripedToBlocked(
878  InputT input_items[ITEMS_PER_THREAD],
879  OutputT output_items[ITEMS_PER_THREAD])
880  {
881  WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
882  }
883 
884 
885 
927  template <typename OutputT>
928  __device__ __forceinline__ void BlockedToWarpStriped(
929  InputT input_items[ITEMS_PER_THREAD],
930  OutputT output_items[ITEMS_PER_THREAD])
931  {
932  BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
933  }
934 
935 
936 
938  /******************************************************************/
942 
943 
952  template <typename OutputT, typename OffsetT>
953  __device__ __forceinline__ void ScatterToBlocked(
954  InputT input_items[ITEMS_PER_THREAD],
955  OutputT output_items[ITEMS_PER_THREAD],
956  OffsetT ranks[ITEMS_PER_THREAD])
957  {
958  ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
959  }
960 
961 
962 
971  template <typename OutputT, typename OffsetT>
972  __device__ __forceinline__ void ScatterToStriped(
973  InputT input_items[ITEMS_PER_THREAD],
974  OutputT output_items[ITEMS_PER_THREAD],
975  OffsetT ranks[ITEMS_PER_THREAD])
976  {
977  ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
978  }
979 
980 
981 
990  template <typename OutputT, typename OffsetT>
991  __device__ __forceinline__ void ScatterToStripedGuarded(
992  InputT input_items[ITEMS_PER_THREAD],
993  OutputT output_items[ITEMS_PER_THREAD],
994  OffsetT ranks[ITEMS_PER_THREAD])
995  {
996  #pragma unroll
997  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
998  {
999  int item_offset = ranks[ITEM];
1000  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1001  if (ranks[ITEM] >= 0)
1002  temp_storage.buff[item_offset] = input_items[ITEM];
1003  }
1004 
1005  CTA_SYNC();
1006 
1007  #pragma unroll
1008  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1009  {
1010  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
1011  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1012  output_items[ITEM] = temp_storage.buff[item_offset];
1013  }
1014  }
1015 
1016 
1017 
1018 
1028  template <typename OutputT, typename OffsetT, typename ValidFlag>
1029  __device__ __forceinline__ void ScatterToStripedFlagged(
1030  InputT input_items[ITEMS_PER_THREAD],
1031  OutputT output_items[ITEMS_PER_THREAD],
1032  OffsetT ranks[ITEMS_PER_THREAD],
1033  ValidFlag is_valid[ITEMS_PER_THREAD])
1034  {
1035  #pragma unroll
1036  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1037  {
1038  int item_offset = ranks[ITEM];
1039  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1040  if (is_valid[ITEM])
1041  temp_storage.buff[item_offset] = input_items[ITEM];
1042  }
1043 
1044  CTA_SYNC();
1045 
1046  #pragma unroll
1047  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1048  {
1049  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
1050  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1051  output_items[ITEM] = temp_storage.buff[item_offset];
1052  }
1053  }
1054 
1055 
1057 
1058 
1059 
1060 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
1061 
1062 
1063  __device__ __forceinline__ void StripedToBlocked(
1064  InputT items[ITEMS_PER_THREAD])
1065  {
1066  StripedToBlocked(items, items);
1067  }
1068 
1069  __device__ __forceinline__ void BlockedToStriped(
1070  InputT items[ITEMS_PER_THREAD])
1071  {
1072  BlockedToStriped(items, items);
1073  }
1074 
1075  __device__ __forceinline__ void WarpStripedToBlocked(
1076  InputT items[ITEMS_PER_THREAD])
1077  {
1078  WarpStripedToBlocked(items, items);
1079  }
1080 
1081  __device__ __forceinline__ void BlockedToWarpStriped(
1082  InputT items[ITEMS_PER_THREAD])
1083  {
1084  BlockedToWarpStriped(items, items);
1085  }
1086 
1087  template <typename OffsetT>
1088  __device__ __forceinline__ void ScatterToBlocked(
1089  InputT items[ITEMS_PER_THREAD],
1090  OffsetT ranks[ITEMS_PER_THREAD])
1091  {
1092  ScatterToBlocked(items, items, ranks);
1093  }
1094 
1095  template <typename OffsetT>
1096  __device__ __forceinline__ void ScatterToStriped(
1097  InputT items[ITEMS_PER_THREAD],
1098  OffsetT ranks[ITEMS_PER_THREAD])
1099  {
1100  ScatterToStriped(items, items, ranks);
1101  }
1102 
1103  template <typename OffsetT>
1104  __device__ __forceinline__ void ScatterToStripedGuarded(
1105  InputT items[ITEMS_PER_THREAD],
1106  OffsetT ranks[ITEMS_PER_THREAD])
1107  {
1108  ScatterToStripedGuarded(items, items, ranks);
1109  }
1110 
1111  template <typename OffsetT, typename ValidFlag>
1112  __device__ __forceinline__ void ScatterToStripedFlagged(
1113  InputT items[ITEMS_PER_THREAD],
1114  OffsetT ranks[ITEMS_PER_THREAD],
1115  ValidFlag is_valid[ITEMS_PER_THREAD])
1116  {
1117  ScatterToStriped(items, items, ranks, is_valid);
1118  }
1119 
1120 #endif // DOXYGEN_SHOULD_SKIP_THIS
1121 
1122 
1123 };
1124 
1125 
1126 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
1127 
1128 
1129 template <
1130  typename T,
1131  int ITEMS_PER_THREAD,
1132  int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
1133  int PTX_ARCH = CUB_PTX_ARCH>
1135 {
1136 private:
1137 
1138  /******************************************************************************
1139  * Constants
1140  ******************************************************************************/
1141 
1143  enum
1144  {
1145  // Whether the logical warp size and the PTX warp size coincide
1146  IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
1147 
1148  WARP_ITEMS = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
1149 
1150  LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH),
1151  SMEM_BANKS = 1 << LOG_SMEM_BANKS,
1152 
1153  // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
1154  INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
1155  PADDING_ITEMS = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
1156  };
1157 
1158  /******************************************************************************
1159  * Type definitions
1160  ******************************************************************************/
1161 
1164  {
1165  T buff[WARP_ITEMS + PADDING_ITEMS];
1166  };
1167 
1168 public:
1169 
1171  struct TempStorage : Uninitialized<_TempStorage> {};
1172 
1173 private:
1174 
1175 
1176  /******************************************************************************
1177  * Thread fields
1178  ******************************************************************************/
1179 
1181  int lane_id;
1182 
1183 public:
1184 
1185  /******************************************************************************
1186  * Construction
1187  ******************************************************************************/
1188 
1190  __device__ __forceinline__ WarpExchange(
1192  :
1193  temp_storage(temp_storage.Alias()),
1194  lane_id(IS_ARCH_WARP ?
1195  LaneId() :
1196  LaneId() % LOGICAL_WARP_THREADS)
1197  {}
1198 
1199 
1200  /******************************************************************************
1201  * Interface
1202  ******************************************************************************/
1203 
1212  template <typename OffsetT>
1213  __device__ __forceinline__ void ScatterToStriped(
1214  T items[ITEMS_PER_THREAD],
1215  OffsetT ranks[ITEMS_PER_THREAD])
1216  {
1217  #pragma unroll
1218  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1219  {
1220  if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
1221  temp_storage.buff[ranks[ITEM]] = items[ITEM];
1222  }
1223 
1224  WARP_SYNC(0xffffffff);
1225 
1226  #pragma unroll
1227  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1228  {
1229  int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
1230  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1231  items[ITEM] = temp_storage.buff[item_offset];
1232  }
1233  }
1234 
1235 };
1236 
1237 
1238 
1239 
1240 #endif // DOXYGEN_SHOULD_SKIP_THIS
1241 
1242 
1243 
1244 
1245 
1246 } // CUB namespace
1247 CUB_NS_POSTFIX // Optional outer namespace(s)
1248 
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
#define CUB_LOG_SMEM_BANKS(arch)
Number of smem banks.
Definition: util_arch.cuh:85
__device__ __forceinline__ BlockExchange(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
Optional outer namespace(s)
struct __align__(16) _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void ScatterToStripedFlagged(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD])
\smemstorage{BlockExchange}
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA th...
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
Definition: util_arch.cuh:73
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >)
CTA_SYNC()
Definition: util_ptx.cuh:255
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition: util_ptx.cuh:420
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into striped arrangement.
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ void ScatterToBlocked(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >)
OffsetT OffsetT
[in] Total number of input data items
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from blocked arrangement to warp-striped arrangement.
\smemstorage{WarpExchange}
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
Statically determine if N is a power-of-two.
Definition: util_type.cuh:155
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from blocked arrangement to striped arrangement.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from striped arrangement to blocked arrangement.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
__device__ __forceinline__ void BlockedToStriped(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void ScatterToStripedFlagged(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD])
Exchanges valid data items annotated by rank into striped arrangement.
__device__ __forceinline__ unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-right then add. Returns (x >> shift) + addend.
Definition: util_ptx.cuh:87
Shared memory storage layout type.
__device__ __forceinline__ void BlockedToWarpStriped(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
The thread block size in threads.
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >)
__device__ __forceinline__ void StripedToBlocked(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ BlockExchange()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into blocked arrangement.
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from warp-striped arrangement to blocked arrangement.
__device__ __forceinline__ void WarpStripedToBlocked(InputT items[ITEMS_PER_THREAD])
__device__ __forceinline__ void ScatterToStripedGuarded(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition: util_ptx.cuh:273
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ WarpExchange(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ void ScatterToStriped(T items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges valid data items annotated by rank into striped arrangement.
__device__ __forceinline__ void ScatterToStriped(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
__device__ __forceinline__ void ScatterToStripedGuarded(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged...
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.