OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
dispatch_reduce.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "../../agent/agent_reduce.cuh"
41 #include "../../iterator/arg_index_input_iterator.cuh"
42 #include "../../thread/thread_operators.cuh"
43 #include "../../grid/grid_even_share.cuh"
44 #include "../../iterator/arg_index_input_iterator.cuh"
45 #include "../../util_debug.cuh"
46 #include "../../util_device.cuh"
47 #include "../../util_namespace.cuh"
48 
50 CUB_NS_PREFIX
51 
53 namespace cub {
54 
55 /******************************************************************************
56  * Kernel entry points
57  *****************************************************************************/
58 
62 template <
63  typename ChainedPolicyT,
64  typename InputIteratorT,
65  typename OutputIteratorT,
66  typename OffsetT,
67  typename ReductionOpT>
68 __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
69 __global__ void DeviceReduceKernel(
70  InputIteratorT d_in,
71  OutputIteratorT d_out,
74  ReductionOpT reduction_op)
75 {
76  // The output value type
77  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
78  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
79  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
80 
81  // Thread block type for reducing input tiles
82  typedef AgentReduce<
83  typename ChainedPolicyT::ActivePolicy::ReducePolicy,
84  InputIteratorT,
85  OutputIteratorT,
86  OffsetT,
87  ReductionOpT>
89 
90  // Shared memory storage
91  __shared__ typename AgentReduceT::TempStorage temp_storage;
92 
93  // Consume input tiles
94  OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
95 
96  // Output result
97  if (threadIdx.x == 0)
98  d_out[blockIdx.x] = block_aggregate;
99 }
100 
101 
105 template <
106  typename ChainedPolicyT,
107  typename InputIteratorT,
108  typename OutputIteratorT,
109  typename OffsetT,
110  typename ReductionOpT,
111  typename OuputT>
112 __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
113 __global__ void DeviceReduceSingleTileKernel(
114  InputIteratorT d_in,
115  OutputIteratorT d_out,
117  ReductionOpT reduction_op,
118  OuputT init)
119 {
120  // Thread block type for reducing input tiles
121  typedef AgentReduce<
122  typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
123  InputIteratorT,
124  OutputIteratorT,
125  OffsetT,
126  ReductionOpT>
127  AgentReduceT;
128 
129  // Shared memory storage
130  __shared__ typename AgentReduceT::TempStorage temp_storage;
131 
132  // Check if empty problem
133  if (num_items == 0)
134  {
135  if (threadIdx.x == 0)
136  *d_out = init;
137  return;
138  }
139 
140  // Consume input tiles
141  OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
142  OffsetT(0),
143  num_items);
144 
145  // Output result
146  if (threadIdx.x == 0)
147  *d_out = reduction_op(init, block_aggregate);
148 }
149 
150 
152 template <typename T, typename OffsetT, typename IteratorT>
153 __device__ __forceinline__
155  T &/*val*/,
156  OffsetT /*base_offset*/,
157  IteratorT /*itr*/)
158 {}
159 
160 
162 template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
163 __device__ __forceinline__
165  KeyValuePairT &val,
166  OffsetT base_offset,
168 {
169  val.key -= base_offset;
170 }
171 
172 
176 template <
177  typename ChainedPolicyT,
178  typename InputIteratorT,
179  typename OutputIteratorT,
180  typename OffsetIteratorT,
181  typename OffsetT,
182  typename ReductionOpT,
183  typename OutputT>
184 __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
185 __global__ void DeviceSegmentedReduceKernel(
186  InputIteratorT d_in,
187  OutputIteratorT d_out,
188  OffsetIteratorT d_begin_offsets,
189  OffsetIteratorT d_end_offsets,
190  int /*num_segments*/,
191  ReductionOpT reduction_op,
192  OutputT init)
193 {
194  // Thread block type for reducing input tiles
195  typedef AgentReduce<
196  typename ChainedPolicyT::ActivePolicy::ReducePolicy,
197  InputIteratorT,
198  OutputIteratorT,
199  OffsetT,
200  ReductionOpT>
201  AgentReduceT;
202 
203  // Shared memory storage
204  __shared__ typename AgentReduceT::TempStorage temp_storage;
205 
206  OffsetT segment_begin = d_begin_offsets[blockIdx.x];
207  OffsetT segment_end = d_end_offsets[blockIdx.x];
208 
209  // Check if empty problem
210  if (segment_begin == segment_end)
211  {
212  if (threadIdx.x == 0)
213  d_out[blockIdx.x] = init;
214  return;
215  }
216 
217  // Consume input tiles
218  OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
219  segment_begin,
220  segment_end);
221 
222  // Normalize as needed
223  NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
224 
225  if (threadIdx.x == 0)
226  d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
227 }
228 
229 
230 
231 
232 /******************************************************************************
233  * Policy
234  ******************************************************************************/
235 
236 template <
237  typename OuputT,
238  typename OffsetT,
239  typename ReductionOpT>
241 {
242  //------------------------------------------------------------------------------
243  // Architecture-specific tuning policies
244  //------------------------------------------------------------------------------
245 
247  struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
248  {
249  // ReducePolicy
250  typedef AgentReducePolicy<
251  CUB_SCALED_GRANULARITIES(128, 8, OuputT),
252  2,
254  LOAD_DEFAULT>
256 
257  // SingleTilePolicy
259 
260  // SegmentedReducePolicy
262  };
263 
264 
266  struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
267  {
268  // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
269  typedef AgentReducePolicy<
270  CUB_SCALED_GRANULARITIES(128, 8, OuputT),
271  4,
273  LOAD_DEFAULT>
275 
276  // SingleTilePolicy
278 
279  // SegmentedReducePolicy
281  };
282 
283 
285  struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
286  {
287  // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
288  typedef AgentReducePolicy<
289  CUB_SCALED_GRANULARITIES(256, 20, OuputT),
290  2,
292  LOAD_DEFAULT>
294 
295  // SingleTilePolicy
297 
298  // SegmentedReducePolicy
300  };
301 
302 
304  struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
305  {
306  // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
307  typedef AgentReducePolicy<
308  CUB_SCALED_GRANULARITIES(256, 20, OuputT),
309  4,
311  LOAD_LDG>
313 
314  // SingleTilePolicy
316 
317  // SegmentedReducePolicy
319  };
320 
322  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
323  {
324  // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
325  typedef AgentReducePolicy<
326  CUB_SCALED_GRANULARITIES(256, 16, OuputT),
327  4,
329  LOAD_LDG>
331 
332  // SingleTilePolicy
334 
335  // SegmentedReducePolicy
337  };
338 
339 
341  typedef Policy600 MaxPolicy;
342 
343 };
344 
345 
346 
347 /******************************************************************************
348  * Single-problem dispatch
349  ******************************************************************************/
350 
354 template <
355  typename InputIteratorT,
356  typename OutputIteratorT,
357  typename OffsetT,
358  typename ReductionOpT>
361  typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
362  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
363  typename std::iterator_traits<OutputIteratorT>::value_type>::Type, // ... else the output iterator's value type
364  OffsetT,
365  ReductionOpT>
366 {
367  //------------------------------------------------------------------------------
368  // Constants
369  //------------------------------------------------------------------------------
370 
371  // Data type of output iterator
372  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
373  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
374  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
375 
376 
377  //------------------------------------------------------------------------------
378  // Problem state
379  //------------------------------------------------------------------------------
380 
383  InputIteratorT d_in;
384  OutputIteratorT d_out;
386  ReductionOpT reduction_op;
387  OutputT init;
388  cudaStream_t stream;
391 
392  //------------------------------------------------------------------------------
393  // Constructor
394  //------------------------------------------------------------------------------
395 
397  CUB_RUNTIME_FUNCTION __forceinline__
399  void* d_temp_storage,
400  size_t &temp_storage_bytes,
401  InputIteratorT d_in,
402  OutputIteratorT d_out,
404  ReductionOpT reduction_op,
405  OutputT init,
406  cudaStream_t stream,
407  bool debug_synchronous,
408  int ptx_version)
409  :
412  d_in(d_in),
413  d_out(d_out),
416  init(init),
417  stream(stream),
420  {}
421 
422 
423  //------------------------------------------------------------------------------
424  // Small-problem (single tile) invocation
425  //------------------------------------------------------------------------------
426 
428  template <
429  typename ActivePolicyT,
430  typename SingleTileKernelT>
431  CUB_RUNTIME_FUNCTION __forceinline__
432  cudaError_t InvokeSingleTile(
433  SingleTileKernelT single_tile_kernel)
434  {
435 #ifndef CUB_RUNTIME_ENABLED
436  (void)single_tile_kernel;
437 
438  // Kernel launch not supported from this device
439  return CubDebug(cudaErrorNotSupported );
440 #else
441  cudaError error = cudaSuccess;
442  do
443  {
444  // Return if the caller is simply requesting the size of the storage allocation
445  if (d_temp_storage == NULL)
446  {
447  temp_storage_bytes = 1;
448  break;
449  }
450 
451  // Log single_reduce_sweep_kernel configuration
452  if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
453  ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
454  (long long) stream,
455  ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
456 
457  // Invoke single_reduce_sweep_kernel
458  single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
459  d_in,
460  d_out,
461  num_items,
462  reduction_op,
463  init);
464 
465  // Check for failure to launch
466  if (CubDebug(error = cudaPeekAtLastError())) break;
467 
468  // Sync the stream if specified to flush runtime errors
469  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
470  }
471  while (0);
472 
473  return error;
474 
475 #endif // CUB_RUNTIME_ENABLED
476  }
477 
478 
479  //------------------------------------------------------------------------------
480  // Normal problem size invocation (two-pass)
481  //------------------------------------------------------------------------------
482 
484  template <
485  typename ActivePolicyT,
486  typename ReduceKernelT,
487  typename SingleTileKernelT>
488  CUB_RUNTIME_FUNCTION __forceinline__
489  cudaError_t InvokePasses(
490  ReduceKernelT reduce_kernel,
491  SingleTileKernelT single_tile_kernel)
492  {
493 #ifndef CUB_RUNTIME_ENABLED
494  (void) reduce_kernel;
495  (void) single_tile_kernel;
496 
497  // Kernel launch not supported from this device
498  return CubDebug(cudaErrorNotSupported );
499 #else
500 
501  cudaError error = cudaSuccess;
502  do
503  {
504  // Get device ordinal
505  int device_ordinal;
506  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
507 
508  // Get SM count
509  int sm_count;
510  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
511 
512  // Init regular kernel configuration
513  KernelConfig reduce_config;
514  if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
515  int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
516 
517  // Even-share work distribution
518  int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
520  even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
521 
522  // Temporary storage allocation requirements
523  void* allocations[1];
524  size_t allocation_sizes[1] =
525  {
526  max_blocks * sizeof(OutputT) // bytes needed for privatized block reductions
527  };
528 
529  // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
530  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
531  if (d_temp_storage == NULL)
532  {
533  // Return if the caller is simply requesting the size of the storage allocation
534  return cudaSuccess;
535  }
536 
537  // Alias the allocation for the privatized per-block reductions
538  OutputT *d_block_reductions = (OutputT*) allocations[0];
539 
540  // Get grid size for device_reduce_sweep_kernel
541  int reduce_grid_size = even_share.grid_size;
542 
543  // Log device_reduce_sweep_kernel configuration
544  if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
545  reduce_grid_size,
546  ActivePolicyT::ReducePolicy::BLOCK_THREADS,
547  (long long) stream,
548  ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
549  reduce_config.sm_occupancy);
550 
551  // Invoke DeviceReduceKernel
552  reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
553  d_in,
554  d_block_reductions,
555  num_items,
556  even_share,
557  reduction_op);
558 
559  // Check for failure to launch
560  if (CubDebug(error = cudaPeekAtLastError())) break;
561 
562  // Sync the stream if specified to flush runtime errors
563  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
564 
565  // Log single_reduce_sweep_kernel configuration
566  if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
567  ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
568  (long long) stream,
569  ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
570 
571  // Invoke DeviceReduceSingleTileKernel
572  single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
573  d_block_reductions,
574  d_out,
575  reduce_grid_size,
576  reduction_op,
577  init);
578 
579  // Check for failure to launch
580  if (CubDebug(error = cudaPeekAtLastError())) break;
581 
582  // Sync the stream if specified to flush runtime errors
583  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
584  }
585  while (0);
586 
587  return error;
588 
589 #endif // CUB_RUNTIME_ENABLED
590 
591  }
592 
593 
594  //------------------------------------------------------------------------------
595  // Chained policy invocation
596  //------------------------------------------------------------------------------
597 
599  template <typename ActivePolicyT>
600  CUB_RUNTIME_FUNCTION __forceinline__
601  cudaError_t Invoke()
602  {
603  typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT;
604  typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
605 
606  // Force kernel code-generation in all compiler passes
607  if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
608  {
609  // Small, single tile size
610  return InvokeSingleTile<ActivePolicyT>(
611  DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
612  }
613  else
614  {
615  // Regular size
616  return InvokePasses<ActivePolicyT>(
617  DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
618  DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
619  }
620  }
621 
622 
623  //------------------------------------------------------------------------------
624  // Dispatch entrypoints
625  //------------------------------------------------------------------------------
626 
630  CUB_RUNTIME_FUNCTION __forceinline__
631  static cudaError_t Dispatch(
632  void *d_temp_storage,
633  size_t &temp_storage_bytes,
634  InputIteratorT d_in,
635  OutputIteratorT d_out,
637  ReductionOpT reduction_op,
638  OutputT init,
639  cudaStream_t stream,
640  bool debug_synchronous)
641  {
642  typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
643 
644  cudaError error = cudaSuccess;
645  do
646  {
647  // Get PTX version
648  int ptx_version;
649  if (CubDebug(error = PtxVersion(ptx_version))) break;
650 
651  // Create dispatch functor
652  DispatchReduce dispatch(
656 
657  // Dispatch to chained policy
658  if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
659  }
660  while (0);
661 
662  return error;
663  }
664 };
665 
666 
667 
668 /******************************************************************************
669  * Segmented dispatch
670  ******************************************************************************/
671 
675 template <
676  typename InputIteratorT,
677  typename OutputIteratorT,
678  typename OffsetIteratorT,
679  typename OffsetT,
680  typename ReductionOpT>
683  typename std::iterator_traits<InputIteratorT>::value_type,
684  OffsetT,
685  ReductionOpT>
686 {
687  //------------------------------------------------------------------------------
688  // Constants
689  //------------------------------------------------------------------------------
690 
692  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
693  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
694  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
695 
696 
697  //------------------------------------------------------------------------------
698  // Problem state
699  //------------------------------------------------------------------------------
700 
703  InputIteratorT d_in;
704  OutputIteratorT d_out;
706  OffsetIteratorT d_begin_offsets;
707  OffsetIteratorT d_end_offsets;
708  ReductionOpT reduction_op;
710  cudaStream_t stream;
713 
714  //------------------------------------------------------------------------------
715  // Constructor
716  //------------------------------------------------------------------------------
717 
719  CUB_RUNTIME_FUNCTION __forceinline__
721  void* d_temp_storage,
722  size_t &temp_storage_bytes,
723  InputIteratorT d_in,
724  OutputIteratorT d_out,
726  OffsetIteratorT d_begin_offsets,
727  OffsetIteratorT d_end_offsets,
728  ReductionOpT reduction_op,
729  OutputT init,
730  cudaStream_t stream,
731  bool debug_synchronous,
732  int ptx_version)
733  :
736  d_in(d_in),
737  d_out(d_out),
742  init(init),
743  stream(stream),
746  {}
747 
748 
749 
750  //------------------------------------------------------------------------------
751  // Chained policy invocation
752  //------------------------------------------------------------------------------
753 
755  template <
756  typename ActivePolicyT,
757  typename DeviceSegmentedReduceKernelT>
758  CUB_RUNTIME_FUNCTION __forceinline__
759  cudaError_t InvokePasses(
760  DeviceSegmentedReduceKernelT segmented_reduce_kernel)
761  {
762 #ifndef CUB_RUNTIME_ENABLED
763  (void)segmented_reduce_kernel;
764  // Kernel launch not supported from this device
765  return CubDebug(cudaErrorNotSupported );
766 #else
767  cudaError error = cudaSuccess;
768  do
769  {
770  // Return if the caller is simply requesting the size of the storage allocation
771  if (d_temp_storage == NULL)
772  {
773  temp_storage_bytes = 1;
774  return cudaSuccess;
775  }
776 
777  // Init kernel configuration
778  KernelConfig segmented_reduce_config;
779  if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
780 
781  // Log device_reduce_sweep_kernel configuration
782  if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
783  num_segments,
784  ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
785  (long long) stream,
786  ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
787  segmented_reduce_config.sm_occupancy);
788 
789  // Invoke DeviceReduceKernel
790  segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
791  d_in,
792  d_out,
795  num_segments,
796  reduction_op,
797  init);
798 
799  // Check for failure to launch
800  if (CubDebug(error = cudaPeekAtLastError())) break;
801 
802  // Sync the stream if specified to flush runtime errors
803  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
804  }
805  while (0);
806 
807  return error;
808 
809 #endif // CUB_RUNTIME_ENABLED
810 
811  }
812 
813 
815  template <typename ActivePolicyT>
816  CUB_RUNTIME_FUNCTION __forceinline__
817  cudaError_t Invoke()
818  {
819  typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
820 
821  // Force kernel code-generation in all compiler passes
822  return InvokePasses<ActivePolicyT>(
823  DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
824  }
825 
826 
827  //------------------------------------------------------------------------------
828  // Dispatch entrypoints
829  //------------------------------------------------------------------------------
830 
834  CUB_RUNTIME_FUNCTION __forceinline__
835  static cudaError_t Dispatch(
836  void *d_temp_storage,
837  size_t &temp_storage_bytes,
838  InputIteratorT d_in,
839  OutputIteratorT d_out,
840  int num_segments,
841  OffsetIteratorT d_begin_offsets,
842  OffsetIteratorT d_end_offsets,
843  ReductionOpT reduction_op,
844  OutputT init,
845  cudaStream_t stream,
846  bool debug_synchronous)
847  {
848  typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
849 
850  if (num_segments <= 0)
851  return cudaSuccess;
852 
853  cudaError error = cudaSuccess;
854  do
855  {
856  // Get PTX version
857  int ptx_version;
858  if (CubDebug(error = PtxVersion(ptx_version))) break;
859 
860  // Create dispatch functor
861  DispatchSegmentedReduce dispatch(
863  d_in, d_out,
867 
868  // Dispatch to chained policy
869  if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
870  }
871  while (0);
872 
873  return error;
874  }
875 };
876 
877 
878 
879 } // CUB namespace
880 CUB_NS_POSTFIX // Optional outer namespace(s)
881 
882 
OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
Cache as texture.
Definition: thread_load.cuh:69
Type equality test.
Definition: util_type.cuh:98
OutputT init
[in] The initial value of the reduction
__device__ __forceinline__ void NormalizeReductionOutput(T &, OffsetT, IteratorT)
Normalize input iterator to segment offset.
Alias wrapper allowing storage to be unioned.
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.
Definition: util_arch.cuh:141
Default (no modifier)
Definition: thread_load.cuh:64
If<(Equals< typename std::iterator_traits< OutputIteratorT >::value_type, void >::VALUE), typename std::iterator_traits< InputIteratorT >::value_type, typename std::iterator_traits< OutputIteratorT >::value_type >::Type OutputT
The output value type.
< Binary reduction functor type having member T operator()(const T &a, const T &b)
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
OffsetT num_items
[in] Total number of input items (i.e., length of d_in)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition: util_device.cuh:62
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Optional outer namespace(s)
int ptx_version
[in] PTX version
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 20, OuputT), 2, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
ReductionOpT reduction_op
[in] Binary reduction functor
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
AgentReducePolicy< CUB_SCALED_GRANULARITIES(128, 8, OuputT), 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 16, OuputT), 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG > ReducePolicy
< Cache load modifier
Policy600 MaxPolicy
MaxPolicy.
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
OutputT init
[in] The initial value of the reduction
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
Invocation.
OutputIteratorT d_out
[out] Pointer to the output aggregate
OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
< Cache load modifier for reading input elements
#define _CubLog(format,...)
Log macro for printf statements.
Definition: util_debug.cuh:112
ReductionOpT reduction_op
[in] Binary reduction functor
OffsetT OffsetT
[in] Total number of input data items
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide ...
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel)
Invoke a single block block to reduce in-core.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 20, OuputT), 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG > ReducePolicy
< Cache load modifier
OutputIteratorT d_out
[out] Pointer to the output aggregate
CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel)
Invoke two-passes to reduce.
InputIteratorT d_in
[in] Pointer to the input sequence of data items
int ptx_version
[in] PTX version
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
< Binary reduction functor type having member T operator()(const T &a, const T &b)
Helper for dispatching into a policy chain.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
AgentReducePolicy< CUB_SCALED_GRANULARITIES(128, 8, OuputT), 2, BLOCK_REDUCE_RAKING, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
< Binary reduction functor type having member T operator()(const T &a, const T &b)
__device__ __forceinline__ OutputT ConsumeTiles(GridEvenShare< OffsetT > &even_share)
InputIteratorT d_in
[in] Pointer to the input sequence of data items
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94
__device__ __forceinline__ OutputT ConsumeRange(GridEvenShare< OffsetT > &even_share, Int2Type< CAN_VECTORIZE > can_vectorize)
Reduce a contiguous segment of input tiles.
#define CUB_SUBSCRIPTION_FACTOR(arch)
Oversubscription factor.
Definition: util_arch.cuh:99
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
OffsetT num_segments
[in] The number of segments that comprise the sorting data
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...