OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
dispatch_reduce.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39
40#include "../../agent/agent_reduce.cuh"
41#include "../../iterator/arg_index_input_iterator.cuh"
42#include "../../thread/thread_operators.cuh"
43#include "../../grid/grid_even_share.cuh"
44#include "../../iterator/arg_index_input_iterator.cuh"
45#include "../../util_debug.cuh"
46#include "../../util_device.cuh"
47#include "../../util_namespace.cuh"
48
50CUB_NS_PREFIX
51
53namespace cub {
54
55/******************************************************************************
56 * Kernel entry points
57 *****************************************************************************/
58
62template <
63 typename ChainedPolicyT,
64 typename InputIteratorT,
65 typename OutputIteratorT,
66 typename OffsetT,
67 typename ReductionOpT>
68__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
69__global__ void DeviceReduceKernel(
70 InputIteratorT d_in,
71 OutputIteratorT d_out,
74 ReductionOpT reduction_op)
75{
76 // The output value type
77 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
78 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
79 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
80
81 // Thread block type for reducing input tiles
82 typedef AgentReduce<
83 typename ChainedPolicyT::ActivePolicy::ReducePolicy,
84 InputIteratorT,
85 OutputIteratorT,
86 OffsetT,
87 ReductionOpT>
89
90 // Shared memory storage
91 __shared__ typename AgentReduceT::TempStorage temp_storage;
92
93 // Consume input tiles
94 OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
95
96 // Output result
97 if (threadIdx.x == 0)
98 d_out[blockIdx.x] = block_aggregate;
99}
100
101
105template <
106 typename ChainedPolicyT,
107 typename InputIteratorT,
108 typename OutputIteratorT,
109 typename OffsetT,
110 typename ReductionOpT,
111 typename OuputT>
112__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
113__global__ void DeviceReduceSingleTileKernel(
114 InputIteratorT d_in,
115 OutputIteratorT d_out,
117 ReductionOpT reduction_op,
118 OuputT init)
119{
120 // Thread block type for reducing input tiles
121 typedef AgentReduce<
122 typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
123 InputIteratorT,
124 OutputIteratorT,
125 OffsetT,
126 ReductionOpT>
128
129 // Shared memory storage
130 __shared__ typename AgentReduceT::TempStorage temp_storage;
131
132 // Check if empty problem
133 if (num_items == 0)
134 {
135 if (threadIdx.x == 0)
136 *d_out = init;
137 return;
138 }
139
140 // Consume input tiles
141 OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
142 OffsetT(0),
143 num_items);
144
145 // Output result
146 if (threadIdx.x == 0)
147 *d_out = reduction_op(init, block_aggregate);
148}
149
150
152template <typename T, typename OffsetT, typename IteratorT>
153__device__ __forceinline__
155 T &/*val*/,
156 OffsetT /*base_offset*/,
157 IteratorT /*itr*/)
158{}
159
160
162template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
163__device__ __forceinline__
165 KeyValuePairT &val,
166 OffsetT base_offset,
168{
169 val.key -= base_offset;
170}
171
172
176template <
177 typename ChainedPolicyT,
178 typename InputIteratorT,
179 typename OutputIteratorT,
180 typename OffsetIteratorT,
181 typename OffsetT,
182 typename ReductionOpT,
183 typename OutputT>
184__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
185__global__ void DeviceSegmentedReduceKernel(
186 InputIteratorT d_in,
187 OutputIteratorT d_out,
188 OffsetIteratorT d_begin_offsets,
189 OffsetIteratorT d_end_offsets,
190 int /*num_segments*/,
191 ReductionOpT reduction_op,
192 OutputT init)
193{
194 // Thread block type for reducing input tiles
195 typedef AgentReduce<
196 typename ChainedPolicyT::ActivePolicy::ReducePolicy,
197 InputIteratorT,
198 OutputIteratorT,
199 OffsetT,
200 ReductionOpT>
202
203 // Shared memory storage
204 __shared__ typename AgentReduceT::TempStorage temp_storage;
205
206 OffsetT segment_begin = d_begin_offsets[blockIdx.x];
207 OffsetT segment_end = d_end_offsets[blockIdx.x];
208
209 // Check if empty problem
210 if (segment_begin == segment_end)
211 {
212 if (threadIdx.x == 0)
213 d_out[blockIdx.x] = init;
214 return;
215 }
216
217 // Consume input tiles
218 OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
219 segment_begin,
220 segment_end);
221
222 // Normalize as needed
223 NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
224
225 if (threadIdx.x == 0)
226 d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
227}
228
229
230
231
232/******************************************************************************
233 * Policy
234 ******************************************************************************/
235
236template <
237 typename OuputT,
238 typename OffsetT,
239 typename ReductionOpT>
241{
242 //------------------------------------------------------------------------------
243 // Architecture-specific tuning policies
244 //------------------------------------------------------------------------------
245
247 struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
248 {
249 // ReducePolicy
250 typedef AgentReducePolicy<
251 CUB_SCALED_GRANULARITIES(128, 8, OuputT),
252 2,
256
257 // SingleTilePolicy
259
260 // SegmentedReducePolicy
262 };
263
264
266 struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
267 {
268 // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
269 typedef AgentReducePolicy<
270 CUB_SCALED_GRANULARITIES(128, 8, OuputT),
271 4,
275
276 // SingleTilePolicy
278
279 // SegmentedReducePolicy
281 };
282
283
285 struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
286 {
287 // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
288 typedef AgentReducePolicy<
289 CUB_SCALED_GRANULARITIES(256, 20, OuputT),
290 2,
294
295 // SingleTilePolicy
297
298 // SegmentedReducePolicy
300 };
301
302
304 struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
305 {
306 // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
307 typedef AgentReducePolicy<
308 CUB_SCALED_GRANULARITIES(256, 20, OuputT),
309 4,
311 LOAD_LDG>
313
314 // SingleTilePolicy
316
317 // SegmentedReducePolicy
319 };
320
322 struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
323 {
324 // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
325 typedef AgentReducePolicy<
326 CUB_SCALED_GRANULARITIES(256, 16, OuputT),
327 4,
329 LOAD_LDG>
331
332 // SingleTilePolicy
334
335 // SegmentedReducePolicy
337 };
338
339
342
343};
344
345
346
347/******************************************************************************
348 * Single-problem dispatch
349 ******************************************************************************/
350
354template <
355 typename InputIteratorT,
356 typename OutputIteratorT,
357 typename OffsetT,
358 typename ReductionOpT>
361 typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
362 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
363 typename std::iterator_traits<OutputIteratorT>::value_type>::Type, // ... else the output iterator's value type
364 OffsetT,
365 ReductionOpT>
366{
367 //------------------------------------------------------------------------------
368 // Constants
369 //------------------------------------------------------------------------------
370
371 // Data type of output iterator
372 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
373 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
374 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
375
376
377 //------------------------------------------------------------------------------
378 // Problem state
379 //------------------------------------------------------------------------------
380
383 InputIteratorT d_in;
384 OutputIteratorT d_out;
386 ReductionOpT reduction_op;
387 OutputT init;
388 cudaStream_t stream;
391
392 //------------------------------------------------------------------------------
393 // Constructor
394 //------------------------------------------------------------------------------
395
397 CUB_RUNTIME_FUNCTION __forceinline__
399 void* d_temp_storage,
400 size_t &temp_storage_bytes,
401 InputIteratorT d_in,
402 OutputIteratorT d_out,
404 ReductionOpT reduction_op,
405 OutputT init,
406 cudaStream_t stream,
408 int ptx_version)
409 :
412 d_in(d_in),
413 d_out(d_out),
416 init(init),
417 stream(stream),
420 {}
421
422
423 //------------------------------------------------------------------------------
424 // Small-problem (single tile) invocation
425 //------------------------------------------------------------------------------
426
428 template <
429 typename ActivePolicyT,
430 typename SingleTileKernelT>
431 CUB_RUNTIME_FUNCTION __forceinline__
432 cudaError_t InvokeSingleTile(
433 SingleTileKernelT single_tile_kernel)
434 {
435#ifndef CUB_RUNTIME_ENABLED
436 (void)single_tile_kernel;
437
438 // Kernel launch not supported from this device
439 return CubDebug(cudaErrorNotSupported );
440#else
441 cudaError error = cudaSuccess;
442 do
443 {
444 // Return if the caller is simply requesting the size of the storage allocation
445 if (d_temp_storage == NULL)
446 {
448 break;
449 }
450
451 // Log single_reduce_sweep_kernel configuration
452 if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
453 ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
454 (long long) stream,
455 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
456
457 // Invoke single_reduce_sweep_kernel
458 single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
459 d_in,
460 d_out,
461 num_items,
463 init);
464
465 // Check for failure to launch
466 if (CubDebug(error = cudaPeekAtLastError())) break;
467
468 // Sync the stream if specified to flush runtime errors
469 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
470 }
471 while (0);
472
473 return error;
474
475#endif // CUB_RUNTIME_ENABLED
476 }
477
478
479 //------------------------------------------------------------------------------
480 // Normal problem size invocation (two-pass)
481 //------------------------------------------------------------------------------
482
484 template <
485 typename ActivePolicyT,
486 typename ReduceKernelT,
487 typename SingleTileKernelT>
488 CUB_RUNTIME_FUNCTION __forceinline__
489 cudaError_t InvokePasses(
490 ReduceKernelT reduce_kernel,
491 SingleTileKernelT single_tile_kernel)
492 {
493#ifndef CUB_RUNTIME_ENABLED
494 (void) reduce_kernel;
495 (void) single_tile_kernel;
496
497 // Kernel launch not supported from this device
498 return CubDebug(cudaErrorNotSupported );
499#else
500
501 cudaError error = cudaSuccess;
502 do
503 {
504 // Get device ordinal
505 int device_ordinal;
506 if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
507
508 // Get SM count
509 int sm_count;
510 if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
511
512 // Init regular kernel configuration
513 KernelConfig reduce_config;
514 if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
515 int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
516
517 // Even-share work distribution
518 int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
520 even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
521
522 // Temporary storage allocation requirements
523 void* allocations[1];
524 size_t allocation_sizes[1] =
525 {
526 max_blocks * sizeof(OutputT) // bytes needed for privatized block reductions
527 };
528
529 // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
530 if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
531 if (d_temp_storage == NULL)
532 {
533 // Return if the caller is simply requesting the size of the storage allocation
534 return cudaSuccess;
535 }
536
537 // Alias the allocation for the privatized per-block reductions
538 OutputT *d_block_reductions = (OutputT*) allocations[0];
539
540 // Get grid size for device_reduce_sweep_kernel
541 int reduce_grid_size = even_share.grid_size;
542
543 // Log device_reduce_sweep_kernel configuration
544 if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
545 reduce_grid_size,
546 ActivePolicyT::ReducePolicy::BLOCK_THREADS,
547 (long long) stream,
548 ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
549 reduce_config.sm_occupancy);
550
551 // Invoke DeviceReduceKernel
552 reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
553 d_in,
554 d_block_reductions,
555 num_items,
558
559 // Check for failure to launch
560 if (CubDebug(error = cudaPeekAtLastError())) break;
561
562 // Sync the stream if specified to flush runtime errors
563 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
564
565 // Log single_reduce_sweep_kernel configuration
566 if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
567 ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
568 (long long) stream,
569 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
570
571 // Invoke DeviceReduceSingleTileKernel
572 single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
573 d_block_reductions,
574 d_out,
575 reduce_grid_size,
577 init);
578
579 // Check for failure to launch
580 if (CubDebug(error = cudaPeekAtLastError())) break;
581
582 // Sync the stream if specified to flush runtime errors
583 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
584 }
585 while (0);
586
587 return error;
588
589#endif // CUB_RUNTIME_ENABLED
590
591 }
592
593
594 //------------------------------------------------------------------------------
595 // Chained policy invocation
596 //------------------------------------------------------------------------------
597
599 template <typename ActivePolicyT>
600 CUB_RUNTIME_FUNCTION __forceinline__
601 cudaError_t Invoke()
602 {
603 typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT;
604 typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
605
606 // Force kernel code-generation in all compiler passes
607 if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
608 {
609 // Small, single tile size
610 return InvokeSingleTile<ActivePolicyT>(
611 DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
612 }
613 else
614 {
615 // Regular size
616 return InvokePasses<ActivePolicyT>(
617 DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
618 DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
619 }
620 }
621
622
623 //------------------------------------------------------------------------------
624 // Dispatch entrypoints
625 //------------------------------------------------------------------------------
626
630 CUB_RUNTIME_FUNCTION __forceinline__
631 static cudaError_t Dispatch(
632 void *d_temp_storage,
633 size_t &temp_storage_bytes,
634 InputIteratorT d_in,
635 OutputIteratorT d_out,
637 ReductionOpT reduction_op,
638 OutputT init,
639 cudaStream_t stream,
640 bool debug_synchronous)
641 {
642 typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
643
644 cudaError error = cudaSuccess;
645 do
646 {
647 // Get PTX version
648 int ptx_version;
649 if (CubDebug(error = PtxVersion(ptx_version))) break;
650
651 // Create dispatch functor
652 DispatchReduce dispatch(
656
657 // Dispatch to chained policy
658 if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
659 }
660 while (0);
661
662 return error;
663 }
664};
665
666
667
668/******************************************************************************
669 * Segmented dispatch
670 ******************************************************************************/
671
675template <
676 typename InputIteratorT,
677 typename OutputIteratorT,
678 typename OffsetIteratorT,
679 typename OffsetT,
680 typename ReductionOpT>
683 typename std::iterator_traits<InputIteratorT>::value_type,
684 OffsetT,
685 ReductionOpT>
686{
687 //------------------------------------------------------------------------------
688 // Constants
689 //------------------------------------------------------------------------------
690
692 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
693 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
694 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
695
696
697 //------------------------------------------------------------------------------
698 // Problem state
699 //------------------------------------------------------------------------------
700
703 InputIteratorT d_in;
704 OutputIteratorT d_out;
706 OffsetIteratorT d_begin_offsets;
707 OffsetIteratorT d_end_offsets;
708 ReductionOpT reduction_op;
710 cudaStream_t stream;
713
714 //------------------------------------------------------------------------------
715 // Constructor
716 //------------------------------------------------------------------------------
717
719 CUB_RUNTIME_FUNCTION __forceinline__
721 void* d_temp_storage,
722 size_t &temp_storage_bytes,
723 InputIteratorT d_in,
724 OutputIteratorT d_out,
726 OffsetIteratorT d_begin_offsets,
727 OffsetIteratorT d_end_offsets,
728 ReductionOpT reduction_op,
730 cudaStream_t stream,
732 int ptx_version)
733 :
736 d_in(d_in),
737 d_out(d_out),
742 init(init),
743 stream(stream),
746 {}
747
748
749
750 //------------------------------------------------------------------------------
751 // Chained policy invocation
752 //------------------------------------------------------------------------------
753
755 template <
756 typename ActivePolicyT,
757 typename DeviceSegmentedReduceKernelT>
758 CUB_RUNTIME_FUNCTION __forceinline__
759 cudaError_t InvokePasses(
760 DeviceSegmentedReduceKernelT segmented_reduce_kernel)
761 {
762#ifndef CUB_RUNTIME_ENABLED
763 (void)segmented_reduce_kernel;
764 // Kernel launch not supported from this device
765 return CubDebug(cudaErrorNotSupported );
766#else
767 cudaError error = cudaSuccess;
768 do
769 {
770 // Return if the caller is simply requesting the size of the storage allocation
771 if (d_temp_storage == NULL)
772 {
774 return cudaSuccess;
775 }
776
777 // Init kernel configuration
778 KernelConfig segmented_reduce_config;
779 if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
780
781 // Log device_reduce_sweep_kernel configuration
782 if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
784 ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
785 (long long) stream,
786 ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
787 segmented_reduce_config.sm_occupancy);
788
789 // Invoke DeviceReduceKernel
790 segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
791 d_in,
792 d_out,
797 init);
798
799 // Check for failure to launch
800 if (CubDebug(error = cudaPeekAtLastError())) break;
801
802 // Sync the stream if specified to flush runtime errors
803 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
804 }
805 while (0);
806
807 return error;
808
809#endif // CUB_RUNTIME_ENABLED
810
811 }
812
813
815 template <typename ActivePolicyT>
816 CUB_RUNTIME_FUNCTION __forceinline__
817 cudaError_t Invoke()
818 {
819 typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
820
821 // Force kernel code-generation in all compiler passes
822 return InvokePasses<ActivePolicyT>(
823 DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
824 }
825
826
827 //------------------------------------------------------------------------------
828 // Dispatch entrypoints
829 //------------------------------------------------------------------------------
830
834 CUB_RUNTIME_FUNCTION __forceinline__
835 static cudaError_t Dispatch(
836 void *d_temp_storage,
837 size_t &temp_storage_bytes,
838 InputIteratorT d_in,
839 OutputIteratorT d_out,
840 int num_segments,
841 OffsetIteratorT d_begin_offsets,
842 OffsetIteratorT d_end_offsets,
843 ReductionOpT reduction_op,
844 OutputT init,
845 cudaStream_t stream,
846 bool debug_synchronous)
847 {
848 typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
849
850 if (num_segments <= 0)
851 return cudaSuccess;
852
853 cudaError error = cudaSuccess;
854 do
855 {
856 // Get PTX version
857 int ptx_version;
858 if (CubDebug(error = PtxVersion(ptx_version))) break;
859
860 // Create dispatch functor
863 d_in, d_out,
867
868 // Dispatch to chained policy
869 if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
870 }
871 while (0);
872
873 return error;
874 }
875};
876
877
878
879} // CUB namespace
880CUB_NS_POSTFIX // Optional outer namespace(s)
881
882
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Optional outer namespace(s)
__device__ __forceinline__ void NormalizeReductionOutput(T &, OffsetT, IteratorT)
Normalize input iterator to segment offset.
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
@ BLOCK_REDUCE_WARP_REDUCTIONS
@ BLOCK_REDUCE_RAKING
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
< Cache load modifier for reading input elements
Alias wrapper allowing storage to be unioned.
AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide ...
__device__ __forceinline__ OutputT ConsumeRange(GridEvenShare< OffsetT > &even_share, Int2Type< CAN_VECTORIZE > can_vectorize)
Reduce a contiguous segment of input tiles.
__device__ __forceinline__ OutputT ConsumeTiles(GridEvenShare< OffsetT > &even_share)
Helper for dispatching into a policy chain.
AgentReducePolicy< CUB_SCALED_GRANULARITIES(128, 8, OuputT), 2, BLOCK_REDUCE_RAKING, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(128, 8, OuputT), 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 20, OuputT), 2, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 20, OuputT), 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 16, OuputT), 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG > ReducePolicy
< Cache load modifier
< Binary reduction functor type having member T operator()(const T &a, const T &b)
Policy600 MaxPolicy
MaxPolicy.
< Binary reduction functor type having member T operator()(const T &a, const T &b)
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
InputIteratorT d_in
[in] Pointer to the input sequence of data items
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
OutputIteratorT d_out
[out] Pointer to the output aggregate
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel)
Invoke a single block block to reduce in-core.
OffsetT num_items
[in] Total number of input items (i.e., length of d_in)
int ptx_version
[in] PTX version
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
OutputT init
[in] The initial value of the reduction
CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel)
Invoke two-passes to reduce.
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
ReductionOpT reduction_op
[in] Binary reduction functor
< Binary reduction functor type having member T operator()(const T &a, const T &b)
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
OutputIteratorT d_out
[out] Pointer to the output aggregate
OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
OutputT init
[in] The initial value of the reduction
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
Invocation.
ReductionOpT reduction_op
[in] Binary reduction functor
InputIteratorT d_in
[in] Pointer to the input sequence of data items
If<(Equals< typenamestd::iterator_traits< OutputIteratorT >::value_type, void >::VALUE), typenamestd::iterator_traits< InputIteratorT >::value_type, typenamestd::iterator_traits< OutputIteratorT >::value_type >::Type OutputT
The output value type.
OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
OffsetT num_segments
[in] The number of segments that comprise the sorting data
CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Type equality test.
Definition util_type.cuh:99
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
__host__ __device__ __forceinline__ void DispatchInit(OffsetT num_items, int max_grid_size, int tile_items)
Dispatch initializer. To be called prior prior to kernel launch.
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
#define CUB_SUBSCRIPTION_FACTOR(arch)
Oversubscription factor.
Definition util_arch.cuh:99
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.