OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
dispatch_histogram.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39#include <limits>
40
41#include "../../agent/agent_histogram.cuh"
42#include "../../util_debug.cuh"
43#include "../../util_device.cuh"
44#include "../../thread/thread_search.cuh"
45#include "../../grid/grid_queue.cuh"
46#include "../../util_namespace.cuh"
47
49CUB_NS_PREFIX
50
52namespace cub {
53
54
55
56/******************************************************************************
57 * Histogram kernel entry points
58 *****************************************************************************/
59
63template <
64 int NUM_ACTIVE_CHANNELS,
65 typename CounterT,
66 typename OffsetT>
71{
72 if ((threadIdx.x == 0) && (blockIdx.x == 0))
73 tile_queue.ResetDrain();
74
75 int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
76
77 #pragma unroll
78 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
79 {
80 if (output_bin < num_output_bins_wrapper.array[CHANNEL])
81 d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
82 }
83}
84
85
89template <
90 typename AgentHistogramPolicyT,
91 int PRIVATIZED_SMEM_BINS,
92 int NUM_CHANNELS,
93 int NUM_ACTIVE_CHANNELS,
94 typename SampleIteratorT,
95 typename CounterT,
96 typename PrivatizedDecodeOpT,
97 typename OutputDecodeOpT,
98 typename OffsetT>
99__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
100__global__ void DeviceHistogramSweepKernel(
101 SampleIteratorT d_samples,
113{
114 // Thread block type for compositing input tiles
115 typedef AgentHistogram<
116 AgentHistogramPolicyT,
117 PRIVATIZED_SMEM_BINS,
118 NUM_CHANNELS,
119 NUM_ACTIVE_CHANNELS,
120 SampleIteratorT,
121 CounterT,
122 PrivatizedDecodeOpT,
123 OutputDecodeOpT,
124 OffsetT>
125 AgentHistogramT;
126
127 // Shared memory for AgentHistogram
128 __shared__ typename AgentHistogramT::TempStorage temp_storage;
129
130 AgentHistogramT agent(
131 temp_storage,
132 d_samples,
139
140 // Initialize counters
141 agent.InitBinCounters();
142
143 // Consume input tiles
144 agent.ConsumeTiles(
146 num_rows,
149 tile_queue);
150
151 // Store output to global (if necessary)
152 agent.StoreOutput();
153
154}
155
156
157
158
159
160
161/******************************************************************************
162 * Dispatch
163 ******************************************************************************/
164
168template <
169 int NUM_CHANNELS,
170 int NUM_ACTIVE_CHANNELS,
171 typename SampleIteratorT,
172 typename CounterT,
173 typename LevelT,
174 typename OffsetT>
176{
177 //---------------------------------------------------------------------
178 // Types and constants
179 //---------------------------------------------------------------------
180
182 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
183
184 enum
185 {
186 // Maximum number of bins per channel for which we will use a privatized smem strategy
187 MAX_PRIVATIZED_SMEM_BINS = 256
188 };
189
190
191 //---------------------------------------------------------------------
192 // Transform functors for converting samples to bin-ids
193 //---------------------------------------------------------------------
194
195 // Searches for bin given a list of bin-boundary levels
196 template <typename LevelIteratorT>
198 {
199 LevelIteratorT d_levels; // Pointer to levels array
200 int num_output_levels; // Number of levels in array
201
202 // Initializer
203 __host__ __device__ __forceinline__ void Init(
204 LevelIteratorT d_levels, // Pointer to levels array
205 int num_output_levels) // Number of levels in array
206 {
207 this->d_levels = d_levels;
208 this->num_output_levels = num_output_levels;
209 }
210
211 // Method for converting samples to bin-ids
212 template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
213 __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
214 {
217 CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
218 LevelIteratorT>::Type // Directly use the supplied input iterator type
219 WrappedLevelIteratorT;
220
221 WrappedLevelIteratorT wrapped_levels(d_levels);
222
223 int num_bins = num_output_levels - 1;
224 if (valid)
225 {
226 bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
227 if (bin >= num_bins)
228 bin = -1;
229 }
230 }
231 };
232
233
234 // Scales samples to evenly-spaced bins
236 {
237 int num_bins; // Number of levels in array
238 LevelT max; // Max sample level (exclusive)
239 LevelT min; // Min sample level (inclusive)
240 LevelT scale; // Bin scaling factor
241
242 // Initializer
243 template <typename _LevelT>
244 __host__ __device__ __forceinline__ void Init(
245 int num_output_levels, // Number of levels in array
246 _LevelT max, // Max sample level (exclusive)
247 _LevelT min, // Min sample level (inclusive)
248 _LevelT scale) // Bin scaling factor
249 {
250 this->num_bins = num_output_levels - 1;
251 this->max = max;
252 this->min = min;
253 this->scale = scale;
254 }
255
256 // Initializer (float specialization)
257 __host__ __device__ __forceinline__ void Init(
258 int num_output_levels, // Number of levels in array
259 float max, // Max sample level (exclusive)
260 float min, // Min sample level (inclusive)
261 float scale) // Bin scaling factor
262 {
263 this->num_bins = num_output_levels - 1;
264 this->max = max;
265 this->min = min;
266 this->scale = float(1.0) / scale;
267 }
268
269 // Initializer (double specialization)
270 __host__ __device__ __forceinline__ void Init(
271 int num_output_levels, // Number of levels in array
272 double max, // Max sample level (exclusive)
273 double min, // Min sample level (inclusive)
274 double scale) // Bin scaling factor
275 {
276 this->num_bins = num_output_levels - 1;
277 this->max = max;
278 this->min = min;
279 this->scale = double(1.0) / scale;
280 }
281
282 // Method for converting samples to bin-ids
283 template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
284 __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
285 {
286 LevelT level_sample = (LevelT) sample;
287
288 if (valid && (level_sample >= min) && (level_sample < max))
289 bin = (int) ((level_sample - min) / scale);
290 }
291
292 // Method for converting samples to bin-ids (float specialization)
293 template <CacheLoadModifier LOAD_MODIFIER>
294 __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
295 {
296 LevelT level_sample = (LevelT) sample;
297
298 if (valid && (level_sample >= min) && (level_sample < max))
299 bin = (int) ((level_sample - min) * scale);
300 }
301
302 // Method for converting samples to bin-ids (double specialization)
303 template <CacheLoadModifier LOAD_MODIFIER>
304 __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
305 {
306 LevelT level_sample = (LevelT) sample;
307
308 if (valid && (level_sample >= min) && (level_sample < max))
309 bin = (int) ((level_sample - min) * scale);
310 }
311 };
312
313
314 // Pass-through bin transform operator
316 {
317 // Method for converting samples to bin-ids
318 template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
319 __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
320 {
321 if (valid)
322 bin = (int) sample;
323 }
324 };
325
326
327
328 //---------------------------------------------------------------------
329 // Tuning policies
330 //---------------------------------------------------------------------
331
332 template <int NOMINAL_ITEMS_PER_THREAD>
333 struct TScale
334 {
335 enum
336 {
337 V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
338 VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
339 };
340 };
341
342
345 {
346 // HistogramSweepPolicy
347 typedef AgentHistogramPolicy<
348 512,
349 (NUM_CHANNELS == 1) ? 8 : 2,
352 true,
353 GMEM,
354 false>
356 };
357
360 {
361 // HistogramSweepPolicy
362 typedef AgentHistogramPolicy<
363 (NUM_CHANNELS == 1) ? 256 : 128,
364 (NUM_CHANNELS == 1) ? 8 : 3,
365 (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
367 true,
368 SMEM,
369 false>
371 };
372
375 {
376 // HistogramSweepPolicy
377 typedef AgentHistogramPolicy<
378 512,
379 (NUM_CHANNELS == 1) ? 8 : 2,
382 true,
383 GMEM,
384 false>
386 };
387
390 {
391 // HistogramSweepPolicy
392 typedef AgentHistogramPolicy<
393 128,
396 LOAD_LDG,
397 true,
398 BLEND,
399 true>
401 };
402
405 {
406 // HistogramSweepPolicy
407 typedef AgentHistogramPolicy<
408 384,
411 LOAD_LDG,
412 true,
413 SMEM,
414 false>
416 };
417
418
419
420 //---------------------------------------------------------------------
421 // Tuning policies of current PTX compiler pass
422 //---------------------------------------------------------------------
423
424#if (CUB_PTX_ARCH >= 500)
425 typedef Policy500 PtxPolicy;
426
427#elif (CUB_PTX_ARCH >= 350)
428 typedef Policy350 PtxPolicy;
429
430#elif (CUB_PTX_ARCH >= 300)
431 typedef Policy300 PtxPolicy;
432
433#elif (CUB_PTX_ARCH >= 200)
434 typedef Policy200 PtxPolicy;
435
436#else
437 typedef Policy110 PtxPolicy;
438
439#endif
440
441 // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
443
444
445 //---------------------------------------------------------------------
446 // Utilities
447 //---------------------------------------------------------------------
448
452 template <typename KernelConfig>
453 CUB_RUNTIME_FUNCTION __forceinline__
454 static cudaError_t InitConfigs(
455 int ptx_version,
456 KernelConfig &histogram_sweep_config)
457 {
458 #if (CUB_PTX_ARCH > 0)
459
460 // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
461 return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
462
463 #else
464
465 // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
466 if (ptx_version >= 500)
467 {
468 return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
469 }
470 else if (ptx_version >= 350)
471 {
472 return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
473 }
474 else if (ptx_version >= 300)
475 {
476 return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
477 }
478 else if (ptx_version >= 200)
479 {
480 return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
481 }
482 else if (ptx_version >= 110)
483 {
484 return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
485 }
486 else
487 {
488 // No global atomic support
489 return cudaErrorNotSupported;
490 }
491
492 #endif
493 }
494
495
500 {
501 int block_threads;
502 int pixels_per_thread;
503
504 template <typename BlockPolicy>
505 CUB_RUNTIME_FUNCTION __forceinline__
506 cudaError_t Init()
507 {
508 block_threads = BlockPolicy::BLOCK_THREADS;
509 pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD;
510
511 return cudaSuccess;
512 }
513 };
514
515
516 //---------------------------------------------------------------------
517 // Dispatch entrypoints
518 //---------------------------------------------------------------------
519
523 template <
524 typename PrivatizedDecodeOpT,
525 typename OutputDecodeOpT,
526 typename DeviceHistogramInitKernelT,
527 typename DeviceHistogramSweepKernelT>
528 CUB_RUNTIME_FUNCTION __forceinline__
529 static cudaError_t PrivatizedDispatch(
530 void* d_temp_storage,
531 size_t& temp_storage_bytes,
532 SampleIteratorT d_samples,
533 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
534 int num_privatized_levels[NUM_ACTIVE_CHANNELS],
535 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS],
536 int num_output_levels[NUM_ACTIVE_CHANNELS],
537 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS],
538 int max_num_output_bins,
542 DeviceHistogramInitKernelT histogram_init_kernel,
543 DeviceHistogramSweepKernelT histogram_sweep_kernel,
544 KernelConfig histogram_sweep_config,
545 cudaStream_t stream,
546 bool debug_synchronous)
547 {
548 #ifndef CUB_RUNTIME_ENABLED
549
550 // Kernel launch not supported from this device
551 return CubDebug(cudaErrorNotSupported);
552
553 #else
554
555 cudaError error = cudaSuccess;
556 do
557 {
558 // Get device ordinal
559 int device_ordinal;
560 if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
561
562 // Get SM count
563 int sm_count;
564 if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
565
566 // Get SM occupancy for histogram_sweep_kernel
567 int histogram_sweep_sm_occupancy;
568 if (CubDebug(error = MaxSmOccupancy(
569 histogram_sweep_sm_occupancy,
570 histogram_sweep_kernel,
571 histogram_sweep_config.block_threads))) break;
572
573 // Get device occupancy for histogram_sweep_kernel
574 int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
575
576 if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
577 {
578 // Treat as a single linear array of samples
580 num_rows = 1;
581 row_stride_samples = num_row_pixels * NUM_CHANNELS;
582 }
583
584 // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
585 int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
586 int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
587 int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
588 int blocks_per_col = (blocks_per_row > 0) ?
589 int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
590 0;
591 int num_thread_blocks = blocks_per_row * blocks_per_col;
592
593 dim3 sweep_grid_dims;
594 sweep_grid_dims.x = (unsigned int) blocks_per_row;
595 sweep_grid_dims.y = (unsigned int) blocks_per_col;
596 sweep_grid_dims.z = 1;
597
598 // Temporary storage allocation requirements
599 const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
600 void* allocations[NUM_ALLOCATIONS];
601 size_t allocation_sizes[NUM_ALLOCATIONS];
602
603 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
604 allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
605
606 allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
607
608 // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
609 if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
610 if (d_temp_storage == NULL)
611 {
612 // Return if the caller is simply requesting the size of the storage allocation
613 break;
614 }
615
616 // Construct the grid queue descriptor
617 GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
618
619 // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
621 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
622 d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
623
624 // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
626 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
627 d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
628
629 // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
631 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
632 privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
633
634 // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
636 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
637 output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
638
639 // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
641 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
642 num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
643
644 // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
646 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
647 num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
648
649 int histogram_init_block_threads = 256;
650 int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
651
652 // Log DeviceHistogramInitKernel configuration
653 if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
654 histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
655
656 // Invoke histogram_init_kernel
657 histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
660 tile_queue);
661
662 // Return if empty problem
663 if ((blocks_per_row == 0) || (blocks_per_col == 0))
664 break;
665
666 // Log histogram_sweep_kernel configuration
667 if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
668 sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
669 histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
670
671 // Invoke histogram_sweep_kernel
672 histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
673 d_samples,
681 num_rows,
684 tile_queue);
685
686 // Check for failure to launch
687 if (CubDebug(error = cudaPeekAtLastError())) break;
688
689 // Sync the stream if specified to flush runtime errors
690 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
691
692 }
693 while (0);
694
695 return error;
696
697 #endif // CUB_RUNTIME_ENABLED
698 }
699
700
701
705 CUB_RUNTIME_FUNCTION
706 static cudaError_t DispatchRange(
707 void* d_temp_storage,
708 size_t& temp_storage_bytes,
709 SampleIteratorT d_samples,
710 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
711 int num_output_levels[NUM_ACTIVE_CHANNELS],
712 LevelT *d_levels[NUM_ACTIVE_CHANNELS],
716 cudaStream_t stream,
717 bool debug_synchronous,
718 Int2Type<false> is_byte_sample)
719 {
720 cudaError error = cudaSuccess;
721 do
722 {
723 // Get PTX version
724 int ptx_version;
725 #if (CUB_PTX_ARCH == 0)
726 if (CubDebug(error = PtxVersion(ptx_version))) break;
727 #else
728 ptx_version = CUB_PTX_ARCH;
729 #endif
730
731 // Get kernel dispatch configurations
732 KernelConfig histogram_sweep_config;
733 if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
734 break;
735
736 // Use the search transform op for converting samples to privatized bins
737 typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
738
739 // Use the pass-thru transform op for converting privatized bins to output bins
740 typedef PassThruTransform OutputDecodeOpT;
741
742 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
743 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
744 int max_levels = num_output_levels[0];
745
746 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
747 {
748 privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
749 if (num_output_levels[channel] > max_levels)
750 max_levels = num_output_levels[channel];
751 }
752 int max_num_output_bins = max_levels - 1;
753
754 // Dispatch
755 if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
756 {
757 // Too many bins to keep in shared memory.
758 const int PRIVATIZED_SMEM_BINS = 0;
759
760 if (CubDebug(error = PrivatizedDispatch(
761 d_temp_storage,
762 temp_storage_bytes,
763 d_samples,
764 d_output_histograms,
765 num_output_levels,
766 privatized_decode_op,
767 num_output_levels,
768 output_decode_op,
769 max_num_output_bins,
771 num_rows,
773 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
774 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
775 histogram_sweep_config,
776 stream,
777 debug_synchronous))) break;
778 }
779 else
780 {
781 // Dispatch shared-privatized approach
782 const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
783
784 if (CubDebug(error = PrivatizedDispatch(
785 d_temp_storage,
786 temp_storage_bytes,
787 d_samples,
788 d_output_histograms,
789 num_output_levels,
790 privatized_decode_op,
791 num_output_levels,
792 output_decode_op,
793 max_num_output_bins,
795 num_rows,
797 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
798 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
799 histogram_sweep_config,
800 stream,
801 debug_synchronous))) break;
802 }
803
804 } while (0);
805
806 return error;
807 }
808
809
813 CUB_RUNTIME_FUNCTION
814 static cudaError_t DispatchRange(
815 void* d_temp_storage,
816 size_t& temp_storage_bytes,
817 SampleIteratorT d_samples,
818 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
819 int num_output_levels[NUM_ACTIVE_CHANNELS],
820 LevelT *d_levels[NUM_ACTIVE_CHANNELS],
824 cudaStream_t stream,
825 bool debug_synchronous,
826 Int2Type<true> is_byte_sample)
827 {
828 cudaError error = cudaSuccess;
829 do
830 {
831 // Get PTX version
832 int ptx_version;
833 #if (CUB_PTX_ARCH == 0)
834 if (CubDebug(error = PtxVersion(ptx_version))) break;
835 #else
836 ptx_version = CUB_PTX_ARCH;
837 #endif
838
839 // Get kernel dispatch configurations
840 KernelConfig histogram_sweep_config;
841 if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
842 break;
843
844 // Use the pass-thru transform op for converting samples to privatized bins
845 typedef PassThruTransform PrivatizedDecodeOpT;
846
847 // Use the search transform op for converting privatized bins to output bins
848 typedef SearchTransform<LevelT*> OutputDecodeOpT;
849
850 int num_privatized_levels[NUM_ACTIVE_CHANNELS];
851 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
852 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
853 int max_levels = num_output_levels[0]; // Maximum number of levels in any channel
854
855 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
856 {
857 num_privatized_levels[channel] = 257;
858 output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
859
860 if (num_output_levels[channel] > max_levels)
861 max_levels = num_output_levels[channel];
862 }
863 int max_num_output_bins = max_levels - 1;
864
865 const int PRIVATIZED_SMEM_BINS = 256;
866
867 if (CubDebug(error = PrivatizedDispatch(
868 d_temp_storage,
869 temp_storage_bytes,
870 d_samples,
871 d_output_histograms,
872 num_privatized_levels,
873 privatized_decode_op,
874 num_output_levels,
875 output_decode_op,
876 max_num_output_bins,
878 num_rows,
880 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
881 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
882 histogram_sweep_config,
883 stream,
884 debug_synchronous))) break;
885
886 } while (0);
887
888 return error;
889 }
890
891
895 CUB_RUNTIME_FUNCTION __forceinline__
896 static cudaError_t DispatchEven(
897 void* d_temp_storage,
898 size_t& temp_storage_bytes,
899 SampleIteratorT d_samples,
900 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
901 int num_output_levels[NUM_ACTIVE_CHANNELS],
902 LevelT lower_level[NUM_ACTIVE_CHANNELS],
903 LevelT upper_level[NUM_ACTIVE_CHANNELS],
907 cudaStream_t stream,
908 bool debug_synchronous,
909 Int2Type<false> is_byte_sample)
910 {
911 cudaError error = cudaSuccess;
912 do
913 {
914 // Get PTX version
915 int ptx_version;
916 #if (CUB_PTX_ARCH == 0)
917 if (CubDebug(error = PtxVersion(ptx_version))) break;
918 #else
919 ptx_version = CUB_PTX_ARCH;
920 #endif
921
922 // Get kernel dispatch configurations
923 KernelConfig histogram_sweep_config;
924 if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
925 break;
926
927 // Use the scale transform op for converting samples to privatized bins
928 typedef ScaleTransform PrivatizedDecodeOpT;
929
930 // Use the pass-thru transform op for converting privatized bins to output bins
931 typedef PassThruTransform OutputDecodeOpT;
932
933 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
934 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
935 int max_levels = num_output_levels[0];
936
937 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
938 {
939 int bins = num_output_levels[channel] - 1;
940 LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
941
942 privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
943
944 if (num_output_levels[channel] > max_levels)
945 max_levels = num_output_levels[channel];
946 }
947 int max_num_output_bins = max_levels - 1;
948
949 if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
950 {
951 // Dispatch shared-privatized approach
952 const int PRIVATIZED_SMEM_BINS = 0;
953
954 if (CubDebug(error = PrivatizedDispatch(
955 d_temp_storage,
956 temp_storage_bytes,
957 d_samples,
958 d_output_histograms,
959 num_output_levels,
960 privatized_decode_op,
961 num_output_levels,
962 output_decode_op,
963 max_num_output_bins,
965 num_rows,
967 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
968 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
969 histogram_sweep_config,
970 stream,
971 debug_synchronous))) break;
972 }
973 else
974 {
975 // Dispatch shared-privatized approach
976 const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
977
978 if (CubDebug(error = PrivatizedDispatch(
979 d_temp_storage,
980 temp_storage_bytes,
981 d_samples,
982 d_output_histograms,
983 num_output_levels,
984 privatized_decode_op,
985 num_output_levels,
986 output_decode_op,
987 max_num_output_bins,
989 num_rows,
991 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
992 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
993 histogram_sweep_config,
994 stream,
995 debug_synchronous))) break;
996 }
997 }
998 while (0);
999
1000 return error;
1001 }
1002
1003
1007 CUB_RUNTIME_FUNCTION __forceinline__
1008 static cudaError_t DispatchEven(
1009 void* d_temp_storage,
1010 size_t& temp_storage_bytes,
1011 SampleIteratorT d_samples,
1012 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
1013 int num_output_levels[NUM_ACTIVE_CHANNELS],
1014 LevelT lower_level[NUM_ACTIVE_CHANNELS],
1015 LevelT upper_level[NUM_ACTIVE_CHANNELS],
1019 cudaStream_t stream,
1020 bool debug_synchronous,
1021 Int2Type<true> is_byte_sample)
1022 {
1023 cudaError error = cudaSuccess;
1024 do
1025 {
1026 // Get PTX version
1027 int ptx_version;
1028 #if (CUB_PTX_ARCH == 0)
1029 if (CubDebug(error = PtxVersion(ptx_version))) break;
1030 #else
1031 ptx_version = CUB_PTX_ARCH;
1032 #endif
1033
1034 // Get kernel dispatch configurations
1035 KernelConfig histogram_sweep_config;
1036 if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
1037 break;
1038
1039 // Use the pass-thru transform op for converting samples to privatized bins
1040 typedef PassThruTransform PrivatizedDecodeOpT;
1041
1042 // Use the scale transform op for converting privatized bins to output bins
1043 typedef ScaleTransform OutputDecodeOpT;
1044
1045 int num_privatized_levels[NUM_ACTIVE_CHANNELS];
1046 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
1047 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
1048 int max_levels = num_output_levels[0];
1049
1050 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1051 {
1052 num_privatized_levels[channel] = 257;
1053
1054 int bins = num_output_levels[channel] - 1;
1055 LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
1056 output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
1057
1058 if (num_output_levels[channel] > max_levels)
1059 max_levels = num_output_levels[channel];
1060 }
1061 int max_num_output_bins = max_levels - 1;
1062
1063 const int PRIVATIZED_SMEM_BINS = 256;
1064
1065 if (CubDebug(error = PrivatizedDispatch(
1066 d_temp_storage,
1067 temp_storage_bytes,
1068 d_samples,
1069 d_output_histograms,
1070 num_privatized_levels,
1071 privatized_decode_op,
1072 num_output_levels,
1073 output_decode_op,
1074 max_num_output_bins,
1076 num_rows,
1078 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
1079 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
1080 histogram_sweep_config,
1081 stream,
1082 debug_synchronous))) break;
1083
1084 }
1085 while (0);
1086
1087 return error;
1088 }
1089
1090};
1091
1092
1093} // CUB namespace
1094CUB_NS_POSTFIX // Optional outer namespace(s)
1095
1096
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
GridQueue is a descriptor utility for dynamic queue management.
__host__ __device__ static __forceinline__ size_t AllocationSize()
Returns the device allocation size in bytes needed to construct a GridQueue instance.
@ BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_WARP_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
__device__ __forceinline__ OffsetT UpperBound(InputIteratorT input, OffsetT num_items, T val)
Returns the offset of the first value within input which compares greater than val.
__global__ void DeviceHistogramInitKernel(ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_output_bins_wrapper, ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_output_histograms_wrapper, GridQueue< int > tile_queue)
< Signed integer type for global offsets
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int tiles_per_row
Number of image tiles per row.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_output_bins_wrapper
< Input data to reduce
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_privatized_histograms_wrapper
Reference to privatized histograms.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int GridQueue< int > tile_queue
< Drain queue descriptor for dynamically mapping tile data onto thread blocks
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > privatized_decode_op_wrapper
The transform operator for determining privatized counter indices from samples, one for each channel.
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_output_histograms_wrapper
Reference to final output histograms.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT num_rows
The number of rows in the region of interest.
OffsetT OffsetT
[in] Total number of input data items
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT num_row_pixels
The number of multi-channel pixels per row in the region of interest.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_privatized_bins_wrapper
The number bins per privatized histogram.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > output_decode_op_wrapper
The transform operator for determining output bin-ids from privatized counter indices,...
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT row_stride_samples
The number of samples between starts of consecutive rows in the region of interest.
< Whether to dequeue tiles from a global work queue
AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wi...
A wrapper for passing simple static arrays as kernel parameters.
T array[COUNT]
Statically-sized array of type T.
__host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t InitConfigs(int ptx_version, KernelConfig &histogram_sweep_config)
static CUB_RUNTIME_FUNCTION cudaError_t DispatchRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t PrivatizedDispatch(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_privatized_levels[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], int max_num_output_bins, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel, KernelConfig histogram_sweep_config, cudaStream_t stream, bool debug_synchronous)
< Function type of cub::DeviceHistogramSweepKernel
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t DispatchEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample)
std::iterator_traits< SampleIteratorT >::value_type SampleT
The sample value type of the input iterator.
static CUB_RUNTIME_FUNCTION cudaError_t DispatchRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t DispatchEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample)
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53