OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
agent_histogram.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include <iterator>
37
38#include "../util_type.cuh"
39#include "../block/block_load.cuh"
40#include "../grid/grid_queue.cuh"
41#include "../iterator/cache_modified_input_iterator.cuh"
42#include "../util_namespace.cuh"
43
45CUB_NS_PREFIX
46
48namespace cub {
49
50
51/******************************************************************************
52 * Tuning policy
53 ******************************************************************************/
54
58enum BlockHistogramMemoryPreference
59{
60 GMEM,
61 SMEM,
62 BLEND
63};
64
65
69template <
70 int _BLOCK_THREADS,
71 int _PIXELS_PER_THREAD,
72 BlockLoadAlgorithm _LOAD_ALGORITHM,
73 CacheLoadModifier _LOAD_MODIFIER,
74 bool _RLE_COMPRESS,
75 BlockHistogramMemoryPreference _MEM_PREFERENCE,
76 bool _WORK_STEALING>
78{
79 enum
80 {
81 BLOCK_THREADS = _BLOCK_THREADS,
82 PIXELS_PER_THREAD = _PIXELS_PER_THREAD,
83 IS_RLE_COMPRESS = _RLE_COMPRESS,
84 MEM_PREFERENCE = _MEM_PREFERENCE,
85 IS_WORK_STEALING = _WORK_STEALING,
86 };
87
88 static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
89 static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
90};
91
92
93/******************************************************************************
94 * Thread block abstractions
95 ******************************************************************************/
96
100template <
101 typename AgentHistogramPolicyT,
102 int PRIVATIZED_SMEM_BINS,
103 int NUM_CHANNELS,
104 int NUM_ACTIVE_CHANNELS,
105 typename SampleIteratorT,
106 typename CounterT,
107 typename PrivatizedDecodeOpT,
108 typename OutputDecodeOpT,
109 typename OffsetT,
110 int PTX_ARCH = CUB_PTX_ARCH>
112{
113 //---------------------------------------------------------------------
114 // Types and constants
115 //---------------------------------------------------------------------
116
118 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
119
122
125
127 enum
128 {
129 BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
130
131 PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
132 SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
133 QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4,
134
135 TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
136 TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
137
138 IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
139
140 MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ?
141 AgentHistogramPolicyT::MEM_PREFERENCE :
142 GMEM,
143
144 IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
145 };
146
148 static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
149
150
153 CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
154 SampleIteratorT>::Type // Directly use the supplied input iterator type
156
160
164
166 typedef BlockLoad<
167 SampleT,
168 BLOCK_THREADS,
169 SAMPLES_PER_THREAD,
170 AgentHistogramPolicyT::LOAD_ALGORITHM>
172
174 typedef BlockLoad<
175 PixelT,
176 BLOCK_THREADS,
177 PIXELS_PER_THREAD,
178 AgentHistogramPolicyT::LOAD_ALGORITHM>
180
182 typedef BlockLoad<
183 QuadT,
184 BLOCK_THREADS,
185 QUADS_PER_THREAD,
186 AgentHistogramPolicyT::LOAD_ALGORITHM>
188
191 {
192 CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding)
193
194 int tile_idx;
195
196 // Aliasable storage layout
198 {
199 typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples
200 typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels
201 typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads
202
203 } aliasable;
204 };
205
206
208 struct TempStorage : Uninitialized<_TempStorage> {};
209
210
211 //---------------------------------------------------------------------
212 // Per-thread fields
213 //---------------------------------------------------------------------
214
217
220
223
225 int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
226
228 int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
229
231 CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
232
234 CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
235
237 OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
238
240 PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
241
244
245
246 //---------------------------------------------------------------------
247 // Initialize privatized bin counters
248 //---------------------------------------------------------------------
249
250 // Initialize privatized bin counters
251 __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
252 {
253 // Initialize histogram bin counts to zeros
254 #pragma unroll
255 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
256 {
257 for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
258 {
259 privatized_histograms[CHANNEL][privatized_bin] = 0;
260 }
261 }
262
263 // Barrier to make sure all threads are done updating counters
264 CTA_SYNC();
265 }
266
267
268 // Initialize privatized bin counters. Specialized for privatized shared-memory counters
269 __device__ __forceinline__ void InitSmemBinCounters()
270 {
271 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
272
273 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
274 privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
275
276 InitBinCounters(privatized_histograms);
277 }
278
279
280 // Initialize privatized bin counters. Specialized for privatized global-memory counters
281 __device__ __forceinline__ void InitGmemBinCounters()
282 {
284 }
285
286
287 //---------------------------------------------------------------------
288 // Update final output histograms
289 //---------------------------------------------------------------------
290
291 // Update final output histograms from privatized histograms
292 __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
293 {
294 // Barrier to make sure all threads are done updating counters
295 CTA_SYNC();
296
297 // Apply privatized bin counts to output bin counts
298 #pragma unroll
299 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
300 {
301 int channel_bins = num_privatized_bins[CHANNEL];
302 for (int privatized_bin = threadIdx.x;
303 privatized_bin < channel_bins;
304 privatized_bin += BLOCK_THREADS)
305 {
306 int output_bin = -1;
307 CounterT count = privatized_histograms[CHANNEL][privatized_bin];
308 bool is_valid = count > 0;
309
310 output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
311
312 if (output_bin >= 0)
313 {
314 atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
315 }
316
317 }
318 }
319 }
320
321
322 // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters
323 __device__ __forceinline__ void StoreSmemOutput()
324 {
325 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
326 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
327 privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
328
329 StoreOutput(privatized_histograms);
330 }
331
332
333 // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters
334 __device__ __forceinline__ void StoreGmemOutput()
335 {
337 }
338
339
340 //---------------------------------------------------------------------
341 // Tile accumulation
342 //---------------------------------------------------------------------
343
344 // Accumulate pixels. Specialized for RLE compression.
345 __device__ __forceinline__ void AccumulatePixels(
346 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
347 bool is_valid[PIXELS_PER_THREAD],
348 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
349 Int2Type<true> is_rle_compress)
350 {
351 #pragma unroll
352 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
353 {
354 // Bin pixels
355 int bins[PIXELS_PER_THREAD];
356
357 #pragma unroll
358 for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
359 {
360 bins[PIXEL] = -1;
361 privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
362 }
363
364 CounterT accumulator = 1;
365
366 #pragma unroll
367 for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
368 {
369 if (bins[PIXEL] != bins[PIXEL + 1])
370 {
371 if (bins[PIXEL] >= 0)
372 atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
373
374 accumulator = 0;
375 }
376 accumulator++;
377 }
378
379 // Last pixel
380 if (bins[PIXELS_PER_THREAD - 1] >= 0)
381 atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
382 }
383 }
384
385
386 // Accumulate pixels. Specialized for individual accumulation of each pixel.
387 __device__ __forceinline__ void AccumulatePixels(
388 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
389 bool is_valid[PIXELS_PER_THREAD],
390 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
391 Int2Type<false> is_rle_compress)
392 {
393 #pragma unroll
394 for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
395 {
396 #pragma unroll
397 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
398 {
399 int bin = -1;
400 privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
401 if (bin >= 0)
402 atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
403 }
404 }
405 }
406
407
411 __device__ __forceinline__ void AccumulateSmemPixels(
412 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
413 bool is_valid[PIXELS_PER_THREAD])
414 {
415 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
416
417 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
418 privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
419
420 AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
421 }
422
423
427 __device__ __forceinline__ void AccumulateGmemPixels(
428 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
429 bool is_valid[PIXELS_PER_THREAD])
430 {
431 AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
432 }
433
434
435
436 //---------------------------------------------------------------------
437 // Tile loading
438 //---------------------------------------------------------------------
439
440 // Load full, aligned tile using pixel iterator (multi-channel)
441 template <int _NUM_ACTIVE_CHANNELS>
442 __device__ __forceinline__ void LoadFullAlignedTile(
444 int valid_samples,
445 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
446 Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels)
447 {
448 typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
449
451
452 // Load using a wrapped pixel iterator
453 BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
454 d_wrapped_pixels,
455 reinterpret_cast<AliasedPixels&>(samples));
456 }
457
458 // Load full, aligned tile using quad iterator (single-channel)
459 __device__ __forceinline__ void LoadFullAlignedTile(
461 int valid_samples,
462 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
463 Int2Type<1> num_active_channels)
464 {
465 typedef QuadT AliasedQuads[QUADS_PER_THREAD];
466
468
469 // Load using a wrapped quad iterator
470 BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
471 d_wrapped_quads,
472 reinterpret_cast<AliasedQuads&>(samples));
473 }
474
475 // Load full, aligned tile
476 __device__ __forceinline__ void LoadTile(
478 int valid_samples,
479 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
480 Int2Type<true> is_full_tile,
481 Int2Type<true> is_aligned)
482 {
483 LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
484 }
485
486 // Load full, mis-aligned tile using sample iterator
487 __device__ __forceinline__ void LoadTile(
489 int valid_samples,
490 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
491 Int2Type<true> is_full_tile,
492 Int2Type<false> is_aligned)
493 {
494 typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
495
496 // Load using sample iterator
497 BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
499 reinterpret_cast<AliasedSamples&>(samples));
500 }
501
502 // Load partially-full, aligned tile using the pixel iterator
503 __device__ __forceinline__ void LoadTile(
505 int valid_samples,
506 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
507 Int2Type<false> is_full_tile,
508 Int2Type<true> is_aligned)
509 {
510 typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
511
513
514 int valid_pixels = valid_samples / NUM_CHANNELS;
515
516 // Load using a wrapped pixel iterator
517 BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
518 d_wrapped_pixels,
519 reinterpret_cast<AliasedPixels&>(samples),
520 valid_pixels);
521 }
522
523 // Load partially-full, mis-aligned tile using sample iterator
524 __device__ __forceinline__ void LoadTile(
526 int valid_samples,
527 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
528 Int2Type<false> is_full_tile,
529 Int2Type<false> is_aligned)
530 {
531 typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
532
533 BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
535 reinterpret_cast<AliasedSamples&>(samples),
536 valid_samples);
537 }
538
539
540 //---------------------------------------------------------------------
541 // Tile processing
542 //---------------------------------------------------------------------
543
544 // Consume a tile of data samples
545 template <
546 bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
547 bool IS_FULL_TILE> // Whether the tile is full
548 __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
549 {
550 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS];
551 bool is_valid[PIXELS_PER_THREAD];
552
553 // Load tile
554 LoadTile(
556 valid_samples,
557 samples,
558 Int2Type<IS_FULL_TILE>(),
559 Int2Type<IS_ALIGNED>());
560
561 // Set valid flags
562 #pragma unroll
563 for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
564 is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
565
566 // Accumulate samples
567#if CUB_PTX_ARCH >= 120
568 if (prefer_smem)
569 AccumulateSmemPixels(samples, is_valid);
570 else
571 AccumulateGmemPixels(samples, is_valid);
572#else
573 AccumulateGmemPixels(samples, is_valid);
574#endif
575
576 }
577
578
579 // Consume row tiles. Specialized for work-stealing from queue
580 template <bool IS_ALIGNED>
581 __device__ __forceinline__ void ConsumeTiles(
585 int tiles_per_row,
587 Int2Type<true> is_work_stealing)
588 {
589
591 int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x;
592 OffsetT num_even_share_tiles = gridDim.x * gridDim.y;
593
594 while (tile_idx < num_tiles)
595 {
596 int row = tile_idx / tiles_per_row;
597 int col = tile_idx - (row * tiles_per_row);
598 OffsetT row_offset = row * row_stride_samples;
599 OffsetT col_offset = (col * TILE_SAMPLES);
600 OffsetT tile_offset = row_offset + col_offset;
601
602 if (col == tiles_per_row - 1)
603 {
604 // Consume a partially-full tile at the end of the row
605 OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
606 ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
607 }
608 else
609 {
610 // Consume full tile
611 ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
612 }
613
614 CTA_SYNC();
615
616 // Get next tile
617 if (threadIdx.x == 0)
618 temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
619
620 CTA_SYNC();
621
622 tile_idx = temp_storage.tile_idx;
623 }
624 }
625
626
627 // Consume row tiles. Specialized for even-share (striped across thread blocks)
628 template <bool IS_ALIGNED>
629 __device__ __forceinline__ void ConsumeTiles(
633 int tiles_per_row,
635 Int2Type<false> is_work_stealing)
636 {
637 for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
638 {
639 OffsetT row_begin = row * row_stride_samples;
640 OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS);
641 OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
642
643 while (tile_offset < row_end)
644 {
645 OffsetT num_remaining = row_end - tile_offset;
646
647 if (num_remaining < TILE_SAMPLES)
648 {
649 // Consume partial tile
650 ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
651 break;
652 }
653
654 // Consume full tile
655 ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
656 tile_offset += gridDim.x * TILE_SAMPLES;
657 }
658 }
659 }
660
661
662 //---------------------------------------------------------------------
663 // Parameter extraction
664 //---------------------------------------------------------------------
665
666 // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
667 template <
668 CacheLoadModifier _MODIFIER,
669 typename _ValueT,
670 typename _OffsetT>
671 __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
672 {
673 return itr.ptr;
674 }
675
676 // Return a native pixel pointer (specialized for other types)
677 template <typename IteratorT>
678 __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
679 {
680 return NULL;
681 }
682
683
684
685 //---------------------------------------------------------------------
686 // Interface
687 //---------------------------------------------------------------------
688
689
693 __device__ __forceinline__ AgentHistogram(
695 SampleIteratorT d_samples,
696 int (&num_output_bins)[NUM_ACTIVE_CHANNELS],
697 int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],
698 CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS],
699 CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],
700 OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS],
701 PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])
702 :
703 temp_storage(temp_storage.Alias()),
704 d_wrapped_samples(d_samples),
710 d_native_samples(NativePointer(d_wrapped_samples)),
711 prefer_smem((MEM_PREFERENCE == SMEM) ?
712 true : // prefer smem privatized histograms
713 (MEM_PREFERENCE == GMEM) ?
714 false : // prefer gmem privatized histograms
715 blockIdx.x & 1) // prefer blended privatized histograms
716 {
717 int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
718
719 // Initialize the locations of this block's privatized histograms
720 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
721 this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
722 }
723
724
728 __device__ __forceinline__ void ConsumeTiles(
732 int tiles_per_row,
734 {
735 // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
736 int quad_mask = AlignBytes<QuadT>::ALIGN_BYTES - 1;
737 int pixel_mask = AlignBytes<PixelT>::ALIGN_BYTES - 1;
738 size_t row_bytes = sizeof(SampleT) * row_stride_samples;
739
740 bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel
741 ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned
742 ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad
743
744 bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel
745 ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned
746 ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel
747
748 // Whether rows are aligned and can be vectorized
749 if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
751 else
753 }
754
755
759 __device__ __forceinline__ void InitBinCounters()
760 {
761 if (prefer_smem)
762 InitSmemBinCounters();
763 else
764 InitGmemBinCounters();
765 }
766
767
771 __device__ __forceinline__ void StoreOutput()
772 {
773 if (prefer_smem)
774 StoreSmemOutput();
775 else
776 StoreGmemOutput();
777 }
778
779
780};
781
782
783
784
785} // CUB namespace
786CUB_NS_POSTFIX // Optional outer namespace(s)
787
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
ValueType * ptr
Wrapped native pointer.
GridQueue is a descriptor utility for dynamic queue management.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
CTA_SYNC()
Definition util_ptx.cuh:255
Optional outer namespace(s)
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int tiles_per_row
Number of image tiles per row.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int GridQueue< int > tile_queue
< Drain queue descriptor for dynamically mapping tile data onto thread blocks
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT num_rows
The number of rows in the region of interest.
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
OffsetT OffsetT
[in] Total number of input data items
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT num_row_pixels
The number of multi-channel pixels per row in the region of interest.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT row_stride_samples
The number of samples between starts of consecutive rows in the region of interest.
< Whether to dequeue tiles from a global work queue
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
@ IS_RLE_COMPRESS
Whether to perform localized RLE to compress samples before histogramming.
@ PIXELS_PER_THREAD
Pixels per thread (per tile of input)
@ MEM_PREFERENCE
Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
@ IS_WORK_STEALING
Whether to dequeue tiles from a global work queue.
@ BLOCK_THREADS
Threads per thread block.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
Temporary storage type (unionable)
Shared memory type required by this thread block.
AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wi...
CacheModifiedInputIterator< LOAD_MODIFIER, PixelT, OffsetT > WrappedPixelIteratorT
Pixel input iterator type (for applying cache modifier)
BlockLoad< SampleT, BLOCK_THREADS, SAMPLES_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadSampleT
Parameterized BlockLoad type for samples.
WrappedSampleIteratorT d_wrapped_samples
Sample input iterator (with cache modifier applied, if possible)
std::iterator_traits< SampleIteratorT >::value_type SampleT
The sample type of the input iterator.
BlockLoad< PixelT, BLOCK_THREADS, PIXELS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadPixelT
Parameterized BlockLoad type for pixels.
CounterT * d_privatized_histograms[NUM_ACTIVE_CHANNELS]
Reference to gmem privatized histograms for each channel.
If< IsPointer< SampleIteratorT >::VALUE, CacheModifiedInputIterator< LOAD_MODIFIER, SampleT, OffsetT >, SampleIteratorT >::Type WrappedSampleIteratorT
Input iterator wrapper type (for applying cache modifier)
bool prefer_smem
Whether to prefer privatized smem counters vs privatized global counters.
int(& num_privatized_bins)[NUM_ACTIVE_CHANNELS]
The number of privatized bins for each channel.
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< true > is_work_stealing)
CubVector< SampleT, 4 >::Type QuadT
The quad type of SampleT.
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< false > is_work_stealing)
CounterT *(& d_output_histograms)[NUM_ACTIVE_CHANNELS]
Reference to final output histograms (gmem)
_TempStorage & temp_storage
Reference to temp_storage.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
int(& num_output_bins)[NUM_ACTIVE_CHANNELS]
The number of output bins for each channel.
CubVector< SampleT, NUM_CHANNELS >::Type PixelT
The pixel type of SampleT.
BlockLoad< QuadT, BLOCK_THREADS, QUADS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadQuadT
Parameterized BlockLoad type for quads.
PrivatizedDecodeOpT(& privatized_decode_op)[NUM_ACTIVE_CHANNELS]
The transform operator for determining privatized counter indices from samples, one for each channel.
__device__ __forceinline__ AgentHistogram(TempStorage &temp_storage, SampleIteratorT d_samples, int(&num_output_bins)[NUM_ACTIVE_CHANNELS], int(&num_privatized_bins)[NUM_ACTIVE_CHANNELS], CounterT *(&d_output_histograms)[NUM_ACTIVE_CHANNELS], CounterT *(&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], OutputDecodeOpT(&output_decode_op)[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT(&privatized_decode_op)[NUM_ACTIVE_CHANNELS])
__device__ __forceinline__ void AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
SampleT * d_native_samples
Native pointer for input samples (possibly NULL if unavailable)
__device__ __forceinline__ void AccumulateSmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
CacheModifiedInputIterator< LOAD_MODIFIER, QuadT, OffsetT > WrappedQuadIteratorT
Qaud input iterator type (for applying cache modifier)
__device__ __forceinline__ void InitBinCounters()
__device__ __forceinline__ void StoreOutput()
OutputDecodeOpT(& output_decode_op)[NUM_ACTIVE_CHANNELS]
The transform operator for determining output bin-ids from privatized counter indices,...
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue)
Structure alignment.
\smemstorage{BlockLoad}
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53