OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
agent_histogram.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "../util_type.cuh"
39 #include "../block/block_load.cuh"
40 #include "../grid/grid_queue.cuh"
41 #include "../iterator/cache_modified_input_iterator.cuh"
42 #include "../util_namespace.cuh"
43 
45 CUB_NS_PREFIX
46 
48 namespace cub {
49 
50 
51 /******************************************************************************
52  * Tuning policy
53  ******************************************************************************/
54 
59 {
60  GMEM,
61  SMEM,
62  BLEND
63 };
64 
65 
69 template <
70  int _BLOCK_THREADS,
71  int _PIXELS_PER_THREAD,
72  BlockLoadAlgorithm _LOAD_ALGORITHM,
73  CacheLoadModifier _LOAD_MODIFIER,
74  bool _RLE_COMPRESS,
75  BlockHistogramMemoryPreference _MEM_PREFERENCE,
76  bool _WORK_STEALING>
78 {
79  enum
80  {
81  BLOCK_THREADS = _BLOCK_THREADS,
82  PIXELS_PER_THREAD = _PIXELS_PER_THREAD,
83  IS_RLE_COMPRESS = _RLE_COMPRESS,
84  MEM_PREFERENCE = _MEM_PREFERENCE,
85  IS_WORK_STEALING = _WORK_STEALING,
86  };
87 
88  static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
89  static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
90 };
91 
92 
93 /******************************************************************************
94  * Thread block abstractions
95  ******************************************************************************/
96 
100 template <
101  typename AgentHistogramPolicyT,
102  int PRIVATIZED_SMEM_BINS,
103  int NUM_CHANNELS,
104  int NUM_ACTIVE_CHANNELS,
105  typename SampleIteratorT,
106  typename CounterT,
107  typename PrivatizedDecodeOpT,
108  typename OutputDecodeOpT,
109  typename OffsetT,
110  int PTX_ARCH = CUB_PTX_ARCH>
112 {
113  //---------------------------------------------------------------------
114  // Types and constants
115  //---------------------------------------------------------------------
116 
118  typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
119 
122 
125 
127  enum
128  {
129  BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
130 
131  PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
132  SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
133  QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4,
134 
135  TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
136  TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
137 
138  IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
139 
140  MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ?
141  AgentHistogramPolicyT::MEM_PREFERENCE :
142  GMEM,
143 
144  IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
145  };
146 
148  static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
149 
150 
152  typedef typename If<IsPointer<SampleIteratorT>::VALUE,
153  CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
154  SampleIteratorT>::Type // Directly use the supplied input iterator type
156 
160 
164 
166  typedef BlockLoad<
167  SampleT,
168  BLOCK_THREADS,
169  SAMPLES_PER_THREAD,
170  AgentHistogramPolicyT::LOAD_ALGORITHM>
172 
174  typedef BlockLoad<
175  PixelT,
176  BLOCK_THREADS,
177  PIXELS_PER_THREAD,
178  AgentHistogramPolicyT::LOAD_ALGORITHM>
180 
182  typedef BlockLoad<
183  QuadT,
184  BLOCK_THREADS,
185  QUADS_PER_THREAD,
186  AgentHistogramPolicyT::LOAD_ALGORITHM>
188 
191  {
192  CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding)
193 
194  int tile_idx;
195 
196  // Aliasable storage layout
197  union Aliasable
198  {
199  typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples
200  typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels
201  typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads
202 
203  } aliasable;
204  };
205 
206 
208  struct TempStorage : Uninitialized<_TempStorage> {};
209 
210 
211  //---------------------------------------------------------------------
212  // Per-thread fields
213  //---------------------------------------------------------------------
214 
217 
220 
223 
225  int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
226 
228  int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
229 
231  CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
232 
234  CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
235 
237  OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
238 
240  PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
241 
244 
245 
246  //---------------------------------------------------------------------
247  // Initialize privatized bin counters
248  //---------------------------------------------------------------------
249 
250  // Initialize privatized bin counters
251  __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
252  {
253  // Initialize histogram bin counts to zeros
254  #pragma unroll
255  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
256  {
257  for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
258  {
259  privatized_histograms[CHANNEL][privatized_bin] = 0;
260  }
261  }
262 
263  // Barrier to make sure all threads are done updating counters
264  CTA_SYNC();
265  }
266 
267 
268  // Initialize privatized bin counters. Specialized for privatized shared-memory counters
269  __device__ __forceinline__ void InitSmemBinCounters()
270  {
271  CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
272 
273  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
274  privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
275 
276  InitBinCounters(privatized_histograms);
277  }
278 
279 
280  // Initialize privatized bin counters. Specialized for privatized global-memory counters
281  __device__ __forceinline__ void InitGmemBinCounters()
282  {
284  }
285 
286 
287  //---------------------------------------------------------------------
288  // Update final output histograms
289  //---------------------------------------------------------------------
290 
291  // Update final output histograms from privatized histograms
292  __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
293  {
294  // Barrier to make sure all threads are done updating counters
295  CTA_SYNC();
296 
297  // Apply privatized bin counts to output bin counts
298  #pragma unroll
299  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
300  {
301  int channel_bins = num_privatized_bins[CHANNEL];
302  for (int privatized_bin = threadIdx.x;
303  privatized_bin < channel_bins;
304  privatized_bin += BLOCK_THREADS)
305  {
306  int output_bin = -1;
307  CounterT count = privatized_histograms[CHANNEL][privatized_bin];
308  bool is_valid = count > 0;
309 
310  output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
311 
312  if (output_bin >= 0)
313  {
314  atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
315  }
316 
317  }
318  }
319  }
320 
321 
322  // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters
323  __device__ __forceinline__ void StoreSmemOutput()
324  {
325  CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
326  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
327  privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
328 
329  StoreOutput(privatized_histograms);
330  }
331 
332 
333  // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters
334  __device__ __forceinline__ void StoreGmemOutput()
335  {
337  }
338 
339 
340  //---------------------------------------------------------------------
341  // Tile accumulation
342  //---------------------------------------------------------------------
343 
344  // Accumulate pixels. Specialized for RLE compression.
345  __device__ __forceinline__ void AccumulatePixels(
346  SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
347  bool is_valid[PIXELS_PER_THREAD],
348  CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
349  Int2Type<true> is_rle_compress)
350  {
351  #pragma unroll
352  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
353  {
354  // Bin pixels
355  int bins[PIXELS_PER_THREAD];
356 
357  #pragma unroll
358  for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
359  {
360  bins[PIXEL] = -1;
361  privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
362  }
363 
364  CounterT accumulator = 1;
365 
366  #pragma unroll
367  for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
368  {
369  if (bins[PIXEL] != bins[PIXEL + 1])
370  {
371  if (bins[PIXEL] >= 0)
372  atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
373 
374  accumulator = 0;
375  }
376  accumulator++;
377  }
378 
379  // Last pixel
380  if (bins[PIXELS_PER_THREAD - 1] >= 0)
381  atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
382  }
383  }
384 
385 
386  // Accumulate pixels. Specialized for individual accumulation of each pixel.
387  __device__ __forceinline__ void AccumulatePixels(
388  SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
389  bool is_valid[PIXELS_PER_THREAD],
390  CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
391  Int2Type<false> is_rle_compress)
392  {
393  #pragma unroll
394  for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
395  {
396  #pragma unroll
397  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
398  {
399  int bin = -1;
400  privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
401  if (bin >= 0)
402  atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
403  }
404  }
405  }
406 
407 
411  __device__ __forceinline__ void AccumulateSmemPixels(
412  SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
413  bool is_valid[PIXELS_PER_THREAD])
414  {
415  CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
416 
417  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
418  privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
419 
420  AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
421  }
422 
423 
427  __device__ __forceinline__ void AccumulateGmemPixels(
428  SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
429  bool is_valid[PIXELS_PER_THREAD])
430  {
431  AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
432  }
433 
434 
435 
436  //---------------------------------------------------------------------
437  // Tile loading
438  //---------------------------------------------------------------------
439 
440  // Load full, aligned tile using pixel iterator (multi-channel)
441  template <int _NUM_ACTIVE_CHANNELS>
442  __device__ __forceinline__ void LoadFullAlignedTile(
444  int valid_samples,
445  SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
446  Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels)
447  {
448  typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
449 
451 
452  // Load using a wrapped pixel iterator
453  BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
454  d_wrapped_pixels,
455  reinterpret_cast<AliasedPixels&>(samples));
456  }
457 
458  // Load full, aligned tile using quad iterator (single-channel)
459  __device__ __forceinline__ void LoadFullAlignedTile(
461  int valid_samples,
462  SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
463  Int2Type<1> num_active_channels)
464  {
465  typedef QuadT AliasedQuads[QUADS_PER_THREAD];
466 
468 
469  // Load using a wrapped quad iterator
470  BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
471  d_wrapped_quads,
472  reinterpret_cast<AliasedQuads&>(samples));
473  }
474 
475  // Load full, aligned tile
476  __device__ __forceinline__ void LoadTile(
478  int valid_samples,
479  SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
480  Int2Type<true> is_full_tile,
481  Int2Type<true> is_aligned)
482  {
483  LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
484  }
485 
486  // Load full, mis-aligned tile using sample iterator
487  __device__ __forceinline__ void LoadTile(
489  int valid_samples,
490  SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
491  Int2Type<true> is_full_tile,
492  Int2Type<false> is_aligned)
493  {
494  typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
495 
496  // Load using sample iterator
497  BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
499  reinterpret_cast<AliasedSamples&>(samples));
500  }
501 
502  // Load partially-full, aligned tile using the pixel iterator
503  __device__ __forceinline__ void LoadTile(
505  int valid_samples,
506  SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
507  Int2Type<false> is_full_tile,
508  Int2Type<true> is_aligned)
509  {
510  typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
511 
513 
514  int valid_pixels = valid_samples / NUM_CHANNELS;
515 
516  // Load using a wrapped pixel iterator
517  BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
518  d_wrapped_pixels,
519  reinterpret_cast<AliasedPixels&>(samples),
520  valid_pixels);
521  }
522 
523  // Load partially-full, mis-aligned tile using sample iterator
524  __device__ __forceinline__ void LoadTile(
526  int valid_samples,
527  SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
528  Int2Type<false> is_full_tile,
529  Int2Type<false> is_aligned)
530  {
531  typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
532 
533  BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
535  reinterpret_cast<AliasedSamples&>(samples),
536  valid_samples);
537  }
538 
539 
540  //---------------------------------------------------------------------
541  // Tile processing
542  //---------------------------------------------------------------------
543 
544  // Consume a tile of data samples
545  template <
546  bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
547  bool IS_FULL_TILE> // Whether the tile is full
548  __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
549  {
550  SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS];
551  bool is_valid[PIXELS_PER_THREAD];
552 
553  // Load tile
554  LoadTile(
555  block_offset,
556  valid_samples,
557  samples,
558  Int2Type<IS_FULL_TILE>(),
559  Int2Type<IS_ALIGNED>());
560 
561  // Set valid flags
562  #pragma unroll
563  for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
564  is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
565 
566  // Accumulate samples
567 #if CUB_PTX_ARCH >= 120
568  if (prefer_smem)
569  AccumulateSmemPixels(samples, is_valid);
570  else
571  AccumulateGmemPixels(samples, is_valid);
572 #else
573  AccumulateGmemPixels(samples, is_valid);
574 #endif
575 
576  }
577 
578 
579  // Consume row tiles. Specialized for work-stealing from queue
580  template <bool IS_ALIGNED>
581  __device__ __forceinline__ void ConsumeTiles(
583  OffsetT num_rows,
585  int tiles_per_row,
587  Int2Type<true> is_work_stealing)
588  {
589 
591  int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x;
592  OffsetT num_even_share_tiles = gridDim.x * gridDim.y;
593 
594  while (tile_idx < num_tiles)
595  {
596  int row = tile_idx / tiles_per_row;
597  int col = tile_idx - (row * tiles_per_row);
598  OffsetT row_offset = row * row_stride_samples;
599  OffsetT col_offset = (col * TILE_SAMPLES);
600  OffsetT tile_offset = row_offset + col_offset;
601 
602  if (col == tiles_per_row - 1)
603  {
604  // Consume a partially-full tile at the end of the row
605  OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
606  ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
607  }
608  else
609  {
610  // Consume full tile
611  ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
612  }
613 
614  CTA_SYNC();
615 
616  // Get next tile
617  if (threadIdx.x == 0)
618  temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
619 
620  CTA_SYNC();
621 
622  tile_idx = temp_storage.tile_idx;
623  }
624  }
625 
626 
627  // Consume row tiles. Specialized for even-share (striped across thread blocks)
628  template <bool IS_ALIGNED>
629  __device__ __forceinline__ void ConsumeTiles(
631  OffsetT num_rows,
633  int tiles_per_row,
635  Int2Type<false> is_work_stealing)
636  {
637  for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
638  {
639  OffsetT row_begin = row * row_stride_samples;
640  OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS);
641  OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
642 
643  while (tile_offset < row_end)
644  {
645  OffsetT num_remaining = row_end - tile_offset;
646 
647  if (num_remaining < TILE_SAMPLES)
648  {
649  // Consume partial tile
650  ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
651  break;
652  }
653 
654  // Consume full tile
655  ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
656  tile_offset += gridDim.x * TILE_SAMPLES;
657  }
658  }
659  }
660 
661 
662  //---------------------------------------------------------------------
663  // Parameter extraction
664  //---------------------------------------------------------------------
665 
666  // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
667  template <
668  CacheLoadModifier _MODIFIER,
669  typename _ValueT,
670  typename _OffsetT>
671  __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
672  {
673  return itr.ptr;
674  }
675 
676  // Return a native pixel pointer (specialized for other types)
677  template <typename IteratorT>
678  __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
679  {
680  return NULL;
681  }
682 
683 
684 
685  //---------------------------------------------------------------------
686  // Interface
687  //---------------------------------------------------------------------
688 
689 
693  __device__ __forceinline__ AgentHistogram(
695  SampleIteratorT d_samples,
696  int (&num_output_bins)[NUM_ACTIVE_CHANNELS],
697  int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],
698  CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS],
699  CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],
700  OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS],
701  PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])
702  :
703  temp_storage(temp_storage.Alias()),
704  d_wrapped_samples(d_samples),
710  d_native_samples(NativePointer(d_wrapped_samples)),
711  prefer_smem((MEM_PREFERENCE == SMEM) ?
712  true : // prefer smem privatized histograms
713  (MEM_PREFERENCE == GMEM) ?
714  false : // prefer gmem privatized histograms
715  blockIdx.x & 1) // prefer blended privatized histograms
716  {
717  int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
718 
719  // Initialize the locations of this block's privatized histograms
720  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
721  this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
722  }
723 
724 
728  __device__ __forceinline__ void ConsumeTiles(
730  OffsetT num_rows,
732  int tiles_per_row,
734  {
735  // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
736  int quad_mask = AlignBytes<QuadT>::ALIGN_BYTES - 1;
737  int pixel_mask = AlignBytes<PixelT>::ALIGN_BYTES - 1;
738  size_t row_bytes = sizeof(SampleT) * row_stride_samples;
739 
740  bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel
741  ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned
742  ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad
743 
744  bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel
745  ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned
746  ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel
747 
748  // Whether rows are aligned and can be vectorized
749  if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
751  else
753  }
754 
755 
759  __device__ __forceinline__ void InitBinCounters()
760  {
761  if (prefer_smem)
762  InitSmemBinCounters();
763  else
764  InitGmemBinCounters();
765  }
766 
767 
771  __device__ __forceinline__ void StoreOutput()
772  {
773  if (prefer_smem)
774  StoreSmemOutput();
775  else
776  StoreGmemOutput();
777  }
778 
779 
780 };
781 
782 
783 
784 
785 } // CUB namespace
786 CUB_NS_POSTFIX // Optional outer namespace(s)
787 
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< true > is_work_stealing)
std::iterator_traits< SampleIteratorT >::value_type SampleT
The sample type of the input iterator.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT row_stride_samples
The number of samples between starts of consecutive rows in the region of interest.
CounterT * d_privatized_histograms[NUM_ACTIVE_CHANNELS]
Reference to gmem privatized histograms for each channel.
PrivatizedDecodeOpT(& privatized_decode_op)[NUM_ACTIVE_CHANNELS]
The transform operator for determining privatized counter indices from samples, one for each channel.
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition: thread_load.cuh:62
Temporary storage type (unionable)
__device__ __forceinline__ AgentHistogram(TempStorage &temp_storage, SampleIteratorT d_samples, int(&num_output_bins)[NUM_ACTIVE_CHANNELS], int(&num_privatized_bins)[NUM_ACTIVE_CHANNELS], CounterT *(&d_output_histograms)[NUM_ACTIVE_CHANNELS], CounterT *(&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], OutputDecodeOpT(&output_decode_op)[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT(&privatized_decode_op)[NUM_ACTIVE_CHANNELS])
Optional outer namespace(s)
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< false > is_work_stealing)
int(& num_privatized_bins)[NUM_ACTIVE_CHANNELS]
The number of privatized bins for each channel.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int GridQueue< int > tile_queue
< Drain queue descriptor for dynamically mapping tile data onto thread blocks
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
CounterT *(& d_output_histograms)[NUM_ACTIVE_CHANNELS]
Reference to final output histograms (gmem)
Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
WrappedSampleIteratorT d_wrapped_samples
Sample input iterator (with cache modifier applied, if possible)
\smemstorage{BlockLoad}
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT num_rows
The number of rows in the region of interest.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int tiles_per_row
Number of image tiles per row.
CTA_SYNC()
Definition: util_ptx.cuh:255
OutputDecodeOpT(& output_decode_op)[NUM_ACTIVE_CHANNELS]
The transform operator for determining output bin-ids from privatized counter indices,...
CacheModifiedInputIterator< LOAD_MODIFIER, QuadT, OffsetT > WrappedQuadIteratorT
Qaud input iterator type (for applying cache modifier)
int(& num_output_bins)[NUM_ACTIVE_CHANNELS]
The number of output bins for each channel.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
Definition: block_load.cuh:473
bool prefer_smem
Whether to prefer privatized smem counters vs privatized global counters.
AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wi...
If< IsPointer< SampleIteratorT >::VALUE, CacheModifiedInputIterator< LOAD_MODIFIER, SampleT, OffsetT >, SampleIteratorT >::Type WrappedSampleIteratorT
Input iterator wrapper type (for applying cache modifier)
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
Definition: block_load.cuh:640
__device__ __forceinline__ void AccumulateSmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
OffsetT OffsetT
[in] Total number of input data items
__device__ __forceinline__ void InitBinCounters()
CacheModifiedInputIterator< LOAD_MODIFIER, PixelT, OffsetT > WrappedPixelIteratorT
Pixel input iterator type (for applying cache modifier)
BlockLoad< QuadT, BLOCK_THREADS, QUADS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadQuadT
Parameterized BlockLoad type for quads.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
Whether to perform localized RLE to compress samples before histogramming.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
Structure alignment.
Definition: util_type.cuh:290
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
CubVector< SampleT, 4 >::Type QuadT
The quad type of SampleT.
BlockLoad< SampleT, BLOCK_THREADS, SAMPLES_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadSampleT
Parameterized BlockLoad type for samples.
ValueType * ptr
Wrapped native pointer.
__device__ __forceinline__ void AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
BlockLoad< PixelT, BLOCK_THREADS, PIXELS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadPixelT
Parameterized BlockLoad type for pixels.
Whether to dequeue tiles from a global work queue.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT num_row_pixels
The number of multi-channel pixels per row in the region of interest.
BlockHistogramMemoryPreference
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition: util_type.cuh:454
SampleT * d_native_samples
Native pointer for input samples (possibly NULL if unavailable)
< Whether to dequeue tiles from a global work queue
Pixels per thread (per tile of input)
__device__ __forceinline__ void StoreOutput()
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue)
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
CubVector< SampleT, NUM_CHANNELS >::Type PixelT
The pixel type of SampleT.
Shared memory type required by this thread block.
GridQueue is a descriptor utility for dynamic queue management.
Definition: grid_queue.cuh:82
_TempStorage & temp_storage
Reference to temp_storage.