OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_adjacent_difference.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../util_type.cuh"
37 #include "../util_ptx.cuh"
38 #include "../util_namespace.cuh"
39 
41 CUB_NS_PREFIX
42 
44 namespace cub {
45 
46 template <
47  typename T,
48  int BLOCK_DIM_X,
49  int BLOCK_DIM_Y = 1,
50  int BLOCK_DIM_Z = 1,
51  int PTX_ARCH = CUB_PTX_ARCH>
53 {
54 private:
55 
56  /******************************************************************************
57  * Constants and type definitions
58  ******************************************************************************/
59 
61  enum
62  {
64  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
65  };
66 
67 
69  struct _TempStorage
70  {
71  T first_items[BLOCK_THREADS];
72  T last_items[BLOCK_THREADS];
73  };
74 
75 
76  /******************************************************************************
77  * Utility methods
78  ******************************************************************************/
79 
81  __device__ __forceinline__ _TempStorage& PrivateStorage()
82  {
83  __shared__ _TempStorage private_storage;
84  return private_storage;
85  }
86 
87 
89  template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
90  struct ApplyOp
91  {
92  // Apply flag operator
93  static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
94  {
95  return flag_op(b, a, idx);
96  }
97  };
98 
100  template <typename FlagOp>
101  struct ApplyOp<FlagOp, false>
102  {
103  // Apply flag operator
104  static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
105  {
106  return flag_op(b, a);
107  }
108  };
109 
111  template <int ITERATION, int MAX_ITERATIONS>
112  struct Iterate
113  {
114  // Head flags
115  template <
116  int ITEMS_PER_THREAD,
117  typename FlagT,
118  typename FlagOp>
119  static __device__ __forceinline__ void FlagHeads(
120  int linear_tid,
121  FlagT (&flags)[ITEMS_PER_THREAD],
122  T (&input)[ITEMS_PER_THREAD],
123  T (&preds)[ITEMS_PER_THREAD],
124  FlagOp flag_op)
125  {
126  preds[ITERATION] = input[ITERATION - 1];
127 
128  flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
129  flag_op,
130  preds[ITERATION],
131  input[ITERATION],
132  (linear_tid * ITEMS_PER_THREAD) + ITERATION);
133 
134  Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
135  }
136 
137  // Tail flags
138  template <
139  int ITEMS_PER_THREAD,
140  typename FlagT,
141  typename FlagOp>
142  static __device__ __forceinline__ void FlagTails(
143  int linear_tid,
144  FlagT (&flags)[ITEMS_PER_THREAD],
145  T (&input)[ITEMS_PER_THREAD],
146  FlagOp flag_op)
147  {
148  flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
149  flag_op,
150  input[ITERATION],
151  input[ITERATION + 1],
152  (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
153 
155  }
156 
157  };
158 
160  template <int MAX_ITERATIONS>
161  struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
162  {
163  // Head flags
164  template <
165  int ITEMS_PER_THREAD,
166  typename FlagT,
167  typename FlagOp>
168  static __device__ __forceinline__ void FlagHeads(
169  int /*linear_tid*/,
170  FlagT (&/*flags*/)[ITEMS_PER_THREAD],
171  T (&/*input*/)[ITEMS_PER_THREAD],
172  T (&/*preds*/)[ITEMS_PER_THREAD],
173  FlagOp /*flag_op*/)
174  {}
175 
176  // Tail flags
177  template <
178  int ITEMS_PER_THREAD,
179  typename FlagT,
180  typename FlagOp>
181  static __device__ __forceinline__ void FlagTails(
182  int /*linear_tid*/,
183  FlagT (&/*flags*/)[ITEMS_PER_THREAD],
184  T (&/*input*/)[ITEMS_PER_THREAD],
185  FlagOp /*flag_op*/)
186  {}
187  };
188 
189 
190  /******************************************************************************
191  * Thread fields
192  ******************************************************************************/
193 
196 
198  unsigned int linear_tid;
199 
200 
201 public:
202 
204  struct TempStorage : Uninitialized<_TempStorage> {};
205 
206 
207  /******************************************************************/
211 
215  __device__ __forceinline__ BlockAdjacentDifference()
216  :
218  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
219  {}
220 
221 
225  __device__ __forceinline__ BlockAdjacentDifference(
227  :
228  temp_storage(temp_storage.Alias()),
229  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
230  {}
231 
232 
234  /******************************************************************/
238 
239 
240 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
241 
242  template <
243  int ITEMS_PER_THREAD,
244  typename FlagT,
245  typename FlagOp>
246  __device__ __forceinline__ void FlagHeads(
247  FlagT (&head_flags)[ITEMS_PER_THREAD],
248  T (&input)[ITEMS_PER_THREAD],
249  T (&preds)[ITEMS_PER_THREAD],
250  FlagOp flag_op)
251  {
252  // Share last item
253  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
254 
255  CTA_SYNC();
256 
257  if (linear_tid == 0)
258  {
259  // Set flag for first thread-item (preds[0] is undefined)
260  head_flags[0] = 1;
261  }
262  else
263  {
264  preds[0] = temp_storage.last_items[linear_tid - 1];
265  head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
266  }
267 
268  // Set head_flags for remaining items
269  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
270  }
271 
272  template <
273  int ITEMS_PER_THREAD,
274  typename FlagT,
275  typename FlagOp>
276  __device__ __forceinline__ void FlagHeads(
277  FlagT (&head_flags)[ITEMS_PER_THREAD],
278  T (&input)[ITEMS_PER_THREAD],
279  T (&preds)[ITEMS_PER_THREAD],
280  FlagOp flag_op,
281  T tile_predecessor_item)
282  {
283  // Share last item
284  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
285 
286  CTA_SYNC();
287 
288  // Set flag for first thread-item
289  preds[0] = (linear_tid == 0) ?
290  tile_predecessor_item : // First thread
291  temp_storage.last_items[linear_tid - 1];
292 
293  head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
294 
295  // Set head_flags for remaining items
296  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
297  }
298 
299 #endif // DOXYGEN_SHOULD_SKIP_THIS
300 
301 
302  template <
303  int ITEMS_PER_THREAD,
304  typename FlagT,
305  typename FlagOp>
306  __device__ __forceinline__ void FlagHeads(
307  FlagT (&head_flags)[ITEMS_PER_THREAD],
308  T (&input)[ITEMS_PER_THREAD],
309  FlagOp flag_op)
310  {
311  T preds[ITEMS_PER_THREAD];
312  FlagHeads(head_flags, input, preds, flag_op);
313  }
314 
315 
316  template <
317  int ITEMS_PER_THREAD,
318  typename FlagT,
319  typename FlagOp>
320  __device__ __forceinline__ void FlagHeads(
321  FlagT (&head_flags)[ITEMS_PER_THREAD],
322  T (&input)[ITEMS_PER_THREAD],
323  FlagOp flag_op,
324  T tile_predecessor_item)
325  {
326  T preds[ITEMS_PER_THREAD];
327  FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
328  }
329 
330 
331 
332  template <
333  int ITEMS_PER_THREAD,
334  typename FlagT,
335  typename FlagOp>
336  __device__ __forceinline__ void FlagTails(
337  FlagT (&tail_flags)[ITEMS_PER_THREAD],
338  T (&input)[ITEMS_PER_THREAD],
339  FlagOp flag_op)
340  {
341  // Share first item
342  temp_storage.first_items[linear_tid] = input[0];
343 
344  CTA_SYNC();
345 
346  // Set flag for last thread-item
347  tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
348  1 : // Last thread
350  flag_op,
351  input[ITEMS_PER_THREAD - 1],
352  temp_storage.first_items[linear_tid + 1],
353  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
354 
355  // Set tail_flags for remaining items
356  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
357  }
358 
359 
360  template <
361  int ITEMS_PER_THREAD,
362  typename FlagT,
363  typename FlagOp>
364  __device__ __forceinline__ void FlagTails(
365  FlagT (&tail_flags)[ITEMS_PER_THREAD],
366  T (&input)[ITEMS_PER_THREAD],
367  FlagOp flag_op,
368  T tile_successor_item)
369  {
370  // Share first item
371  temp_storage.first_items[linear_tid] = input[0];
372 
373  CTA_SYNC();
374 
375  // Set flag for last thread-item
376  T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
377  tile_successor_item : // Last thread
378  temp_storage.first_items[linear_tid + 1];
379 
380  tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
381  flag_op,
382  input[ITEMS_PER_THREAD - 1],
383  successor_item,
384  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
385 
386  // Set tail_flags for remaining items
387  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
388  }
389 
390 
391  template <
392  int ITEMS_PER_THREAD,
393  typename FlagT,
394  typename FlagOp>
395  __device__ __forceinline__ void FlagHeadsAndTails(
396  FlagT (&head_flags)[ITEMS_PER_THREAD],
397  FlagT (&tail_flags)[ITEMS_PER_THREAD],
398  T (&input)[ITEMS_PER_THREAD],
399  FlagOp flag_op)
400  {
401  // Share first and last items
402  temp_storage.first_items[linear_tid] = input[0];
403  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
404 
405  CTA_SYNC();
406 
407  T preds[ITEMS_PER_THREAD];
408 
409  // Set flag for first thread-item
410  preds[0] = temp_storage.last_items[linear_tid - 1];
411  if (linear_tid == 0)
412  {
413  head_flags[0] = 1;
414  }
415  else
416  {
417  head_flags[0] = ApplyOp<FlagOp>::FlagT(
418  flag_op,
419  preds[0],
420  input[0],
421  linear_tid * ITEMS_PER_THREAD);
422  }
423 
424 
425  // Set flag for last thread-item
426  tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
427  1 : // Last thread
429  flag_op,
430  input[ITEMS_PER_THREAD - 1],
431  temp_storage.first_items[linear_tid + 1],
432  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
433 
434  // Set head_flags for remaining items
435  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
436 
437  // Set tail_flags for remaining items
438  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
439  }
440 
441 
442  template <
443  int ITEMS_PER_THREAD,
444  typename FlagT,
445  typename FlagOp>
446  __device__ __forceinline__ void FlagHeadsAndTails(
447  FlagT (&head_flags)[ITEMS_PER_THREAD],
448  FlagT (&tail_flags)[ITEMS_PER_THREAD],
449  T tile_successor_item,
450  T (&input)[ITEMS_PER_THREAD],
451  FlagOp flag_op)
452  {
453  // Share first and last items
454  temp_storage.first_items[linear_tid] = input[0];
455  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
456 
457  CTA_SYNC();
458 
459  T preds[ITEMS_PER_THREAD];
460 
461  // Set flag for first thread-item
462  if (linear_tid == 0)
463  {
464  head_flags[0] = 1;
465  }
466  else
467  {
468  preds[0] = temp_storage.last_items[linear_tid - 1];
469  head_flags[0] = ApplyOp<FlagOp>::FlagT(
470  flag_op,
471  preds[0],
472  input[0],
473  linear_tid * ITEMS_PER_THREAD);
474  }
475 
476  // Set flag for last thread-item
477  T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
478  tile_successor_item : // Last thread
479  temp_storage.first_items[linear_tid + 1];
480 
481  tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
482  flag_op,
483  input[ITEMS_PER_THREAD - 1],
484  successor_item,
485  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
486 
487  // Set head_flags for remaining items
488  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
489 
490  // Set tail_flags for remaining items
491  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
492  }
493 
494  template <
495  int ITEMS_PER_THREAD,
496  typename FlagT,
497  typename FlagOp>
498  __device__ __forceinline__ void FlagHeadsAndTails(
499  FlagT (&head_flags)[ITEMS_PER_THREAD],
500  T tile_predecessor_item,
501  FlagT (&tail_flags)[ITEMS_PER_THREAD],
502  T (&input)[ITEMS_PER_THREAD],
503  FlagOp flag_op)
504  {
505  // Share first and last items
506  temp_storage.first_items[linear_tid] = input[0];
507  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
508 
509  CTA_SYNC();
510 
511  T preds[ITEMS_PER_THREAD];
512 
513  // Set flag for first thread-item
514  preds[0] = (linear_tid == 0) ?
515  tile_predecessor_item : // First thread
516  temp_storage.last_items[linear_tid - 1];
517 
518  head_flags[0] = ApplyOp<FlagOp>::FlagT(
519  flag_op,
520  preds[0],
521  input[0],
522  linear_tid * ITEMS_PER_THREAD);
523 
524  // Set flag for last thread-item
525  tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
526  1 : // Last thread
528  flag_op,
529  input[ITEMS_PER_THREAD - 1],
530  temp_storage.first_items[linear_tid + 1],
531  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
532 
533  // Set head_flags for remaining items
534  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
535 
536  // Set tail_flags for remaining items
537  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
538  }
539 
540 
541  template <
542  int ITEMS_PER_THREAD,
543  typename FlagT,
544  typename FlagOp>
545  __device__ __forceinline__ void FlagHeadsAndTails(
546  FlagT (&head_flags)[ITEMS_PER_THREAD],
547  T tile_predecessor_item,
548  FlagT (&tail_flags)[ITEMS_PER_THREAD],
549  T tile_successor_item,
550  T (&input)[ITEMS_PER_THREAD],
551  FlagOp flag_op)
552  {
553  // Share first and last items
554  temp_storage.first_items[linear_tid] = input[0];
555  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
556 
557  CTA_SYNC();
558 
559  T preds[ITEMS_PER_THREAD];
560 
561  // Set flag for first thread-item
562  preds[0] = (linear_tid == 0) ?
563  tile_predecessor_item : // First thread
564  temp_storage.last_items[linear_tid - 1];
565 
566  head_flags[0] = ApplyOp<FlagOp>::FlagT(
567  flag_op,
568  preds[0],
569  input[0],
570  linear_tid * ITEMS_PER_THREAD);
571 
572  // Set flag for last thread-item
573  T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
574  tile_successor_item : // Last thread
575  temp_storage.first_items[linear_tid + 1];
576 
577  tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
578  flag_op,
579  input[ITEMS_PER_THREAD - 1],
580  successor_item,
581  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
582 
583  // Set head_flags for remaining items
584  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
585 
586  // Set tail_flags for remaining items
587  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
588  }
589 
590 
591 
592 };
593 
594 
595 } // CUB namespace
596 CUB_NS_POSTFIX // Optional outer namespace(s)
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item)
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], FlagT(&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Optional outer namespace(s)
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
static __device__ __forceinline__ void FlagHeads(int linear_tid, FlagT(&flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
Shared memory storage layout type (last element from each thread's input)
__device__ __forceinline__ BlockAdjacentDifference()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
CTA_SYNC()
Definition: util_ptx.cuh:255
_TempStorage & temp_storage
Shared storage reference.
Templated unrolling of item comparison (inductive case)
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT(&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ void FlagTails(FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
static __device__ __forceinline__ void FlagTails(int linear_tid, FlagT(&flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ void FlagTails(FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_successor_item)
__device__ __forceinline__ BlockAdjacentDifference(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Specialization for when FlagOp has third index param.
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item)