OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_discontinuity.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../util_type.cuh"
37 #include "../util_ptx.cuh"
38 #include "../util_namespace.cuh"
39 
41 CUB_NS_PREFIX
42 
44 namespace cub {
45 
102 template <
103  typename T,
104  int BLOCK_DIM_X,
105  int BLOCK_DIM_Y = 1,
106  int BLOCK_DIM_Z = 1,
107  int PTX_ARCH = CUB_PTX_ARCH>
109 {
110 private:
111 
112  /******************************************************************************
113  * Constants and type definitions
114  ******************************************************************************/
115 
117  enum
118  {
120  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
121  };
122 
123 
126  {
127  T first_items[BLOCK_THREADS];
128  T last_items[BLOCK_THREADS];
129  };
130 
131 
132  /******************************************************************************
133  * Utility methods
134  ******************************************************************************/
135 
137  __device__ __forceinline__ _TempStorage& PrivateStorage()
138  {
139  __shared__ _TempStorage private_storage;
140  return private_storage;
141  }
142 
143 
145  template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
146  struct ApplyOp
147  {
148  // Apply flag operator
149  static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
150  {
151  return flag_op(a, b, idx);
152  }
153  };
154 
156  template <typename FlagOp>
157  struct ApplyOp<FlagOp, false>
158  {
159  // Apply flag operator
160  static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
161  {
162  return flag_op(a, b);
163  }
164  };
165 
167  template <int ITERATION, int MAX_ITERATIONS>
168  struct Iterate
169  {
170  // Head flags
171  template <
172  int ITEMS_PER_THREAD,
173  typename FlagT,
174  typename FlagOp>
175  static __device__ __forceinline__ void FlagHeads(
176  int linear_tid,
177  FlagT (&flags)[ITEMS_PER_THREAD],
178  T (&input)[ITEMS_PER_THREAD],
179  T (&preds)[ITEMS_PER_THREAD],
180  FlagOp flag_op)
181  {
182  preds[ITERATION] = input[ITERATION - 1];
183 
184  flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
185  flag_op,
186  preds[ITERATION],
187  input[ITERATION],
188  (linear_tid * ITEMS_PER_THREAD) + ITERATION);
189 
190  Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
191  }
192 
193  // Tail flags
194  template <
195  int ITEMS_PER_THREAD,
196  typename FlagT,
197  typename FlagOp>
198  static __device__ __forceinline__ void FlagTails(
199  int linear_tid,
200  FlagT (&flags)[ITEMS_PER_THREAD],
201  T (&input)[ITEMS_PER_THREAD],
202  FlagOp flag_op)
203  {
204  flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
205  flag_op,
206  input[ITERATION],
207  input[ITERATION + 1],
208  (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
209 
211  }
212 
213  };
214 
216  template <int MAX_ITERATIONS>
217  struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
218  {
219  // Head flags
220  template <
221  int ITEMS_PER_THREAD,
222  typename FlagT,
223  typename FlagOp>
224  static __device__ __forceinline__ void FlagHeads(
225  int /*linear_tid*/,
226  FlagT (&/*flags*/)[ITEMS_PER_THREAD],
227  T (&/*input*/)[ITEMS_PER_THREAD],
228  T (&/*preds*/)[ITEMS_PER_THREAD],
229  FlagOp /*flag_op*/)
230  {}
231 
232  // Tail flags
233  template <
234  int ITEMS_PER_THREAD,
235  typename FlagT,
236  typename FlagOp>
237  static __device__ __forceinline__ void FlagTails(
238  int /*linear_tid*/,
239  FlagT (&/*flags*/)[ITEMS_PER_THREAD],
240  T (&/*input*/)[ITEMS_PER_THREAD],
241  FlagOp /*flag_op*/)
242  {}
243  };
244 
245 
246  /******************************************************************************
247  * Thread fields
248  ******************************************************************************/
249 
252 
254  unsigned int linear_tid;
255 
256 
257 public:
258 
260  struct TempStorage : Uninitialized<_TempStorage> {};
261 
262 
263  /******************************************************************/
267 
271  __device__ __forceinline__ BlockDiscontinuity()
272  :
274  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
275  {}
276 
277 
281  __device__ __forceinline__ BlockDiscontinuity(
283  :
284  temp_storage(temp_storage.Alias()),
285  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
286  {}
287 
288 
290  /******************************************************************/
294 
295 
296 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
297 
298  template <
299  int ITEMS_PER_THREAD,
300  typename FlagT,
301  typename FlagOp>
302  __device__ __forceinline__ void FlagHeads(
303  FlagT (&head_flags)[ITEMS_PER_THREAD],
304  T (&input)[ITEMS_PER_THREAD],
305  T (&preds)[ITEMS_PER_THREAD],
306  FlagOp flag_op)
307  {
308  // Share last item
309  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
310 
311  CTA_SYNC();
312 
313  if (linear_tid == 0)
314  {
315  // Set flag for first thread-item (preds[0] is undefined)
316  head_flags[0] = 1;
317  }
318  else
319  {
320  preds[0] = temp_storage.last_items[linear_tid - 1];
321  head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
322  }
323 
324  // Set head_flags for remaining items
325  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
326  }
327 
328  template <
329  int ITEMS_PER_THREAD,
330  typename FlagT,
331  typename FlagOp>
332  __device__ __forceinline__ void FlagHeads(
333  FlagT (&head_flags)[ITEMS_PER_THREAD],
334  T (&input)[ITEMS_PER_THREAD],
335  T (&preds)[ITEMS_PER_THREAD],
336  FlagOp flag_op,
337  T tile_predecessor_item)
338  {
339  // Share last item
340  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
341 
342  CTA_SYNC();
343 
344  // Set flag for first thread-item
345  preds[0] = (linear_tid == 0) ?
346  tile_predecessor_item : // First thread
347  temp_storage.last_items[linear_tid - 1];
348 
349  head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
350 
351  // Set head_flags for remaining items
352  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
353  }
354 
355 #endif // DOXYGEN_SHOULD_SKIP_THIS
356 
357 
407  template <
408  int ITEMS_PER_THREAD,
409  typename FlagT,
410  typename FlagOp>
411  __device__ __forceinline__ void FlagHeads(
412  FlagT (&head_flags)[ITEMS_PER_THREAD],
413  T (&input)[ITEMS_PER_THREAD],
414  FlagOp flag_op)
415  {
416  T preds[ITEMS_PER_THREAD];
417  FlagHeads(head_flags, input, preds, flag_op);
418  }
419 
420 
476  template <
477  int ITEMS_PER_THREAD,
478  typename FlagT,
479  typename FlagOp>
480  __device__ __forceinline__ void FlagHeads(
481  FlagT (&head_flags)[ITEMS_PER_THREAD],
482  T (&input)[ITEMS_PER_THREAD],
483  FlagOp flag_op,
484  T tile_predecessor_item)
485  {
486  T preds[ITEMS_PER_THREAD];
487  FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
488  }
489 
490 
491 
493  /******************************************************************/
497 
498 
549  template <
550  int ITEMS_PER_THREAD,
551  typename FlagT,
552  typename FlagOp>
553  __device__ __forceinline__ void FlagTails(
554  FlagT (&tail_flags)[ITEMS_PER_THREAD],
555  T (&input)[ITEMS_PER_THREAD],
556  FlagOp flag_op)
557  {
558  // Share first item
559  temp_storage.first_items[linear_tid] = input[0];
560 
561  CTA_SYNC();
562 
563  // Set flag for last thread-item
564  tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
565  1 : // Last thread
567  flag_op,
568  input[ITEMS_PER_THREAD - 1],
569  temp_storage.first_items[linear_tid + 1],
570  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
571 
572  // Set tail_flags for remaining items
573  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
574  }
575 
576 
633  template <
634  int ITEMS_PER_THREAD,
635  typename FlagT,
636  typename FlagOp>
637  __device__ __forceinline__ void FlagTails(
638  FlagT (&tail_flags)[ITEMS_PER_THREAD],
639  T (&input)[ITEMS_PER_THREAD],
640  FlagOp flag_op,
641  T tile_successor_item)
642  {
643  // Share first item
644  temp_storage.first_items[linear_tid] = input[0];
645 
646  CTA_SYNC();
647 
648  // Set flag for last thread-item
649  T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
650  tile_successor_item : // Last thread
651  temp_storage.first_items[linear_tid + 1];
652 
653  tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
654  flag_op,
655  input[ITEMS_PER_THREAD - 1],
656  successor_item,
657  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
658 
659  // Set tail_flags for remaining items
660  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
661  }
662 
663 
665  /******************************************************************/
669 
670 
731  template <
732  int ITEMS_PER_THREAD,
733  typename FlagT,
734  typename FlagOp>
735  __device__ __forceinline__ void FlagHeadsAndTails(
736  FlagT (&head_flags)[ITEMS_PER_THREAD],
737  FlagT (&tail_flags)[ITEMS_PER_THREAD],
738  T (&input)[ITEMS_PER_THREAD],
739  FlagOp flag_op)
740  {
741  // Share first and last items
742  temp_storage.first_items[linear_tid] = input[0];
743  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
744 
745  CTA_SYNC();
746 
747  T preds[ITEMS_PER_THREAD];
748 
749  // Set flag for first thread-item
750  preds[0] = temp_storage.last_items[linear_tid - 1];
751  if (linear_tid == 0)
752  {
753  head_flags[0] = 1;
754  }
755  else
756  {
757  head_flags[0] = ApplyOp<FlagOp>::FlagT(
758  flag_op,
759  preds[0],
760  input[0],
761  linear_tid * ITEMS_PER_THREAD);
762  }
763 
764 
765  // Set flag for last thread-item
766  tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
767  1 : // Last thread
769  flag_op,
770  input[ITEMS_PER_THREAD - 1],
771  temp_storage.first_items[linear_tid + 1],
772  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
773 
774  // Set head_flags for remaining items
775  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
776 
777  // Set tail_flags for remaining items
778  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
779  }
780 
781 
847  template <
848  int ITEMS_PER_THREAD,
849  typename FlagT,
850  typename FlagOp>
851  __device__ __forceinline__ void FlagHeadsAndTails(
852  FlagT (&head_flags)[ITEMS_PER_THREAD],
853  FlagT (&tail_flags)[ITEMS_PER_THREAD],
854  T tile_successor_item,
855  T (&input)[ITEMS_PER_THREAD],
856  FlagOp flag_op)
857  {
858  // Share first and last items
859  temp_storage.first_items[linear_tid] = input[0];
860  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
861 
862  CTA_SYNC();
863 
864  T preds[ITEMS_PER_THREAD];
865 
866  // Set flag for first thread-item
867  if (linear_tid == 0)
868  {
869  head_flags[0] = 1;
870  }
871  else
872  {
873  preds[0] = temp_storage.last_items[linear_tid - 1];
874  head_flags[0] = ApplyOp<FlagOp>::FlagT(
875  flag_op,
876  preds[0],
877  input[0],
878  linear_tid * ITEMS_PER_THREAD);
879  }
880 
881  // Set flag for last thread-item
882  T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
883  tile_successor_item : // Last thread
884  temp_storage.first_items[linear_tid + 1];
885 
886  tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
887  flag_op,
888  input[ITEMS_PER_THREAD - 1],
889  successor_item,
890  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
891 
892  // Set head_flags for remaining items
893  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
894 
895  // Set tail_flags for remaining items
896  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
897  }
898 
899 
971  template <
972  int ITEMS_PER_THREAD,
973  typename FlagT,
974  typename FlagOp>
975  __device__ __forceinline__ void FlagHeadsAndTails(
976  FlagT (&head_flags)[ITEMS_PER_THREAD],
977  T tile_predecessor_item,
978  FlagT (&tail_flags)[ITEMS_PER_THREAD],
979  T (&input)[ITEMS_PER_THREAD],
980  FlagOp flag_op)
981  {
982  // Share first and last items
983  temp_storage.first_items[linear_tid] = input[0];
984  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
985 
986  CTA_SYNC();
987 
988  T preds[ITEMS_PER_THREAD];
989 
990  // Set flag for first thread-item
991  preds[0] = (linear_tid == 0) ?
992  tile_predecessor_item : // First thread
993  temp_storage.last_items[linear_tid - 1];
994 
995  head_flags[0] = ApplyOp<FlagOp>::FlagT(
996  flag_op,
997  preds[0],
998  input[0],
999  linear_tid * ITEMS_PER_THREAD);
1000 
1001  // Set flag for last thread-item
1002  tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1003  1 : // Last thread
1005  flag_op,
1006  input[ITEMS_PER_THREAD - 1],
1007  temp_storage.first_items[linear_tid + 1],
1008  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
1009 
1010  // Set head_flags for remaining items
1011  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
1012 
1013  // Set tail_flags for remaining items
1014  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
1015  }
1016 
1017 
1090  template <
1091  int ITEMS_PER_THREAD,
1092  typename FlagT,
1093  typename FlagOp>
1094  __device__ __forceinline__ void FlagHeadsAndTails(
1095  FlagT (&head_flags)[ITEMS_PER_THREAD],
1096  T tile_predecessor_item,
1097  FlagT (&tail_flags)[ITEMS_PER_THREAD],
1098  T tile_successor_item,
1099  T (&input)[ITEMS_PER_THREAD],
1100  FlagOp flag_op)
1101  {
1102  // Share first and last items
1103  temp_storage.first_items[linear_tid] = input[0];
1104  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
1105 
1106  CTA_SYNC();
1107 
1108  T preds[ITEMS_PER_THREAD];
1109 
1110  // Set flag for first thread-item
1111  preds[0] = (linear_tid == 0) ?
1112  tile_predecessor_item : // First thread
1113  temp_storage.last_items[linear_tid - 1];
1114 
1115  head_flags[0] = ApplyOp<FlagOp>::FlagT(
1116  flag_op,
1117  preds[0],
1118  input[0],
1119  linear_tid * ITEMS_PER_THREAD);
1120 
1121  // Set flag for last thread-item
1122  T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
1123  tile_successor_item : // Last thread
1124  temp_storage.first_items[linear_tid + 1];
1125 
1126  tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
1127  flag_op,
1128  input[ITEMS_PER_THREAD - 1],
1129  successor_item,
1130  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
1131 
1132  // Set head_flags for remaining items
1133  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
1134 
1135  // Set tail_flags for remaining items
1136  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
1137  }
1138 
1139 
1140 
1141 
1143 
1144 };
1145 
1146 
1147 } // CUB namespace
1148 CUB_NS_POSTFIX // Optional outer namespace(s)
\smemstorage{BlockDiscontinuity}
Shared memory storage layout type (last element from each thread's input)
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item)
Sets head flags indicating discontinuities between items partitioned across the thread block.
Optional outer namespace(s)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
CTA_SYNC()
Definition: util_ptx.cuh:255
__device__ __forceinline__ void FlagTails(FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_successor_item)
Sets tail flags indicating discontinuities between items partitioned across the thread block.
__device__ __forceinline__ void FlagTails(FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets tail flags indicating discontinuities between items partitioned across the thread block,...
The BlockDiscontinuity class provides collective methods for flagging discontinuities within an order...
Specialization for when FlagOp has third index param.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
static __device__ __forceinline__ void FlagHeads(int linear_tid, FlagT(&flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
__device__ __forceinline__ BlockDiscontinuity()
Collective constructor using a private static allocation of shared memory as temporary storage.
static __device__ __forceinline__ void FlagTails(int linear_tid, FlagT(&flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ BlockDiscontinuity(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
The thread block size in threads.
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT(&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets both head and tail flags indicating discontinuities between items partitioned across the thread ...
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], FlagT(&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets both head and tail flags indicating discontinuities between items partitioned across the thread ...
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets both head and tail flags indicating discontinuities between items partitioned across the thread ...
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item)
Templated unrolling of item comparison (inductive case)
__device__ __forceinline__ void FlagHeadsAndTails(FlagT(&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets both head and tail flags indicating discontinuities between items partitioned across the thread ...
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op)
Sets head flags indicating discontinuities between items partitioned across the thread block,...