From ac7c06bde64923c9190b78ab436f7ef304ee17ec Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 31 May 2022 21:29:59 +0400
Subject: [PATCH 1/2] Reformat select docs

---
 cub/device/device_select.cuh | 999 +++++++++++++++++++++--------------
 1 file changed, 616 insertions(+), 383 deletions(-)
diff --git a/cub/device/device_select.cuh b/cub/device/device_select.cuh
index f9ed6d0a25..4bff04aed2 100644
--- a/cub/device/device_select.cuh
+++ b/cub/device/device_select.cuh
@@ -1,7 +1,6 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -14,10 +13,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -28,418 +27,652 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
+ * @file cub::DeviceSelect provides device-wide, parallel operations for 
+ *       compacting selected items from sequences of data items residing within 
+ *       device-accessible memory.
  */
 
 #pragma once
 
-#include <stdio.h>
 #include <iterator>
+#include <stdio.h>
 
-#include "dispatch/dispatch_select_if.cuh"
-#include "dispatch/dispatch_unique_by_key.cuh"
-#include "../config.cuh"
+#include <cub/config.cuh>
+#include <cub/device/dispatch/dispatch_select_if.cuh>
+#include <cub/device/dispatch/dispatch_unique_by_key.cuh>
 
 CUB_NAMESPACE_BEGIN
 
 
 /**
- * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
- * \ingroup SingleModule
+ * @brief DeviceSelect provides device-wide, parallel operations for compacting 
+ *        selected items from sequences of data items residing within 
+ *        device-accessible memory. ![](select_logo.png)
+ * @ingroup SingleModule
  *
- * \par Overview
+ * @par Overview
  * These operations apply a selection criterion to selectively copy
  * items from a specified input sequence to a compact output sequence.
  *
- * \par Usage Considerations
- * \cdp_class{DeviceSelect}
+ * @par Usage Considerations
+ * @cdp_class{DeviceSelect}
  *
- * \par Performance
- * \linear_performance{select-flagged, select-if, and select-unique}
+ * @par Performance
+ * @linear_performance{select-flagged, select-if, and select-unique}
  *
- * \par
- * The following chart illustrates DeviceSelect::If
- * performance across different CUDA architectures for \p int32 items,
- * where 50% of the items are randomly selected.
+ * @par
+ * The following chart illustrates DeviceSelect::If performance across 
+ * different CUDA architectures for `int32` items, where 50% of the items are 
+ * randomly selected.
  *
- * \image html select_if_int32_50_percent.png
+ * @image html select_if_int32_50_percent.png
  *
- * \par
- * The following chart illustrates DeviceSelect::Unique
- * performance across different CUDA architectures for \p int32 items
- * where segments have lengths uniformly sampled from [1,1000].
+ * @par
+ * The following chart illustrates DeviceSelect::Unique performance across 
+ * different CUDA architectures for `int32` items where segments have lengths 
+ * uniformly sampled from `[1, 1000]`.
  *
- * \image html select_unique_int32_len_500.png
+ * @image html select_unique_int32_len_500.png
  *
- * \par
- * \plots_below
+ * @par
+ * @plots_below
  *
  */
 struct DeviceSelect
 {
-    /**
-     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
-     *
-     * \par
-     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [1, 4, 6, 7]
-     * // d_num_selected_out    <-- [4]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    FlagIterator,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Flagged(
-        void*                        d_temp_storage,                ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_flags,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
-     *
-     * \par
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated select-if performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
-     * selected with 50% probability.
-     *
-     * \image html select_if_int32_50_percent.png
-     * \image html select_if_int64_50_percent.png
-     *
-     * \par
-     * The following charts are similar, but 5% selection probability:
-     *
-     * \image html select_if_int32_5_percent.png
-     * \image html select_if_int64_5_percent.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const {
-     *         return (a < compare);
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int      num_items;              // e.g., 8
-     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;    // e.g., [ ]
-     * LessThan select_op(7);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // d_out                 <-- [0, 2, 3, 5, 2]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT,
-        typename                    SelectOp>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t If(
-        void*                        d_temp_storage,                ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        SelectOp                    select_op,                      ///< [in] Unary selection operator
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
+  /**
+   * @brief Uses the `d_flags` sequence to selectively copy the corresponding 
+   *        items from `d_in` into `d_out`. The total number of items selected 
+   *        is written to `d_num_selected_out`. ![](select_flags_logo.png)
+   *
+   * @par
+   * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`, 
+   *   `char`, `int`, etc.).
+   * - Copies of the selected items are compacted into `d_out` and maintain 
+   *   their original relative ordering.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the compaction of items selected from 
+   * an `int` device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>  // or equivalently <cub/device/device_select.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for input, 
+   * // flags, and output
+   * int  num_items;              // e.g., 8
+   * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+   * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+   * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+   * int  *d_num_selected_out;    // e.g., [ ]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSelect::Flagged(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_flags, d_out, d_num_selected_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run selection
+   * cub::DeviceSelect::Flagged(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_flags, d_out, d_num_selected_out, num_items);
+   *
+   * // d_out                 <-- [1, 4, 6, 7]
+   * // d_num_selected_out    <-- [4]
+   *
+   * @endcode
+   *
+   * @tparam InputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam FlagIterator         
+   *   **[inferred]** Random-access input iterator type for reading selection 
+   *   flags \iterator
+   *
+   * @tparam OutputIteratorT      
+   *   **[inferred]** Random-access output iterator type for writing selected 
+   *   items \iterator
+   *
+   * @tparam NumSelectedIteratorT  
+   *   **[inferred]** Output iterator type for recording the number of items 
+   *   selected \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[in] d_flags  
+   *   Pointer to the input sequence of selection flags
+   *
+   * @param[out] d_out  
+   *   Pointer to the output sequence of selected data items
+   *
+   * @param[out] d_num_selected_out  
+   *   Pointer to the output total number of items selected 
+   *   (i.e., length of `d_out`)
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename FlagIterator,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Flagged(void *d_temp_storage,
+          size_t &temp_storage_bytes,
+          InputIteratorT d_in,
+          FlagIterator d_flags,
+          OutputIteratorT d_out,
+          NumSelectedIteratorT d_num_selected_out,
+          int num_items,
+          cudaStream_t stream    = 0,
+          bool debug_synchronous = false)
+  {
+    using OffsetT    = int;      // Signed integer type for global offsets
+    using SelectOp   = NullType; // Selection op (not used)
+    using EqualityOp = NullType; // Equality operator (not used)
 
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            select_op,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
+    return DispatchSelectIf<InputIteratorT,
+                            FlagIterator,
+                            OutputIteratorT,
+                            NumSelectedIteratorT,
+                            SelectOp,
+                            EqualityOp,
+                            OffsetT,
+                            false>::Dispatch(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_in,
+                                             d_flags,
+                                             d_out,
+                                             d_num_selected_out,
+                                             SelectOp(),
+                                             EqualityOp(),
+                                             num_items,
+                                             stream,
+                                             debug_synchronous);
+  }
 
+  /**
+   * @brief Uses the `select_op` functor to selectively copy items from `d_in` 
+   *        into `d_out`. The total number of items selected is written to 
+   *        `d_num_selected_out`. ![](select_logo.png)
+   *
+   * @par
+   * - Copies of the selected items are compacted into `d_out` and maintain 
+   *   their original relative ordering.
+   * - @devicestorage
+   *
+   * @par Performance
+   * The following charts illustrate saturated select-if performance across 
+   * different CUDA architectures for `int32` and `int64` items, respectively. 
+   * Items are selected with 50% probability.
+   *
+   * @image html select_if_int32_50_percent.png
+   * @image html select_if_int64_50_percent.png
+   *
+   * @par
+   * The following charts are similar, but 5% selection probability:
+   *
+   * @image html select_if_int32_5_percent.png
+   * @image html select_if_int64_5_percent.png
+   *
+   * @par Snippet
+   * The code snippet below illustrates the compaction of items selected from 
+   * an `int` device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+   *
+   * // Functor type for selecting values less than some criteria
+   * struct LessThan
+   * {
+   *     int compare;
+   *
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     LessThan(int compare) : compare(compare) {}
+   *
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     bool operator()(const int &a) const {
+   *         return (a < compare);
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int      num_items;              // e.g., 8
+   * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+   * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+   * int      *d_num_selected_out;    // e.g., [ ]
+   * LessThan select_op(7);
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSelect::If(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, d_num_selected_out, num_items, select_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run selection
+   * cub::DeviceSelect::If(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, d_num_selected_out, num_items, select_op);
+   *
+   * // d_out                 <-- [0, 2, 3, 5, 2]
+   * // d_num_selected_out    <-- [5]
+   * @endcode
+   *
+   * @tparam InputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam OutputIteratorT      
+   *   **[inferred]** Random-access output iterator type for writing selected 
+   *   items \iterator
+   *
+   * @tparam NumSelectedIteratorT  
+   *   **[inferred]** Output iterator type for recording the number of items 
+   *   selected \iterator
+   *
+   * @tparam SelectOp             
+   *   **[inferred]** Selection operator type having member 
+   *   `bool operator()(const T &a)`
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output sequence of selected data items
+   *
+   * @param[out] d_num_selected_out  
+   *   Pointer to the output total number of items selected 
+   *   (i.e., length of `d_out`)
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param[in] select_op  
+   *   Unary selection operator
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectOp>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  If(void *d_temp_storage,
+     size_t &temp_storage_bytes,
+     InputIteratorT d_in,
+     OutputIteratorT d_out,
+     NumSelectedIteratorT d_num_selected_out,
+     int num_items,
+     SelectOp select_op,
+     cudaStream_t stream    = 0,
+     bool debug_synchronous = false)
+  {
+    using OffsetT      = int;        // Signed integer type for global offsets
+    using FlagIterator = NullType *; // FlagT iterator type (not used)
+    using EqualityOp   = NullType;   // Equality operator (not used)
 
-    /**
-     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated select-unique performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
-     * lengths uniformly sampled from [1,1000].
-     *
-     * \image html select_unique_int32_len_500.png
-     * \image html select_unique_int64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html select_unique_int32_len_5.png
-     * \image html select_unique_int64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [0, 2, 9, 5, 8]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Unique(
-        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;        // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef Equality                EqualityOp;     // Default == operator
+    return DispatchSelectIf<InputIteratorT,
+                            FlagIterator,
+                            OutputIteratorT,
+                            NumSelectedIteratorT,
+                            SelectOp,
+                            EqualityOp,
+                            OffsetT,
+                            false>::Dispatch(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_in,
+                                             NULL,
+                                             d_out,
+                                             d_num_selected_out,
+                                             select_op,
+                                             EqualityOp(),
+                                             num_items,
+                                             stream,
+                                             debug_synchronous);
+  }
 
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
+  /**
+   * @brief Given an input sequence `d_in` having runs of consecutive 
+   *        equal-valued keys, only the first key from each run is selectively 
+   *        copied to `d_out`. The total number of items selected is written to 
+   *        `d_num_selected_out`. ![](unique_logo.png)
+   *
+   * @par
+   * - The `==` equality operator is used to determine whether keys are 
+   *   equivalent
+   * - Copies of the selected items are compacted into `d_out` and maintain 
+   *   their original relative ordering.
+   * - @devicestorage
+   *
+   * @par Performance
+   * The following charts illustrate saturated select-unique performance across different
+   * CUDA architectures for `int32` and `int64` items, respectively. Segments 
+   * have lengths uniformly sampled from `[1, 1000]`.
+   *
+   * @image html select_unique_int32_len_500.png
+   * @image html select_unique_int64_len_500.png
+   *
+   * @par
+   * The following charts are similar, but with segment lengths uniformly 
+   * sampled from `[1, 10]`:
+   *
+   * @image html select_unique_int32_len_5.png
+   * @image html select_unique_int64_len_5.png
+   *
+   * @par Snippet
+   * The code snippet below illustrates the compaction of items selected from 
+   * an `int` device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int  num_items;              // e.g., 8
+   * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+   * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+   * int  *d_num_selected_out;    // e.g., [ ]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSelect::Unique(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, d_num_selected_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run selection
+   * cub::DeviceSelect::Unique(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, d_num_selected_out, num_items);
+   *
+   * // d_out                 <-- [0, 2, 9, 5, 8]
+   * // d_num_selected_out    <-- [5]
+   * @endcode
+   *
+   * @tparam InputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam OutputIteratorT      
+   *   **[inferred]** Random-access output iterator type for writing selected 
+   *   items \iterator
+   *
+   * @tparam NumSelectedIteratorT  
+   *   **[inferred]** Output iterator type for recording the number of items 
+   *   selected \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output sequence of selected data items
+   *
+   * @param[out] d_num_selected_out  
+   *   Pointer to the output total number of items selected 
+   *   (i.e., length of `d_out`)
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Unique(void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         NumSelectedIteratorT d_num_selected_out,
+         int num_items,
+         cudaStream_t stream    = 0,
+         bool debug_synchronous = false)
+  {
+    using OffsetT      = int;        // Signed integer type for global offsets
+    using FlagIterator = NullType *; // FlagT iterator type (not used)
+    using SelectOp     = NullType;   // Selection op (not used)
+    using EqualityOp   = Equality;   // Default == operator
 
+    return DispatchSelectIf<InputIteratorT,
+                            FlagIterator,
+                            OutputIteratorT,
+                            NumSelectedIteratorT,
+                            SelectOp,
+                            EqualityOp,
+                            OffsetT,
+                            false>::Dispatch(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_in,
+                                             NULL,
+                                             d_out,
+                                             d_num_selected_out,
+                                             SelectOp(),
+                                             EqualityOp(),
+                                             num_items,
+                                             stream,
+                                             debug_synchronous);
+  }
 
-    /**
-     * \brief Given an input sequence \p d_keys_in and \p d_values_in with runs of key-value pairs with consecutive equal-valued keys, only the first key and its value from each run is selectively copied to \p d_keys_out and \p d_values_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;              // e.g., 8
-     * int  *d_keys_in;             // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int  *d_values_in;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * int  *d_keys_out;            // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_values_out;          // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items);
-     *
-     * // d_keys_out            <-- [0, 2, 9, 5, 8]
-     * // d_values_out          <-- [1, 2, 4, 5, 8]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam KeyInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
-     * \tparam ValueInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
-     * \tparam KeyOutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected keys \iterator
-     * \tparam ValueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing selected values \iterator
-     * \tparam NumSelectedIteratorT    <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    KeyInputIteratorT,
-        typename                    ValueInputIteratorT,
-        typename                    KeyOutputIteratorT,
-        typename                    ValueOutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t UniqueByKey(
-        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeyInputIteratorT           d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-        ValueInputIteratorT         d_values_in,                    ///< [in] Pointer to the input sequence of values
-        KeyOutputIteratorT          d_keys_out,                     ///< [out] Pointer to the output sequence of selected keys
-        ValueOutputIteratorT        d_values_out,                   ///< [out] Pointer to the output sequence of selected values
-        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        using OffsetT = int;
-        using EqualityOp = Equality;
+  /**
+   * @brief Given an input sequence `d_keys_in` and `d_values_in` with runs of 
+   *        key-value pairs with consecutive equal-valued keys, only the first 
+   *        key and its value from each run is selectively copied to 
+   *        `d_keys_out` and `d_values_out`. The total number of items selected 
+   *        is written to `d_num_selected_out`. ![](unique_logo.png)
+   *
+   * \par
+   * - The `==` equality operator is used to determine whether keys are 
+   *   equivalent
+   * - Copies of the selected items are compacted into `d_out` and maintain 
+   *   their original relative ordering.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the compaction of items selected from 
+   * an `int` device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int  num_items;              // e.g., 8
+   * int  *d_keys_in;             // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+   * int  *d_values_in;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+   * int  *d_keys_out;            // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+   * int  *d_values_out;          // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+   * int  *d_num_selected_out;    // e.g., [ ]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSelect::UniqueByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_values_in, 
+   *   d_keys_out, d_values_out, d_num_selected_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run selection
+   * cub::DeviceSelect::UniqueByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_values_in, 
+   *   d_keys_out, d_values_out, d_num_selected_out, num_items);
+   *
+   * // d_keys_out            <-- [0, 2, 9, 5, 8]
+   * // d_values_out          <-- [1, 2, 4, 5, 8]
+   * // d_num_selected_out    <-- [5]
+   * @endcode
+   *
+   * @tparam KeyInputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   keys \iterator
+   *
+   * @tparam ValueInputIteratorT     
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   values \iterator
+   *
+   * @tparam KeyOutputIteratorT      
+   *   **[inferred]** Random-access output iterator type for writing selected 
+   *   keys \iterator
+   *
+   * @tparam ValueOutputIteratorT    
+   *   **[inferred]** Random-access output iterator type for writing selected 
+   *   values \iterator
+   *
+   * @tparam NumSelectedIteratorT    
+   *   **[inferred]** Output iterator type for recording the number of items 
+   *   selected \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in  
+   *   Pointer to the input sequence of keys
+   *
+   * @param[in] d_values_in  
+   *   Pointer to the input sequence of values
+   *
+   * @param[out] d_keys_out  
+   *   Pointer to the output sequence of selected keys
+   *
+   * @param[out] d_values_out  
+   *   Pointer to the output sequence of selected values
+   *
+   * @param[out] d_num_selected_out  
+   *   Pointer to the total number of items selected (i.e., length of 
+   *   `d_keys_out` or `d_values_out`)
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_keys_in` or 
+   *   `d_values_in`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   */
+  template <typename KeyInputIteratorT,
+            typename ValueInputIteratorT,
+            typename KeyOutputIteratorT,
+            typename ValueOutputIteratorT,
+            typename NumSelectedIteratorT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  UniqueByKey(void *d_temp_storage,
+              size_t &temp_storage_bytes,
+              KeyInputIteratorT d_keys_in,
+              ValueInputIteratorT d_values_in,
+              KeyOutputIteratorT d_keys_out,
+              ValueOutputIteratorT d_values_out,
+              NumSelectedIteratorT d_num_selected_out,
+              int num_items,
+              cudaStream_t stream    = 0,
+              bool debug_synchronous = false)
+  {
+    using OffsetT    = int;
+    using EqualityOp = Equality;
 
-        return DispatchUniqueByKey<KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, NumSelectedIteratorT, EqualityOp, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_values_in,
-            d_keys_out,
-            d_values_out,
-            d_num_selected_out,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
+    return DispatchUniqueByKey<KeyInputIteratorT,
+                               ValueInputIteratorT,
+                               KeyOutputIteratorT,
+                               ValueOutputIteratorT,
+                               NumSelectedIteratorT,
+                               EqualityOp,
+                               OffsetT>::Dispatch(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_keys_in,
+                                                  d_values_in,
+                                                  d_keys_out,
+                                                  d_values_out,
+                                                  d_num_selected_out,
+                                                  EqualityOp(),
+                                                  num_items,
+                                                  stream,
+                                                  debug_synchronous);
+  }
 };
 
 /**
- * \example example_device_select_flagged.cu
- * \example example_device_select_if.cu
- * \example example_device_select_unique.cu
+ * @example example_device_select_flagged.cu
+ * @example example_device_select_if.cu
+ * @example example_device_select_unique.cu
  */
 
 CUB_NAMESPACE_END

From 7896eb4d998163ed1442e20085c212420217d146 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Wed, 1 Jun 2022 21:55:28 +0400
Subject: [PATCH 2/2] In-place select

---
 cub/device/device_select.cuh               | 286 ++++++++++++++++++++-
 cub/device/dispatch/dispatch_select_if.cuh |   5 +-
 test/test_device_select_if.cu              | 223 ++++++++++++++++
 3 files changed, 511 insertions(+), 3 deletions(-)

diff --git a/cub/device/device_select.cuh b/cub/device/device_select.cuh
index 4bff04aed2..9c8e889da2 100644
--- a/cub/device/device_select.cuh
+++ b/cub/device/device_select.cuh
@@ -90,6 +90,9 @@ struct DeviceSelect
    *   `char`, `int`, etc.).
    * - Copies of the selected items are compacted into `d_out` and maintain 
    *   their original relative ordering.
+   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap 
+   *   `[d_in, d_in + num_items)`, `[d_flags, d_flags + num_items)` nor
+   *   `d_num_selected_out` in any way.
    * - @devicestorage
    *
    * @par Snippet
@@ -215,6 +218,137 @@ struct DeviceSelect
                                              stream,
                                              debug_synchronous);
   }
+   
+  /**
+   * @brief Uses the `d_flags` sequence to selectively compact the items in 
+   *        `d_data`. The total number of items selected is written to 
+   *        `d_num_selected_out`. ![](select_flags_logo.png)
+   *
+   * @par
+   * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`, 
+   *   `char`, `int`, etc.).
+   * - Copies of the selected items are compacted in-place and maintain 
+   *   their original relative ordering.
+   * - The `d_data` may equal `d_flags`. The range 
+   *  `[d_data, d_data + num_items)` shall not overlap 
+   *  `[d_flags, d_flags + num_items)` in any other way.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the compaction of items selected from 
+   * an `int` device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>  // or equivalently <cub/device/device_select.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for input, 
+   * // flags, and output
+   * int  num_items;              // e.g., 8
+   * int  *d_data;                // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+   * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+   * int  *d_num_selected_out;    // e.g., [ ]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSelect::Flagged(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_flags, d_num_selected_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run selection
+   * cub::DeviceSelect::Flagged(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_flags, d_num_selected_out, num_items);
+   *
+   * // d_data                <-- [1, 4, 6, 7]
+   * // d_num_selected_out    <-- [4]
+   *
+   * @endcode
+   *
+   * @tparam IteratorT       
+   *   **[inferred]** Random-access iterator type for reading and writing 
+   *   selected items \iterator
+   *
+   * @tparam FlagIterator         
+   *   **[inferred]** Random-access input iterator type for reading selection 
+   *   flags \iterator
+   *
+   * @tparam NumSelectedIteratorT  
+   *   **[inferred]** Output iterator type for recording the number of items 
+   *   selected \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_data
+   *   Pointer to the sequence of data items
+   *
+   * @param[in] d_flags  
+   *   Pointer to the input sequence of selection flags
+   *
+   * @param[out] d_num_selected_out  
+   *   Pointer to the output total number of items selected 
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_data`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   */
+  template <typename IteratorT,
+            typename FlagIterator,
+            typename NumSelectedIteratorT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Flagged(void *d_temp_storage,
+          size_t &temp_storage_bytes,
+          IteratorT d_data,
+          FlagIterator d_flags,
+          NumSelectedIteratorT d_num_selected_out,
+          int num_items,
+          cudaStream_t stream    = 0,
+          bool debug_synchronous = false)
+  {
+    using OffsetT    = int;      // Signed integer type for global offsets
+    using SelectOp   = NullType; // Selection op (not used)
+    using EqualityOp = NullType; // Equality operator (not used)
+
+    constexpr bool may_alias = true;
+
+    return DispatchSelectIf<IteratorT,
+                            FlagIterator,
+                            IteratorT,
+                            NumSelectedIteratorT,
+                            SelectOp,
+                            EqualityOp,
+                            OffsetT,
+                            false, 
+                            may_alias>::Dispatch(d_temp_storage,
+                                                 temp_storage_bytes,
+                                                 d_data, // in
+                                                 d_flags,
+                                                 d_data, // out
+                                                 d_num_selected_out,
+                                                 SelectOp(),
+                                                 EqualityOp(),
+                                                 num_items,
+                                                 stream,
+                                                 debug_synchronous);
+  }
 
   /**
    * @brief Uses the `select_op` functor to selectively copy items from `d_in` 
@@ -224,6 +358,8 @@ struct DeviceSelect
    * @par
    * - Copies of the selected items are compacted into `d_out` and maintain 
    *   their original relative ordering.
+   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap 
+   *   `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way.
    * - @devicestorage
    *
    * @par Performance
@@ -377,6 +513,145 @@ struct DeviceSelect
                                              debug_synchronous);
   }
 
+  /**
+   * @brief Uses the `select_op` functor to selectively compact items in 
+   *        `d_data`. The total number of items selected is written to 
+   *        `d_num_selected_out`. ![](select_logo.png)
+   *
+   * @par
+   * - Copies of the selected items are compacted in `d_data` and maintain 
+   *   their original relative ordering.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the compaction of items selected from 
+   * an `int` device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+   *
+   * // Functor type for selecting values less than some criteria
+   * struct LessThan
+   * {
+   *     int compare;
+   *
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     LessThan(int compare) : compare(compare) {}
+   *
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     bool operator()(const int &a) const {
+   *         return (a < compare);
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int      num_items;              // e.g., 8
+   * int      *d_data;                // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+   * int      *d_num_selected_out;    // e.g., [ ]
+   * LessThan select_op(7);
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSelect::If(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, d_num_selected_out, num_items, select_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run selection
+   * cub::DeviceSelect::If(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, d_num_selected_out, num_items, select_op);
+   *
+   * // d_data                <-- [0, 2, 3, 5, 2]
+   * // d_num_selected_out    <-- [5]
+   * @endcode
+   *
+   * @tparam IteratorT       
+   *   **[inferred]** Random-access input iterator type for reading and 
+   *   writing items \iterator
+   *
+   * @tparam NumSelectedIteratorT  
+   *   **[inferred]** Output iterator type for recording the number of items 
+   *   selected \iterator
+   *
+   * @tparam SelectOp             
+   *   **[inferred]** Selection operator type having member 
+   *   `bool operator()(const T &a)`
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_data
+   *   Pointer to the sequence of data items
+   *
+   * @param[out] d_num_selected_out  
+   *   Pointer to the output total number of items selected 
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_data`)
+   *
+   * @param[in] select_op  
+   *   Unary selection operator
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   */
+  template <typename IteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectOp>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  If(void *d_temp_storage,
+     size_t &temp_storage_bytes,
+     IteratorT d_data,
+     NumSelectedIteratorT d_num_selected_out,
+     int num_items,
+     SelectOp select_op,
+     cudaStream_t stream    = 0,
+     bool debug_synchronous = false)
+  {
+    using OffsetT      = int;        // Signed integer type for global offsets
+    using FlagIterator = NullType *; // FlagT iterator type (not used)
+    using EqualityOp   = NullType;   // Equality operator (not used)
+
+    constexpr bool may_alias = true;
+
+    return DispatchSelectIf<IteratorT,
+                            FlagIterator,
+                            IteratorT,
+                            NumSelectedIteratorT,
+                            SelectOp,
+                            EqualityOp,
+                            OffsetT,
+                            false,
+                            may_alias>::Dispatch(d_temp_storage,
+                                                 temp_storage_bytes,
+                                                 d_data, // in
+                                                 NULL,
+                                                 d_data, // out
+                                                 d_num_selected_out,
+                                                 select_op,
+                                                 EqualityOp(),
+                                                 num_items,
+                                                 stream,
+                                                 debug_synchronous);
+  }
+
   /**
    * @brief Given an input sequence `d_in` having runs of consecutive 
    *        equal-valued keys, only the first key from each run is selectively 
@@ -388,6 +663,8 @@ struct DeviceSelect
    *   equivalent
    * - Copies of the selected items are compacted into `d_out` and maintain 
    *   their original relative ordering.
+   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap 
+   *   `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way.
    * - @devicestorage
    *
    * @par Performance
@@ -526,11 +803,18 @@ struct DeviceSelect
    *        `d_keys_out` and `d_values_out`. The total number of items selected 
    *        is written to `d_num_selected_out`. ![](unique_logo.png)
    *
-   * \par
+   * @par
    * - The `==` equality operator is used to determine whether keys are 
    *   equivalent
    * - Copies of the selected items are compacted into `d_out` and maintain 
    *   their original relative ordering.
+   * - In-place operations are not supported. There must be no overlap between
+   *   any of the provided ranges:
+   *   - `[d_keys_in,          d_keys_in    + num_items)`
+   *   - `[d_keys_out,         d_keys_out   + *d_num_selected_out)`
+   *   - `[d_values_in,        d_values_in  + num_items)`
+   *   - `[d_values_out,       d_values_out + *d_num_selected_out)`
+   *   - `[d_num_selected_out, d_num_selected_out + 1)`
    * - @devicestorage
    *
    * @par Snippet
diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh
index bc19b1281b..fb949e6305 100644
--- a/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/device/dispatch/dispatch_select_if.cuh
@@ -123,7 +123,8 @@ template <
     typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
     typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
     typename    OffsetT,                        ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+    bool        KEEP_REJECTS,                   ///< Whether or not we push rejected items to the back of the output
+    bool        MayAlias = false>                   
 struct DispatchSelectIf
 {
     /******************************************************************************
@@ -161,7 +162,7 @@ struct DispatchSelectIf
                 128,
                 ITEMS_PER_THREAD,
                 BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
+                MayAlias ? LOAD_CA : LOAD_LDG,
                 BLOCK_SCAN_WARP_SCANS>
             SelectIfPolicyT;
     };
diff --git a/test/test_device_select_if.cu b/test/test_device_select_if.cu
index 283230aff0..c917bfd45e 100644
--- a/test/test_device_select_if.cu
+++ b/test/test_device_select_if.cu
@@ -48,6 +48,9 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
 
 #include "test_util.h"
 
@@ -823,6 +826,222 @@ void TestFlagsAliasingInPartition()
   CubDebugExit(g_allocator.DeviceFree(d_in));
 }
 
+struct Odd
+{
+  __host__ __device__ bool operator()(int v) const { return v % 2; }
+};
+
+void TestIfInPlace()
+{
+  const int num_items = 4 * 1024 * 1024;
+  const int num_iters = 42;
+
+  thrust::device_vector<int> num_out(1);
+  thrust::device_vector<int> data(num_items);
+  thrust::device_vector<int> reference(num_items);
+  thrust::device_vector<int> reference_out(1);
+
+  thrust::sequence(data.begin(), data.end());
+
+  Odd op{};
+
+  int *d_num_out = thrust::raw_pointer_cast(num_out.data());
+  int *d_data = thrust::raw_pointer_cast(data.data());
+  int *d_reference = thrust::raw_pointer_cast(reference.data());
+  int *d_reference_out = thrust::raw_pointer_cast(reference_out.data());
+
+  void *d_tmp_storage{};
+  std::size_t tmp_storage_size{};
+
+  CubDebugExit(
+    cub::DeviceSelect::If(d_tmp_storage,
+                          tmp_storage_size,
+                          d_data,
+                          d_num_out,
+                          num_items,
+                          op,
+                          0,
+                          true));
+
+  thrust::device_vector<char> tmp_storage(tmp_storage_size);
+  d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data());
+
+  thrust::default_random_engine g{};
+
+  for (int iter = 0; iter < num_iters; iter++)
+  {
+    thrust::shuffle(data.begin(), data.end(), g);
+
+    CubDebugExit(
+      cub::DeviceSelect::If(d_tmp_storage,
+                            tmp_storage_size,
+                            d_data,
+                            d_reference,
+                            d_reference_out,
+                            num_items,
+                            op,
+                            0,
+                            true));
+
+    CubDebugExit(
+      cub::DeviceSelect::If(d_tmp_storage,
+                            tmp_storage_size,
+                            d_data,
+                            d_num_out,
+                            num_items,
+                            op,
+                            0,
+                            true));
+
+    AssertEquals(num_out, reference_out);
+    const int num_selected = num_out[0];
+
+    const bool match_reference = thrust::equal(reference.begin(),
+                                               reference.begin() + num_selected,
+                                               data.begin());
+    AssertTrue(match_reference);
+  }
+}
+
+void TestFlaggedInPlace()
+{
+  const int num_items = 4 * 1024 * 1024;
+  const int num_iters = 42;
+
+  thrust::device_vector<int> num_out(1);
+  thrust::device_vector<int> data(num_items);
+  thrust::device_vector<bool> flags(num_items);
+
+  int h_num_out{};
+  int *d_num_out = thrust::raw_pointer_cast(num_out.data());
+  int *d_data = thrust::raw_pointer_cast(data.data());
+  bool *d_flags = thrust::raw_pointer_cast(flags.data());
+
+  void *d_tmp_storage{};
+  std::size_t tmp_storage_size{};
+
+  CubDebugExit(
+    cub::DeviceSelect::Flagged(d_tmp_storage,
+                               tmp_storage_size,
+                               d_data,
+                               d_flags,
+                               d_num_out,
+                               num_items,
+                               0,
+                               true));
+
+  thrust::device_vector<char> tmp_storage(tmp_storage_size);
+  d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data());
+
+  thrust::default_random_engine g{};
+
+  for (int iter = 0; iter < num_iters; iter++)
+  {
+    const int num_selected = RandomValue(num_items);
+
+    thrust::sequence(data.begin(), data.end());
+    thrust::fill(flags.begin(), flags.begin() + num_selected, true);
+    thrust::fill(flags.begin() + num_selected, flags.end(), false);
+    thrust::shuffle(flags.begin(), flags.end(), g);
+
+    CubDebugExit(
+      cub::DeviceSelect::Flagged(d_tmp_storage,
+                                 tmp_storage_size,
+                                 d_data,
+                                 d_flags,
+                                 d_num_out,
+                                 num_items,
+                                 0,
+                                 true));
+
+    cudaMemcpy(&h_num_out, d_num_out, sizeof(int), cudaMemcpyDeviceToHost);
+
+    AssertEquals(num_selected, h_num_out);
+
+    auto selection_perm_begin = thrust::make_permutation_iterator(flags.begin(),
+                                                                  data.begin());
+    auto selection_perm_end = selection_perm_begin + num_selected;
+
+    AssertEquals(num_selected,
+                 thrust::count(selection_perm_begin, selection_perm_end, true));
+  }
+}
+
+void TestFlaggedInPlaceWithAliasedFlags()
+{
+  const int num_items = 1024 * 1024;
+  const int num_iters = 42;
+
+  thrust::device_vector<int> num_out(1);
+  thrust::device_vector<int> data(num_items);
+  thrust::device_vector<int> reference(num_items);
+  thrust::device_vector<int> flags(num_items);
+
+  int h_num_out{};
+  int *d_num_out = thrust::raw_pointer_cast(num_out.data());
+  int *d_data = thrust::raw_pointer_cast(data.data());
+  int *d_flags = d_data; // alias
+  int *d_allocated_flags = thrust::raw_pointer_cast(data.data()); 
+  int *d_reference = thrust::raw_pointer_cast(reference.data()); 
+
+  void *d_tmp_storage{};
+  std::size_t tmp_storage_size{};
+
+  CubDebugExit(
+    cub::DeviceSelect::Flagged(d_tmp_storage,
+                               tmp_storage_size,
+                               d_data,
+                               d_flags,
+                               d_num_out,
+                               num_items,
+                               0,
+                               true));
+
+  thrust::device_vector<char> tmp_storage(tmp_storage_size);
+  d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data());
+
+  thrust::default_random_engine g{};
+
+  for (int iter = 0; iter < num_iters; iter++)
+  {
+    const int num_selected = RandomValue(num_items);
+
+    thrust::sequence(data.begin(), data.begin() + num_selected, 1);
+    thrust::fill(data.begin() + num_selected, data.end(), 0);
+    thrust::shuffle(data.begin(), data.end(), g);
+
+    CubDebugExit(
+      cub::DeviceSelect::Flagged(d_tmp_storage,
+                                 tmp_storage_size,
+                                 d_data,      // in
+                                 d_allocated_flags,
+                                 d_reference, // out
+                                 d_num_out,
+                                 num_items,
+                                 0,
+                                 true));
+
+    CubDebugExit(
+      cub::DeviceSelect::Flagged(d_tmp_storage,
+                                 tmp_storage_size,
+                                 d_data,
+                                 d_flags,
+                                 d_num_out,
+                                 num_items,
+                                 0,
+                                 true));
+
+    cudaMemcpy(&h_num_out, d_num_out, sizeof(int), cudaMemcpyDeviceToHost);
+
+    AssertEquals(num_selected, h_num_out);
+
+    const bool match_reference = thrust::equal(reference.begin(),
+                                               reference.begin() + num_selected,
+                                               data.begin());
+    AssertTrue(match_reference);
+  }
+}
+
 //---------------------------------------------------------------------
 // Main
 //---------------------------------------------------------------------
@@ -863,6 +1082,10 @@ int main(int argc, char** argv)
 
     TestFlagsAliasingInPartition();
 
+    TestFlaggedInPlace();
+    TestFlaggedInPlaceWithAliasedFlags();
+    TestIfInPlace();
+
     Test<unsigned char>(num_items);
     Test<unsigned short>(num_items);
     Test<unsigned int>(num_items);