NVIDIA · brycelelbach · Oct 15, 2019 · Sep 12, 2019 · Oct 10, 2019
diff --git a/doc/branching.md b/doc/branching.md
@@ -13,6 +13,7 @@ versions don't directly map to any CUDA Toolkit version.
 
 | Thrust version    | CUDA version  |
 | ----------------- | ------------- |
+| 1.9.6             | 10.1 Update 2 |
 | 1.9.5             | 10.1 Update 1 |
 | 1.9.4             | 10.1          |
 | 1.9.3             | 10.0          |

diff --git a/doc/changelog.md b/doc/changelog.md
@@ -1,3 +1,21 @@
+# Thrust v1.9.6  (CUDA 10.1 Update 2) #
+
+## Summary
+
+Thrust v1.9.6 is a minor release accompanying the CUDA 10.1 Update 2 release.
+
+## Bug Fixes
+
+- NVBug 2509847 Inconsistent alignment of `thrust::complex`
+- NVBug 2586774 Compilation failure with Clang + older libstdc++ that doesn't
+    have `std::is_trivially_copyable`
+- NVBug 200488234 CUDA header files contain unicode characters which leads
+    compiling errors on Windows
+- #949, #973, NVBug 2422333, NVBug 2522259, NVBug 2528822 `thrust::detail::aligned_reinterpret_cast`
+    must be annotated with __host__ __device__
+- NVBug 2599629 Missing include in the OpenMP sort implementation
+- NVBug 200513211 Truncation warning in test code under VC142
+
 # Thrust v1.9.5  (CUDA 10.1 Update 1)
 
 ## Summary

diff --git a/testing/out_of_memory_recovery.cu b/testing/out_of_memory_recovery.cu
@@ -0,0 +1,24 @@
+// Regression test for NVBug 2720132.
+
+#include <unittest/unittest.h>
+#include <thrust/device_vector.h>
+#include <thrust/detail/cstdint.h>
+
+struct non_trivial
+{
+  __host__ __device__ non_trivial() {}
+  __host__ __device__ ~non_trivial() {}
+};
+
+void test_out_of_memory_recovery()
+{
+  try
+  {
+    thrust::device_vector<non_trivial> x(1);
+
+    for (thrust::detail::uint64_t n = 1 ;; n <<= 1)
+      thrust::device_vector<thrust::detail::uint32_t> y(n);
+  }
+  catch (...) { }
+}
+DECLARE_UNITTEST(test_out_of_memory_recovery);
diff --git a/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -177,13 +177,13 @@ public:
         res_desc.res.linear.desc        = channel_desc;
         res_desc.res.linear.sizeInBytes = bytes;
         tex_desc.readMode               = cudaReadModeElementType;
-        return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+        return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL));
     }
 
     /// Unbind this iterator from its texture reference
     cudaError_t UnbindTexture()
     {
-        return cudaDestroyTextureObject(tex_obj);
+        return CubDebug(cudaDestroyTextureObject(tex_obj));
     }
 
     /// Postfix increment

diff --git a/thrust/system/cuda/detail/cub/util_allocator.cuh b/thrust/system/cuda/detail/cub/util_allocator.cuh
@@ -406,7 +406,7 @@ struct CachingDeviceAllocator
                 // in use by the device, only consider cached blocks that are
                 // either (from the active stream) or (from an idle stream)
                 if ((active_stream == block_itr->associated_stream) ||
-                    (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
+                    (CubDebug(cudaEventQuery(block_itr->ready_event)) != cudaErrorNotReady))
                 {
                     // Reuse existing cache block.  Insert into live blocks.
                     found = true;

diff --git a/thrust/system/cuda/detail/cub/util_debug.cuh b/thrust/system/cuda/detail/cub/util_debug.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -72,6 +72,13 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
 {
     (void)filename;
     (void)line;
+
+#ifdef CUB_RUNTIME_ENABLED
+    // Clear the global CUDA error state which may have been set by the last
+    // call. Otherwise, errors may "leak" to unrelated kernel launches.
+    cudaGetLastError();
+#endif
+
 #ifdef CUB_STDERR
     if (error)
     {

diff --git a/thrust/system/cuda/detail/cub/util_device.cuh b/thrust/system/cuda/detail/cub/util_device.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -199,11 +199,11 @@ CUB_RUNTIME_FUNCTION __forceinline__
 static cudaError_t SyncStream(cudaStream_t stream)
 {
 #if (CUB_PTX_ARCH == 0)
-    return cudaStreamSynchronize(stream);
+    return CubDebug(cudaStreamSynchronize(stream));
 #else
     (void)stream;
     // Device can't yet sync on a specific stream
-    return cudaDeviceSynchronize();
+    return CubDebug(cudaDeviceSynchronize());
 #endif
 }
 
@@ -255,15 +255,12 @@ cudaError_t MaxSmOccupancy(
 
     // CUDA API calls not supported from this device
     return CubDebug(cudaErrorInvalidConfiguration);
-
 #else
-
-    return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
+    return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_sm_occupancy,
         kernel_ptr,
         block_threads,
-        dynamic_smem_bytes);
-
+        dynamic_smem_bytes));
 #endif  // CUB_RUNTIME_ENABLED
 }
 

diff --git a/thrust/system/cuda/detail/malloc_and_free.h b/thrust/system/cuda/detail/malloc_and_free.h
@@ -62,9 +62,9 @@ void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
 
   if(status != cudaSuccess)
   {
-  //  cuda_cub::throw_on_error(status, "device malloc failed");
+    cudaGetLastError(); // Clear global CUDA error state.
     thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
-  } 
+  }
 #else
   result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n));
 #endif

diff --git a/thrust/system/cuda/detail/par.h b/thrust/system/cuda/detail/par.h
@@ -29,6 +29,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
 
 #include <thrust/detail/allocator_aware_execution_policy.h>
 
@@ -40,36 +41,6 @@
 THRUST_BEGIN_NS
 namespace cuda_cub {
 
-inline __host__ __device__
-cudaStream_t
-default_stream()
-{
-  return cudaStreamLegacy;
-}
-
-template <class Derived>
-__host__ __device__
-cudaStream_t
-get_stream(execution_policy<Derived> &)
-{
-  return default_stream();
-}
-
-__thrust_exec_check_disable__
-template <class Derived>
-__host__ __device__
-cudaError_t
-synchronize_stream(execution_policy<Derived> &)
-{
-  #if __THRUST_HAS_CUDART__
-    cudaDeviceSynchronize();
-    return cudaGetLastError();
-  #else
-    return cudaSuccess;
-  #endif
-}
-
-
 template <class Derived>
 struct execute_on_stream_base : execution_policy<Derived>
 {