NVIDIA · miscco · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
@@ -11,9 +11,9 @@
 #   be linked into the developer build targets, as they include both
 #   cudax.compiler_interface and cccl.compiler_interface_cppXX.
 
-find_package(libcudacxx ${cudax_VERSION} EXACT CONFIG REQUIRED
+find_package(Thrust ${cudax_VERSION} EXACT CONFIG REQUIRED
   NO_DEFAULT_PATH # Only check the explicit path in HINTS:
-  HINTS "${CCCL_SOURCE_DIR}/lib/cmake/libcudacxx/"
+  HINTS "${CCCL_SOURCE_DIR}/lib/cmake/thrust/"
 )
 
 function(cudax_build_compiler_targets)
@@ -73,6 +73,8 @@ function(cudax_build_compiler_targets)
       cccl.compiler_interface_cpp${dialect}
       cudax.compiler_interface
       libcudacxx::libcudacxx
+      CUB::CUB
+      Thrust::Thrust
     )
   endforeach()
 

@@ -25,6 +25,10 @@ function(cudax_add_example target_name_var example_src cudax_target)
     ${cudax_target}
     cudax.examples.thrust
   )
+  target_compile_options(${example_target} PRIVATE
+    "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
+  )
+
   cudax_clone_target_properties(${example_target} ${cudax_target})
   target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")
 

@@ -30,7 +30,6 @@
  * Unified Virtual Address Space (UVA) features.
  */
 
-#define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 #include <cuda/memory_resource>
 
 #include <cuda/experimental/algorithm.cuh>

@@ -0,0 +1,90 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * Vector addition: C = A + B.
+ *
+ * This sample is a very basic sample that implements element by element
+ * vector addition. It is the same as the sample illustrating Chapter 2
+ * of the programming guide with some additions like error checking.
+ */
+
+#include <thrust/generate.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+
+#include <cuda/experimental/container.cuh>
+#include <cuda/experimental/memory_resource.cuh>
+#include <cuda/experimental/stream.cuh>
+
+#include <iostream>
+
+namespace cudax = cuda::experimental;
+
+constexpr int numElements = 50000;
+
+struct generator
+{
+  thrust::default_random_engine gen{};
+  thrust::uniform_real_distribution<float> dist{-10.0f, 10.0f};
+
+  __host__ __device__ generator(const unsigned seed)
+      : gen{seed}
+  {}
+
+  __host__ __device__ float operator()() noexcept
+  {
+    return dist(gen);
+  }
+};
+
+int main()
+{
+  // A CUDA stream on which to execute the vector addition kernel
+  cudax::stream stream{};
+
+  // The execution policy we want to use to run all work on the same stream
+  auto policy = thrust::cuda::par_nosync.on(stream.get());
+
+  // An environment we use to pass all necessary information to the containers
+  cudax::env_t<cuda::mr::device_accessible> env{cudax::device_memory_resource{}, stream};
+
+  // Allocate the two inputs and output, but do not zero initialize via `cudax::uninit`
+  cudax::async_device_vector<float> A{env, numElements, cudax::uninit};
+  cudax::async_device_vector<float> B{env, numElements, cudax::uninit};
+  cudax::async_device_vector<float> C{env, numElements, cudax::uninit};
+
+  // Fill both vectors on stream using a random number generator
+  thrust::generate(policy, A.begin(), A.end(), generator{42});
+  thrust::generate(policy, B.begin(), B.end(), generator{1337});
+
+  // Add the vectors together
+  thrust::transform(policy, A.begin(), A.end(), B.begin(), C.begin(), cuda::std::plus<>{});
+
+  // Verify that the result vector is correct, by copying it to host
+  cudax::env_t<cuda::mr::host_accessible> host_env{cudax::pinned_memory_resource{}, stream};
+  cudax::async_host_vector<float> h_A{host_env, A};
+  cudax::async_host_vector<float> h_B{host_env, B};
+  cudax::async_host_vector<float> h_C{host_env, C};
+
+  // Do not forget to sync afterwards
+  stream.wait();
+
+  for (int i = 0; i < numElements; ++i)
+  {
+    if (cuda::std::abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
+    {
+      std::cerr << "Result verification failed at element " << i << "\n";
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  return 0;
+}