Loading pkgs/development/compilers/cudatoolkit/extension.nix +2 −0 Original line number Diff line number Diff line Loading @@ -71,4 +71,6 @@ in cudaFlags markForCudatoolkitRootHook setupCudaHook; saxpy = final.callPackage ./saxpy { }; } pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt 0 → 100644 +12 −0 Original line number Diff line number Diff line cmake_minimum_required(VERSION 3.25) project(saxpy LANGUAGES CXX CUDA) find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas) add_executable(saxpy saxpy.cu) target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m) target_compile_features(saxpy PRIVATE cxx_std_14) target_compile_options(saxpy PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: --expt-relaxed-constexpr>) install(TARGETS saxpy) pkgs/development/compilers/cudatoolkit/saxpy/default.nix 0 → 100644 +50 −0 Original line number Diff line number Diff line { autoAddOpenGLRunpathHook , backendStdenv , cmake , cuda_cccl , cuda_cudart , cudaFlags , cuda_nvcc , lib , libcublas , setupCudaHook , stdenv }: backendStdenv.mkDerivation { pname = "saxpy"; version = "unstable-2023-07-11"; src = ./.; buildInputs = [ libcublas cuda_cudart cuda_cccl ]; nativeBuildInputs = [ cmake # NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for # cross-compilation to work. This should work automatically once we move to # spliced scopes. Delete this comment once that happens cuda_nvcc # Alternatively, we could remove the propagated hook from cuda_nvcc and add # directly: # setupCudaHook autoAddOpenGLRunpathHook ]; cmakeFlags = [ "-DCMAKE_VERBOSE_MAKEFILE=ON" "-DCMAKE_CUDA_ARCHITECTURES=${with cudaFlags; builtins.concatStringsSep ";" (map dropDot cudaCapabilities)}" ]; meta = { description = "A simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation"; license = lib.licenses.mit; maintainers = lib.teams.cuda.members; platforms = lib.platforms.unix; }; } pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu 0 → 100644 +68 −0 Original line number Diff line number Diff line #include <cublas_v2.h> #include <cuda_runtime.h> #include <vector> #include <stdio.h> static inline void check(cudaError_t err, const char *context) { if (err != cudaSuccess) { fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err)); std::exit(EXIT_FAILURE); } } #define CHECK(x) check(x, #x) __global__ void saxpy(int n, float a, float *x, float *y) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) y[i] = a * x[i] + y[i]; } int main(void) { setbuf(stderr, NULL); fprintf(stderr, "Start\n"); int rtVersion, driverVersion; CHECK(cudaRuntimeGetVersion(&rtVersion)); CHECK(cudaDriverGetVersion(&driverVersion)); fprintf(stderr, "Runtime version: %d\n", rtVersion); fprintf(stderr, "Driver version: %d\n", driverVersion); constexpr int N = 1 << 10; std::vector<float> xHost(N), yHost(N); for (int i = 0; i < N; i++) { xHost[i] = 1.0f; yHost[i] = 2.0f; } fprintf(stderr, "Host memory initialized, copying to the device\n"); fflush(stderr); float *xDevice, *yDevice; CHECK(cudaMalloc(&xDevice, N * sizeof(float))); CHECK(cudaMalloc(&yDevice, N * sizeof(float))); CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float), cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float), cudaMemcpyHostToDevice)); fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n"); saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice); fprintf(stderr, "Scheduled a kernel call\n"); CHECK(cudaGetLastError()); CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float), cudaMemcpyDeviceToHost)); float maxError = 0.0f; for (int i = 0; i < N; i++) maxError = max(maxError, abs(yHost[i] - 4.0f)); fprintf(stderr, "Max error: %f\n", maxError); CHECK(cudaFree(xDevice)); CHECK(cudaFree(yDevice)); } Loading
pkgs/development/compilers/cudatoolkit/extension.nix +2 −0 Original line number Diff line number Diff line Loading @@ -71,4 +71,6 @@ in cudaFlags markForCudatoolkitRootHook setupCudaHook; saxpy = final.callPackage ./saxpy { }; }
pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt 0 → 100644 +12 −0 Original line number Diff line number Diff line cmake_minimum_required(VERSION 3.25) project(saxpy LANGUAGES CXX CUDA) find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas) add_executable(saxpy saxpy.cu) target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m) target_compile_features(saxpy PRIVATE cxx_std_14) target_compile_options(saxpy PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: --expt-relaxed-constexpr>) install(TARGETS saxpy)
pkgs/development/compilers/cudatoolkit/saxpy/default.nix 0 → 100644 +50 −0 Original line number Diff line number Diff line { autoAddOpenGLRunpathHook , backendStdenv , cmake , cuda_cccl , cuda_cudart , cudaFlags , cuda_nvcc , lib , libcublas , setupCudaHook , stdenv }: backendStdenv.mkDerivation { pname = "saxpy"; version = "unstable-2023-07-11"; src = ./.; buildInputs = [ libcublas cuda_cudart cuda_cccl ]; nativeBuildInputs = [ cmake # NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for # cross-compilation to work. This should work automatically once we move to # spliced scopes. Delete this comment once that happens cuda_nvcc # Alternatively, we could remove the propagated hook from cuda_nvcc and add # directly: # setupCudaHook autoAddOpenGLRunpathHook ]; cmakeFlags = [ "-DCMAKE_VERBOSE_MAKEFILE=ON" "-DCMAKE_CUDA_ARCHITECTURES=${with cudaFlags; builtins.concatStringsSep ";" (map dropDot cudaCapabilities)}" ]; meta = { description = "A simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation"; license = lib.licenses.mit; maintainers = lib.teams.cuda.members; platforms = lib.platforms.unix; }; }
pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu 0 → 100644 +68 −0 Original line number Diff line number Diff line #include <cublas_v2.h> #include <cuda_runtime.h> #include <vector> #include <stdio.h> static inline void check(cudaError_t err, const char *context) { if (err != cudaSuccess) { fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err)); std::exit(EXIT_FAILURE); } } #define CHECK(x) check(x, #x) __global__ void saxpy(int n, float a, float *x, float *y) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) y[i] = a * x[i] + y[i]; } int main(void) { setbuf(stderr, NULL); fprintf(stderr, "Start\n"); int rtVersion, driverVersion; CHECK(cudaRuntimeGetVersion(&rtVersion)); CHECK(cudaDriverGetVersion(&driverVersion)); fprintf(stderr, "Runtime version: %d\n", rtVersion); fprintf(stderr, "Driver version: %d\n", driverVersion); constexpr int N = 1 << 10; std::vector<float> xHost(N), yHost(N); for (int i = 0; i < N; i++) { xHost[i] = 1.0f; yHost[i] = 2.0f; } fprintf(stderr, "Host memory initialized, copying to the device\n"); fflush(stderr); float *xDevice, *yDevice; CHECK(cudaMalloc(&xDevice, N * sizeof(float))); CHECK(cudaMalloc(&yDevice, N * sizeof(float))); CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float), cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float), cudaMemcpyHostToDevice)); fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n"); saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice); fprintf(stderr, "Scheduled a kernel call\n"); CHECK(cudaGetLastError()); CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float), cudaMemcpyDeviceToHost)); float maxError = 0.0f; for (int i = 0; i < N; i++) maxError = max(maxError, abs(yHost[i] - 4.0f)); fprintf(stderr, "Max error: %f\n", maxError); CHECK(cudaFree(xDevice)); CHECK(cudaFree(yDevice)); }