Unverified Commit 251d3166 authored by Someone Serge's avatar Someone Serge
Browse files

cudaPackages.saxpy: init at unstable-2023-07-11

parent 4df8614c
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -71,4 +71,6 @@ in
    cudaFlags
    markForCudatoolkitRootHook
    setupCudaHook;

    saxpy = final.callPackage ./saxpy { };
}
+12 −0
Original line number Diff line number Diff line
cmake_minimum_required(VERSION 3.25)
project(saxpy LANGUAGES CXX CUDA)

find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas)

add_executable(saxpy saxpy.cu)
target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m)
target_compile_features(saxpy PRIVATE cxx_std_14)
target_compile_options(saxpy PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
                                     --expt-relaxed-constexpr>)

install(TARGETS saxpy)
+50 −0
Original line number Diff line number Diff line
{ autoAddOpenGLRunpathHook
, backendStdenv
, cmake
, cuda_cccl
, cuda_cudart
, cudaFlags
, cuda_nvcc
, lib
, libcublas
, setupCudaHook
, stdenv
}:

backendStdenv.mkDerivation {
  pname = "saxpy";
  version = "unstable-2023-07-11";

  src = ./.;

  buildInputs = [
    libcublas
    cuda_cudart
    cuda_cccl
  ];
  nativeBuildInputs = [
    cmake

    # NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for
    # cross-compilation to work. This should work automatically once we move to
    # spliced scopes. Delete this comment once that happens
    cuda_nvcc

    # Alternatively, we could remove the propagated hook from cuda_nvcc and add
    # directly:
    # setupCudaHook
    autoAddOpenGLRunpathHook
  ];

  cmakeFlags = [
    "-DCMAKE_VERBOSE_MAKEFILE=ON"
    "-DCMAKE_CUDA_ARCHITECTURES=${with cudaFlags; builtins.concatStringsSep ";" (map dropDot cudaCapabilities)}"
  ];

  meta = {
    description = "A simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation";
    license = lib.licenses.mit;
    maintainers = lib.teams.cuda.members;
    platforms = lib.platforms.unix;
  };
}
+68 −0
Original line number Diff line number Diff line
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <vector>

#include <stdio.h>

static inline void check(cudaError_t err, const char *context) {
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err));
    std::exit(EXIT_FAILURE);
  }
}

#define CHECK(x) check(x, #x)

__global__ void saxpy(int n, float a, float *x, float *y) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n)
    y[i] = a * x[i] + y[i];
}

int main(void) {
  setbuf(stderr, NULL);
  fprintf(stderr, "Start\n");

  int rtVersion, driverVersion;
  CHECK(cudaRuntimeGetVersion(&rtVersion));
  CHECK(cudaDriverGetVersion(&driverVersion));

  fprintf(stderr, "Runtime version: %d\n", rtVersion);
  fprintf(stderr, "Driver version: %d\n", driverVersion);

  constexpr int N = 1 << 10;

  std::vector<float> xHost(N), yHost(N);
  for (int i = 0; i < N; i++) {
    xHost[i] = 1.0f;
    yHost[i] = 2.0f;
  }

  fprintf(stderr, "Host memory initialized, copying to the device\n");
  fflush(stderr);

  float *xDevice, *yDevice;
  CHECK(cudaMalloc(&xDevice, N * sizeof(float)));
  CHECK(cudaMalloc(&yDevice, N * sizeof(float)));

  CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float),
                   cudaMemcpyHostToDevice));
  CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float),
                   cudaMemcpyHostToDevice));
  fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n");

  saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice);
  fprintf(stderr, "Scheduled a kernel call\n");
  CHECK(cudaGetLastError());

  CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float),
                   cudaMemcpyDeviceToHost));

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(yHost[i] - 4.0f));
  fprintf(stderr, "Max error: %f\n", maxError);

  CHECK(cudaFree(xDevice));
  CHECK(cudaFree(yDevice));
}