Commit 869e7ab8 authored by Cianciosa, Mark's avatar Cianciosa, Mark
Browse files

Fix cuda errors that the refactoring creating.

parent f1347bea
Loading
Loading
Loading
Loading
+17 −20
Original line number Diff line number Diff line
@@ -179,8 +179,9 @@ namespace gpu {
            check_error(cuDeviceGetName(device_name, 100, device), "cuDeviceGetName");
            std::cout << "  Device name              : " << device_name << std::endl;

            const std::string temp = arch.str();
            std::array<const char *, 3> options({
                arch.str().c_str(),
                temp.c_str(),
                "--std=c++17",
                "--include-path=" CUDA_INCLUDE
            });
@@ -195,6 +196,7 @@ namespace gpu {
                                  "nvrtcGetProgramLog");
                std::cout << log << std::endl;
                free(log);
                std::cout << kernel_source << std::endl;
            }

            const char *mangled_kernel_name;
@@ -321,7 +323,6 @@ namespace gpu {
///
///  @params[in] index Number of times to record.
//------------------------------------------------------------------------------
        template<typename T>
        void print_results(const size_t index) {
            wait();
            for (CUdeviceptr &buffer : buffers) {
@@ -336,7 +337,6 @@ namespace gpu {
///  @params[in]     source_index Index of the GPU buffer.
///  @params[in,out] destination  Host side buffer to copy to.
//------------------------------------------------------------------------------
        template<typename T>
        void copy_buffer(const size_t source_index,
                         T *destination) {
	    size_t size;
@@ -391,7 +391,7 @@ namespace gpu {
            }
            source_buffer << ") {" << std::endl;

            source_buffer << "    const index = blockIdx.x*blockDim.x + threadIdx.x;"
            source_buffer << "    const int index = blockIdx.x*blockDim.x + threadIdx.x;"
                          << std::endl;
            source_buffer << "    if (index < " << size << ") {" << std::endl;

@@ -454,25 +454,22 @@ namespace gpu {
            source_buffer << "    const unsigned int j = threadIdx.x/32;" << std::endl;
            source_buffer << "    const unsigned int k = threadIdx.x%32;" << std::endl;
            source_buffer << "    if (i < " << size << ") {" << std::endl;
            source_buffer << "        "
            jit::add_type<T> (source_buffer);
            source_buffer << " sub_max = input[i];" << std::endl;
            source_buffer << "        " << jit::type_to_string<T> () << " sub_max = ";
            if constexpr (jit::is_complex<T> ()) {
                source_buffer << "abs(input[i]);" << std::endl;
            } else {
                source_buffer << "input[i];" << std::endl;
            }
            source_buffer << "        for (size_t index = i + 1024; index < " << size <<"; index += 1024) {" << std::endl;
            if constexpr (jit::is_complex<T> ()) {
                source_buffer << "            sub_max = max(abs(sub_max), abs(input[index]));" << std::endl;
                source_buffer << "            sub_max = max(sub_max, abs(input[index]));" << std::endl;
            } else {
                source_buffer << "            sub_max = max(sub_max, input[index]);" << std::endl;
            }
            source_buffer << "        }" << std::endl;
            source_buffer << "        __shared__ ";
            jit::add_type<T> (source_buffer);
            source_buffer << " thread_max[32];" << std::endl;
            source_buffer << "        __shared__ " << jit::type_to_string<T> () << " thread_max[32];" << std::endl;
            source_buffer << "        for (int index = 16; index > 0; index /= 2) {" << std::endl;
            if constexpr (jit::is_complex<T> ()) {
                source_buffer << "            sub_max = max(abs(sub_max), abs(__shfl_down_sync(__activemask(), sub_max, index)));" << std::endl;
            } else {
            source_buffer << "            sub_max = max(sub_max, __shfl_down_sync(__activemask(), sub_max, index));" << std::endl;
            }
            source_buffer << "        }" << std::endl;
            source_buffer << "        thread_max[j] = sub_max;" << std::endl;
            source_buffer << "        __syncthreads();" << std::endl;