Commit a4cc444b authored by Cianciosa, Mark's avatar Cianciosa, Mark Committed by Cianciosa, Mark
Browse files

Renable complex support for hip. This is not tested yet.

parent 47d82851
Loading
Loading
Loading
Loading
+22 −21
Original line number Diff line number Diff line
@@ -113,7 +113,6 @@ namespace gpu {
        hip_context(const size_t index) :
        device(index), result_buffer(0), module(0) {
            check_error(hipSetDevice(device), "hipSetDevice");            
            
            check_error(hipStreamCreate(&stream), "hipStreamCreate");
        }

@@ -175,6 +174,9 @@ namespace gpu {
                std::cout << "  Device name                     : " << device_properties.name << std::endl;
                std::cout << "  Total Global Memory             : " << device_properties.totalGlobalMem << std::endl;
                std::cout << "  Managed Memory                  : " << device_properties.managedMemory << std::endl;
                std::cout << "  Max Threads Per Block           : " << device_properties.maxThreadsPerBlock << std::endl;
                std::cout << "  Max Threads Per Multi Processor : " << device_properties.maxThreadsPerMultiProcessor << std::endl;
                std::cout << "  Warp size                       : " << device_properties.warpSize << std::endl;
            }

            std::array<const char *, 2> options({
@@ -415,8 +417,7 @@ namespace gpu {
            if constexpr (jit::is_complex<T> ()) {
                source_buffer << "#define HIP_DEVICE_CODE" << std::endl;
		source_buffer << "#define M_PI " << M_PI << std::endl;
//  FIXME: Figure out how hip does complex.
//                source_buffer << "#include <cuda/std/complex>" << std::endl;
                source_buffer << "#include <hip/hip_complex.h>" << std::endl;
                source_buffer << "#include <special_functions.hpp>" << std::endl;
            }
        }
@@ -437,7 +438,6 @@ namespace gpu {
                                  graph::output_nodes<T, SAFE_MATH> &outputs,
                                  const size_t size,
                                  jit::register_map &registers) {
//  FIXME: Convert to hip.
            source_buffer << std::endl;
            source_buffer << "extern \"C\" __global__ void " << name << "("
                          << std::endl;
@@ -462,6 +462,7 @@ namespace gpu {

            source_buffer << "    const int index = blockIdx.x*blockDim.x + threadIdx.x;"
                          << std::endl;
            source_buffer << "    printf(%i %i %i %i, index, blockIdx.x, blockDim.x, threadIdx.x);" << std::endl;
            source_buffer << "    if (index < " << size << ") {" << std::endl;

            for (auto &input : inputs) {
@@ -486,7 +487,6 @@ namespace gpu {
                                   graph::output_nodes<T, SAFE_MATH> &outputs,
                                   graph::map_nodes<T, SAFE_MATH> &setters,
                                   jit::register_map &registers) {
//  FIXME: Convert to hip
            for (auto &[out, in] : setters) {
                graph::shared_leaf<T, SAFE_MATH> a = out->compile(source_buffer, registers);
                source_buffer << "        " << jit::to_string('v',  in.get())
@@ -555,8 +555,9 @@ namespace gpu {
            jit::add_type<T> (source_buffer);
            source_buffer << " *result) {" << std::endl;
            source_buffer << "    const unsigned int i = threadIdx.x;" << std::endl;
            source_buffer << "    const unsigned int j = threadIdx.x/32;" << std::endl;
            source_buffer << "    const unsigned int k = threadIdx.x%32;" << std::endl;
            source_buffer << "    const unsigned int j = threadIdx.x/64;" << std::endl;
            source_buffer << "    const unsigned int k = threadIdx.x%64;" << std::endl;
       	    source_buffer << "    printf(%i %i %i %i %i %i, i, j, k, blockIdx.x, blockDim.x, threadIdx.x);" << std::endl;
            source_buffer << "    if (i < " << size << ") {" << std::endl;
            source_buffer << "        " << jit::type_to_string<T> () << " sub_max = ";
            if constexpr (jit::is_complex<T> ()) {
@@ -571,15 +572,15 @@ namespace gpu {
                source_buffer << "            sub_max = max(sub_max, input[index]);" << std::endl;
            }
            source_buffer << "        }" << std::endl;
            source_buffer << "        __shared__ " << jit::type_to_string<T> () << " thread_max[32];" << std::endl;
            source_buffer << "        for (int index = 16; index > 0; index /= 2) {" << std::endl;
            source_buffer << "            sub_max = max(sub_max, __shfl_down_sync(__activemask(), sub_max, index));" << std::endl;
            source_buffer << "        __shared__ " << jit::type_to_string<T> () << " thread_max[64];" << std::endl;
            source_buffer << "        for (int index = 32; index > 0; index /= 2) {" << std::endl;
            source_buffer << "            sub_max = max(sub_max, __shfl_down(sub_max, index));" << std::endl;
            source_buffer << "        }" << std::endl;
            source_buffer << "        thread_max[j] = sub_max;" << std::endl;
            source_buffer << "        __syncthreads();" << std::endl;
            source_buffer << "        if (j == 0) {"  << std::endl;
            source_buffer << "            for (int index = 16; index > 0; index /= 2) {" << std::endl;
            source_buffer << "                thread_max[k] = max(thread_max[k], __shfl_down_sync(__activemask(), thread_max[k], index));" << std::endl;
            source_buffer << "            for (int index = 32; index > 0; index /= 2) {" << std::endl;
            source_buffer << "                thread_max[k] = max(thread_max[k], __shfl_down(thread_max[k], index));" << std::endl;
            source_buffer << "            }" << std::endl;
            source_buffer << "            *result = thread_max[0];" << std::endl;
            source_buffer << "        }"  << std::endl;