Unverified Commit 3641cedb authored by Ana Gainaru's avatar Ana Gainaru Committed by GitHub
Browse files

Merge pull request #3133 from anagainaru/cuda-bug-bp5

Bug BP5 Get for chunks in an ND array
parents 182ab77c e6a8f012
Loading
Loading
Loading
Loading
+11 −18
Original line number Diff line number Diff line
@@ -451,7 +451,8 @@ int NdCopy(const char *in, const Dims &inStart, const Dims &inCount,
#ifdef ADIOS2_HAVE_CUDA
            if (MemSpace == MemorySpace::CUDA)
            {
                helper::CudaMemCopyFromBuffer(outOvlpBase, 0, inOvlpBase,
                helper::NdCopyCUDA(inOvlpBase, outOvlpBase, inOvlpGapSize,
                                   outOvlpGapSize, ovlpCount, minContDim,
                                   blockSize);
                return 0;
            }
@@ -509,6 +510,14 @@ int NdCopy(const char *in, const Dims &inStart, const Dims &inCount,
    // padding
    else
    {
#ifdef ADIOS2_HAVE_CUDA
        if (MemSpace == MemorySpace::CUDA)
        {
            helper::Throw<std::invalid_argument>(
                "Helper", "Memory", "CopyContiguousMemory",
                "Direct byte order reversal not supported for GPU buffers");
        }
#endif
        //        Dims revInCount(inCount);
        //        Dims revOutCount(outCount);
        //
@@ -607,14 +616,6 @@ int NdCopy(const char *in, const Dims &inStart, const Dims &inCount,
        // Same Endian"
        if (inIsLittleEndian == outIsLittleEndian)
        {
#ifdef ADIOS2_HAVE_CUDA
            if (MemSpace == MemorySpace::CUDA)
            {
                helper::CudaMemCopyFromBuffer(outOvlpBase, 0, inOvlpBase,
                                              blockSize);
                return 0;
            }
#endif
            if (!safeMode)
            {
                NdCopyRecurDFNonSeqDynamic(0, inOvlpBase, outOvlpBase,
@@ -632,14 +633,6 @@ int NdCopy(const char *in, const Dims &inStart, const Dims &inCount,
        // different Endian"
        else
        {
#ifdef ADIOS2_HAVE_CUDA
            if (MemSpace == MemorySpace::CUDA)
            {
                helper::Throw<std::invalid_argument>(
                    "Helper", "Memory", "CopyContiguousMemory",
                    "Direct byte order reversal not supported for GPU buffers");
            }
#endif
            if (!safeMode)
            {
                NdCopyRecurDFNonSeqDynamicRevEndian(
+31 −0
Original line number Diff line number Diff line
@@ -98,6 +98,37 @@ void CudaMemCopyFromBuffer(T *GPUbuffer, size_t position, const char *source,
    char *dest = reinterpret_cast<char *>(GPUbuffer);
    MemcpyBufferToGPU(dest, source + position, size);
}

static inline void NdCopyCUDA(const char *&inOvlpBase, char *&outOvlpBase,
                              Dims &inOvlpGapSize, Dims &outOvlpGapSize,
                              Dims &ovlpCount, size_t minContDim,
                              size_t blockSize)
{
    Dims pos(ovlpCount.size(), 0);
    size_t curDim = 0;
    while (true)
    {
        while (curDim != minContDim)
        {
            pos[curDim]++;
            curDim++;
        }
        CudaMemCopyFromBuffer(outOvlpBase, 0, inOvlpBase, blockSize);
        inOvlpBase += blockSize;
        outOvlpBase += blockSize;
        do
        {
            if (curDim == 0)
            {
                return;
            }
            inOvlpBase += inOvlpGapSize[curDim];
            outOvlpBase += outOvlpGapSize[curDim];
            pos[curDim] = 0;
            curDim--;
        } while (pos[curDim] == ovlpCount[curDim]);
    }
}
#endif

template <class T>
+20 −4
Original line number Diff line number Diff line
@@ -164,14 +164,30 @@ gtest_add_tests_helper(StepsInSituLocalArray MPI_ALLOW BP Engine.BP. .FileStream
  WORKING_DIRECTORY ${FS_DIR} EXTRA_ARGS "FileStream"
)

if(ADIOS2_HAVE_CUDA AND ADIOS2_HAVE_BP5)
if(ADIOS2_HAVE_CUDA)
    enable_language(CUDA)
	gtest_add_tests_helper(WriteReadCuda MPI_ALLOW BP Engine.BP. .BP4
	  WORKING_DIRECTORY ${BP4_DIR} EXTRA_ARGS "BP4"
	  )
	gtest_add_tests_helper(SelectionsCuda MPI_ALLOW BP Engine.BP. .BP4
	  WORKING_DIRECTORY ${BP4_DIR} EXTRA_ARGS "BP4"
	  )
	if(ADIOS2_HAVE_BP5)
		gtest_add_tests_helper(WriteReadCuda MPI_ALLOW BP Engine.BP. .BP5
		  WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "BP5"
		  )
		gtest_add_tests_helper(SelectionsCuda MPI_ALLOW BP Engine.BP. .BP5
		  WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "BP5"
		  )
	endif()

  foreach(tgt ${Test.Engine.BP.WriteReadCuda-TARGETS})
    target_sources(${tgt} PRIVATE operations/CudaRoutines.cu)
    target_link_libraries(${tgt} CUDA::cudart)
  endforeach()

  foreach(tgt ${Test.Engine.BP.SelectionsCuda-TARGETS})
    target_sources(${tgt} PRIVATE operations/CudaRoutines.cu)
    target_link_libraries(${tgt} CUDA::cudart)
  endforeach()
endif()
+302 −0
Original line number Diff line number Diff line
/*
 * Distributed under the OSI-approved Apache License, Version 2.0.  See
 * accompanying file Copyright.txt for details.
 */

#include "operations/CudaRoutines.h"

#include <cstdint>
#include <cstring>

#include <array>
#include <limits>
#include <stdexcept>
#include <vector>

#include <adios2.h>
#include <adios2/common/ADIOSTypes.h>

#include <gtest/gtest.h>

std::string engineName; // comes from command line

class ADIOSSelectionCUDATest : public ::testing::Test
{
public:
    ADIOSSelectionCUDATest() = default;
};

void copySelection2D(const double *a, const adios2::Dims &shape,
                     const adios2::Dims &start, const adios2::Dims &count,
                     double *b)
{
    double *bp = b;
    for (size_t x = 0; x < count[0]; ++x)
    {
        for (size_t y = 0; y < count[1]; ++y)
        {
            const size_t aidx = (start[0] + x) * shape[1] + start[1] + y;
            *bp = a[aidx];
            ++bp;
        }
    }
}

bool compareSelection2D(const double *a, const adios2::Dims &shape,
                        const adios2::Dims &start, const adios2::Dims &count,
                        double *b, adios2::Dims &firstNonEqPoint)
{
    std::cout << " compare Block: shape = " << adios2::ToString(shape)
              << " start = " << adios2::ToString(start)
              << " count = " << adios2::ToString(count) << std::endl;
    bool match = true;
    double *bp = b;
    for (size_t x = 0; x < count[0]; ++x)
    {
        size_t aidx = (start[0] + x) * shape[1] + start[1];
        for (size_t y = 0; y < count[1]; ++y)
        {
            if (*bp != a[aidx])
            {
                firstNonEqPoint = {x, y};
                std::cout << "   Non-match at pos = "
                          << adios2::ToString(firstNonEqPoint)
                          << " : a = " << a[aidx] << ", b = " << *bp
                          << std::endl;
                match = false;
            }
            ++bp;
            ++aidx;
        }
    }
    return match;
}

bool compareSelection2D_F(const double *a, const adios2::Dims &shape,
                          const adios2::Dims &start, const adios2::Dims &count,
                          double *b, adios2::Dims &firstNonEqPoint)
{
    std::cout << " compare Block: shape = " << adios2::ToString(shape)
              << " start = " << adios2::ToString(start)
              << " count = " << adios2::ToString(count) << std::endl;
    bool match = true;
    double *bp = b;
    for (size_t y = 0; y < count[1]; ++y)
    {
        size_t aidx = (start[1] + y) * shape[0] + start[0];
        for (size_t x = 0; x < count[0]; ++x)
        {
            if (*bp != a[aidx])
            {
                firstNonEqPoint = {y, x};
                std::cout << "   Non-match at pos = "
                          << adios2::ToString(firstNonEqPoint)
                          << " : a = " << a[aidx] << ", b = " << *bp
                          << std::endl;
                match = false;
            }
            ++bp;
            ++aidx;
        }
    }
    return match;
}

TEST_F(ADIOSSelectionCUDATest, 2D)
{
    constexpr size_t C1 = 5;
    constexpr size_t C2 = 4;
    constexpr size_t DIM1 = 3 * C1;
    constexpr size_t DIM2 = 3 * C2;
    const std::string filename = "ADIOSSelectionCuda2D.bp";
    double a[DIM1 * DIM2];

    double *ap = a;
    for (size_t x = 1; x <= DIM1; ++x)
    {
        for (size_t y = 1; y <= DIM2; ++y)
        {
            *ap = x * 1.0 + y / 100.0;
            ++ap;
        }
    }
    /* ./bin/bpls -la testing/adios2/engine/bp/bp4/ADIOSSelection2D.bp
        -d a -n 12 -f "%6.3f"
    */

    // Write test data using BP
    {
        adios2::ADIOS adios;
        adios2::IO ioWrite = adios.DeclareIO("TestIOWrite");
        if (!engineName.empty())
        {
            ioWrite.SetEngine(engineName);
        }
        else
        {
            // Create the BP Engine
            ioWrite.SetEngine("BPFile");
        }

        std::cout << "Write data as CUDA buffer..." << std::endl;

        adios2::Engine engine = ioWrite.Open(filename, adios2::Mode::Write);
        const adios2::Dims shape = {DIM1, DIM2};
        const adios2::Dims count = {C1, C2};
        adios2::Dims start{0, 0};

        double b[C1 * C2];

        adios2::Variable<double> var =
            ioWrite.DefineVariable<double>("a", shape, start, count);

        /*adios2::Variable<double> vara1 =
            ioWrite.DefineVariable<double>("a1", shape, start, shape);*/

        engine.BeginStep();

        double *gpuSimData;
        cudaMalloc(&gpuSimData, C1 * C2 * sizeof(double));
        for (size_t x = 0; x < DIM1; x += count[0])
        {
            for (size_t y = 0; y < DIM2; y += count[1])
            {
                start = {x, y};
                copySelection2D(a, shape, start, count, b);
                var.SetSelection({start, count});
                cudaMemcpy(gpuSimData, b, C1 * C2 * sizeof(double),
                           cudaMemcpyHostToDevice);
                var.SetMemorySpace(adios2::MemorySpace::CUDA);
                engine.Put(var, gpuSimData, adios2::Mode::Sync);
            }
        }

        engine.EndStep();
        engine.Close();
    }

    // Read data
    {
        adios2::ADIOS adios;
        adios2::IO ioRead = adios.DeclareIO("TestIORead");
        ioRead.SetEngine("File");
        adios2::Engine engine = ioRead.Open(filename, adios2::Mode::Read);
        EXPECT_TRUE(engine);
        engine.BeginStep();
        adios2::Variable<double> var = ioRead.InquireVariable<double>("a");
        EXPECT_TRUE(var);
        const adios2::Dims shape = {DIM1, DIM2};
        const adios2::Dims count = {C1, C2};
        adios2::Dims s{0, 0};
        adios2::Dims c = shape;
        adios2::Dims firstNonMatch{0, 0};

        std::cout << "CUDA read selections with entire blocks..." << std::endl;

        /* Entire array */
        {
            std::vector<double> res(DIM1 * DIM2);
            double *gpuRet;
            cudaMalloc(&gpuRet, DIM1 * DIM2 * sizeof(double));
            var.SetSelection({s, c});
            var.SetMemorySpace(adios2::MemorySpace::CUDA);
            engine.Get<double>(var, gpuRet, adios2::Mode::Sync);
            cudaMemcpy(res.data(), gpuRet, DIM1 * DIM2 * sizeof(double),
                       cudaMemcpyDeviceToHost);
            EXPECT_EQ(res.size(), DIM1 * DIM2);
            EXPECT_TRUE(
                compareSelection2D(a, shape, s, c, res.data(), firstNonMatch));
        }

        /* Single block in the center */
        {
            s = {5, 4};
            c = count;
            std::vector<double> res(c[0] * c[1]);
            double *gpuRet;
            cudaMalloc(&gpuRet, c[0] * c[1] * sizeof(double));
            var.SetSelection({s, c});
            var.SetMemorySpace(adios2::MemorySpace::CUDA);
            engine.Get<double>(var, gpuRet, adios2::Mode::Sync);
            cudaMemcpy(res.data(), gpuRet, c[0] * c[1] * sizeof(double),
                       cudaMemcpyDeviceToHost);
            EXPECT_EQ(res.size(), c[0] * c[1]);
            EXPECT_TRUE(
                compareSelection2D(a, shape, s, c, res.data(), firstNonMatch));
        }

        /* Four blocks in X-Y direction */
        {
            s = {5, 4};
            c = {2 * count[0], 2 * count[1]};
            std::vector<double> res(c[0] * c[1]);
            double *gpuRet;
            cudaMalloc(&gpuRet, c[0] * c[1] * sizeof(double));
            var.SetSelection({s, c});
            var.SetMemorySpace(adios2::MemorySpace::CUDA);
            engine.Get<double>(var, gpuRet, adios2::Mode::Sync);
            cudaMemcpy(res.data(), gpuRet, c[0] * c[1] * sizeof(double),
                       cudaMemcpyDeviceToHost);
            EXPECT_EQ(res.size(), c[0] * c[1]);
            EXPECT_TRUE(
                compareSelection2D(a, shape, s, c, res.data(), firstNonMatch));
        }

        /*
         *   Partial blocks
         */

        std::cout << "CUDA read selections with partial blocks..." << std::endl;

        /* center part of single block in center */
        {
            s = {6, 5};
            c = {count[0] - 2, count[1] - 2};
            std::vector<double> res(c[0] * c[1]);
            double *gpuRet;
            cudaMalloc(&gpuRet, c[0] * c[1] * sizeof(double));
            var.SetSelection({s, c});
            var.SetMemorySpace(adios2::MemorySpace::CUDA);
            engine.Get<double>(var, gpuRet, adios2::Mode::Sync);
            cudaMemcpy(res.data(), gpuRet, c[0] * c[1] * sizeof(double),
                       cudaMemcpyDeviceToHost);
            EXPECT_EQ(res.size(), c[0] * c[1]);
            EXPECT_TRUE(
                compareSelection2D(a, shape, s, c, res.data(), firstNonMatch));
        }

        /* Center block plus 1 in each direction, cutting into all blocks */
        /* ./bin/bpls -la testing/adios2/engine/bp/bp4/ADIOSSelection2D.bp
            -d a  -f "%6.3f " -s "4,3" -c "7,6" -n 6
        */
        {
            s = {4, 3};
            c = {count[0] + 2, count[1] + 2};
            std::vector<double> res(c[0] * c[1]);
            double *gpuRet;
            cudaMalloc(&gpuRet, c[0] * c[1] * sizeof(double));
            var.SetSelection({s, c});
            var.SetMemorySpace(adios2::MemorySpace::CUDA);
            engine.Get<double>(var, gpuRet, adios2::Mode::Sync);
            cudaMemcpy(res.data(), gpuRet, c[0] * c[1] * sizeof(double),
                       cudaMemcpyDeviceToHost);
            EXPECT_EQ(res.size(), c[0] * c[1]);
            EXPECT_TRUE(
                compareSelection2D(a, shape, s, c, res.data(), firstNonMatch));
        }

        engine.EndStep();
        engine.Close();
    }
}

int main(int argc, char **argv)
{
    ::testing::InitGoogleTest(&argc, argv);
    if (argc > 1)
    {
        engineName = std::string(argv[1]);
    }
    int result = RUN_ALL_TESTS();
    return result;
}