Commit eae99a2a authored by David M. Rogers's avatar David M. Rogers
Browse files

Added test.

parent a5bf765a
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -8,3 +8,6 @@ CXXFLAGS=$cflags \
LDFLAGS=$libs \
  cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a  \
        -DCMAKE_CXX_COMPILER=hipcc ..

echo After running make, run the test with:
echo srun -n8 -N1 -A '<acct>' --gpu-bind closest -c7 -t 2 ./mpitest
+23 −0
Original line number Diff line number Diff line
@@ -48,6 +48,10 @@ struct CCLH {
        ReduceData *data = new ReduceData(send, recv, count, mpi->comm);
        CUDACHECK( hipLaunchHostFunc(stream, CCLH::doAllReduce, data));
    }

    void wait() {
        CUDACHECK( cudaStreamSynchronize(stream) );
    }
};
using CCLp = std::shared_ptr<CCLH>;

@@ -58,6 +62,13 @@ int cclAllReduce((const void*)send, (void*)recv, size, ncclFloat, ncclSum,
    run(nccl, sendbuff, recvbuff);
    return 0;
}*/
__global__ void set_one(size_t n, double *x) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < n;
         i += blockDim.x * gridDim.x) {
              x[i] = 1.0;
      }
}

int main(int argc, char *argv[]) {
    MPIp mpi = std::make_shared<MPIH>(&argc, &argv);
@@ -68,11 +79,23 @@ int main(int argc, char *argv[]) {
    devMalloc(src, count*sizeof(double));
    devMalloc(dst, count*sizeof(double));

    double t0 = MPI_Wtime();
    set_one<<< 32*120, 32, 0, ccl->stream >>>(count, src);
    set_one<<< 32*120, 32, 0, ccl->stream >>>(count, dst);
    if(mpi->rank == 0)
        printf("Allreduce: %d doubles.\n", count);
    double t1 = MPI_Wtime();

    //MPI_Allreduce(src, dst, count, MPI_DOUBLE, MPI_SUM, mpi.comm);
    ccl->Allreduce(src, dst, count);
    double t2 = MPI_Wtime();

    ccl->wait();
    double t3 = MPI_Wtime();
    printf("Answer = %g\nTimes:\n", dst[0]);
    printf("  kernel launch: %g\n", t1-t0);
    printf("  allreduce launch: %g\n", t2-t1);
    printf("  sync: %g\n", t3-t2);

    devFree(src);
    devFree(dst);