Loading build.rc +3 −0 Original line number Diff line number Diff line Loading @@ -8,3 +8,6 @@ CXXFLAGS=$cflags \ LDFLAGS=$libs \ cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a \ -DCMAKE_CXX_COMPILER=hipcc .. echo After running make, run the test with: echo srun -n8 -N1 -A '<acct>' --gpu-bind closest -c7 -t 2 ./mpitest src/allreduce.cc +23 −0 Original line number Diff line number Diff line Loading @@ -48,6 +48,10 @@ struct CCLH { ReduceData *data = new ReduceData(send, recv, count, mpi->comm); CUDACHECK( hipLaunchHostFunc(stream, CCLH::doAllReduce, data)); } void wait() { CUDACHECK( cudaStreamSynchronize(stream) ); } }; using CCLp = std::shared_ptr<CCLH>; Loading @@ -58,6 +62,13 @@ int cclAllReduce((const void*)send, (void*)recv, size, ncclFloat, ncclSum, run(nccl, sendbuff, recvbuff); return 0; }*/ __global__ void set_one(size_t n, double *x) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { x[i] = 1.0; } } int main(int argc, char *argv[]) { MPIp mpi = std::make_shared<MPIH>(&argc, &argv); Loading @@ -68,11 +79,23 @@ int main(int argc, char *argv[]) { devMalloc(src, count*sizeof(double)); devMalloc(dst, count*sizeof(double)); double t0 = MPI_Wtime(); set_one<<< 32*120, 32, 0, ccl->stream >>>(count, src); set_one<<< 32*120, 32, 0, ccl->stream >>>(count, dst); if(mpi->rank == 0) printf("Allreduce: %d doubles.\n", count); double t1 = MPI_Wtime(); //MPI_Allreduce(src, dst, count, MPI_DOUBLE, MPI_SUM, mpi.comm); ccl->Allreduce(src, dst, count); double t2 = MPI_Wtime(); ccl->wait(); double t3 = MPI_Wtime(); printf("Answer = %g\nTimes:\n", dst[0]); printf(" kernel launch: %g\n", t1-t0); printf(" allreduce launch: %g\n", t2-t1); printf(" sync: %g\n", t3-t2); devFree(src); devFree(dst); Loading Loading
build.rc +3 −0 Original line number Diff line number Diff line Loading @@ -8,3 +8,6 @@ CXXFLAGS=$cflags \ LDFLAGS=$libs \ cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a \ -DCMAKE_CXX_COMPILER=hipcc .. echo After running make, run the test with: echo srun -n8 -N1 -A '<acct>' --gpu-bind closest -c7 -t 2 ./mpitest
src/allreduce.cc +23 −0 Original line number Diff line number Diff line Loading @@ -48,6 +48,10 @@ struct CCLH { ReduceData *data = new ReduceData(send, recv, count, mpi->comm); CUDACHECK( hipLaunchHostFunc(stream, CCLH::doAllReduce, data)); } void wait() { CUDACHECK( cudaStreamSynchronize(stream) ); } }; using CCLp = std::shared_ptr<CCLH>; Loading @@ -58,6 +62,13 @@ int cclAllReduce((const void*)send, (void*)recv, size, ncclFloat, ncclSum, run(nccl, sendbuff, recvbuff); return 0; }*/ __global__ void set_one(size_t n, double *x) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { x[i] = 1.0; } } int main(int argc, char *argv[]) { MPIp mpi = std::make_shared<MPIH>(&argc, &argv); Loading @@ -68,11 +79,23 @@ int main(int argc, char *argv[]) { devMalloc(src, count*sizeof(double)); devMalloc(dst, count*sizeof(double)); double t0 = MPI_Wtime(); set_one<<< 32*120, 32, 0, ccl->stream >>>(count, src); set_one<<< 32*120, 32, 0, ccl->stream >>>(count, dst); if(mpi->rank == 0) printf("Allreduce: %d doubles.\n", count); double t1 = MPI_Wtime(); //MPI_Allreduce(src, dst, count, MPI_DOUBLE, MPI_SUM, mpi.comm); ccl->Allreduce(src, dst, count); double t2 = MPI_Wtime(); ccl->wait(); double t3 = MPI_Wtime(); printf("Answer = %g\nTimes:\n", dst[0]); printf(" kernel launch: %g\n", t1-t0); printf(" allreduce launch: %g\n", t2-t1); printf(" sync: %g\n", t3-t2); devFree(src); devFree(dst); Loading