Added test. (eae99a2a) · Commits · Rogers, David / mpi-test

build.rc

+3 −0

Original line number	Diff line number	Diff line
		@@ -8,3 +8,6 @@ CXXFLAGS=$cflags \
		LDFLAGS=$libs \
		cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a \
		-DCMAKE_CXX_COMPILER=hipcc ..

		echo After running make, run the test with:
		echo srun -n8 -N1 -A '<acct>' --gpu-bind closest -c7 -t 2 ./mpitest

+23 −0

Original line number	Diff line number	Diff line
		@@ -48,6 +48,10 @@ struct CCLH {
		ReduceData *data = new ReduceData(send, recv, count, mpi->comm);
		CUDACHECK( hipLaunchHostFunc(stream, CCLH::doAllReduce, data));
		}

		void wait() {
		CUDACHECK( cudaStreamSynchronize(stream) );
		}
		};
		using CCLp = std::shared_ptr<CCLH>;

		@@ -58,6 +62,13 @@ int cclAllReduce((const void)send, (void)recv, size, ncclFloat, ncclSum,
		run(nccl, sendbuff, recvbuff);
		return 0;
		}*/
		__global__ void set_one(size_t n, double *x) {
		for (int i = blockIdx.x * blockDim.x + threadIdx.x;
		i < n;
		i += blockDim.x * gridDim.x) {
		x[i] = 1.0;
		}
		}

		int main(int argc, char *argv[]) {
		MPIp mpi = std::make_shared<MPIH>(&argc, &argv);
		@@ -68,11 +79,23 @@ int main(int argc, char *argv[]) {
		devMalloc(src, count*sizeof(double));
		devMalloc(dst, count*sizeof(double));

		double t0 = MPI_Wtime();
		set_one<<< 32*120, 32, 0, ccl->stream >>>(count, src);
		set_one<<< 32*120, 32, 0, ccl->stream >>>(count, dst);
		if(mpi->rank == 0)
		printf("Allreduce: %d doubles.\n", count);
		double t1 = MPI_Wtime();

		//MPI_Allreduce(src, dst, count, MPI_DOUBLE, MPI_SUM, mpi.comm);
		ccl->Allreduce(src, dst, count);
		double t2 = MPI_Wtime();

		ccl->wait();
		double t3 = MPI_Wtime();
		printf("Answer = %g\nTimes:\n", dst[0]);
		printf(" kernel launch: %g\n", t1-t0);
		printf(" allreduce launch: %g\n", t2-t1);
		printf(" sync: %g\n", t3-t2);

		devFree(src);
		devFree(dst);