Working on optimizations for multigpu (58f0a5b8) · Commits · Berrill, Mark A / RayTrace-miniapp

src/CreateImage.cpp

+18 −22

Original line number	Diff line number	Diff line
		@@ -115,26 +115,11 @@ int run_tests( const std::string& filename, const Options& options )
		#endif
		}

		// Call a dummy CUDA/OpenAcc method to initialize the GPU (for more accurate times)
		static bool cudaInitialized = false;
		if ( !cudaInitialized ) {
		auto index = std::find(methods.begin(),methods.end(),"Cuda-MultiGPU");
		if ( index == methods.end() )
		index = std::find(methods.begin(),methods.end(),"Cuda");
		if ( index == methods.end() )
		index = find(methods.begin(),methods.end(),"OpenAcc");
		if ( index != methods.end() ) {
		auto info = loadInput( filename, 0.1 );
		RayTrace::create_image( info, *index );
		free2( info );
		}
		cudaInitialized = true;
		}

		// Load the image structure
		double image0=NULL, I_ang0=NULL;
		auto info = loadInput( filename, options.scale, &image0, &I_ang0 );
		if ( info == NULL )
		auto info2 = loadInput( filename, 0.1 );
		if ( info == nullptr )
		return -2;

		// Call create_image for each method
		@@ -144,8 +129,15 @@ int run_tests( const std::string& filename, const Options& options )
		for ( size_t i = 0; i < methods.size(); i++ ) {
		if ( rank() == 0 )
		printf( "Running %s\n", methods[i].c_str() );
		int iterations = options.iterations;
		if ( methods[i] == "cpu" && methods.size() > 1 )
		iterations = 1;
		// Initialize the method to reduce runtime variability
		if ( methods[i] != "cpu" )
		RayTrace::create_image( info2, methods[i] );
		// Run the timing tests
		double start = getTime();
		for ( int it = 0; it < options.iterations; it++ ) {
		for ( int it = 0; it < iterations; it++ ) {
		RayTrace::create_image( info, methods[i] );
		double stop = getTime();
		time[i].push_back( stop - start );
		@@ -171,12 +163,12 @@ int run_tests( const std::string& filename, const Options& options )
		double avg = getAvg( time[i] );
		double dev = getDev( time[i] );
		printf( "%14s %7.3f %7.3f %7.3f %7.3f\n", methods[i].c_str(), avg, min, max, dev );
		if ( dev/avg > 0.10 ) {
		printf( " Standard deviation exceeded tolerance (10%%)\n");
		if ( dev/avg > 0.25 ) {
		printf( " Standard deviation exceeded tolerance (25%%)\n");
		N_errors++;
		}
		if ( (max-avg)/avg > 0.15 ) {
		printf( " Maximum runtime exceeded average by more than 15%%\n");
		if ( (max-avg)/avg > 0.50 ) {
		printf( " Maximum runtime exceeded average by more than 50%%\n");
		N_errors++;
		}
		}
		@@ -186,6 +178,7 @@ int run_tests( const std::string& filename, const Options& options )
		free( (void *) image0 );
		free( (void *) I_ang0 );
		free2( info );
		free2( info2 );
		return sumReduce(N_errors);
		}

		@@ -237,6 +230,9 @@ int main( int argc, char *argv[] )
		// Initialize kokkos
		KokkosInitialize( argc, argv );

		// Print hardware stats
		printHardware();

		// Run the tests for all files
		int N_errors = 0;
		for (size_t i=0; i<filenames.size(); i++)

src/CreateImageHelpers.cpp

+86 −5

Original line number	Diff line number	Diff line
		#include "CreateImageHelpers.h"

		#include <math.h>
		#include <thread>


		#if defined( WIN32 ) \|\| defined( _WIN32 ) \|\| defined( WIN64 ) \|\| defined( _WIN64 )
		#ifdef USE_CUDA
		#include <cuda_runtime_api.h>
		#endif

		// Detect the OS
		// clang-format off
		#if defined( WIN32 ) \|\| defined( _WIN32 ) \|\| defined( WIN64 ) \|\| defined( _WIN64 ) \|\| defined( _MSC_VER )
		#define USE_WINDOWS
		#elif defined( __APPLE__ )
		#define USE_MAC
		#elif defined( __linux ) \|\| defined( __linux__ ) \|\| defined( __unix ) \|\| defined( __posix )
		#define USE_LINUX
		#define USE_NM
		#else
		#error Unknown OS
		#endif
		// clang-format on


		// Include system dependent headers
		// clang-format off
		#ifdef USE_WINDOWS
		#include <process.h>
		#include <psapi.h>
		#include <stdio.h>
		#include <tchar.h>
		#include <windows.h>
		#else
		#include <dlfcn.h>
		#include <execinfo.h>
		#include <sched.h>
		#include <sys/time.h>
		#include <ctime>
		#include <unistd.h>
		#endif
		#ifdef USE_LINUX
		#include <malloc.h>
		#endif
		#ifdef USE_MAC
		#include <mach/mach.h>
		#include <sys/sysctl.h>
		#include <sys/types.h>
		#endif
		// clang-format on


		#if defined( USE_WINDOWS )
		#define get_time( x ) QueryPerformanceCounter( x )
		#define get_frequency( f ) QueryPerformanceFrequency( f )
		#define TIME_TYPE LARGE_INTEGER
		@@ -14,7 +60,6 @@ inline double get_diff( TIME_TYPE start, TIME_TYPE end, TIME_TYPE f )
		return ( ( (double) ( end.QuadPart - start.QuadPart ) ) / ( (double) f.QuadPart ) );
		}
		#else
		#include <sys/time.h>
		#define get_time( x ) gettimeofday( x, NULL );
		#define get_frequency( f ) ( *f = timeval() )
		#define TIME_TYPE timeval
		@@ -89,8 +134,8 @@ bool check_ans( const double image0, const double I_ang0, const RayTrace::crea
		const double tol =
		5e-6; // RayTrace uses single precision for some calculations (may need to adjust to 1e-5)
		// bool pass = error[0]<=tol && error[1]<=tol;
		bool pass =
		( norm0[0] - norm1[0] ) / norm0[0] <= tol && ( norm0[1] - norm1[1] ) / norm0[1] <= tol;
		bool pass = fabs( norm0[0] - norm1[0] ) / norm0[0] <= tol &&
		fabs( norm0[1] - norm1[1] ) / norm0[1] <= tol;
		if ( !pass ) {
		std::cerr << " Answers do not match:" << std::endl;
		std::cerr << " image: " << error[0] << " " << norm0[0] << " " << norm1[0] << std::endl;
		@@ -191,3 +236,39 @@ double getDev( const std::vector<double>& x )
		return y;
		}


		// Print info about the hardware
		void printHardware()
		{
		// Get number of threads
		size_t N_threads = std::thread::hardware_concurrency();
		// Get number of gpus
		int N_gpu = 0;
		#ifdef USE_CUDA
		cudaGetDeviceCount( &N_gpu );
		#endif
		// Get system memory
		#if defined( USE_LINUX )
		static size_t pages = sysconf( _SC_PHYS_PAGES );
		static size_t N_bytes = pages * sysconf( _SC_PAGESIZE );
		#elif defined( USE_MAC )
		int mib[2] = { CTL_HW, HW_MEMSIZE };
		u_int namelen = sizeof( mib ) / sizeof( mib[0] );
		uint64_t size;
		size_t len = sizeof( size );
		size_t N_bytes = 0;
		if ( sysctl( mib, namelen, &size, &len, nullptr, 0 ) == 0 )
		N_bytes = size;
		#elif defined( USE_WINDOWS )
		MEMORYSTATUSEX status;
		status.dwLength = sizeof( status );
		GlobalMemoryStatusEx( &status );
		size_t N_bytes = status.ullTotalPhys;
		#else
		#error Unknown OS
		#endif
		// Print the results
		std::cout << "Number of threads: " << N_threads << std::endl;
		std::cout << "Number of gpus: " << N_gpu << std::endl;
		std::cout << "System memory: " << N_bytes/1073741824 << " GB" << std::endl;
		}

src/CreateImageHelpers.h

+4 −0

Original line number	Diff line number	Diff line
		@@ -10,6 +10,10 @@
		#include "RayTrace.h"


		// Function to print info about the hardware that we will use
		void printHardware();


		// Function to call fread, checking that the proper length was read
		void fread2( void ptr, size_t size, size_t count, FILE fid );

src/RayTraceImage.cpp

+26 −18

Original line number	Diff line number	Diff line
		@@ -86,6 +86,26 @@ void setGPU( int id )
		cudaSetDevice(id);
		#endif
		}
		void setDeviceAndRun( int id, int N_threads,
		std::function<void( int )> setID,
		std::function<void( int, const RayTrace::EUV_beam_struct&,
		const RayTrace::ray_gain_struct, const RayTrace::ray_seed_struct,
		int, const std::vector<ray_struct>&, double, double*,
		double*, unsigned int&, std::vector<ray_struct>& )> function,
		int N, const RayTrace::EUV_beam_struct& beam,
		const RayTrace::ray_gain_struct gain, const RayTrace::ray_seed_struct seed,
		int method, const std::vector<ray_struct> &rays, double scale, double **image,
		double **I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays )
		{
		std::vector<ray_struct> rays2;
		rays2.reserve( rays.size() / N_threads + 1 );
		for ( size_t i = id; i < rays.size(); i+=N_threads )
		rays2.emplace_back( rays[i] );
		image = (double ) calloc( beam.nx * beam.ny * beam.nv, sizeof( double ) );
		I_ang = (double ) calloc( beam.na * beam.nb, sizeof( double ) );
		setID( id );
		function( N, beam, gain, seed, method, rays2, scale, image, I_ang, failure_code, failed_rays );
		}
		void RayTraceImageThreadLoop( size_t N_threads,
		std::function<void( int, const RayTrace::EUV_beam_struct&,
		const RayTrace::ray_gain_struct, const RayTrace::ray_seed_struct,
		@@ -97,26 +117,15 @@ void RayTraceImageThreadLoop( size_t N_threads,
		int method, const std::vector<ray_struct> &rays, double scale, double *image,
		double *I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays )
		{
		std::vector<std::vector<ray_struct>> rays2( N_threads );
		std::vector<std::vector<ray_struct>> failed_rays2( N_threads );
		std::vector<unsigned int> failure_code2( N_threads, 0 );
		std::vector<double *> image2( N_threads, NULL );
		std::vector<double *> I_ang2( N_threads, NULL );
		std::vector<double *> image2( N_threads, nullptr );
		std::vector<double *> I_ang2( N_threads, nullptr );
		std::vector<std::thread> threads;
		for ( size_t i = 0, j = 0; i < N_threads; i++ ) {
		size_t N2 = rays.size() / N_threads + 1;
		if ( N2 == 0 ) {
		N2 = 8;
		}
		rays2.reserve( N2 );
		for ( size_t k = 0; k < N2 && j < rays.size(); k++, j++ )
		rays2[i].push_back( rays[j] );
		image2[i] = (double ) calloc( beam.nx beam.ny * beam.nv, sizeof( double ) );
		I_ang2[i] = (double ) calloc( beam.na beam.nb, sizeof( double ) );
		setID( i );
		threads.push_back( std::thread( function, N, std::ref(beam),
		gain, seed, method, std::ref( rays2[i] ), scale, image2[i],
		I_ang2[i], std::ref( failure_code2[i] ), std::ref( failed_rays2[i] ) ) );
		for ( size_t i = 0; i < N_threads; i++ ) {
		threads.push_back( std::thread( setDeviceAndRun, i, N_threads, setID, function,
		N, std::ref(beam), gain, seed, method, std::ref( rays ), scale, &image2[i],
		&I_ang2[i], std::ref( failure_code2[i] ), std::ref( failed_rays2[i] ) ) );
		}
		for ( size_t i = 0; i < N_threads; i++ ) {
		threads[i].join();
		@@ -129,7 +138,6 @@ void RayTraceImageThreadLoop( size_t N_threads,
		failed_rays.push_back( failed_rays2[i][j] );
		free( image2[i] );
		free( I_ang2[i] );
		rays2[i].clear();
		}
		}

src/RayTraceImageCuda.cu

+6 −3

Original line number	Diff line number	Diff line
		@@ -34,7 +34,9 @@


		// Atomic add operation for double
		__device__ double atomicAdd2(double* address, double val)
		#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 600
		#else
		__device__ double atomicAdd(double* address, double val)
		{
		unsigned long long int* address_as_ull = (unsigned long long int*)address;
		unsigned long long int old = *address_as_ull, assumed;
		@@ -46,6 +48,7 @@ __device__ double atomicAdd2(double* address, double val)
		} while (assumed != old);
		return __longlong_as_double(old);
		}
		#endif


		// Get the globally unique thread id
		@@ -113,14 +116,14 @@ void RayTraceImageCudaKernel( int N, int nx, int ny, int na, int nb, int nv,
		if (i1>=0 && i2>=0){
		double Iv2 = &image[nv(i1+i2*nx)];
		for (int iv=0; iv<nv; iv++)
		atomicAdd2(&Iv2[iv],Iv[iv]*scale);
		atomicAdd(&Iv2[iv],Iv[iv]*scale);
		}
		// Copy I_out into I_ang
		if (i3>=0 && i4>=0) {
		double tmp = 0.0;
		for (int iv=0; iv<nv; iv++)
		tmp += 2.0dv[iv]Iv[iv];
		atomicAdd2(&I_ang[i3+i4*na],tmp);
		atomicAdd(&I_ang[i3+i4*na],tmp);
		}
		}
		}