Loading src/CreateImage.cpp +18 −22 Original line number Diff line number Diff line Loading @@ -115,26 +115,11 @@ int run_tests( const std::string& filename, const Options& options ) #endif } // Call a dummy CUDA/OpenAcc method to initialize the GPU (for more accurate times) static bool cudaInitialized = false; if ( !cudaInitialized ) { auto index = std::find(methods.begin(),methods.end(),"Cuda-MultiGPU"); if ( index == methods.end() ) index = std::find(methods.begin(),methods.end(),"Cuda"); if ( index == methods.end() ) index = find(methods.begin(),methods.end(),"OpenAcc"); if ( index != methods.end() ) { auto info = loadInput( filename, 0.1 ); RayTrace::create_image( info, *index ); free2( info ); } cudaInitialized = true; } // Load the image structure double *image0=NULL, *I_ang0=NULL; auto info = loadInput( filename, options.scale, &image0, &I_ang0 ); if ( info == NULL ) auto info2 = loadInput( filename, 0.1 ); if ( info == nullptr ) return -2; // Call create_image for each method Loading @@ -144,8 +129,15 @@ int run_tests( const std::string& filename, const Options& options ) for ( size_t i = 0; i < methods.size(); i++ ) { if ( rank() == 0 ) printf( "Running %s\n", methods[i].c_str() ); int iterations = options.iterations; if ( methods[i] == "cpu" && methods.size() > 1 ) iterations = 1; // Initialize the method to reduce runtime variability if ( methods[i] != "cpu" ) RayTrace::create_image( info2, methods[i] ); // Run the timing tests double start = getTime(); for ( int it = 0; it < options.iterations; it++ ) { for ( int it = 0; it < iterations; it++ ) { RayTrace::create_image( info, methods[i] ); double stop = getTime(); time[i].push_back( stop - start ); Loading @@ -171,12 +163,12 @@ int run_tests( const std::string& filename, const Options& options ) double avg = getAvg( time[i] ); double dev = getDev( time[i] ); printf( "%14s %7.3f %7.3f %7.3f %7.3f\n", methods[i].c_str(), avg, min, max, dev ); if ( dev/avg > 0.10 ) { printf( " Standard deviation exceeded tolerance (10%%)\n"); if ( dev/avg > 0.25 ) { printf( " Standard deviation exceeded tolerance (25%%)\n"); N_errors++; } if ( (max-avg)/avg > 0.15 ) { printf( " Maximum runtime exceeded average by more than 15%%\n"); if ( (max-avg)/avg > 0.50 ) { printf( " Maximum runtime exceeded average by more than 50%%\n"); N_errors++; } } Loading @@ -186,6 +178,7 @@ int run_tests( const std::string& filename, const Options& options ) free( (void *) image0 ); free( (void *) I_ang0 ); free2( info ); free2( info2 ); return sumReduce(N_errors); } Loading Loading @@ -237,6 +230,9 @@ int main( int argc, char *argv[] ) // Initialize kokkos KokkosInitialize( argc, argv ); // Print hardware stats printHardware(); // Run the tests for all files int N_errors = 0; for (size_t i=0; i<filenames.size(); i++) Loading src/CreateImageHelpers.cpp +86 −5 Original line number Diff line number Diff line #include "CreateImageHelpers.h" #include <math.h> #include <thread> #if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 ) #ifdef USE_CUDA #include <cuda_runtime_api.h> #endif // Detect the OS // clang-format off #if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 ) || defined( _MSC_VER ) #define USE_WINDOWS #elif defined( __APPLE__ ) #define USE_MAC #elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix ) #define USE_LINUX #define USE_NM #else #error Unknown OS #endif // clang-format on // Include system dependent headers // clang-format off #ifdef USE_WINDOWS #include <process.h> #include <psapi.h> #include <stdio.h> #include <tchar.h> #include <windows.h> #else #include <dlfcn.h> #include <execinfo.h> #include <sched.h> #include <sys/time.h> #include <ctime> #include <unistd.h> #endif #ifdef USE_LINUX #include <malloc.h> #endif #ifdef USE_MAC #include <mach/mach.h> #include <sys/sysctl.h> #include <sys/types.h> #endif // clang-format on #if defined( USE_WINDOWS ) #define get_time( x ) QueryPerformanceCounter( x ) #define get_frequency( f ) QueryPerformanceFrequency( f ) #define TIME_TYPE LARGE_INTEGER Loading @@ -14,7 +60,6 @@ inline double get_diff( TIME_TYPE start, TIME_TYPE end, TIME_TYPE f ) return ( ( (double) ( end.QuadPart - start.QuadPart ) ) / ( (double) f.QuadPart ) ); } #else #include <sys/time.h> #define get_time( x ) gettimeofday( x, NULL ); #define get_frequency( f ) ( *f = timeval() ) #define TIME_TYPE timeval Loading Loading @@ -89,8 +134,8 @@ bool check_ans( const double *image0, const double *I_ang0, const RayTrace::crea const double tol = 5e-6; // RayTrace uses single precision for some calculations (may need to adjust to 1e-5) // bool pass = error[0]<=tol && error[1]<=tol; bool pass = ( norm0[0] - norm1[0] ) / norm0[0] <= tol && ( norm0[1] - norm1[1] ) / norm0[1] <= tol; bool pass = fabs( norm0[0] - norm1[0] ) / norm0[0] <= tol && fabs( norm0[1] - norm1[1] ) / norm0[1] <= tol; if ( !pass ) { std::cerr << " Answers do not match:" << std::endl; std::cerr << " image: " << error[0] << " " << norm0[0] << " " << norm1[0] << std::endl; Loading Loading @@ -191,3 +236,39 @@ double getDev( const std::vector<double>& x ) return y; } // Print info about the hardware void printHardware() { // Get number of threads size_t N_threads = std::thread::hardware_concurrency(); // Get number of gpus int N_gpu = 0; #ifdef USE_CUDA cudaGetDeviceCount( &N_gpu ); #endif // Get system memory #if defined( USE_LINUX ) static size_t pages = sysconf( _SC_PHYS_PAGES ); static size_t N_bytes = pages * sysconf( _SC_PAGESIZE ); #elif defined( USE_MAC ) int mib[2] = { CTL_HW, HW_MEMSIZE }; u_int namelen = sizeof( mib ) / sizeof( mib[0] ); uint64_t size; size_t len = sizeof( size ); size_t N_bytes = 0; if ( sysctl( mib, namelen, &size, &len, nullptr, 0 ) == 0 ) N_bytes = size; #elif defined( USE_WINDOWS ) MEMORYSTATUSEX status; status.dwLength = sizeof( status ); GlobalMemoryStatusEx( &status ); size_t N_bytes = status.ullTotalPhys; #else #error Unknown OS #endif // Print the results std::cout << "Number of threads: " << N_threads << std::endl; std::cout << "Number of gpus: " << N_gpu << std::endl; std::cout << "System memory: " << N_bytes/1073741824 << " GB" << std::endl; } src/CreateImageHelpers.h +4 −0 Original line number Diff line number Diff line Loading @@ -10,6 +10,10 @@ #include "RayTrace.h" // Function to print info about the hardware that we will use void printHardware(); // Function to call fread, checking that the proper length was read void fread2( void *ptr, size_t size, size_t count, FILE *fid ); Loading src/RayTraceImage.cpp +26 −18 Original line number Diff line number Diff line Loading @@ -86,6 +86,26 @@ void setGPU( int id ) cudaSetDevice(id); #endif } void setDeviceAndRun( int id, int N_threads, std::function<void( int )> setID, std::function<void( int, const RayTrace::EUV_beam_struct&, const RayTrace::ray_gain_struct*, const RayTrace::ray_seed_struct*, int, const std::vector<ray_struct>&, double, double*, double*, unsigned int&, std::vector<ray_struct>& )> function, int N, const RayTrace::EUV_beam_struct& beam, const RayTrace::ray_gain_struct *gain, const RayTrace::ray_seed_struct *seed, int method, const std::vector<ray_struct> &rays, double scale, double **image, double **I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays ) { std::vector<ray_struct> rays2; rays2.reserve( rays.size() / N_threads + 1 ); for ( size_t i = id; i < rays.size(); i+=N_threads ) rays2.emplace_back( rays[i] ); *image = (double *) calloc( beam.nx * beam.ny * beam.nv, sizeof( double ) ); *I_ang = (double *) calloc( beam.na * beam.nb, sizeof( double ) ); setID( id ); function( N, beam, gain, seed, method, rays2, scale, *image, *I_ang, failure_code, failed_rays ); } void RayTraceImageThreadLoop( size_t N_threads, std::function<void( int, const RayTrace::EUV_beam_struct&, const RayTrace::ray_gain_struct*, const RayTrace::ray_seed_struct*, Loading @@ -97,26 +117,15 @@ void RayTraceImageThreadLoop( size_t N_threads, int method, const std::vector<ray_struct> &rays, double scale, double *image, double *I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays ) { std::vector<std::vector<ray_struct>> rays2( N_threads ); std::vector<std::vector<ray_struct>> failed_rays2( N_threads ); std::vector<unsigned int> failure_code2( N_threads, 0 ); std::vector<double *> image2( N_threads, NULL ); std::vector<double *> I_ang2( N_threads, NULL ); std::vector<double *> image2( N_threads, nullptr ); std::vector<double *> I_ang2( N_threads, nullptr ); std::vector<std::thread> threads; for ( size_t i = 0, j = 0; i < N_threads; i++ ) { size_t N2 = rays.size() / N_threads + 1; if ( N2 == 0 ) { N2 = 8; } rays2.reserve( N2 ); for ( size_t k = 0; k < N2 && j < rays.size(); k++, j++ ) rays2[i].push_back( rays[j] ); image2[i] = (double *) calloc( beam.nx * beam.ny * beam.nv, sizeof( double ) ); I_ang2[i] = (double *) calloc( beam.na * beam.nb, sizeof( double ) ); setID( i ); threads.push_back( std::thread( function, N, std::ref(beam), gain, seed, method, std::ref( rays2[i] ), scale, image2[i], I_ang2[i], std::ref( failure_code2[i] ), std::ref( failed_rays2[i] ) ) ); for ( size_t i = 0; i < N_threads; i++ ) { threads.push_back( std::thread( setDeviceAndRun, i, N_threads, setID, function, N, std::ref(beam), gain, seed, method, std::ref( rays ), scale, &image2[i], &I_ang2[i], std::ref( failure_code2[i] ), std::ref( failed_rays2[i] ) ) ); } for ( size_t i = 0; i < N_threads; i++ ) { threads[i].join(); Loading @@ -129,7 +138,6 @@ void RayTraceImageThreadLoop( size_t N_threads, failed_rays.push_back( failed_rays2[i][j] ); free( image2[i] ); free( I_ang2[i] ); rays2[i].clear(); } } Loading src/RayTraceImageCuda.cu +6 −3 Original line number Diff line number Diff line Loading @@ -34,7 +34,9 @@ // Atomic add operation for double __device__ double atomicAdd2(double* address, double val) #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 #else __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; Loading @@ -46,6 +48,7 @@ __device__ double atomicAdd2(double* address, double val) } while (assumed != old); return __longlong_as_double(old); } #endif // Get the globally unique thread id Loading Loading @@ -113,14 +116,14 @@ void RayTraceImageCudaKernel( int N, int nx, int ny, int na, int nb, int nv, if (i1>=0 && i2>=0){ double *Iv2 = &image[nv*(i1+i2*nx)]; for (int iv=0; iv<nv; iv++) atomicAdd2(&Iv2[iv],Iv[iv]*scale); atomicAdd(&Iv2[iv],Iv[iv]*scale); } // Copy I_out into I_ang if (i3>=0 && i4>=0) { double tmp = 0.0; for (int iv=0; iv<nv; iv++) tmp += 2.0*dv[iv]*Iv[iv]; atomicAdd2(&I_ang[i3+i4*na],tmp); atomicAdd(&I_ang[i3+i4*na],tmp); } } } Loading Loading
src/CreateImage.cpp +18 −22 Original line number Diff line number Diff line Loading @@ -115,26 +115,11 @@ int run_tests( const std::string& filename, const Options& options ) #endif } // Call a dummy CUDA/OpenAcc method to initialize the GPU (for more accurate times) static bool cudaInitialized = false; if ( !cudaInitialized ) { auto index = std::find(methods.begin(),methods.end(),"Cuda-MultiGPU"); if ( index == methods.end() ) index = std::find(methods.begin(),methods.end(),"Cuda"); if ( index == methods.end() ) index = find(methods.begin(),methods.end(),"OpenAcc"); if ( index != methods.end() ) { auto info = loadInput( filename, 0.1 ); RayTrace::create_image( info, *index ); free2( info ); } cudaInitialized = true; } // Load the image structure double *image0=NULL, *I_ang0=NULL; auto info = loadInput( filename, options.scale, &image0, &I_ang0 ); if ( info == NULL ) auto info2 = loadInput( filename, 0.1 ); if ( info == nullptr ) return -2; // Call create_image for each method Loading @@ -144,8 +129,15 @@ int run_tests( const std::string& filename, const Options& options ) for ( size_t i = 0; i < methods.size(); i++ ) { if ( rank() == 0 ) printf( "Running %s\n", methods[i].c_str() ); int iterations = options.iterations; if ( methods[i] == "cpu" && methods.size() > 1 ) iterations = 1; // Initialize the method to reduce runtime variability if ( methods[i] != "cpu" ) RayTrace::create_image( info2, methods[i] ); // Run the timing tests double start = getTime(); for ( int it = 0; it < options.iterations; it++ ) { for ( int it = 0; it < iterations; it++ ) { RayTrace::create_image( info, methods[i] ); double stop = getTime(); time[i].push_back( stop - start ); Loading @@ -171,12 +163,12 @@ int run_tests( const std::string& filename, const Options& options ) double avg = getAvg( time[i] ); double dev = getDev( time[i] ); printf( "%14s %7.3f %7.3f %7.3f %7.3f\n", methods[i].c_str(), avg, min, max, dev ); if ( dev/avg > 0.10 ) { printf( " Standard deviation exceeded tolerance (10%%)\n"); if ( dev/avg > 0.25 ) { printf( " Standard deviation exceeded tolerance (25%%)\n"); N_errors++; } if ( (max-avg)/avg > 0.15 ) { printf( " Maximum runtime exceeded average by more than 15%%\n"); if ( (max-avg)/avg > 0.50 ) { printf( " Maximum runtime exceeded average by more than 50%%\n"); N_errors++; } } Loading @@ -186,6 +178,7 @@ int run_tests( const std::string& filename, const Options& options ) free( (void *) image0 ); free( (void *) I_ang0 ); free2( info ); free2( info2 ); return sumReduce(N_errors); } Loading Loading @@ -237,6 +230,9 @@ int main( int argc, char *argv[] ) // Initialize kokkos KokkosInitialize( argc, argv ); // Print hardware stats printHardware(); // Run the tests for all files int N_errors = 0; for (size_t i=0; i<filenames.size(); i++) Loading
src/CreateImageHelpers.cpp +86 −5 Original line number Diff line number Diff line #include "CreateImageHelpers.h" #include <math.h> #include <thread> #if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 ) #ifdef USE_CUDA #include <cuda_runtime_api.h> #endif // Detect the OS // clang-format off #if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 ) || defined( _MSC_VER ) #define USE_WINDOWS #elif defined( __APPLE__ ) #define USE_MAC #elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix ) #define USE_LINUX #define USE_NM #else #error Unknown OS #endif // clang-format on // Include system dependent headers // clang-format off #ifdef USE_WINDOWS #include <process.h> #include <psapi.h> #include <stdio.h> #include <tchar.h> #include <windows.h> #else #include <dlfcn.h> #include <execinfo.h> #include <sched.h> #include <sys/time.h> #include <ctime> #include <unistd.h> #endif #ifdef USE_LINUX #include <malloc.h> #endif #ifdef USE_MAC #include <mach/mach.h> #include <sys/sysctl.h> #include <sys/types.h> #endif // clang-format on #if defined( USE_WINDOWS ) #define get_time( x ) QueryPerformanceCounter( x ) #define get_frequency( f ) QueryPerformanceFrequency( f ) #define TIME_TYPE LARGE_INTEGER Loading @@ -14,7 +60,6 @@ inline double get_diff( TIME_TYPE start, TIME_TYPE end, TIME_TYPE f ) return ( ( (double) ( end.QuadPart - start.QuadPart ) ) / ( (double) f.QuadPart ) ); } #else #include <sys/time.h> #define get_time( x ) gettimeofday( x, NULL ); #define get_frequency( f ) ( *f = timeval() ) #define TIME_TYPE timeval Loading Loading @@ -89,8 +134,8 @@ bool check_ans( const double *image0, const double *I_ang0, const RayTrace::crea const double tol = 5e-6; // RayTrace uses single precision for some calculations (may need to adjust to 1e-5) // bool pass = error[0]<=tol && error[1]<=tol; bool pass = ( norm0[0] - norm1[0] ) / norm0[0] <= tol && ( norm0[1] - norm1[1] ) / norm0[1] <= tol; bool pass = fabs( norm0[0] - norm1[0] ) / norm0[0] <= tol && fabs( norm0[1] - norm1[1] ) / norm0[1] <= tol; if ( !pass ) { std::cerr << " Answers do not match:" << std::endl; std::cerr << " image: " << error[0] << " " << norm0[0] << " " << norm1[0] << std::endl; Loading Loading @@ -191,3 +236,39 @@ double getDev( const std::vector<double>& x ) return y; } // Print info about the hardware void printHardware() { // Get number of threads size_t N_threads = std::thread::hardware_concurrency(); // Get number of gpus int N_gpu = 0; #ifdef USE_CUDA cudaGetDeviceCount( &N_gpu ); #endif // Get system memory #if defined( USE_LINUX ) static size_t pages = sysconf( _SC_PHYS_PAGES ); static size_t N_bytes = pages * sysconf( _SC_PAGESIZE ); #elif defined( USE_MAC ) int mib[2] = { CTL_HW, HW_MEMSIZE }; u_int namelen = sizeof( mib ) / sizeof( mib[0] ); uint64_t size; size_t len = sizeof( size ); size_t N_bytes = 0; if ( sysctl( mib, namelen, &size, &len, nullptr, 0 ) == 0 ) N_bytes = size; #elif defined( USE_WINDOWS ) MEMORYSTATUSEX status; status.dwLength = sizeof( status ); GlobalMemoryStatusEx( &status ); size_t N_bytes = status.ullTotalPhys; #else #error Unknown OS #endif // Print the results std::cout << "Number of threads: " << N_threads << std::endl; std::cout << "Number of gpus: " << N_gpu << std::endl; std::cout << "System memory: " << N_bytes/1073741824 << " GB" << std::endl; }
src/CreateImageHelpers.h +4 −0 Original line number Diff line number Diff line Loading @@ -10,6 +10,10 @@ #include "RayTrace.h" // Function to print info about the hardware that we will use void printHardware(); // Function to call fread, checking that the proper length was read void fread2( void *ptr, size_t size, size_t count, FILE *fid ); Loading
src/RayTraceImage.cpp +26 −18 Original line number Diff line number Diff line Loading @@ -86,6 +86,26 @@ void setGPU( int id ) cudaSetDevice(id); #endif } void setDeviceAndRun( int id, int N_threads, std::function<void( int )> setID, std::function<void( int, const RayTrace::EUV_beam_struct&, const RayTrace::ray_gain_struct*, const RayTrace::ray_seed_struct*, int, const std::vector<ray_struct>&, double, double*, double*, unsigned int&, std::vector<ray_struct>& )> function, int N, const RayTrace::EUV_beam_struct& beam, const RayTrace::ray_gain_struct *gain, const RayTrace::ray_seed_struct *seed, int method, const std::vector<ray_struct> &rays, double scale, double **image, double **I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays ) { std::vector<ray_struct> rays2; rays2.reserve( rays.size() / N_threads + 1 ); for ( size_t i = id; i < rays.size(); i+=N_threads ) rays2.emplace_back( rays[i] ); *image = (double *) calloc( beam.nx * beam.ny * beam.nv, sizeof( double ) ); *I_ang = (double *) calloc( beam.na * beam.nb, sizeof( double ) ); setID( id ); function( N, beam, gain, seed, method, rays2, scale, *image, *I_ang, failure_code, failed_rays ); } void RayTraceImageThreadLoop( size_t N_threads, std::function<void( int, const RayTrace::EUV_beam_struct&, const RayTrace::ray_gain_struct*, const RayTrace::ray_seed_struct*, Loading @@ -97,26 +117,15 @@ void RayTraceImageThreadLoop( size_t N_threads, int method, const std::vector<ray_struct> &rays, double scale, double *image, double *I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays ) { std::vector<std::vector<ray_struct>> rays2( N_threads ); std::vector<std::vector<ray_struct>> failed_rays2( N_threads ); std::vector<unsigned int> failure_code2( N_threads, 0 ); std::vector<double *> image2( N_threads, NULL ); std::vector<double *> I_ang2( N_threads, NULL ); std::vector<double *> image2( N_threads, nullptr ); std::vector<double *> I_ang2( N_threads, nullptr ); std::vector<std::thread> threads; for ( size_t i = 0, j = 0; i < N_threads; i++ ) { size_t N2 = rays.size() / N_threads + 1; if ( N2 == 0 ) { N2 = 8; } rays2.reserve( N2 ); for ( size_t k = 0; k < N2 && j < rays.size(); k++, j++ ) rays2[i].push_back( rays[j] ); image2[i] = (double *) calloc( beam.nx * beam.ny * beam.nv, sizeof( double ) ); I_ang2[i] = (double *) calloc( beam.na * beam.nb, sizeof( double ) ); setID( i ); threads.push_back( std::thread( function, N, std::ref(beam), gain, seed, method, std::ref( rays2[i] ), scale, image2[i], I_ang2[i], std::ref( failure_code2[i] ), std::ref( failed_rays2[i] ) ) ); for ( size_t i = 0; i < N_threads; i++ ) { threads.push_back( std::thread( setDeviceAndRun, i, N_threads, setID, function, N, std::ref(beam), gain, seed, method, std::ref( rays ), scale, &image2[i], &I_ang2[i], std::ref( failure_code2[i] ), std::ref( failed_rays2[i] ) ) ); } for ( size_t i = 0; i < N_threads; i++ ) { threads[i].join(); Loading @@ -129,7 +138,6 @@ void RayTraceImageThreadLoop( size_t N_threads, failed_rays.push_back( failed_rays2[i][j] ); free( image2[i] ); free( I_ang2[i] ); rays2[i].clear(); } } Loading
src/RayTraceImageCuda.cu +6 −3 Original line number Diff line number Diff line Loading @@ -34,7 +34,9 @@ // Atomic add operation for double __device__ double atomicAdd2(double* address, double val) #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 #else __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; Loading @@ -46,6 +48,7 @@ __device__ double atomicAdd2(double* address, double val) } while (assumed != old); return __longlong_as_double(old); } #endif // Get the globally unique thread id Loading Loading @@ -113,14 +116,14 @@ void RayTraceImageCudaKernel( int N, int nx, int ny, int na, int nb, int nv, if (i1>=0 && i2>=0){ double *Iv2 = &image[nv*(i1+i2*nx)]; for (int iv=0; iv<nv; iv++) atomicAdd2(&Iv2[iv],Iv[iv]*scale); atomicAdd(&Iv2[iv],Iv[iv]*scale); } // Copy I_out into I_ang if (i3>=0 && i4>=0) { double tmp = 0.0; for (int iv=0; iv<nv; iv++) tmp += 2.0*dv[iv]*Iv[iv]; atomicAdd2(&I_ang[i3+i4*na],tmp); atomicAdd(&I_ang[i3+i4*na],tmp); } } } Loading