Commit 58f0a5b8 authored by Berrill, Mark's avatar Berrill, Mark
Browse files

Working on optimizations for multigpu

parent 4d8c5766
Loading
Loading
Loading
Loading
+18 −22
Original line number Diff line number Diff line
@@ -115,26 +115,11 @@ int run_tests( const std::string& filename, const Options& options )
#endif
    }

    // Call a dummy CUDA/OpenAcc method to initialize the GPU (for more accurate times)
    static bool cudaInitialized = false;
    if ( !cudaInitialized ) {
        auto index = std::find(methods.begin(),methods.end(),"Cuda-MultiGPU");
        if ( index == methods.end() )
            index = std::find(methods.begin(),methods.end(),"Cuda");
        if ( index == methods.end() )
            index = find(methods.begin(),methods.end(),"OpenAcc");
        if ( index != methods.end() ) {
            auto info = loadInput( filename, 0.1 );
            RayTrace::create_image( info, *index );
            free2( info );
        }
        cudaInitialized = true;
    }

    // Load the image structure
    double *image0=NULL, *I_ang0=NULL;
    auto info = loadInput( filename, options.scale, &image0, &I_ang0 );
    if ( info == NULL )
    auto info2 = loadInput( filename, 0.1 );
    if ( info == nullptr )
        return -2;

    // Call create_image for each method
@@ -144,8 +129,15 @@ int run_tests( const std::string& filename, const Options& options )
    for ( size_t i = 0; i < methods.size(); i++ ) {
        if ( rank() == 0 )
            printf( "Running %s\n", methods[i].c_str() );
        int iterations = options.iterations;
        if ( methods[i] == "cpu" && methods.size() > 1 )
            iterations = 1;
        // Initialize the method to reduce runtime variability
        if ( methods[i] != "cpu" )
            RayTrace::create_image( info2, methods[i] );
        // Run the timing tests
        double start = getTime();
        for ( int it = 0; it < options.iterations; it++ ) {
        for ( int it = 0; it < iterations; it++ ) {
            RayTrace::create_image( info, methods[i] );
            double stop = getTime();
            time[i].push_back( stop - start );
@@ -171,12 +163,12 @@ int run_tests( const std::string& filename, const Options& options )
            double avg = getAvg( time[i] );
            double dev = getDev( time[i] );
            printf( "%14s %7.3f %7.3f %7.3f %7.3f\n", methods[i].c_str(), avg, min, max, dev );
            if ( dev/avg > 0.10 ) {
                printf( "   Standard deviation exceeded tolerance (10%%)\n");
            if ( dev/avg > 0.25 ) {
                printf( "   Standard deviation exceeded tolerance (25%%)\n");
                N_errors++;
            }
            if ( (max-avg)/avg > 0.15 ) {
                printf( "   Maximum runtime exceeded average by more than 15%%\n");
            if ( (max-avg)/avg > 0.50 ) {
                printf( "   Maximum runtime exceeded average by more than 50%%\n");
                N_errors++;
            }
        }
@@ -186,6 +178,7 @@ int run_tests( const std::string& filename, const Options& options )
    free( (void *) image0 );
    free( (void *) I_ang0 );
    free2( info );
    free2( info2 );
    return sumReduce(N_errors);
}

@@ -237,6 +230,9 @@ int main( int argc, char *argv[] )
    // Initialize kokkos
    KokkosInitialize( argc, argv );

    // Print hardware stats
    printHardware();

    // Run the tests for all files
    int N_errors = 0;
    for (size_t i=0; i<filenames.size(); i++)
+86 −5
Original line number Diff line number Diff line
#include "CreateImageHelpers.h"

#include <math.h>
#include <thread>


#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
#ifdef USE_CUDA
#include <cuda_runtime_api.h>
#endif

// Detect the OS
// clang-format off
#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 ) || defined( _MSC_VER )
    #define USE_WINDOWS
#elif defined( __APPLE__ )
    #define USE_MAC
#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
    #define USE_LINUX
    #define USE_NM
#else
    #error Unknown OS
#endif
// clang-format on


// Include system dependent headers
// clang-format off
#ifdef USE_WINDOWS
    #include <process.h>
    #include <psapi.h>
    #include <stdio.h>
    #include <tchar.h>
    #include <windows.h>
#else
    #include <dlfcn.h>
    #include <execinfo.h>
    #include <sched.h>
    #include <sys/time.h>
    #include <ctime>
    #include <unistd.h>
#endif
#ifdef USE_LINUX
    #include <malloc.h>
#endif
#ifdef USE_MAC
    #include <mach/mach.h>
    #include <sys/sysctl.h>
    #include <sys/types.h>
#endif
// clang-format on


#if defined( USE_WINDOWS )
#define get_time( x ) QueryPerformanceCounter( x )
#define get_frequency( f ) QueryPerformanceFrequency( f )
#define TIME_TYPE LARGE_INTEGER
@@ -14,7 +60,6 @@ inline double get_diff( TIME_TYPE start, TIME_TYPE end, TIME_TYPE f )
    return ( ( (double) ( end.QuadPart - start.QuadPart ) ) / ( (double) f.QuadPart ) );
}
#else
#include <sys/time.h>
#define get_time( x ) gettimeofday( x, NULL );
#define get_frequency( f ) ( *f = timeval() )
#define TIME_TYPE timeval
@@ -89,8 +134,8 @@ bool check_ans( const double *image0, const double *I_ang0, const RayTrace::crea
    const double tol =
        5e-6; // RayTrace uses single precision for some calculations (may need to adjust to 1e-5)
    // bool pass = error[0]<=tol && error[1]<=tol;
    bool pass =
        ( norm0[0] - norm1[0] ) / norm0[0] <= tol && ( norm0[1] - norm1[1] ) / norm0[1] <= tol;
    bool pass = fabs( norm0[0] - norm1[0] ) / norm0[0] <= tol &&
                fabs( norm0[1] - norm1[1] ) / norm0[1] <= tol;
    if ( !pass ) {
        std::cerr << "  Answers do not match:" << std::endl;
        std::cerr << "    image: " << error[0] << " " << norm0[0] << " " << norm1[0] << std::endl;
@@ -191,3 +236,39 @@ double getDev( const std::vector<double>& x )
    return y;
}


// Print info about the hardware
void printHardware()
{
    // Get number of threads
    size_t N_threads = std::thread::hardware_concurrency();
    // Get number of gpus
    int N_gpu = 0;
#ifdef USE_CUDA
    cudaGetDeviceCount( &N_gpu );
#endif
    // Get system memory
#if defined( USE_LINUX )
    static size_t pages = sysconf( _SC_PHYS_PAGES );
    static size_t N_bytes    = pages * sysconf( _SC_PAGESIZE );
#elif defined( USE_MAC )
    int mib[2]    = { CTL_HW, HW_MEMSIZE };
    u_int namelen = sizeof( mib ) / sizeof( mib[0] );
    uint64_t size;
    size_t len = sizeof( size );
    size_t N_bytes = 0;
    if ( sysctl( mib, namelen, &size, &len, nullptr, 0 ) == 0 )
        N_bytes = size;
#elif defined( USE_WINDOWS )
    MEMORYSTATUSEX status;
    status.dwLength = sizeof( status );
    GlobalMemoryStatusEx( &status );
    size_t N_bytes = status.ullTotalPhys;
#else
    #error Unknown OS
#endif
    // Print the results
    std::cout << "Number of threads: " << N_threads << std::endl;
    std::cout << "Number of gpus: " << N_gpu << std::endl;
    std::cout << "System memory: " << N_bytes/1073741824 << " GB" << std::endl;
}
+4 −0
Original line number Diff line number Diff line
@@ -10,6 +10,10 @@
#include "RayTrace.h"


// Function to print info about the hardware that we will use
void printHardware();


// Function to call fread, checking that the proper length was read
void fread2( void *ptr, size_t size, size_t count, FILE *fid );

+26 −18
Original line number Diff line number Diff line
@@ -86,6 +86,26 @@ void setGPU( int id )
    cudaSetDevice(id);
#endif
}
void setDeviceAndRun( int id, int N_threads,
    std::function<void( int )> setID,
    std::function<void( int, const RayTrace::EUV_beam_struct&,
            const RayTrace::ray_gain_struct*, const RayTrace::ray_seed_struct*,
            int, const std::vector<ray_struct>&, double, double*,
            double*, unsigned int&, std::vector<ray_struct>& )> function,
    int N, const RayTrace::EUV_beam_struct& beam,
    const RayTrace::ray_gain_struct *gain, const RayTrace::ray_seed_struct *seed,
    int method, const std::vector<ray_struct> &rays, double scale, double **image,
    double **I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays )
{
    std::vector<ray_struct> rays2;
    rays2.reserve( rays.size() / N_threads + 1 );
    for ( size_t i = id; i < rays.size(); i+=N_threads )
        rays2.emplace_back( rays[i] );
    *image = (double *) calloc( beam.nx * beam.ny * beam.nv, sizeof( double ) );
    *I_ang = (double *) calloc( beam.na * beam.nb, sizeof( double ) );
    setID( id );
    function( N, beam, gain, seed, method, rays2, scale, *image, *I_ang, failure_code, failed_rays );
}
void RayTraceImageThreadLoop( size_t N_threads, 
    std::function<void( int, const RayTrace::EUV_beam_struct&,
        const RayTrace::ray_gain_struct*, const RayTrace::ray_seed_struct*,
@@ -97,26 +117,15 @@ void RayTraceImageThreadLoop( size_t N_threads,
    int method, const std::vector<ray_struct> &rays, double scale, double *image,
    double *I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays )
{
    std::vector<std::vector<ray_struct>> rays2( N_threads );
    std::vector<std::vector<ray_struct>> failed_rays2( N_threads );
    std::vector<unsigned int> failure_code2( N_threads, 0 );
    std::vector<double *> image2( N_threads, NULL );
    std::vector<double *> I_ang2( N_threads, NULL );
    std::vector<double *> image2( N_threads, nullptr );
    std::vector<double *> I_ang2( N_threads, nullptr );
    std::vector<std::thread> threads;
    for ( size_t i = 0, j = 0; i < N_threads; i++ ) {
        size_t N2 = rays.size() / N_threads + 1;
        if ( N2 == 0 ) {
            N2 = 8;
        }
        rays2.reserve( N2 );
        for ( size_t k = 0; k < N2 && j < rays.size(); k++, j++ )
            rays2[i].push_back( rays[j] );
        image2[i] = (double *) calloc( beam.nx * beam.ny * beam.nv, sizeof( double ) );
        I_ang2[i] = (double *) calloc( beam.na * beam.nb, sizeof( double ) );
        setID( i );
        threads.push_back( std::thread( function, N, std::ref(beam),
            gain, seed, method, std::ref( rays2[i] ), scale, image2[i],
            I_ang2[i], std::ref( failure_code2[i] ), std::ref( failed_rays2[i] ) ) );
    for ( size_t i = 0; i < N_threads; i++ ) {
        threads.push_back( std::thread( setDeviceAndRun, i, N_threads, setID, function,
            N, std::ref(beam), gain, seed, method, std::ref( rays ), scale, &image2[i],
            &I_ang2[i], std::ref( failure_code2[i] ), std::ref( failed_rays2[i] ) ) );
    }
    for ( size_t i = 0; i < N_threads; i++ ) {
        threads[i].join();
@@ -129,7 +138,6 @@ void RayTraceImageThreadLoop( size_t N_threads,
            failed_rays.push_back( failed_rays2[i][j] );
        free( image2[i] );
        free( I_ang2[i] );
        rays2[i].clear();
    }
}

+6 −3
Original line number Diff line number Diff line
@@ -34,7 +34,9 @@


// Atomic add operation for double
__device__ double atomicAdd2(double* address, double val)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
#else
__device__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
@@ -46,6 +48,7 @@ __device__ double atomicAdd2(double* address, double val)
    } while (assumed != old);
    return __longlong_as_double(old);
}
#endif


// Get the globally unique thread id
@@ -113,14 +116,14 @@ void RayTraceImageCudaKernel( int N, int nx, int ny, int na, int nb, int nv,
            if (i1>=0 && i2>=0){ 
                double *Iv2 = &image[nv*(i1+i2*nx)];
                for (int iv=0; iv<nv; iv++)
                    atomicAdd2(&Iv2[iv],Iv[iv]*scale);
                    atomicAdd(&Iv2[iv],Iv[iv]*scale);
            }
            // Copy I_out into I_ang 
            if (i3>=0 && i4>=0) {    
                double tmp = 0.0;
                for (int iv=0; iv<nv; iv++)
                    tmp += 2.0*dv[iv]*Iv[iv];
                atomicAdd2(&I_ang[i3+i4*na],tmp);
                atomicAdd(&I_ang[i3+i4*na],tmp);
            }
        }
    }
Loading