Commit 65acc3c4 authored by Cianciosa, Mark's avatar Cianciosa, Mark
Browse files

Remove result buffers and synronize all the buffers directly.

parent 7bff9f24
Loading
Loading
Loading
Loading
+8 −37
Original line number Diff line number Diff line
@@ -41,8 +41,6 @@ namespace gpu {
        unsigned int thread_groups;
///  Number of threads in a group.
        unsigned int threads_per_group;
///  Result buffers.
        std::vector<CUdeviceptr> result_buffers;
///  Index offset.
        size_t buffer_offset;
///  Buffer element size.
@@ -208,8 +206,6 @@ namespace gpu {
                check_error(cuMemAlloc(&buffers[i], backend.size()*buffer_element_size), "cuMemAlloc");
                check_error(cuMemcpyHtoD(buffers[i], &backend[0], backend.size()*buffer_element_size), "cuMemcpyHtoD");
                kernel_arguments.push_back(reinterpret_cast<void *> (&buffers[i]));

                check_error(cuMemAllocManaged(&result_buffers[i], result_size, CU_MEM_ATTACH_GLOBAL), "cuMemAllocManaged");
            }
            for (size_t i = inputs.size(), ie = buffers.size(), j = 0; i < ie; i++, j++)	{
                const BACKEND backend = outputs[j]->evaluate();
@@ -217,8 +213,6 @@ namespace gpu {
                check_error(cuMemAlloc(&buffers[i], backend.size()*buffer_element_size), "cuMemAlloc");
                check_error(cuMemcpyHtoD(buffers[i], &backend[0], backend.size()*buffer_element_size), "cuMemcpyHtoD");
                kernel_arguments.push_back(reinterpret_cast<void *> (&buffers[i]));

                check_error(cuMemAllocManaged(&result_buffers[i], result_size, CU_MEM_ATTACH_GLOBAL), "cuMemAllocManaged");
            }

            int value;
@@ -229,25 +223,6 @@ namespace gpu {
            std::cout << "  Threads per group        : " << threads_per_group << std::endl;
            std::cout << "  Number of groups         : " << thread_groups << std::endl;
            std::cout << "  Total problem size       : " << threads_per_group*thread_groups << std::endl;

            encode_blit();
        }

//------------------------------------------------------------------------------
///  @brief  Encode a blit command to the stream.
///
///  blit is the metal terminology for a memcopy operation added to the command
///  stream. Don't know what the cuda term is.
//------------------------------------------------------------------------------
        void encode_blit() {
            for (size_t i = 0, ie = buffers.size(); i < ie; i++) {
                check_error_async(cuMemcpyDtoDAsync(result_buffers[i] + time_offset,
                                                    buffers[i] + buffer_offset,
                                                    buffer_element_size, stream),
                                  "check_error_async");
            }

            time_offset += buffer_element_size;
        }

//------------------------------------------------------------------------------
@@ -261,7 +236,6 @@ namespace gpu {
                                             threads_per_group, 1, 1, 0, stream,
                                             kernel_arguments.data(), NULL),
                              "cuLaunchKernel");
            encode_blit();
        }

//------------------------------------------------------------------------------
@@ -269,22 +243,19 @@ namespace gpu {
//------------------------------------------------------------------------------
        void wait() {
            check_error_async(cuStreamSynchronize(stream), "cuStreamSynchronize");
            check_error(cuCtxSynchronize(), "cuCtxSynchronize");
        }

//------------------------------------------------------------------------------
///  @brief Print out the results.
///
///  @param[in] num_times Number of times to record.
///  @param[in] index Number of times to record.
//------------------------------------------------------------------------------
        template<class BACKEND>
        void print_results(const size_t num_times) {
            check_error(cuCtxSynchronize(), "cuCtxSynchronize");
            for (size_t i = 0, ie = num_times + 1; i < ie; i++) {
                std::cout << i << " ";
                for (CUdeviceptr &buffer : result_buffers) {
                    std::cout << reinterpret_cast<typename BACKEND::base *> (buffer)[i] << " ";
                }
                std::cout << std::endl;
        void print_results(const size_t index) {
            wait();
            for (CUdeviceptr &buffer : buffers) {
                std::cout << reinterpret_cast<typename BACKEND::base *> (buffer)[index] << " ";
            }
            std::cout << std::endl;
        }
+4 −3
Original line number Diff line number Diff line
@@ -245,13 +245,14 @@ namespace jit {
            
            const timeing::measure_diagnostic gpu_time("GPU Time");
            
            context.print_results<BACKEND> (0);
            for (size_t i = 0; i < num_steps; i++) {
                context.step();
                context.print_results<BACKEND> (0);
            }
            context.wait();
            gpu_time.stop();

            context.print_results<BACKEND> (num_steps);
            gpu_time.stop();
        }
    };
}
+8 −40
Original line number Diff line number Diff line
@@ -34,8 +34,6 @@ namespace gpu {
        NSUInteger thread_groups;
///  Number of threads in a group.
        NSUInteger threads_per_group;
///  Result buffers.
        std::vector<id<MTLBuffer>> result_buffers;
///  Index offset.
        size_t buffer_offset;
///  Buffer element size.
@@ -105,16 +103,12 @@ namespace gpu {
                    buffers.push_back([device newBufferWithBytes:&backend[0]
                                                          length:backend.size()*buffer_element_size
                                                         options:MTLResourceStorageModeManaged]);
                    result_buffers.push_back([device newBufferWithLength:num_times*buffer_element_size
                                                                 options:MTLResourceStorageModeManaged]);
                }
                for (graph::shared_leaf<BACKEND> &output : outputs) {
                    const BACKEND backend = output->evaluate();
                    buffers.push_back([device newBufferWithBytes:&backend[0]
                                                          length:backend.size()*buffer_element_size
                                                         options:MTLResourceStorageModeManaged]);
                    result_buffers.push_back([device newBufferWithLength:num_times*buffer_element_size
                                                                 options:MTLResourceStorageModeManaged]);
                }
                
                threads_per_group = state.maxTotalThreadsPerThreadgroup;
@@ -123,10 +117,6 @@ namespace gpu {
                std::cout << "  Threads per group  : " << threads_per_group << std::endl;
                std::cout << "  Number of groups   : " << thread_groups << std::endl;
                std::cout << "  Total problem size : " << threads_per_group*thread_groups << std::endl;
                
                command_buffer = [queue commandBuffer];
                encode_blit();
                [command_buffer commit];
            }
        }

@@ -139,23 +129,6 @@ namespace gpu {
            return options;
        }

//------------------------------------------------------------------------------
///  @brief Encode a blit command.
//------------------------------------------------------------------------------
        void encode_blit() {
            id<MTLBlitCommandEncoder> blit = [command_buffer blitCommandEncoder];
            for (size_t i = 0, ie = buffers.size(); i < ie; i++) {
                [blit copyFromBuffer:buffers[i]
                        sourceOffset:buffer_offset
                            toBuffer:result_buffers[i]
                   destinationOffset:time_offset
                                size:buffer_element_size];
            }
            [blit endEncoding];

            time_offset += buffer_element_size;
        }

//------------------------------------------------------------------------------
///  @brief Perform a time step.
///
@@ -177,8 +150,6 @@ namespace gpu {
                        threadsPerThreadgroup:MTLSizeMake(threads_per_group, 1, 1)];
                [encoder endEncoding];
                
                encode_blit();
                
                [command_buffer commit];
            }
        }
@@ -189,8 +160,8 @@ namespace gpu {
        void wait() {
            command_buffer = [queue commandBuffer];
            id<MTLBlitCommandEncoder> blit = [command_buffer blitCommandEncoder];
            for (size_t i = 0, ie = buffers.size(); i < ie; i++) {
                [blit synchronizeResource:result_buffers[i]];
            for (id<MTLBuffer> buffer : buffers) {
                [blit synchronizeResource:buffer];
            }
            [blit endEncoding];
            
@@ -201,17 +172,14 @@ namespace gpu {
//------------------------------------------------------------------------------
///  @brief Print out the results.
///
///  @param[in] num_times Number of times to record.
///  @param[in] index Particle index to print.
//------------------------------------------------------------------------------
        template<class BACKEND>
        void print_results(const size_t num_times) {
            for (size_t i = 0, ie = num_times + 1; i < ie; i++) {
                std::cout << i << " ";
                for (id<MTLBuffer> buffer : result_buffers) {
        void print_results(const size_t index) {
            wait();
            for (id<MTLBuffer> buffer : buffers) {
                const typename BACKEND::base *contents = static_cast<typename BACKEND::base *> ([buffer contents]);
                    std::cout << contents[i] << " ";
                }
                std::cout << std::endl;
                std::cout << contents[index] << " ";
            }
            std::cout << std::endl;
        }