Fix call_once_per_loop to handle work stealing where id >> counter (4f9fd72c) · Commits · NDIP / Tool Sources / Direct-Geometry Spectroscopy / DCA / DCA Main

include/dca/parallel/hpx/hpx.hpp

+28 −21

Original line number	Diff line number	Diff line
		@@ -24,6 +24,7 @@
		#include <hpx/runtime/threads/executors/limiting_executor.hpp>
		#include <hpx/debugging/demangle_helper.hpp>
		#include <hpx/parallel/executors/parallel_executor.hpp>
		#include <hpx/util/yield_while.hpp>
		//
		#include <vector>
		#include <thread>
		@@ -51,6 +52,11 @@ struct thread_traits {
		hpx::this_thread::yield();
		}
		//
		template <typename F>
		static void yield_while(F &&f) {
		hpx::util::yield_while(std::forward<F>(f));
		}
		//
		static std::uint64_t default_threadcount() {
		return hpx::get_num_worker_threads();
		}
		@@ -60,7 +66,6 @@ struct thread_traits {
		std::vector<int> get_affinity()
		{
		cpu_set_t cpu_set_mask;

		auto status = sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set_mask);

		if (status == -1) {
		@@ -73,14 +78,8 @@ std::vector<int> get_affinity()
		cores.reserve(cpu_count);

		for (auto i = 0; i < CPU_SETSIZE && cores.size() < cpu_count; ++i) {
		if (CPU_ISSET(i, &cpu_set_mask)) {
		cores.push_back(i);
		}
		}

		if (cores.size() != cpu_count) {
		throw(std::logic_error("Core count mismatch."));
		}

		return cores;
		}
		@@ -98,8 +97,8 @@ class ThreadPool {
		public:
		// Creates a pool with n_threads.
		// Actually does nothing, HPX does not need to allocate threads
		ThreadPool(size_t n_threads = 1) : exec() /exec(100, 100)/ {
		set_task_count_threshold(n_threads-1);
		ThreadPool(size_t n_threads = 1) : exec(500, 500) {
		pool_size = n_threads;
		}

		ThreadPool(const ThreadPool& /other/) = delete;
		@@ -107,22 +106,23 @@ public:

		// Conclude all the pending work and destroy the threads spawned by this class.
		~ThreadPool() {
		// exec.set_and_wait(0,0);
		exec.wait_all();
		}

		void set_task_count_threshold(std::int64_t count)
		{
		// exec.set_threshold(count, count);
		exec.set_threshold(count, count+1);
		}

		void wait_for_tasks()
		{
		// exec.wait();
		exec.wait_all();
		}

		// we don't do anything here
		void enlarge(std::size_t n_threads) {
		std::cout << "HPX threadpool enlarge: " << n_threads << std::endl;
		pool_size = n_threads;
		}

		// Call asynchronously the function f with arguments args. This method is thread safe.
		@@ -138,7 +138,6 @@ public:
		std::cout << "enqueue: Arguments : "
		<< hpx::util::debug::print_type<Args...>(" \| ") << std::endl;
		#endif
		// typedef decltype(hpx::async(exec, std::forward<F>(f), std::forward<Args>(args)...)) return_type;
		return hpx::async(exec, std::forward<F>(f), std::forward<Args>(args)...);
		}

		@@ -150,7 +149,8 @@ public:
		// Returns the number of threads used by this class.
		std::size_t size() const {
		std::cout << "HPX threadpool size" << std::endl;
		return hpx::get_num_worker_threads();
		return pool_size;
		// return hpx::get_num_worker_threads();
		}

		// Returns a static instance.
		@@ -159,11 +159,14 @@ public:
		return global_pool;
		}

		// hpx::threads::executors::limiting_executor
		// <hpx::threads::executors::default_executor> exec;
		// this is just to make the DCA tests pass
		int pool_size;

		hpx::parallel::execution::parallel_executor exec;
		hpx::threads::executors::limiting_executor
		<hpx::threads::executors::default_executor> exec;

		// hpx::parallel::execution::parallel_executor;
		// hpx::threads::executors::default_executor;
		};


		@@ -182,10 +185,11 @@ struct hpxthread
		pool.enlarge(num_threads);

		// Fork.
		// Note we do not use std::forward here because we do not want to
		// accidentally move the same args more than once, we must use copy semantics
		for (int id = 0; id < num_threads; ++id)
		futures.emplace_back(
		pool.enqueue(std::forward<F>(f),
		id, num_threads,std::forward<Args>(args)...));
		pool.enqueue(f, id, num_threads, args...));

		// Join.
		for (auto& future : futures)
		@@ -206,9 +210,12 @@ struct hpxthread
		pool.enlarge(num_threads);

		// Spawn num_threads tasks.
		// Note we do not use std::forward here because we do not want to
		// accidentally move the same args more than once, we must use copy semantics
		for (int id = 0; id < num_threads; ++id)
		futures.emplace_back(
		pool.enqueue(std::forward<F>(f), id, num_threads, std::forward<Args>(args)...));
		pool.enqueue(f, id, num_threads, args...));

		// Sum the result of the tasks.
		ReturnType result = 0;
		for (auto& future : futures)

include/dca/parallel/stdthread/thread_pool/thread_pool.hpp

+7 −0

Original line number	Diff line number	Diff line
		@@ -41,6 +41,13 @@ struct thread_traits {
		static void yield() {
		std::this_thread::yield();
		}
		//
		template <typename F>
		static void yield_while(F &&f) {
		while (f()) {
		std::this_thread::yield();
		}
		}
		};

		class ThreadPool {

include/dca/parallel/util/call_once_per_loop.hpp

+20 −13

Original line number	Diff line number	Diff line
		@@ -35,25 +35,32 @@ struct OncePerLoopFlag {
		// completion of f(args...).
		// Precondition: each call must use a non decreasing value of the loop index.
		template <class F, class... Args>
		void callOncePerLoop(OncePerLoopFlag& flag, const int loop_id, F&& f, Args&&... args) {
		const int currently_done = flag.loop_done;
		void callOncePerLoop(OncePerLoopFlag& flag, const int loop_id, F&& f, Args&&... args)
		{
		// if this Id has been done, exit immediately
		if (loop_id <= flag.loop_done) {
		return;
		}

		if (loop_id < 0)
		throw(std::out_of_range("Negative loop index."));
		// if this Id is too far ahead, then wait
		dca::parallel::thread_traits::yield_while([&flag, loop_id](){
		return (loop_id > (flag.loop_done+1));
		});

		if (loop_id <= currently_done)
		// whilst we were yielding, someone else took the Id slot
		if (loop_id <= flag.loop_done) {
		return;
		else if (loop_id > currently_done + 1 && currently_done != -1)
		throw(std::logic_error("Loop id called out of order."));
		}

		// take the lock
		dca::parallel::thread_traits::unique_lock lock(flag.mutex);
		// Check if flag.loop_done changed before locking the mutex.
		if (loop_id <= flag.loop_done)
		// whilst we were waiting, someone else took the Id slot
		if (loop_id <= flag.loop_done) {
		return;
		}

		// Run the task.
		f(args...);

		f(std::forward<Args>(args)...);
		flag.loop_done = loop_id;
		}