From 052dc48939a063b19a13c10cb2c735b4b06a4c4b Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 13 Dec 2016 12:30:14 +0200 Subject: Various scheduler improvements and fixes --- build2/scheduler | 252 ++++++++++++++++++++++------------------ build2/scheduler.cxx | 129 ++++++++++++-------- build2/scheduler.txx | 91 +++++++++++++++ unit-tests/scheduler/driver.cxx | 125 ++++++++++++++++---- 4 files changed, 415 insertions(+), 182 deletions(-) create mode 100644 build2/scheduler.txx diff --git a/build2/scheduler b/build2/scheduler index 80b9ec9..ffb8acd 100644 --- a/build2/scheduler +++ b/build2/scheduler @@ -8,8 +8,6 @@ #include #include #include -#include -#include #include // aligned_storage, etc #include @@ -49,15 +47,7 @@ namespace build2 // allow a ready master to continue as soon as possible. If it were reused // as a helper, then it could be blocked on a nested wait() further down the // stack. This means that the number of threads created by the scheduler - // will normally exceed the maximum active allowed. It should not, however, - // get excessive provided the tasks are not too deeply nested. - // - // @@ Actually, this is not quite correct: this seems to be a function of - // both depth and width of the task tree. Perhaps we should use a hybrid - // approach: when the number of helper threads reaches a certain limit - // we switch to reusing suspended threads? Also, a thread can rake its - // own and its subtask's queues directly in wait() since wait() won't - // return until they are done. + // will normally exceed the maximum active allowed. // class scheduler { @@ -77,68 +67,7 @@ namespace build2 // template void - async (atomic_count& task_count, F&& f, A&&... a) - { - using task = task_type; - - static_assert (sizeof (task) <= sizeof (task_data), - "insufficient space"); - - static_assert (std::is_trivially_destructible::value, - "not trivially destructible"); - - // Push into the queue unless we are running serially or the queue is - // full. - // - task_data* td (nullptr); - - if (max_active_ != 1) - { - task_queue& tq (task_queue_ != nullptr - ? *task_queue_ - : create_queue ()); - - lock ql (tq.mutex); - - if (tq.shutdown) - throw system_error (ECANCELED, std::system_category ()); - - if ((td = push (tq)) != nullptr) - { - // Package the task. - // - new (&td->data) task { - &task_count, - decay_copy (forward (f)), - typename task::args_type (decay_copy (forward (a))...)}; - - td->thunk = &task_thunk; - } - else - tq.stat_full++; - } - - // If serial/full, then run the task synchronously. In this case - // there is no need to mess with task count. - // - if (td == nullptr) - { - forward (f) (forward (a)...); - return; - } - - // Increment the task count. - // - task_count.fetch_add (1, std::memory_order_release); - - lock l (mutex_); - task_ = true; - - // If there is a spare active thread, wake up (or create) the helper. - // - if (active_ < max_active_) - activate_helper (l); - } + async (atomic_count& task_count, F&&, A&&...); // Wait until the task count reaches 0. If the scheduler is shutdown // while waiting, throw system_error(ECANCELED). @@ -163,22 +92,24 @@ namespace build2 // already active (e.g., the calling thread). It must not be 0 (since // someone has to schedule the first task). // - // If maximum threads is unspecified, then a generally appropriate default - // limit it used. + // If the maximum threads or task queue depth arguments are unspecified, + // then appropriate defaults are used. // scheduler (size_t max_active, size_t init_active = 1, - size_t max_threads = 0) + size_t max_threads = 0, + size_t queue_depth = 0) { - startup (max_active, init_active, max_threads); + startup (max_active, init_active, max_threads, queue_depth); } - // Note: naturally not thread-safe. + // Start the scheduler. // void startup (size_t max_active, size_t init_active = 1, - size_t max_threads = 0); + size_t max_threads = 0, + size_t queue_depth = 0); // Wait for all the helper threads to terminate. Throw system_error on // failure. Note that the initially active threads are not waited for. @@ -201,6 +132,35 @@ namespace build2 stat shutdown (); + // If initially active thread(s) (besides the one that calls startup()) + // exist before the call to startup(), then they must call join() before + // executing any tasks. The two common cases where you don't have to call + // join are a single active thread that calls startup()/shutdown() or + // active thread(s) that are created after startup(). + // + void + join () + { + assert (task_queue_ = nullptr); + + // Lock the mutex to make sure the values set in startup() are visible + // in this thread. + // + lock l (mutex_); + } + + // If initially active thread(s) participate in multiple schedulers and/or + // sessions (intervals between startup() and shutdown()), then they must + // call leave() before joining another scheduler/session. Note that this + // applies to the active thread that calls shutdown(). Note that a thread + // can only participate in one scheduler at a time. + // + void + leave () + { + task_queue_ = nullptr; + } + // Return the number of hardware threads or 0 if unable to determine. // static size_t @@ -252,21 +212,7 @@ namespace build2 template static void - task_thunk (scheduler& s, lock& ql, void* td) - { - using task = task_type; - - // Move the data and release the lock. - // - task t (move (*static_cast (td))); - ql.unlock (); - - t.thunk (std::index_sequence_for ()); - - atomic_count& tc (*t.task_count); - if (--tc == 0) - s.resume (tc); // Resume a waiter, if any. - } + task_thunk (scheduler&, lock&, void*); template static std::decay_t @@ -284,8 +230,7 @@ namespace build2 // (init_active + helpers) <= max_threads // // Note that the first three are immutable between startup() and - // shutdown() so can be accessed without a lock (@@ What about - // the case of multiple initially active?). + // shutdown() so can be accessed without a lock (but see join()). // size_t init_active_ = 0; // Initially active threads. size_t max_active_ = 0; // Maximum number of active threads. @@ -378,11 +323,18 @@ namespace build2 size_t stat_full = 0; // Number of times pop() returned NULL. - // head <= stop <= tail + // Our task queue is circular with head being the index of the first + // element and tail -- of the last. Since this makes the empty and one + // element cases indistinguishable, we also keep the size. // - size_t head = 0; // Index of the first element. - size_t stop = 0; // Index of the element to stop at in pop_back(). - size_t tail = 0; // Index of the one past last element. + // The mark is an index somewhere between (figuratively speaking) head + // and tail, if enabled. If the mark is hit, then it is disabled until + // the queue becomes empty. + // + size_t head = 0; + size_t mark = 0; + size_t tail = 0; + size_t size = 0; unique_ptr data; @@ -398,40 +350,102 @@ namespace build2 task_data* push (task_queue& tq) { - return tq.tail != task_queue_depth_ ? &tq.data[tq.tail++] : nullptr; + size_t& s (tq.size); + size_t& t (tq.tail); + + if (s != task_queue_depth_) + { + // normal wrap empty + // | | | + t = s != 0 ? (t != task_queue_depth_ - 1 ? t + 1 : 0) : t; + s++; + return &tq.data[t]; + } + + return nullptr; } bool - empty_front (task_queue& tq) const {return tq.head == tq.tail;} + empty_front (task_queue& tq) const {return tq.size == 0;} void pop_front (task_queue& tq, lock& ql) { - task_data& td (tq.data[tq.head++]); + size_t& s (tq.size); + size_t& h (tq.head); + size_t& m (tq.mark); + + bool a (h == m); // Adjust mark? + task_data& td (tq.data[h]); + + // normal wrap empty + // | | | + h = s != 1 ? (h != task_queue_depth_ - 1 ? h + 1 : 0) : h; - if (tq.head == tq.tail) - tq.stop = tq.head = tq.tail = 0; // Reset. - else if (tq.head > tq.stop) - tq.stop = tq.head; + if (--s == 0 || a) + m = h; // Reset or adjust the mark. // The thunk moves the task data to its stack, releases the lock, // and continues to execute the task. // td.thunk (*this, ql, &td.data); + ql.lock (); } bool - empty_back (task_queue& tq) const {return tq.stop == tq.tail;} + empty_back (task_queue& tq) const + { + return tq.size == 0 || tq.mark == task_queue_depth_; + } void pop_back (task_queue& tq, lock& ql) { - task_data& td (tq.data[--tq.tail]); + size_t& s (tq.size); + size_t& t (tq.tail); + size_t& m (tq.mark); + + bool a (t == m); // Adjust mark? - if (tq.head == tq.tail) - tq.stop = tq.head = tq.tail = 0; + task_data& td (tq.data[t]); + + // Save the old queue mark and set the new one in case the task we are + // about to run adds sub-tasks. + // + size_t om (m); + m = t; // Where next push() will go. + + // normal wrap empty + // | | | + t = s != 1 ? (t != 0 ? t - 1 : task_queue_depth_ - 1) : t; + --s; td.thunk (*this, ql, &td.data); + ql.lock (); + + // Restore the old mark (which we might have to adjust). + // + if (s == 0) + m = t; // Reset the mark. + else if (a) + m = task_queue_depth_; // Disable the mark. + else + // What happens if head goes past the old mark? In this case we will + // get into the empty queue state before we end up making any (wrong) + // decisions based on this value. Unfortunately there is no way to + // detect this (and do some sanity asserts) since things can wrap + // around. + // + // To put it another way, the understanding here is that after the + // task returns we will either have an empty queue or there will still + // be tasks between the old mark and and the current tail, something + // along these lines: + // + // OOOOOXXXXOOO + // | | | + // m h t + // + m = om; } // Each thread has its own queue. Instead of allocating all max_threads of @@ -440,9 +454,19 @@ namespace build2 // vector> task_queues_; - // TLS cache of each thread's task queue. + // TLS cache of thread's task queue. + // + static + // Apparently Apple's Clang "temporarily disabled" C++11 thread_local + // until they can implement a "fast" version, which reportedly happened in + // XCode 8. So for now we will continue using __thread for this target. // - thread_local static task_queue* task_queue_; +#if defined(__apple_build_version__) && __apple_build_version__ < 8000000 + __thread +#else + thread_local +#endif + task_queue* task_queue_; task_queue& create_queue (); @@ -453,4 +477,6 @@ namespace build2 extern scheduler sched; } +#include + #endif // BUILD2_SCHEDULER diff --git a/build2/scheduler.cxx b/build2/scheduler.cxx index b62d2d4..4619e85 100644 --- a/build2/scheduler.cxx +++ b/build2/scheduler.cxx @@ -4,6 +4,8 @@ #include +#include + using namespace std; namespace build2 @@ -16,27 +18,14 @@ namespace build2 // See if we can run some of our own tasks. // - task_queue& tq (*task_queue_); // Must have been initializied by async(). + task_queue& tq (*task_queue_); // Must have been set by async() or task + // would have been 0. for (lock ql (tq.mutex); !tq.shutdown && !empty_back (tq); ) - { - // Save the old stop point and set the new one in case the task we are - // about to run adds sub-tasks. - // - size_t stop (tq.stop); - tq.stop = tq.tail - 1; // Index of the first sub-task to be added (-1 - // is for the following pop_back()). - - pop_back (tq, ql); // Releases the lock. - ql.lock (); - - // Restore the old stop point which we might have to adjust. - // - tq.stop = tq.head > stop ? tq.head : tq.tail < stop ? tq.tail : stop; - } + pop_back (tq, ql); - // Note that empty task queue doesn't automatically mean the task count is - // zero (some might still be executing asynchronously). + // Note that empty task queue doesn't automatically mean the task count + // is zero (some might still be executing asynchronously). // if (task_count == 0) return; @@ -135,8 +124,16 @@ namespace build2 } void scheduler:: - startup (size_t max_active, size_t init_active, size_t max_threads) + startup (size_t max_active, + size_t init_active, + size_t max_threads, + size_t queue_depth) { + // Lock the mutex to make sure our changes are visible in (other) active + // threads. + // + lock l (mutex_); + // Use 4x max_active on 32-bit and 8x max_active on 64-bit. Unless we were // asked to run serially. // @@ -153,30 +150,53 @@ namespace build2 max_threads_ = max_threads; // This value should be proportional to the amount of hardware concurrency - // we have. Note that the queue entry is quite sizable. + // we have (no use queing things if helpers cannot keep up). Note that the + // queue entry is quite sizable. // - task_queue_depth_ = max_active * sizeof (void*) * 4; + task_queue_depth_ = queue_depth != 0 + ? queue_depth + : max_active * sizeof (void*) * 2; - task_queues_.clear (); task_queues_.reserve (max_threads_); - // Pick a nice prime for common max_threads numbers. Though Intel Xeons - // are all over the map when it comes to cores (6, 8, 10, 12, 14, 16, - // 18, 20, 22). + // Pick a nice prime for common max_threads numbers. Experience shows that + // we want something close to 2x for small numbers, then reduce to 1.5x + // in-between, and 1x for large ones. + // + // Note that Intel Xeons are all over the map when it comes to cores (6, + // 8, 10, 12, 14, 16, 18, 20, 22). // wait_queue_size_ = // HW threads x bits + // + // 2x + // max_threads == 8 ? 17 : // 2 x 4 max_threads == 16 ? 31 : // 4 x 4, 2 x 8 - max_threads == 32 ? 63 : // 4 x 8 - max_threads == 48 ? 97 : // 6 x 8 - max_threads == 64 ? 127 : // 8 x 8 - max_threads == 96 ? 191 : // 12 x 8 - max_threads == 128 ? 257 : // 16 x 8 - max_threads == 192 ? 383 : // 24 x 8 - max_threads == 256 ? 509 : // 32 x 8 - max_threads == 384 ? 769 : // 48 x 8 - max_threads == 512 ? 1021 : // 64 x 8 - 2 * max_threads - 1; + // + // 1.5x + // + max_threads == 32 ? 47 : // 4 x 8 + max_threads == 48 ? 53 : // 6 x 8 + max_threads == 64 ? 67 : // 8 x 8 + max_threads == 80 ? 89 : // 10 x 8 + // + // 1x + // + max_threads == 96 ? 101 : // 12 x 8 + max_threads == 112 ? 127 : // 14 x 8 + max_threads == 128 ? 131 : // 16 x 8 + max_threads == 144 ? 139 : // 18 x 8 + max_threads == 160 ? 157 : // 20 x 8 + max_threads == 176 ? 173 : // 22 x 8 + max_threads == 192 ? 191 : // 24 x 8 + max_threads == 224 ? 223 : // 28 x 8 + max_threads == 256 ? 251 : // 32 x 8 + max_threads == 288 ? 271 : // 36 x 8 + max_threads == 320 ? 313 : // 40 x 8 + max_threads == 352 ? 331 : // 44 x 8 + max_threads == 384 ? 367 : // 48 x 8 + max_threads == 512 ? 499 : // 64 x 8 + max_threads - 1; // Assume max_threads is even. wait_queue_.reset (new wait_slot[wait_queue_size_]); @@ -249,6 +269,11 @@ namespace build2 l.lock (); } + // Free the memory. + // + wait_queue_.reset (); + task_queues_.clear (); + r.thread_max_active = max_active_; r.thread_max_total = max_threads_; r.thread_max_waiting = stat_max_waiters_; @@ -281,7 +306,7 @@ namespace build2 starting_++; l.unlock (); - // Restore the counter if the thread creation fails. + // Restore the counters if the thread creation fails. // struct guard { @@ -338,10 +363,8 @@ namespace build2 { task_queue& tq (*s.task_queues_[i]); - for (lock ql (tq.mutex); - !tq.shutdown && !s.empty_front (tq); - ql.lock ()) - s.pop_front (tq, ql); // Releases the lock. + for (lock ql (tq.mutex); !tq.shutdown && !s.empty_front (tq); ) + s.pop_front (tq, ql); } l.lock (); @@ -367,16 +390,30 @@ namespace build2 s.helpers_--; } - thread_local scheduler::task_queue* scheduler::task_queue_ = nullptr; +#if defined(__apple_build_version__) && __apple_build_version__ < 8000000 + __thread +#else + thread_local +#endif + scheduler::task_queue* scheduler::task_queue_ = nullptr; auto scheduler:: create_queue () -> task_queue& { - lock l (mutex_); - task_queues_.push_back (make_unique (task_queue_depth_)); - task_queue_ = task_queues_.back ().get (); - task_queue_->shutdown = shutdown_; - return *task_queue_; + // Note that task_queue_depth is immutable between startup() and + // shutdown() (but see join()). + // + unique_ptr tqp (new task_queue (task_queue_depth_)); + task_queue& tq (*tqp); + + { + lock l (mutex_); + tq.shutdown = shutdown_; + task_queues_.push_back (move (tqp)); + } + + task_queue_ = &tq; + return tq; } scheduler sched; diff --git a/build2/scheduler.txx b/build2/scheduler.txx new file mode 100644 index 0000000..9202f0b --- /dev/null +++ b/build2/scheduler.txx @@ -0,0 +1,91 @@ +// file : build2/scheduler.txx -*- C++ -*- +// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include + +namespace build2 +{ + template + void scheduler:: + async (atomic_count& task_count, F&& f, A&&... a) + { + using task = task_type; + + static_assert (sizeof (task) <= sizeof (task_data::data), + "insufficient space"); + + static_assert (std::is_trivially_destructible::value, + "not trivially destructible"); + + // Push into the queue unless we are running serially or the queue is + // full. + // + task_data* td (nullptr); + + if (max_active_ != 1) + { + task_queue* tq (task_queue_); // Single load. + if (tq == nullptr) + tq = &create_queue (); + + lock ql (tq->mutex); + + if (tq->shutdown) + throw system_error (ECANCELED, std::system_category ()); + + if ((td = push (*tq)) != nullptr) + { + // Package the task. + // + new (&td->data) task { + &task_count, + decay_copy (forward (f)), + typename task::args_type (decay_copy (forward (a))...)}; + + td->thunk = &task_thunk; + } + else + tq->stat_full++; + } + + // If serial/full, then run the task synchronously. In this case + // there is no need to mess with task count. + // + if (td == nullptr) + { + forward (f) (forward (a)...); + return; + } + + // Increment the task count. + // + task_count.fetch_add (1, std::memory_order_release); + + lock l (mutex_); + task_ = true; + + // If there is a spare active thread, wake up (or create) the helper. + // + if (active_ < max_active_) + activate_helper (l); + } + + template + void scheduler:: + task_thunk (scheduler& s, lock& ql, void* td) + { + using task = task_type; + + // Move the data and release the lock. + // + task t (move (*static_cast (td))); + ql.unlock (); + + t.thunk (std::index_sequence_for ()); + + atomic_count& tc (*t.task_count); + if (--tc == 0) + s.resume (tc); // Resume a waiter, if any. + } +} diff --git a/unit-tests/scheduler/driver.cxx b/unit-tests/scheduler/driver.cxx index 726415a..46bd55d 100644 --- a/unit-tests/scheduler/driver.cxx +++ b/unit-tests/scheduler/driver.cxx @@ -17,74 +17,133 @@ using namespace std; namespace build2 { - // Usage argv[0] + // Usage argv[0] [-v ] [-d ] [-c ] + // [-q ] // + // -v task tree volume (affects both depth and width), for example 100 + // -d computational difficulty of each task, for example 10 + // -c max active threads, if unspecified or 0, then hardware concurrency + // -q task queue depth, if unspecified or 0, then appropriate default used + // + // Specifying any option also turns on the verbose mode. + // + // Notes on testing: + // + // 1. Ideally you would want to test things on an SMP machine. + // + // 2. When need to compare performance, disable turbo boost since its + // availability depends on CPU utilization/temperature: + // + // # echo '1' >/sys/devices/system/cpu/intel_pstate/no_turbo + // + // 3. Use turbostat(1) to see per-CPU details (utlization, frequency): + // + // $ sudo turbostat --interval 1 ./driver -d 8 -v 300 + // + static bool + prime (uint64_t); + + // Find # of primes in the [x, y) range. + // + static void + inner (uint64_t x, uint64_t y, uint64_t& r) + { + for (; x != y; ++x) + if (prime (x)) + r++; + }; + int main (int argc, char* argv[]) { bool verb (false); + + // Adjust assert() below if changing these defaults. + // + size_t volume (100); + uint32_t difficulty (10); + size_t max_active (0); + size_t queue_depth (0); - if (argc > 1) + for (int i (1); i != argc; ++i) { + string a (argv[i]); + + if (a == "-v") + volume = stoul (argv[++i]); + else if (a == "-d") + difficulty = stoul (argv[++i]); + else if (a == "-c") + max_active = stoul (argv[++i]); + else if (a == "-q") + queue_depth = stoul (argv[++i]); + else + assert (false); + verb = true; - max_active = stoul (argv[1]); } if (max_active == 0) max_active = scheduler::hardware_concurrency (); - scheduler s (max_active); + scheduler s (max_active, 1, 0, queue_depth); - auto inner = [] (size_t x, size_t y, size_t& out) + // Find # prime counts of primes in [i, d*i*i) ranges for i in (0, n]. + // + auto outer = [difficulty, &s] (size_t n, vector& o, uint64_t& r) { - out = x + y; - this_thread::sleep_for (chrono::microseconds (out * 10)); - }; - - auto outer = [&s, &inner] (size_t n, size_t& out) - { - vector result (2 * n, 0); scheduler::atomic_count task_count (0); - for (size_t i (0); i != 2 * n; ++i) + for (size_t i (1); i <= n; ++i) { + o[i - 1] = 0; s.async (task_count, inner, i, - i, - std::ref (result[i])); + i * i * difficulty, + ref (o[i - 1])); } s.wait (task_count); assert (task_count == 0); - for (size_t i (0); i != n; ++i) - out += result[i]; - - this_thread::sleep_for (chrono::microseconds (out * 10)); + for (uint64_t v: o) + r += prime (v) ? 1 : 0; }; - const size_t tasks (50); + vector r (volume, 0); + vector> o (volume, vector ()); - vector result (tasks, 0); scheduler::atomic_count task_count (0); - for (size_t i (0); i != tasks; ++i) + for (size_t i (0); i != volume; ++i) { + o[i].resize (i); s.async (task_count, outer, i, - std::ref (result[i])); + ref (o[i]), + ref (r[i])); } s.wait (task_count); assert (task_count == 0); + size_t n (0); + for (uint64_t v: r) + n += v; + + if (volume == 100 && difficulty == 10) + assert (n == 580); + scheduler::stat st (s.shutdown ()); if (verb) { + cerr << "result " << n << endl + << endl; + cerr << "thread_max_active " << st.thread_max_active << endl << "thread_max_total " << st.thread_max_total << endl << "thread_helpers " << st.thread_helpers << endl @@ -99,6 +158,26 @@ namespace build2 return 0; } + + static bool + prime (uint64_t x) + { + if (x == 2 || x == 3) + return true; + + if (x < 2 || x % 2 == 0 || x % 3 == 0) + return false; + + // Test divisors starting from 5 and incrementing alternatively by 2/4. + // + for (uint64_t d (5), i (2); d * d <= x; d += i, i = 6 - i) + { + if (x % d == 0) + return false; + } + + return true; + } } int -- cgit v1.1