From 6fd981a2a5b41e38f4b2e8f5798b431e4cdcf19b Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 20 Feb 2018 11:09:26 +0200 Subject: Initial work on deadlock detection support Fun fact: In a serial build system a dependency cycle leads to an infinite loop/recursion. In a parallel -- to a deadlock. Still think build systems are fun? --- build2/scheduler.cxx | 93 ++++++++++++++++++++++++++++++++++++++++++++-------- build2/scheduler.hxx | 9 ++++- build2/scheduler.txx | 2 +- 3 files changed, 88 insertions(+), 16 deletions(-) diff --git a/build2/scheduler.cxx b/build2/scheduler.cxx index 79951b6..34b71cd 100644 --- a/build2/scheduler.cxx +++ b/build2/scheduler.cxx @@ -12,8 +12,9 @@ #endif #include +#include // std::terminate() -#include +#include using namespace std; @@ -73,17 +74,76 @@ namespace build2 active_--; waiting_++; + progress_++; if (waiting_ > stat_max_waiters_) stat_max_waiters_ = waiting_; - // A spare active thread has become available. If there are ready - // masters or eager helpers, wake someone up. + // A spare active thread has become available. If there are ready masters + // or eager helpers, wake someone up. // if (ready_ != 0) + { ready_condv_.notify_one (); + } else if (queued_task_count_.load (std::memory_order_consume) != 0) + { activate_helper (l); + } + // @@ TODO: Redo as a separate "monitoring" thread. + // + // This still doesn't work for the phase lock case where we call + // deactivate and then go wait on a condition variable: we are doing + // deadlock detection while holding the lock that prevents other + // threads from making progress! + // +#if 0 + else if (active_ == 0) + { + // We may have a deadlock which can happen because of dependency cycles. + // + // Relying on the active_ count alone is not precise enough, however: + // some threads might be transitioning between the active/waiting/ready + // states. Carefully accounting for this is not trivial, to say the + // least (especially in the face of spurious wakeups). So we are going + // to do a "fuzzy" deadlock detection by measuring "progress". The idea + // is that those transitions should be pretty short-lived and so if we + // wait for a couple of hundreds context switches, then we should be + // able to distinguish a real deadlock from the transition case. + // + size_t p (progress_); + + for (size_t i (0); i != 100; ++i) + { + l.unlock (); + this_thread::yield () is not enough. + l.lock (); + + if (p != progress_) + break; + } + + if (p == progress_) + { + // Reactivate and fail. + // + waiting_--; + active_++; + + // Shutting things down cleanly is tricky: we could have handles it in + // the scheduler (e.g., by setting a flag and then waking everyone up, + // similar to shutdown). But there could also be "external waiters" + // that have called deactivate() -- we have no way to wake those up. + // So for now we are going to abort (the nice thing about abort is if + // this is not a dependency cycle, then we have a core to examine). + // + error << "deadlock detected, can be caused by a dependency cycle" << + info << "re-run with -s to diagnose dependency cycles"; + + std::terminate (); + } + } +#endif } void scheduler:: @@ -93,7 +153,6 @@ namespace build2 return; lock l (mutex_); - waiting_--; if (collision) stat_wait_collisions_++; @@ -101,17 +160,19 @@ namespace build2 // If we have spare active threads, then become active. Otherwise it // enters the ready queue. // + waiting_--; ready_++; + progress_++; while (!shutdown_ && active_ >= max_active_) ready_condv_.wait (l); ready_--; + active_++; + progress_++; if (shutdown_) throw_generic_error (ECANCELED); - - active_++; } size_t scheduler:: @@ -303,11 +364,13 @@ namespace build2 if ((wait_queue_size_ = max_threads == 1 ? 0 : shard_size ()) != 0) wait_queue_.reset (new wait_slot[wait_queue_size_]); - // Reset stats counters. + // Reset counters. // stat_max_waiters_ = 0; stat_wait_collisions_ = 0; + progress_ = 0; + for (size_t i (0); i != wait_queue_size_; ++i) wait_queue_[i].shutdown = false; @@ -434,15 +497,17 @@ namespace build2 if (!shutdown_) { if (idle_ != 0) + { idle_condv_.notify_one (); + } + // + // Ignore the max_threads value if we have queued tasks but no active + // threads. This means everyone is waiting for something to happen but + // nobody is doing anything (e.g., working the queues). This, for + // example, can happen if a thread waits for a task that is in its queue + // but is below the mark. + // else if (init_active_ + helpers_ < max_threads_ || - // - // Ignore the max_threads value if we have queued tasks but no - // active threads. This means everyone is waiting for something - // to happen but nobody is doing anything (e.g., work the - // queues). This, for example, can happen if a thread waits for - // a task that is in its queue but is below the mark. - // (active_ == 0 && queued_task_count_.load (memory_order_consume) != 0)) { diff --git a/build2/scheduler.hxx b/build2/scheduler.hxx index d90a26f..aac9425 100644 --- a/build2/scheduler.hxx +++ b/build2/scheduler.hxx @@ -426,6 +426,13 @@ namespace build2 size_t stat_max_waiters_; size_t stat_wait_collisions_; + // Progress counter. + // + // We increment it for each active->waiting->ready->active transition + // and it is used for deadlock detection (see deactivate()). + // + size_t progress_; + // Wait queue. // // A wait slot blocks a bunch of threads. When they are (all) unblocked, @@ -456,7 +463,7 @@ namespace build2 // // Each queue has its own mutex plus we have an atomic total count of the // queued tasks. Note that it should only be modified while holding one - // of the queue lock. + // of the queue locks. // atomic_count queued_task_count_; diff --git a/build2/scheduler.txx b/build2/scheduler.txx index dd98b4a..37dd320 100644 --- a/build2/scheduler.txx +++ b/build2/scheduler.txx @@ -105,7 +105,7 @@ namespace build2 } // If there is a spare active thread, wake up (or create) the helper - // (unless someone already snatched it). + // (unless someone already snatched the task). // if (queued_task_count_.load (std::memory_order_consume) != 0) { -- cgit v1.1