From 6fd981a2a5b41e38f4b2e8f5798b431e4cdcf19b Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Tue, 20 Feb 2018 11:09:26 +0200
Subject: Initial work on deadlock detection support

Fun fact: In a serial build system a dependency cycle leads to an infinite
loop/recursion. In a parallel -- to a deadlock. Still think build systems
are fun?
---
 build2/scheduler.cxx | 93 ++++++++++++++++++++++++++++++++++++++++++++--------
 build2/scheduler.hxx |  9 ++++-
 build2/scheduler.txx |  2 +-
 3 files changed, 88 insertions(+), 16 deletions(-)
diff --git a/build2/scheduler.cxx b/build2/scheduler.cxx
index 79951b6..34b71cd 100644
--- a/build2/scheduler.cxx
+++ b/build2/scheduler.cxx
@@ -12,8 +12,9 @@
 #endif
 
 #include <cerrno>
+#include <exception> // std::terminate()
 
-#include <iostream>
+#include <build2/diagnostics.hxx>
 
 using namespace std;
 
@@ -73,17 +74,76 @@ namespace build2
 
     active_--;
     waiting_++;
+    progress_++;
 
     if (waiting_ > stat_max_waiters_)
       stat_max_waiters_ = waiting_;
 
-    // A spare active thread has become available. If there are ready
-    // masters or eager helpers, wake someone up.
+    // A spare active thread has become available. If there are ready masters
+    // or eager helpers, wake someone up.
     //
     if (ready_ != 0)
+    {
       ready_condv_.notify_one ();
+    }
     else if (queued_task_count_.load (std::memory_order_consume) != 0)
+    {
       activate_helper (l);
+    }
+    // @@ TODO: Redo as a separate "monitoring" thread.
+    //
+    // This still doesn't work for the phase lock case where we call
+    // deactivate and then go wait on a condition variable: we are doing
+    // deadlock detection while holding the lock that prevents other
+    // threads from making progress!
+    //
+#if 0
+    else if (active_ == 0)
+    {
+      // We may have a deadlock which can happen because of dependency cycles.
+      //
+      // Relying on the active_ count alone is not precise enough, however:
+      // some threads might be transitioning between the active/waiting/ready
+      // states. Carefully accounting for this is not trivial, to say the
+      // least (especially in the face of spurious wakeups). So we are going
+      // to do a "fuzzy" deadlock detection by measuring "progress". The idea
+      // is that those transitions should be pretty short-lived and so if we
+      // wait for a couple of hundreds context switches, then we should be
+      // able to distinguish a real deadlock from the transition case.
+      //
+      size_t p (progress_);
+
+      for (size_t i (0); i != 100; ++i)
+      {
+        l.unlock ();
+        this_thread::yield () is not enough.
+        l.lock ();
+
+        if (p != progress_)
+          break;
+      }
+
+      if (p == progress_)
+      {
+        // Reactivate and fail.
+        //
+        waiting_--;
+        active_++;
+
+        // Shutting things down cleanly is tricky: we could have handles it in
+        // the scheduler (e.g., by setting a flag and then waking everyone up,
+        // similar to shutdown). But there could also be "external waiters"
+        // that have called deactivate() -- we have no way to wake those up.
+        // So for now we are going to abort (the nice thing about abort is if
+        // this is not a dependency cycle, then we have a core to examine).
+        //
+        error << "deadlock detected, can be caused by a dependency cycle" <<
+          info << "re-run with -s to diagnose dependency cycles";
+
+        std::terminate ();
+      }
+    }
+#endif
   }
 
   void scheduler::
@@ -93,7 +153,6 @@ namespace build2
       return;
 
     lock l (mutex_);
-    waiting_--;
 
     if (collision)
       stat_wait_collisions_++;
@@ -101,17 +160,19 @@ namespace build2
     // If we have spare active threads, then become active. Otherwise it
     // enters the ready queue.
     //
+    waiting_--;
     ready_++;
+    progress_++;
 
     while (!shutdown_ && active_ >= max_active_)
       ready_condv_.wait (l);
 
     ready_--;
+    active_++;
+    progress_++;
 
     if (shutdown_)
       throw_generic_error (ECANCELED);
-
-    active_++;
   }
 
   size_t scheduler::
@@ -303,11 +364,13 @@ namespace build2
     if ((wait_queue_size_ = max_threads == 1 ? 0 : shard_size ()) != 0)
       wait_queue_.reset (new wait_slot[wait_queue_size_]);
 
-    // Reset stats counters.
+    // Reset counters.
     //
     stat_max_waiters_     = 0;
     stat_wait_collisions_ = 0;
 
+    progress_ = 0;
+
     for (size_t i (0); i != wait_queue_size_; ++i)
       wait_queue_[i].shutdown = false;
 
@@ -434,15 +497,17 @@ namespace build2
     if (!shutdown_)
     {
       if (idle_ != 0)
+      {
         idle_condv_.notify_one ();
+      }
+      //
+      // Ignore the max_threads value if we have queued tasks but no active
+      // threads. This means everyone is waiting for something to happen but
+      // nobody is doing anything (e.g., working the queues). This, for
+      // example, can happen if a thread waits for a task that is in its queue
+      // but is below the mark.
+      //
       else if (init_active_ + helpers_ < max_threads_ ||
-               //
-               // Ignore the max_threads value if we have queued tasks but no
-               // active threads. This means everyone is waiting for something
-               // to happen but nobody is doing anything (e.g., work the
-               // queues). This, for example, can happen if a thread waits for
-               // a task that is in its queue but is below the mark.
-               //
                (active_ == 0 &&
                 queued_task_count_.load (memory_order_consume) != 0))
       {
diff --git a/build2/scheduler.hxx b/build2/scheduler.hxx
index d90a26f..aac9425 100644
--- a/build2/scheduler.hxx
+++ b/build2/scheduler.hxx
@@ -426,6 +426,13 @@ namespace build2
     size_t stat_max_waiters_;
     size_t stat_wait_collisions_;
 
+    // Progress counter.
+    //
+    // We increment it for each active->waiting->ready->active transition
+    // and it is used for deadlock detection (see deactivate()).
+    //
+    size_t progress_;
+
     // Wait queue.
     //
     // A wait slot blocks a bunch of threads. When they are (all) unblocked,
@@ -456,7 +463,7 @@ namespace build2
     //
     // Each queue has its own mutex plus we have an atomic total count of the
     // queued tasks. Note that it should only be modified while holding one
-    // of the queue lock.
+    // of the queue locks.
     //
     atomic_count queued_task_count_;
 
diff --git a/build2/scheduler.txx b/build2/scheduler.txx
index dd98b4a..37dd320 100644
--- a/build2/scheduler.txx
+++ b/build2/scheduler.txx
@@ -105,7 +105,7 @@ namespace build2
     }
 
     // If there is a spare active thread, wake up (or create) the helper
-    // (unless someone already snatched it).
+    // (unless someone already snatched the task).
     //
     if (queued_task_count_.load (std::memory_order_consume) != 0)
     {
-- 
cgit v1.1