aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bbot/agent/agent.cxx121
1 files changed, 85 insertions, 36 deletions
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 1b35ee0..8e0caf6 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -487,11 +487,12 @@ lock_toolchain (unsigned int timeout)
class machine_lock
{
public:
- // A lock is either locked by this process or it contains information
- // about the process holding the lock.
+ // A lock is either locked by this process or it contains information about
+ // the process holding the lock.
//
pid_t pid; // Process using the machine.
- optional<uint64_t> prio; // Task priority (absent means being bootstrapped).
+ optional<uint64_t> prio; // Task priority (absent means being bootstrapped
+ // or have been suspended).
machine_lock () = default; // Uninitialized lock.
@@ -524,29 +525,45 @@ public:
// implementation for rationale).
//
void
- write (const toolchain_lock& tl, optional<uint64_t> prio)
+ bootstrap (const toolchain_lock& tl)
{
assert (tl.locked () && fl_);
if (fd_ != nullfd)
- {
- pid_t pid (getpid ());
+ write (nullopt);
+ }
- string l (to_string (pid));
+ void
+ perform_task (const toolchain_lock& tl, uint64_t prio)
+ {
+ assert (tl.locked () && fl_);
- if (prio)
- {
- l += ' ';
- l += to_string (*prio);
- }
+ if (fd_ != nullfd)
+ write (prio);
+ }
- auto n (fdwrite (fd_.get (), l.c_str (), l.size ()));
+ // Truncate the holding process information after the call to perform_task()
+ // so that it doesn't contain the priority, marking the machine as being
+ // suspended.
+ //
+ // Note that this one can be called without holding the toolchain lock.
+ //
+ void
+ suspend_task ()
+ {
+ assert (fl_);
- if (n == -1)
- throw_generic_ios_failure (errno);
+ if (fd_ != nullfd)
+ {
+ assert (tp_ != 0); // Must be called after perform_task().
- if (static_cast<size_t> (n) != l.size ())
- throw_generic_ios_failure (EFBIG);
+ // While there is no direct statement to this effect in POSIX, the
+ // consensus on the internet is that truncation is atomic, in a sense
+ // that the reader shouldn't see a partially truncated content. Feels
+ // like should be doubly so when actually truncating as opposed to
+ // extending the size, which is what we do.
+ //
+ fdtruncate (fd_.get (), tp_);
}
}
@@ -573,9 +590,35 @@ public:
: pid (pi), prio (pr), fl_ (false) {}
private:
- path fp_;
- auto_fd fd_;
- bool fl_ = false;
+ void
+ write (optional<uint64_t> prio)
+ {
+ pid_t pid (getpid ());
+
+ string l (to_string (pid));
+
+ if (prio)
+ {
+ tp_ = l.size (); // Truncate position.
+
+ l += ' ';
+ l += to_string (*prio);
+ }
+
+ auto n (fdwrite (fd_.get (), l.c_str (), l.size ()));
+
+ if (n == -1)
+ throw_generic_ios_failure (errno);
+
+ if (static_cast<size_t> (n) != l.size ())
+ throw_generic_ios_failure (EFBIG);
+ }
+
+private:
+ path fp_;
+ auto_fd fd_;
+ bool fl_ = false;
+ uint64_t tp_ = 0; // Truncate position.
};
// Try to lock the machine given its -<toolchain> directory. Return unlocked
@@ -686,12 +729,12 @@ snapshot_path (const dir_path& tp)
//
// Note that this function returns both machines that this process managed to
// lock as well as the machines locked by other processes (including those
-// that are being bootstrapped), in case the caller needs to interrupt one of
-// them for a higher-priority task. In the latter case, the manifest is empty
-// if the machine is being bootstrapped and only has the machine_manifest
-// information otherwise. (The bootstrapped machines have to be returned to
-// get the correct count of currently active instances for the inst_max
-// comparison.)
+// that are being bootstrapped or that have been suspended), in case the
+// caller needs to interrupt one of them for a higher-priority task. In the
+// latter case, the manifest is empty if the machine is bootstrapping or
+// suspended and only has the machine_manifest information otherwise. (The
+// bootstrapping/suspended machines have to be returned to get the correct
+// count of currently active instances for the inst_max comparison.)
//
struct bootstrapped_machine
{
@@ -776,7 +819,7 @@ try
};
// Notice and warn if there are no machines (as opposed to all of them
- // being locked).
+ // being busy).
//
bool none (true);
@@ -921,10 +964,10 @@ try
mm = parse_manifest<machine_manifest> (
sp / "manifest", "machine");
}
- else // Being bootstrapped.
+ else // Bootstrapping/suspended.
{
l1 ([&]{trace << "keeping " << md << ": being bootstrapped "
- << "by " << ml.pid;});
+ << "or suspened by " << ml.pid;});
}
// Add the machine to the lists and bail out.
@@ -1101,7 +1144,7 @@ try
// from scratch.
//
r.clear ();
- ml.write (tl, nullopt /* prio */); // Being bootstrapped.
+ ml.bootstrap (tl);
tl.unlock ();
optional<bootstrapped_machine_manifest> bmm (
@@ -1155,7 +1198,8 @@ catch (const system_error& e)
}
static result_manifest
-perform_task (const dir_path& md,
+perform_task (machine_lock& ml,
+ const dir_path& md,
const bootstrapped_machine_manifest& mm,
const task_manifest& tm)
try
@@ -1304,7 +1348,7 @@ try
try {m->forcedown (false);} catch (const failed&) {}
}));
- auto soft_fail = [&xp, &m, &r] (const char* msg)
+ auto soft_fail = [&ml, &xp, &m, &r] (const char* msg)
{
{
diag_record dr (error);
@@ -1314,6 +1358,11 @@ try
try
{
+ // Update the information in the machine lock to signal that the
+ // machine is suspended and cannot be interrupted.
+ //
+ ml.suspend_task ();
+
m->suspend (false);
m->wait (false);
m->cleanup ();
@@ -1900,7 +1949,7 @@ try
if (busy == inst_max)
{
- if (!prio) // All being bootstrapped.
+ if (!prio) // All bootstrapping/suspended.
{
sleep = rand_sleep ();
continue;
@@ -1960,7 +2009,7 @@ try
{
// @@ For now skip machines locked by other processes.
//
- // @@ Note: skip machines being bootstrapped.
+ // @@ Note: skip machines bootstrapping/suspended.
//
if (m.lock.locked ())
tq.machines.emplace_back (m.manifest.machine.id,
@@ -2141,7 +2190,7 @@ try
{
if (mh.name == m.manifest.machine.name)
{
- m.lock.write (tl, prio);
+ m.lock.perform_task (tl, prio);
pm = &m;
}
else
@@ -2188,7 +2237,7 @@ try
if (!tr.agent_checksum || *tr.agent_checksum != agent_checksum)
t.worker_checksum = nullopt;
- result_manifest r (perform_task (m.path, m.manifest, t));
+ result_manifest r (perform_task (m.lock, m.path, m.manifest, t));
m.lock.unlock (); // No need to hold the lock any longer.