aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2023-05-14 09:54:16 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2023-05-14 10:08:30 +0200
commit67d3980a2aefaa9962c3011804171fc529dc5b2b (patch)
treec8cb322378365e205d0cd5aa14c66d74fda73dc6
parent26f669cc6ecf7c3957bcb6caef26d203b5e582a7 (diff)
Setup machine interrupt handling infrastructure
-rw-r--r--bbot/agent/agent.cxx105
1 files changed, 70 insertions, 35 deletions
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 8e0caf6..d58743b 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -1197,8 +1197,13 @@ catch (const system_error& e)
fail << "unable to iterate over " << machines << ": " << e << endf;
}
+// Perform the build task throwing interrupt if it has been interrupted.
+//
+struct interrupt {};
+
static result_manifest
-perform_task (machine_lock& ml,
+perform_task (toolchain_lock tl, // Note: assumes ownership.
+ machine_lock& ml,
const dir_path& md,
const bootstrapped_machine_manifest& mm,
const task_manifest& tm)
@@ -1206,6 +1211,8 @@ try
{
tracer trace ("perform_task", md.string ().c_str ());
+ tl.unlock (); // @@ TMP (we have to arm the signal handler under lock).
+
result_manifest r {
tm.name,
tm.version,
@@ -1226,7 +1233,8 @@ try
//
// 3. Start the TFTP server and the machine.
//
- // 4. Serve TFTP requests while watching out for the result manifest.
+ // 4. Serve TFTP requests while watching out for the result manifest and
+ // interrupts.
//
// 5. Clean up (force the machine down and delete the snapshot).
//
@@ -2177,34 +2185,13 @@ try
//
task_manifest& t (*tr.task);
- // First verify the requested machine is one of those we sent in tq. Then
- // find the corresponding bootstrapped_machine instance in ms. Also unlock
- // all the other machines as well as the global toolchain lock.
+ // First verify the requested machine is one of those we sent in tq.
//
- bootstrapped_machine* pm (nullptr);
- for (const machine_header_manifest& mh: tq.machines)
- {
- if (mh.name == t.machine) // Yes, comparing names, not ids.
- {
- for (bootstrapped_machine& m: ms)
- {
- if (mh.name == m.manifest.machine.name)
- {
- m.lock.perform_task (tl, prio);
- pm = &m;
- }
- else
- m.lock.unlock ();
- }
-
- assert (pm != nullptr);
- break;
- }
- }
-
- tl.unlock ();
-
- if (pm == nullptr)
+ if (find_if (tq.machines.begin (), tq.machines.end (),
+ [&t] (const machine_header_manifest& mh)
+ {
+ return mh.name == t.machine; // Yes, names, not ids.
+ }) == tq.machines.end ())
{
error << "task from " << url << " for unknown machine " << t.machine;
@@ -2214,8 +2201,6 @@ try
continue;
}
- bootstrapped_machine& m (*pm);
-
if (ops.dump_task ())
{
serialize_manifest (t, cout, "stdout", "task");
@@ -2237,9 +2222,56 @@ try
if (!tr.agent_checksum || *tr.agent_checksum != agent_checksum)
t.worker_checksum = nullopt;
- result_manifest r (perform_task (m.lock, m.path, m.manifest, t));
+ // Handle interrupts.
+ //
+ // Note that the interrupt can be triggered both by another process (the
+ // interrupt exception is thrown from perform_task()) as well as by this
+ // process in case we were unable to interrupt the other process (seeing
+ // that we have already received a task, responding with an interrupt
+ // feels like the most sensible option).
+ //
+ result_manifest r;
+ bootstrapped_machine* pm (nullptr);
+ try
+ {
+ // Next find the corresponding bootstrapped_machine instance in ms. Also
+ // unlock all the other machines.
+ //
+ // @@ TODO: looks like this is also where we will interrupt the machines
+ // (thus inside the try block). Note that we have to do this
+ // while holding the toolchain lock. Would be good to
+ // unlock all the machines as well as the toolchain lock on
+ // failure.
+ //
+ for (bootstrapped_machine& m: ms)
+ {
+ if (m.manifest.machine.name == t.machine)
+ {
+ assert (pm == nullptr); // Sanity check.
- m.lock.unlock (); // No need to hold the lock any longer.
+ m.lock.perform_task (tl, prio);
+ pm = &m;
+ }
+ else
+ m.lock.unlock ();
+ }
+ assert (pm != nullptr);
+
+ r = perform_task (move (tl), pm->lock, pm->path, pm->manifest, t);
+ }
+ catch (const interrupt&)
+ {
+ r = result_manifest {
+ t.name,
+ t.version,
+ result_status::interrupt,
+ operation_results {},
+ nullopt /* worker_checksum */,
+ nullopt /* dependency_checksum */};
+ }
+
+ if (pm != nullptr) // Let's not assume.
+ pm->lock.unlock (); // No need to hold the lock any longer.
if (ops.dump_result ())
{
@@ -2279,6 +2311,8 @@ try
fail << "unable to sign task response challenge: " << e;
}
+ result_status rs (r.status);
+
// Upload the result.
//
result_request_manifest rq {tr.session,
@@ -2337,8 +2371,9 @@ try
}
}
- l2 ([&]{trace << "built " << t.name << '/' << t.version << " "
- << "on " << t.machine << " "
+ l2 ([&]{trace << "built " << t.name << '/' << t.version << ' '
+ << "status " << rs << ' '
+ << "on " << t.machine << ' '
<< "for " << url;});
}
}