From 67d3980a2aefaa9962c3011804171fc529dc5b2b Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Sun, 14 May 2023 09:54:16 +0200 Subject: Setup machine interrupt handling infrastructure --- bbot/agent/agent.cxx | 105 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 70 insertions(+), 35 deletions(-) diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 8e0caf6..d58743b 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -1197,8 +1197,13 @@ catch (const system_error& e) fail << "unable to iterate over " << machines << ": " << e << endf; } +// Perform the build task throwing interrupt if it has been interrupted. +// +struct interrupt {}; + static result_manifest -perform_task (machine_lock& ml, +perform_task (toolchain_lock tl, // Note: assumes ownership. + machine_lock& ml, const dir_path& md, const bootstrapped_machine_manifest& mm, const task_manifest& tm) @@ -1206,6 +1211,8 @@ try { tracer trace ("perform_task", md.string ().c_str ()); + tl.unlock (); // @@ TMP (we have to arm the signal handler under lock). + result_manifest r { tm.name, tm.version, @@ -1226,7 +1233,8 @@ try // // 3. Start the TFTP server and the machine. // - // 4. Serve TFTP requests while watching out for the result manifest. + // 4. Serve TFTP requests while watching out for the result manifest and + // interrupts. // // 5. Clean up (force the machine down and delete the snapshot). // @@ -2177,34 +2185,13 @@ try // task_manifest& t (*tr.task); - // First verify the requested machine is one of those we sent in tq. Then - // find the corresponding bootstrapped_machine instance in ms. Also unlock - // all the other machines as well as the global toolchain lock. + // First verify the requested machine is one of those we sent in tq. // - bootstrapped_machine* pm (nullptr); - for (const machine_header_manifest& mh: tq.machines) - { - if (mh.name == t.machine) // Yes, comparing names, not ids. - { - for (bootstrapped_machine& m: ms) - { - if (mh.name == m.manifest.machine.name) - { - m.lock.perform_task (tl, prio); - pm = &m; - } - else - m.lock.unlock (); - } - - assert (pm != nullptr); - break; - } - } - - tl.unlock (); - - if (pm == nullptr) + if (find_if (tq.machines.begin (), tq.machines.end (), + [&t] (const machine_header_manifest& mh) + { + return mh.name == t.machine; // Yes, names, not ids. + }) == tq.machines.end ()) { error << "task from " << url << " for unknown machine " << t.machine; @@ -2214,8 +2201,6 @@ try continue; } - bootstrapped_machine& m (*pm); - if (ops.dump_task ()) { serialize_manifest (t, cout, "stdout", "task"); @@ -2237,9 +2222,56 @@ try if (!tr.agent_checksum || *tr.agent_checksum != agent_checksum) t.worker_checksum = nullopt; - result_manifest r (perform_task (m.lock, m.path, m.manifest, t)); + // Handle interrupts. + // + // Note that the interrupt can be triggered both by another process (the + // interrupt exception is thrown from perform_task()) as well as by this + // process in case we were unable to interrupt the other process (seeing + // that we have already received a task, responding with an interrupt + // feels like the most sensible option). + // + result_manifest r; + bootstrapped_machine* pm (nullptr); + try + { + // Next find the corresponding bootstrapped_machine instance in ms. Also + // unlock all the other machines. + // + // @@ TODO: looks like this is also where we will interrupt the machines + // (thus inside the try block). Note that we have to do this + // while holding the toolchain lock. Would be good to + // unlock all the machines as well as the toolchain lock on + // failure. + // + for (bootstrapped_machine& m: ms) + { + if (m.manifest.machine.name == t.machine) + { + assert (pm == nullptr); // Sanity check. - m.lock.unlock (); // No need to hold the lock any longer. + m.lock.perform_task (tl, prio); + pm = &m; + } + else + m.lock.unlock (); + } + assert (pm != nullptr); + + r = perform_task (move (tl), pm->lock, pm->path, pm->manifest, t); + } + catch (const interrupt&) + { + r = result_manifest { + t.name, + t.version, + result_status::interrupt, + operation_results {}, + nullopt /* worker_checksum */, + nullopt /* dependency_checksum */}; + } + + if (pm != nullptr) // Let's not assume. + pm->lock.unlock (); // No need to hold the lock any longer. if (ops.dump_result ()) { @@ -2279,6 +2311,8 @@ try fail << "unable to sign task response challenge: " << e; } + result_status rs (r.status); + // Upload the result. // result_request_manifest rq {tr.session, @@ -2337,8 +2371,9 @@ try } } - l2 ([&]{trace << "built " << t.name << '/' << t.version << " " - << "on " << t.machine << " " + l2 ([&]{trace << "built " << t.name << '/' << t.version << ' ' + << "status " << rs << ' ' + << "on " << t.machine << ' ' << "for " << url;}); } } -- cgit v1.1