From e852e2f469fed309fb32f0f938122bb83be9b8c7 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 16 May 2023 05:43:56 +0200 Subject: Implement machine interruption for priority level four --- bbot/agent/agent.cxx | 115 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 80 insertions(+), 35 deletions(-) diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 9aba216..ad8cd74 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -2133,29 +2133,40 @@ try // Determine which machines we need to offer for this priority. // - for (const bootstrapped_machine& m: ms) { - if (!m.lock.locked ()) + bool interruptable (false); + for (const bootstrapped_machine& m: ms) { - if (!m.lock.prio) // Skip bootstrapping/suspended. - continue; + if (!m.lock.locked ()) + { + if (!m.lock.prio) // Skip bootstrapping/suspended. + continue; - uint64_t eprio (*m.lock.prio); + uint64_t eprio (*m.lock.prio); - // Determine if our priority can interrupt the existing task. - // - // Based on the above discussion of the priority lower bound - // determination (and some menditation) it's clear that we can only - // interrupt the existing task if our priority is (at least) on a - // higher 3rd level. - // - if ((prio / 100) <= (eprio / 100)) - continue; + // Determine if our priority can interrupt the existing task. + // + // Based on the above discussion of the priority lower bound + // determination (and some menditation) it's clear that we can + // only interrupt the existing task if our priority is (at least) + // on a higher 3rd level. + // + if ((prio / 100) <= (eprio / 100)) + continue; + + interruptable = true; + } + + tq.machines.emplace_back (m.manifest.machine.id, + m.manifest.machine.name, + m.manifest.machine.summary); } - tq.machines.emplace_back (m.manifest.machine.id, - m.manifest.machine.name, - m.manifest.machine.summary); + // Sanity check: in the priority monitor mode we should only ask for a + // task if we can interrupt one (this should be taken care of by the + // priority lower bound calculation above). + // + assert (!prio_mon || interruptable); } if (ops.dump_machines ()) @@ -2370,10 +2381,12 @@ try // Next find the corresponding bootstrapped_machine instance in ms. Also // unlock all the other machines. // - // While at it also find the lowest priority candidate to interrupt if - // necessary. + // While at it also see if we need to interrupt the selected machine (if + // busy), one of the existing (if we are at the max allowed instances, + // that is in the priority monitor mode), or all existing (if this is a + // priority level 4 task). // - bootstrapped_machine* im (nullptr); + vector ims; for (bootstrapped_machine& m: ms) { if (m.manifest.machine.name == t.machine) @@ -2385,12 +2398,37 @@ try m.lock.unlock (); else if (m.lock.prio) // Not bootstrapping/suspended. { - if (im == nullptr || *m.lock.prio < *im->lock.prio) - im = &m; + // Only consider machines that we can interrupt (see above). + // + if ((prio / 100) > (*m.lock.prio / 100)) + { + if (prio >= 1000) // Priority level 4 (interrupt all). + ims.push_back (&m); + else if (prio_mon) + { + // Find the lowest priority task to interrupt. + // + if (ims.empty ()) + ims.push_back (&m); + else if (*m.lock.prio < *ims.back ()->lock.prio) + ims.back () = &m; + } + } } } + assert (pm != nullptr); + if (!pm->lock.locked ()) + { + if (prio >= 1000) + ims.insert (ims.begin (), pm); // Interrupt first (see below). + else + ims = {pm}; + } + + assert (!prio_mon || !ims.empty ()); // We should have at least one. + // Move the toolchain lock into this scope so that it's automatically // released on any failure (on the happy path it is released by // perform_task()). @@ -2398,25 +2436,25 @@ try toolchain_lock& rtl (tl); toolchain_lock tl (move (rtl)); - // See if we need to interrupt the selected machine (if busy) or one of - // the existing (if we are at the max allowed instances, that is in the - // priority monitor mode). + // Interrupt the machines, if necessary. // - if (!pm->lock.locked ()) - im = pm; - else if (prio_mon) - assert (im != nullptr); // We should have at least one. - else - im = nullptr; // No interrupt necessary. - - if (im != nullptr) + // Note that if we are interrupting multiple machines, then the target + // machine, if needs to be interrupted, must be first. This way if we + // are unable to successfully interrupt it, we don't interrupt the rest. + // + for (bootstrapped_machine* im: ims) { - assert (!im->lock.locked () && im->lock.prio); // Sanity checks. + bool first (im == ims.front ()); + + // Sanity checks. + // + assert (!im->lock.locked () && im->lock.prio); + assert (im != pm || first); const dir_path& tp (im->path); // - path. l1 ([&]{trace << "interrupting " - << (im == pm ? "target" : "lowest priority") + << (im == pm ? "target" : "lower priority") << " machine " << tp << ", pid " << im->lock.pid;}); // The plan is to send the interrupt and then wait for the lock. @@ -2439,6 +2477,13 @@ try throw_generic_error (errno); } + // If we are interrupting multiple machine, there is no use acquiring + // the lock (or failing if unable to) for subsequent machines since + // this is merely a performance optimization. + // + if (!first) + continue; + // Try to lock the machine. // // While this normally shouldn't take long, there could be parts of -- cgit v1.1