diff options
Diffstat (limited to 'bbot/agent/agent.cxx')
-rw-r--r-- | bbot/agent/agent.cxx | 120 |
1 files changed, 88 insertions, 32 deletions
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 8f54346..a8b7c77 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -61,7 +61,8 @@ namespace bbot standard_version tc_ver; string tc_id; - uint16_t inst; + uint16_t inst; // 1-based. + uint16_t inst_max; // 0 if priority monitoring is disabled. uint16_t offset; @@ -682,10 +683,13 @@ snapshot_path (const dir_path& tp) // (re-)bootstrapping them if necessary. // // Note that this function returns both machines that this process managed to -// lock as well as the machines locked by other processes (except those that -// are being bootstrapped), in case the caller needs to interrupt one of them -// for a higher-priority task. In the latter case, the manifest only has the -// machine_manifest information. +// lock as well as the machines locked by other processes (including those +// that are being bootstrapped), in case the caller needs to interrupt one of +// them for a higher-priority task. In the latter case, the manifest is empty +// if the machine is being bootstrapped and only has the machine_manifest +// information otherwise. (The bootstrapped machines have to be returned to +// get the correct count of currently active instances for the inst_max +// comparison.) // struct bootstrapped_machine { @@ -836,7 +840,7 @@ try none = none && sp.empty (); - // Try to lock the machine, skipping it if being bootstrapped. + // Try to lock the machine. // machine_lock ml (lock_machine (tl, tp)); @@ -844,37 +848,39 @@ try { // @@ TMP: restore l4 tracing. - if (!ml.prio) // Being bootstrapped. + machine_manifest mm; + if (ml.prio) { - l1 ([&]{trace << "skipping " << md << ": being bootstrapped " - << "by " << ml.pid;}); - break; - } + // Get the machine manifest (subset of the steps performed for + // the locked case below). + // + // Note that it's possible the machine we get is not what was + // originally locked by the other process (e.g., it has been + // upgraded since). It's also possible that if and when we + // interrupt and lock this machine, it will be a different + // machine (e.g., it has been upgraded since we read this + // machine manifest). To deal with all of that we will be + // reloading this information if/when we acquire the lock to + // this machine. + // + if (sp.empty ()) + { + l3 ([&]{trace << "skipping " << md << ": no subvolume link";}); + break; + } - // Get the machine manifest (subset of the steps performed for - // the locked case below). - // - // Note that it's possible the machine we get is not what was - // originally locked by the other process (e.g., it has been - // upgraded since). It's also possible that if and when we - // interrupt and lock this machine, it will be a different - // machine (e.g., it has been upgraded since we read this - // machine manifest). To deal with all of that we will be - // reloading this information if/when we acquire the lock to - // this machine. - // - if (sp.empty ()) + l1 ([&]{trace << "keeping " << md << ": locked by " << ml.pid + << " with priority " << *ml.prio;}); + + mm = parse_manifest<machine_manifest> ( + sp / "manifest", "machine"); + } + else // Being bootstrapped. { - l3 ([&]{trace << "skipping " << md << ": no subvolume link";}); - break; + l1 ([&]{trace << "keeping " << md << ": being bootstrapped " + << "by " << ml.pid;}); } - l1 ([&]{trace << "keeping " << md << ": locked by " << ml.pid - << " with priority " << *ml.prio;}); - - auto mm ( - parse_manifest<machine_manifest> (sp / "manifest", "machine")); - // Add the machine to the lists and bail out. // r.push_back (bootstrapped_machine { @@ -1625,6 +1631,8 @@ try if (inst == 0 || inst > 99) fail << "invalid --instance value " << inst; + inst_max = ops.instance_max (); + offset = (tc_num - 1) * 100 + inst; // Controller URLs. @@ -1699,6 +1707,9 @@ try info << "toolchain id " << tc_id << info << "instance num " << inst; + if (inst_max != 0) + dr << info << "instance max " << inst_max; + for (const string& u: controllers) dr << info << "controller url " << u; } @@ -1761,6 +1772,49 @@ try toolchain_lock& tl (er.first); bootstrapped_machines& ms (er.second); + // Determine if we should operate in the priority monitor mode and, if so, + // the lower bound on the priorities that we should consider. + // + optional<uint64_t> prio_mon; + if (inst_max != 0) + { + uint16_t busy (0); + optional<uint64_t> prio; + + for (const bootstrapped_machine& m: ms) + { + if (!m.lock.locked ()) + { + ++busy; + if (m.lock.prio && (!prio || *m.lock.prio < *prio)) + prio = *m.lock.prio; + } + } + + assert (busy <= inst_max); + + if (busy == inst_max) + { + if (!prio) // All being bootstrapped. + { + sleep = rand_sleep (); + continue; + } + + prio_mon = *prio; + } + } + + // @@ For now bail out if in the priority monitor mode. + // + if (prio_mon) + { + l1 ([&]{trace << "priority monitor, lower bound " << *prio_mon;}); + + sleep = rand_sleep () / 2; + continue; + } + // Prepare task request. // task_request_manifest tq { @@ -1779,6 +1833,8 @@ try { // @@ For now skip machines locked by other processes. // + // @@ Note: skip machines being bootstrapped. + // if (m.lock.locked ()) tq.machines.emplace_back (m.manifest.machine.id, m.manifest.machine.name, |