diff options
-rw-r--r-- | bbot/agent/agent.cli | 14 | ||||
-rw-r--r-- | bbot/agent/agent.cxx | 120 | ||||
-rw-r--r-- | bbot/bbot-agent@.service | 3 |
3 files changed, 105 insertions, 32 deletions
diff --git a/bbot/agent/agent.cli b/bbot/agent/agent.cli index aa7eb59..3d028fd 100644 --- a/bbot/agent/agent.cli +++ b/bbot/agent/agent.cli @@ -111,6 +111,20 @@ namespace bbot network ports, interfaces, etc." } + uint16_t --instance-max = 0 + { + "<num>", + "Maximum number of instances that can perform tasks concurrently. If the + number of instances that have been started is greater than this number + (normally by just one), then when the maximum number of tasks is + already being performed, the extra instances operate in the \i{priority + monitor} mode: they only query controller URLs with priorities higher + than of the existing tasks and can only perform a task by interrupting + one of them. If the maximum number of instances is \cb{0} (default), + then it is assumed the number of instances started is the maximum + number, essentially disabling the priority monitor functionality." + } + size_t --cpu = 1 { "<num>", diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 8f54346..a8b7c77 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -61,7 +61,8 @@ namespace bbot standard_version tc_ver; string tc_id; - uint16_t inst; + uint16_t inst; // 1-based. + uint16_t inst_max; // 0 if priority monitoring is disabled. uint16_t offset; @@ -682,10 +683,13 @@ snapshot_path (const dir_path& tp) // (re-)bootstrapping them if necessary. // // Note that this function returns both machines that this process managed to -// lock as well as the machines locked by other processes (except those that -// are being bootstrapped), in case the caller needs to interrupt one of them -// for a higher-priority task. In the latter case, the manifest only has the -// machine_manifest information. +// lock as well as the machines locked by other processes (including those +// that are being bootstrapped), in case the caller needs to interrupt one of +// them for a higher-priority task. In the latter case, the manifest is empty +// if the machine is being bootstrapped and only has the machine_manifest +// information otherwise. (The bootstrapped machines have to be returned to +// get the correct count of currently active instances for the inst_max +// comparison.) // struct bootstrapped_machine { @@ -836,7 +840,7 @@ try none = none && sp.empty (); - // Try to lock the machine, skipping it if being bootstrapped. + // Try to lock the machine. // machine_lock ml (lock_machine (tl, tp)); @@ -844,37 +848,39 @@ try { // @@ TMP: restore l4 tracing. - if (!ml.prio) // Being bootstrapped. + machine_manifest mm; + if (ml.prio) { - l1 ([&]{trace << "skipping " << md << ": being bootstrapped " - << "by " << ml.pid;}); - break; - } + // Get the machine manifest (subset of the steps performed for + // the locked case below). + // + // Note that it's possible the machine we get is not what was + // originally locked by the other process (e.g., it has been + // upgraded since). It's also possible that if and when we + // interrupt and lock this machine, it will be a different + // machine (e.g., it has been upgraded since we read this + // machine manifest). To deal with all of that we will be + // reloading this information if/when we acquire the lock to + // this machine. + // + if (sp.empty ()) + { + l3 ([&]{trace << "skipping " << md << ": no subvolume link";}); + break; + } - // Get the machine manifest (subset of the steps performed for - // the locked case below). - // - // Note that it's possible the machine we get is not what was - // originally locked by the other process (e.g., it has been - // upgraded since). It's also possible that if and when we - // interrupt and lock this machine, it will be a different - // machine (e.g., it has been upgraded since we read this - // machine manifest). To deal with all of that we will be - // reloading this information if/when we acquire the lock to - // this machine. - // - if (sp.empty ()) + l1 ([&]{trace << "keeping " << md << ": locked by " << ml.pid + << " with priority " << *ml.prio;}); + + mm = parse_manifest<machine_manifest> ( + sp / "manifest", "machine"); + } + else // Being bootstrapped. { - l3 ([&]{trace << "skipping " << md << ": no subvolume link";}); - break; + l1 ([&]{trace << "keeping " << md << ": being bootstrapped " + << "by " << ml.pid;}); } - l1 ([&]{trace << "keeping " << md << ": locked by " << ml.pid - << " with priority " << *ml.prio;}); - - auto mm ( - parse_manifest<machine_manifest> (sp / "manifest", "machine")); - // Add the machine to the lists and bail out. // r.push_back (bootstrapped_machine { @@ -1625,6 +1631,8 @@ try if (inst == 0 || inst > 99) fail << "invalid --instance value " << inst; + inst_max = ops.instance_max (); + offset = (tc_num - 1) * 100 + inst; // Controller URLs. @@ -1699,6 +1707,9 @@ try info << "toolchain id " << tc_id << info << "instance num " << inst; + if (inst_max != 0) + dr << info << "instance max " << inst_max; + for (const string& u: controllers) dr << info << "controller url " << u; } @@ -1761,6 +1772,49 @@ try toolchain_lock& tl (er.first); bootstrapped_machines& ms (er.second); + // Determine if we should operate in the priority monitor mode and, if so, + // the lower bound on the priorities that we should consider. + // + optional<uint64_t> prio_mon; + if (inst_max != 0) + { + uint16_t busy (0); + optional<uint64_t> prio; + + for (const bootstrapped_machine& m: ms) + { + if (!m.lock.locked ()) + { + ++busy; + if (m.lock.prio && (!prio || *m.lock.prio < *prio)) + prio = *m.lock.prio; + } + } + + assert (busy <= inst_max); + + if (busy == inst_max) + { + if (!prio) // All being bootstrapped. + { + sleep = rand_sleep (); + continue; + } + + prio_mon = *prio; + } + } + + // @@ For now bail out if in the priority monitor mode. + // + if (prio_mon) + { + l1 ([&]{trace << "priority monitor, lower bound " << *prio_mon;}); + + sleep = rand_sleep () / 2; + continue; + } + // Prepare task request. // task_request_manifest tq { @@ -1779,6 +1833,8 @@ try { // @@ For now skip machines locked by other processes. // + // @@ Note: skip machines being bootstrapped. + // if (m.lock.locked ()) tq.machines.emplace_back (m.manifest.machine.id, m.manifest.machine.name, diff --git a/bbot/bbot-agent@.service b/bbot/bbot-agent@.service index e938126..18b7c9e 100644 --- a/bbot/bbot-agent@.service +++ b/bbot/bbot-agent@.service @@ -38,6 +38,8 @@ Environment=TOOLCHAIN_NUM=1 Environment=TOOLCHAIN_VER= Environment=TOOLCHAIN_ID= +Environment=INSTANCE_MAX=0 + Environment="CONTROLLER_URL=" Environment="CONTROLLER_TRUST=" @@ -63,6 +65,7 @@ ExecStart=/build/bots/default/bin/bbot-agent \ --toolchain-num ${TOOLCHAIN_NUM} \ --toolchain-ver ${TOOLCHAIN_VER} \ --toolchain-id ${TOOLCHAIN_ID} \ + --instance-max ${INSTANCE_MAX} \ --instance %i \ $CONTROLLER_TRUST \ $CONTROLLER_URL |