aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2017-04-12 15:27:40 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2017-04-12 15:27:40 +0200
commit4c7bb136ac8c1c1cd47942ad7fe8257b20997871 (patch)
tree27353008ece9a85fb7ef6b6106c338b432bda004
parent50579107e3f628c2e3c644d5af01ef67404a0f2a (diff)
Complete agent side of machine bootstrap
-rw-r--r--bbot/agent.cli14
-rw-r--r--bbot/agent.cxx128
-rw-r--r--bbot/bbot-agent@.service11
-rw-r--r--bbot/buildfile2
-rw-r--r--bbot/machine38
-rw-r--r--bbot/machine.cxx45
-rw-r--r--bbot/tftp6
-rw-r--r--bbot/tftp.cxx5
-rw-r--r--tests/agent/testscript3
9 files changed, 198 insertions, 54 deletions
diff --git a/bbot/agent.cli b/bbot/agent.cli
index a10889e..562860f 100644
--- a/bbot/agent.cli
+++ b/bbot/agent.cli
@@ -64,6 +64,20 @@ namespace bbot
the default."
}
+ size_t --bootstrap-timeout = 600
+ {
+ "<sec>",
+ "Maximum number of seconds to wait for machine bootstrap completion,
+ 600 (10 minutes) by default."
+ }
+
+ size_t --build-timeout = 1800
+ {
+ "<sec>",
+ "Maximum number of seconds to wait for build completion, 1800 (30
+ minutes) by default."
+ }
+
uint16_t --verbose = 1
{
"<level>",
diff --git a/bbot/agent.cxx b/bbot/agent.cxx
index 4d5cc4a..634a94d 100644
--- a/bbot/agent.cxx
+++ b/bbot/agent.cxx
@@ -101,7 +101,11 @@ btrfs_exit (tracer& t, A&&... a)
: run_io_exit (t, fdnull (), fdnull (), 2, "btrfs", forward<A> (a)...);
}
-static bootstrapped_machine_manifest
+// Bootstrap the machine. Return the bootstrapped machine manifest if
+// successful and nullopt otherwise (in which case the machine directory
+// should be cleaned and the machine ignored for now).
+//
+static optional<bootstrapped_machine_manifest>
bootstrap_machine (const dir_path& md,
const machine_manifest& mm,
optional<bootstrapped_machine_manifest> obmm)
@@ -128,7 +132,7 @@ bootstrap_machine (const dir_path& md,
else
try
{
- string br ("br1"); // Use private bridge for now.
+ string br ("br1"); // Using private bridge for now.
// Start the TFTP server (server chroot is /build/tftp). Map:
//
@@ -138,6 +142,11 @@ bootstrap_machine (const dir_path& md,
auto_rmdir arm (dir_path ("/build/tftp/bootstrap/" + tc_name));
try_mkdir_p (arm.path ());
+ // Bootstrap result manifest.
+ //
+ path mf (arm.path () / "manifest");
+ try_rmfile (mf);
+
tftp_server tftpd ("Gr ^/?(.+)$ /toolchain/" + tc_name + "/\\1\n" +
"Pr ^/?(.+)$ /bootstrap/" + tc_name + "/\\1\n");
@@ -152,32 +161,85 @@ bootstrap_machine (const dir_path& md,
br,
tftpd.port ()));
- r.machine.mac = m->mac;
+ {
+ // If we are terminating with an exception then force the machine down.
+ // Failed that, the machine's destructor will block waiting for its
+ // completion.
+ //
+ auto mg (
+ make_exception_guard (
+ [&m, &md] ()
+ {
+ info << "trying to force machine " << md << " down";
+ try {m->forcedown ();} catch (const failed&) {}
+ }));
+
+ // What happens if the bootstrap process hangs? The simple thing would
+ // be to force the machine down after some timeout and then fail. But
+ // that won't be very helpful for investigating the cause. So instead
+ // the plan is to suspend it after some timeout, issue diagnostics
+ // (without failing and which Build OS monitor will relay to the admin),
+ // and wait for the external intervention.
+ //
+ auto soft_fail = [&md, &m] (const char* msg)
+ {
+ {
+ diag_record dr (error);
+ dr << msg << " for machine " << md << ", suspending";
+ m->print_info (dr);
+ }
+ m->suspend ();
+ m->wait ();
+ return nullopt;
+ };
+
+ // The first request should be the toolchain download. Wait for up to 60
+ // seconds for that to arrive. In a sense we use it as an indication
+ // that the machine has booted and the bootstrap process has started.
+ //
+ size_t to;
+ const size_t startup_to (60);
+ const size_t bootstrap_to (ops.bootstrap_timeout ());
+ const size_t shutdown_to (60);
+
+ if (!tftpd.serve ((to = startup_to)))
+ return soft_fail ("bootstrap startup timeout");
+
+ l2 ([&]{trace << "completed startup in " << startup_to - to << "s";});
+
+ // Next the bootstrap process may download additional toolchain
+ // archives, build things, and then upload the result manifest. So on
+ // our side we serve TFTP requests while periodically checking for the
+ // manifest file.
+ //
+ for (to = bootstrap_to; to != 0 && !file_exists (mf); tftpd.serve (to)) ;
+
+ if (to == 0)
+ return soft_fail ("bootstrap timeout");
+
+ l2 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";});
+
+ // Shut the machine down cleanly.
+ //
+ if (!m->shutdown ((to = shutdown_to)))
+ return soft_fail ("bootstrap shutdown timeout");
+
+ l2 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";});
+ }
- // The first request should be the toolchain download. Wait for up to 60
- // seconds for that to arrive. In a sense we use it as an indication that
- // the machine has booted and the bootstrap process has started.
+ // Parse the result manifest.
//
- size_t timeout (60);
- if (tftpd.serve (timeout))
+ try
{
- l2 ([&]{trace << "received first request in " << 60 - timeout << "s";});
+ r.bootstrap = parse_manifest<bootstrap_manifest> (mf, "bootstrap");
}
- else
+ catch (const failed&)
{
- // @@ What should be do here? Non-fatal? Mark the machine as failed?
- //
- error << "bootstrap timeout during first request for machine " << md;
- m->forcedown ();
- throw failed ();
+ error << "invalid bootstrap manifest for machine " << md;
+ return nullopt;
}
- if (!m->shutdown ())
- {
- error << "forcing machine " << md << " down";
- m->forcedown ();
- throw failed ();
- }
+ r.machine.mac = m->mac; // Save the MAC address.
}
catch (const system_error& e)
{
@@ -352,25 +414,25 @@ try
(r = cmp ("libbutl", LIBBUTL_VERSION)) != 0 ? r : 0;
};
- optional<bootstrapped_machine_manifest> obmm;
+ optional<bootstrapped_machine_manifest> bmm;
if (te)
{
- obmm = parse_manifest<bootstrapped_machine_manifest> (
+ bmm = parse_manifest<bootstrapped_machine_manifest> (
tp / "manifest", "bootstrapped machine");
- if (obmm->machine.id != mm.id)
+ if (bmm->machine.id != mm.id)
{
l2 ([&]{trace << "re-bootstrapping " << tp << ": new machine";});
te = false;
}
- if (obmm->toolchain.id != tc_id)
+ if (bmm->toolchain.id != tc_id)
{
l2 ([&]{trace << "re-bootstrapping " << tp << ": new toolchain";});
te = false;
}
- if (int i = compare_bbot (obmm->bootstrap))
+ if (int i = compare_bbot (bmm->bootstrap))
{
if (i < 0)
{
@@ -397,8 +459,14 @@ try
// bootstrap the new machine. Then atomically rename it to
// <name>-<toolchain>.
//
- bootstrapped_machine_manifest bmm (
- bootstrap_machine (xp, mm, move (obmm)));
+ bmm = bootstrap_machine (xp, mm, move (bmm));
+
+ if (!bmm)
+ {
+ l2 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";});
+ btrfs (trace, "subvolume", "delete", xp);
+ break;
+ }
try
{
@@ -409,12 +477,10 @@ try
fail << "unable to rename " << xp << " to " << tp;
}
- te = true;
-
// Check the boostrapped bbot version as above and ignore this
// machine if it's newer than us.
//
- if (int i = compare_bbot (bmm.bootstrap))
+ if (int i = compare_bbot (bmm->bootstrap))
{
assert (i > 0);
l2 ([&]{trace << "ignoring " << tp << ": old bbot";});
diff --git a/bbot/bbot-agent@.service b/bbot/bbot-agent@.service
index f8349ff..af760b3 100644
--- a/bbot/bbot-agent@.service
+++ b/bbot/bbot-agent@.service
@@ -5,16 +5,23 @@ After=default.target
[Service]
Type=simple
+Environment=VERBOSE=1
+
Environment=CPU=1
Environment=RAM=1048576
-Environment=VERBOSE=1
+
+Environment=BOOTSTRAP_TIMEOUT=600
+Environment=BUILD_TIMEOUT=1800
+
Environment=TOOLCHAIN_ID=123abc
Environment=TOOLCHAIN_NUM=1
ExecStart=/build/bbot/%i/bin/bbot-agent --systemd-daemon \
+ --verbose ${VERBOSE} \
--cpu ${CPU} \
--ram ${RAM} \
- --verbose ${VERBOSE} \
+ --bootstrap-timeout ${BOOTSTRAP_TIMEOUT} \
+ --build-timeout ${BUILD_TIMEOUT} \
%i \
${TOOLCHAIN_NUM} \
${TOOLCHAIN_ID}
diff --git a/bbot/buildfile b/bbot/buildfile
index 5498458..311775d 100644
--- a/bbot/buildfile
+++ b/bbot/buildfile
@@ -67,7 +67,7 @@ if $cli.configured
# Usage options.
#
cli.options += --suppress-undocumented --long-usage --ansi-color \
---page-usage 'bbot::print_$name$_' --option-length 20
+--page-usage 'bbot::print_$name$_' --option-length 23
# Include generated cli files into the distribution.
#
diff --git a/bbot/machine b/bbot/machine
index c2942ac..f99d11b 100644
--- a/bbot/machine
+++ b/bbot/machine
@@ -12,21 +12,51 @@ namespace bbot
{
// A running build machine (container, vm, etc).
//
+ // Note that if the machine is destroyed while it is still running, the
+ // destructor will block until the machine process terminates.
+ //
class machine
{
public:
- // Shut the machine down cleanly. Return false if machine is still
- // running, true if machine exited successfully, and throw failed
- // otherwise.
+ // Shut the machine down cleanly waiting up to the specified number of
+ // seconds for completion. Update the timeout and return false if the
+ // machine is still running, true if the machine exited successfully, and
+ // throw failed otherwise.
//
virtual bool
- shutdown () = 0;
+ shutdown (size_t& seconds) = 0;
// Force the machine down.
//
virtual void
forcedown () = 0;
+ // Suspend the machine.
+ //
+ virtual void
+ suspend () = 0;
+
+ // Wait for the machine to terminate up to the specified number of
+ // seconds. Update the timeout and return false if the machine is still
+ // running, true if the machine exited successfully, and throw failed
+ // otherwise.
+ //
+ virtual bool
+ wait (size_t& seconds) = 0;
+
+ bool
+ wait ()
+ {
+ size_t sec (~0); // Wait indefinitely.
+ return wait (sec);
+ }
+
+ // Print information about the machine (as info diagnostics) that can be
+ // useful for debugging (e.g., how to connect/login, etc).
+ //
+ virtual void
+ print_info (diag_record&) = 0;
+
public:
const string mac; // MAC address (inside the machine).
diff --git a/bbot/machine.cxx b/bbot/machine.cxx
index 8cad3f9..460e802 100644
--- a/bbot/machine.cxx
+++ b/bbot/machine.cxx
@@ -104,15 +104,23 @@ namespace bbot
uint16_t tftp_port);
virtual bool
- shutdown () override;
+ shutdown (size_t& seconds) override;
virtual void
forcedown () override;
- private:
+ virtual void
+ suspend () override;
+
bool
- wait (size_t seconds);
+ wait (size_t& seconds) override;
+
+ using machine::wait;
+ virtual void
+ print_info (diag_record&) override;
+
+ private:
void
monitor_command (const string&);
@@ -123,6 +131,7 @@ namespace bbot
string tap; // Tap network interface.
uint16_t port; // TFTP port.
+ string vnc; // QEMU VNC TCP addr:port.
path monitor; // QEMU monitor UNIX socket.
process proc;
};
@@ -140,6 +149,7 @@ namespace bbot
br (br),
tap (create_tap (br, port)),
port (port),
+ vnc ("127.0.0.1:" + to_string (5900 + stoul (tc_num))),
monitor ("/tmp/" + tc_name + "-monitor")
{
tracer trace ("kvm_machine");
@@ -185,7 +195,7 @@ namespace bbot
//
// VNC & monitor.
//
- "-vnc", "localhost:" + tc_num, // 5900 + tc_num
+ "-vnc", "127.0.0.1:" + tc_num, // 5900 + tc_num
"-monitor", "unix:" + monitor.string () + ",server,nowait");
}
@@ -208,30 +218,45 @@ namespace bbot
// forcedown().
//
bool kvm_machine::
- shutdown ()
+ shutdown (size_t& seconds)
{
monitor_command ("system_powerdown");
- // Wait for up to 10 seconds for the machine to shutdown.
+ // Wait for up to the specified number if seconds for the machine to
+ // shutdown.
//
- return wait (10);
+ return wait (seconds);
}
void kvm_machine::
forcedown ()
{
monitor_command ("system_reset");
- wait (size_t (~0)); // Wait indefinitely.
+ wait ();
+ }
+
+ void kvm_machine::
+ suspend ()
+ {
+ monitor_command ("stop");
+ }
+
+ void kvm_machine::
+ print_info (diag_record& dr)
+ {
+ dr << info << "qemu pid: " << proc.id ()
+ << info << "qemu vnc: " << vnc
+ << info << "qemu monitor: unix:" << monitor;
}
bool kvm_machine::
- wait (size_t sec)
+ wait (size_t& sec)
try
{
tracer trace ("kvm_machine::wait");
bool t;
- for (size_t i (0); !(t = proc.try_wait ()) && i != sec; ++i)
+ for (; !(t = proc.try_wait ()) && sec != 0; --sec)
sleep (1);
if (t)
diff --git a/bbot/tftp b/bbot/tftp
index cdcbfe0..581d41c 100644
--- a/bbot/tftp
+++ b/bbot/tftp
@@ -28,9 +28,9 @@ namespace bbot
uint16_t
port () const;
- // Wait for a TFTP request for up to the specified number of seconds. If
- // a request was served, update the timeout value and return true. Retain
- // the original timeout value and return false otherwise.
+ // Wait for a TFTP request for up to the specified number of seconds.
+ // Update the timeout value as well as return true if a request was
+ // served and false otherwise.
//
bool
serve (size_t& seconds);
diff --git a/bbot/tftp.cxx b/bbot/tftp.cxx
index 9c783c5..27d58a4 100644
--- a/bbot/tftp.cxx
+++ b/bbot/tftp.cxx
@@ -92,12 +92,13 @@ namespace bbot
throw_system_error (errno);
}
else if (r == 0) // Timeout.
+ {
+ sec = 0;
return false;
+ }
if (FD_ISSET (fd, &rd))
{
- text << "connection";
-
// The inetd "protocol" is to pass the socket as stdin/stdout file
// descriptors.
//
diff --git a/tests/agent/testscript b/tests/agent/testscript
index 9198460..cf92ace 100644
--- a/tests/agent/testscript
+++ b/tests/agent/testscript
@@ -23,6 +23,7 @@ test.arguments = stage 1
cp = $src_base/btrfs-cpdir -f /build/machines.orig /build/machines
rm = $src_base/btrfs-rmdir /build/machines
+#\
: dump-machines
:
{
@@ -112,6 +113,7 @@ rm = $src_base/btrfs-rmdir /build/machines
}
#\
+
: bootstrap
:
{
@@ -135,4 +137,3 @@ rm = $src_base/btrfs-rmdir /build/machines
#-$rm
}
-#\