diff options
-rw-r--r-- | bbot/agent/agent.cxx | 29 | ||||
-rw-r--r-- | bbot/agent/machine.cxx | 110 | ||||
-rw-r--r-- | bbot/agent/machine.hxx | 6 |
3 files changed, 130 insertions, 15 deletions
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 2378fc2..be3b706 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -211,6 +211,7 @@ bootstrap_machine (const dir_path& md, { m->suspend (false); m->wait (false); + m->cleanup (); info << "resuming after machine suspension"; } catch (const failed&) {} @@ -236,7 +237,10 @@ bootstrap_machine (const dir_path& md, // Failed, exit code diagnostics has already been issued. } - error << "machine " << md << " exited unexpectedly"; + diag_record dr (error); + dr << "machine " << md << " exited unexpectedly"; + m->print_info (dr); + return false; }; @@ -273,8 +277,11 @@ bootstrap_machine (const dir_path& md, if (retry > ops.bootstrap_retries ()) return soft_fail ("bootstrap startup timeout"); - warn << "machine " << mm.name << " appears to have " - << "mis-booted, retrying"; + // Note: keeping the logs behind (no cleanup). + + diag_record dr (warn); + dr << "machine " << mm.name << " mis-booted, retrying"; + m->print_info (dr); try {m->forcedown (false);} catch (const failed&) {} continue; @@ -322,6 +329,8 @@ bootstrap_machine (const dir_path& md, return soft_fail ("bootstrap shutdown timeout"); l3 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";}); + + m->cleanup (); } // Parse the result manifest. @@ -897,6 +906,7 @@ try { m->suspend (false); m->wait (false); + m->cleanup (); info << "resuming after machine suspension"; } catch (const failed&) {} @@ -916,7 +926,10 @@ try { } - error << "machine " << xp << " exited unexpectedly"; + diag_record dr (error); + dr << "machine " << xp << " exited unexpectedly"; + m->print_info (dr); + return false; }; @@ -945,8 +958,11 @@ try if (retry > ops.build_retries ()) return soft_fail ("build startup timeout"); - warn << "machine " << mm.machine.name << " appears to have " - << "mis-booted, retrying"; + // Note: keeping the logs behind (no cleanup). + + diag_record dr (warn); + dr << "machine " << mm.machine.name << " mis-booted, retrying"; + m->print_info (dr); try {m->forcedown (false);} catch (const failed&) {} continue; @@ -1010,6 +1026,7 @@ try // lease instead of a new one. // try {m->forcedown (false);} catch (const failed&) {} + m->cleanup (); } } diff --git a/bbot/agent/machine.cxx b/bbot/agent/machine.cxx index fdc11c0..2566ca4 100644 --- a/bbot/agent/machine.cxx +++ b/bbot/agent/machine.cxx @@ -189,6 +189,9 @@ namespace bbot using machine::wait; virtual void + cleanup () override; + + virtual void print_info (diag_record&) override; private: @@ -202,6 +205,8 @@ namespace bbot tap net; // Tap network interface. string vnc; // QEMU VNC TCP addr:port. path monitor; // QEMU monitor UNIX socket. + path log; // QEMU log (QMP read end). + auto_fd qmp; // QMP write end. process proc; }; @@ -229,6 +234,18 @@ namespace bbot if (sizeof (sockaddr_un::sun_path) <= monitor.size ()) throw invalid_argument ("monitor unix socket path too long"); + // Machine name. + // + // While we currently can only have one running machine per toolchain, add + // the instance number for debuggability. + // + string name (mm.name + '-' + tc_name + '-' + to_string (inst)); + + // Machine log. Note that it is only removed with an explicit cleanup() + // call. + // + log = path ("/tmp/" + path::traits::temp_name (name) + ".log"); + // Map logical CPUs to sockets/cores/threads keeping the number of and // cores even. Failed that, QEMU just makes it a machine with that number // of sockets and some operating systems (like Windows) can only do two. @@ -315,20 +332,45 @@ namespace bbot //"-device", "scsi-hd,drive=disk0" } - // Start the VM. + // Setup QMP (QEMU Machine Protocol) monitor to act as a log. // - // Notes: + // Note that we still have to tell it our "capabilities" so while it will + // write to a log file, we need a pipe it will read from. // - // 1. echo system_powerdown | socat - UNIX-CONNECT:.../monitor + fdpipe qmp_in; + try + { + qmp_in = fdopen_pipe (); + } + catch (const io_error& e) + { + fail << "unable to create QMP input pipe: " << e; + } + + auto_fd qmp_out; + try + { + qmp_out = fdopen (log, (fdopen_mode::out | + fdopen_mode::create | + fdopen_mode::exclusive)); + } + catch (const io_error& e) + { + fail << "unable to create QMP output file: " << e; + } + + // Start the VM. // const char* env[] = {"QEMU_AUDIO_DRV=none", // Disable audio output. nullptr}; proc = run_io_start ( trace, - fdnull (), - 2, - 2, + qmp_in, + 2, // 1>&2 (QMP goes to stdout) + qmp_out, process_env (kvm, md, env), // Run from the machine's directory. + "-name", name + ",debug-threads=on", + "-S", // Start suspended. "-boot", "c", // Boot from disk. "-no-reboot", // Exit on VM reboot. "-m", to_string (ram / 1024) + "M", @@ -337,15 +379,64 @@ namespace bbot ",sockets=" + to_string (sockets) + ",cores=" + to_string (cores) + ",threads=" + to_string (threads)), - // + // RTC settings. // "-rtc", "clock=vm,driftfix=slew", "-no-hpet", "-global", "kvm-pit.lost_tick_policy=discard", + os, + + // VNC. + // "-vnc", "127.0.0.1:" + to_string (offset), // 5900 + offset - "-monitor", "unix:" + monitor.string () + ",server,nowait"); + + // QMP. + // + "-chardev", "stdio,id=qmp", + "-mon", "chardev=qmp,mode=control,pretty=on", + + // Monitor. + // + "-chardev", "socket,id=mon,path=" + monitor.string () + ",server,nowait", + "-mon", "chardev=mon,mode=readline"); + + qmp_out.close (); + qmp_in.in.close (); + qmp = move (qmp_in.out); + + // Wait for the QMP greeting. One day we will find a better way. + // + sleep (1); + + try + { + ofdstream os (move (qmp)); + os << "{ \"execute\": \"qmp_capabilities\" }" << endl; + qmp = os.release (); + } + catch (const io_error& e) + { + fail << "unable to initialize QMP: " << e; + } + + // Start execution. + // + try + { + monitor_command ("cont"); + } + catch (const system_error& e) + { + fail << "unable to communicate with qemu monitor: " << e; + } + } + + void kvm_machine:: + cleanup () + { + try_rmfile (log, true /* ignore_errors */); } // Connect to the QEMU monitor via the UNIX socket and send system_reset. @@ -434,6 +525,7 @@ namespace bbot print_info (diag_record& dr) { dr << info << "qemu pid: " << proc.id () + << info << "qemu log: " << log << info << "qemu vnc: " << vnc << info << "qemu monitor: unix:" << monitor; } @@ -453,7 +545,7 @@ namespace bbot { run_io_finish (trace, proc, kvm, fh); net.destroy (); //@@ Always fails hard. - try_rmfile (monitor, true); // QEMU doesn't seem to remove it. + try_rmfile (monitor, true /* ignore_errors */); // QEMU doesn't do it. } return t; diff --git a/bbot/agent/machine.hxx b/bbot/agent/machine.hxx index 04da80e..46ad544 100644 --- a/bbot/agent/machine.hxx +++ b/bbot/agent/machine.hxx @@ -53,6 +53,12 @@ namespace bbot return wait (sec, fail_hard); } + // Cleanup machine resources (logs, etc). Normally you would only call + // this after a successful (or successfully investigated) completion. + // + virtual void + cleanup () = 0; + // Print information about the machine (as info diagnostics) that can be // useful for debugging (e.g., how to connect/login, etc). // |