From 1fd86636b0d70d754caf205d8048893a9c9793c3 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Thu, 7 Feb 2019 08:02:47 +0200 Subject: Various improvements and fixes --- bootstrap | 2 +- buildos | 538 ++++++++++++++++++++++++++++++++++----------------------- doc/manual.cli | 41 +++-- init | 39 +++-- 4 files changed, 386 insertions(+), 234 deletions(-) diff --git a/bootstrap b/bootstrap index 78dd42e..e606c9f 100755 --- a/bootstrap +++ b/bootstrap @@ -67,7 +67,7 @@ base_pkgs+=",qemu-kvm,qemu-utils,socat" base_pkgs+=",g++,make,pkg-config" -extra_pkgs="ca-certificates,smartmontools" +extra_pkgs="ca-certificates,time,dmidecode,smartmontools" owd="$(pwd)" trap "{ cd '$owd'; exit 1; }" ERR diff --git a/buildos b/buildos index 012869b..52e984e 100755 --- a/buildos +++ b/buildos @@ -51,7 +51,7 @@ info "starting build os monitor..." # # First we separete quoted variables and arguments with newlines (giving # priority to assignments). Then we replace whitespaces with newline on -# lines that don't contain quites. Finally, clean up by removing blank +# lines that don't contain quites. Finally, we clean up by removing blank # lines. # # Note: the same code as in init. @@ -137,7 +137,9 @@ function toolchain_value () # echo "${!n}" } +instances=0 # Number of bbot instances across all toolchains. toolchain_names=() + for tn in "${!toolchains[@]}"; do tp="${toolchains["$tn"]}" tu="$(toolchain_value "$tp" toolchain_url)" @@ -161,8 +163,28 @@ for tn in "${!toolchains[@]}"; do declare "${tp}toolchain_ver=" declare "${tp}toolchain_fver=" # Full version (with snapshot). - # If buildos.toolchain_trust was not specified, set it to "no" so that - # we don't prompt if the repository happens to be signed. + # Default to 1 bbot agent instance. + # + if [ -z "$(toolchain_value "$tp" instances)" ]; then + declare "${tp}instances=1" + fi + + instances=$(($instances + $(toolchain_value "$tp" instances))) + + # Default to 0 nice value. + # + if [ -z "$(toolchain_value "$tp" nice)" ]; then + declare "${tp}nice=0" + fi + + # Default to br1 (private/NAT bridge). + # + if [ -z "$(toolchain_value "$tp" bridge)" ]; then + declare "${tp}bridge=br1" + fi + + # If toolchain_trust was not specified, set it to "no" so that we don't + # prompt if the repository happens to be signed. # if [ -z "$(toolchain_value "$tp" toolchain_trust)" ]; then declare "${tp}toolchain_trust=no" @@ -180,27 +202,48 @@ if [ "${#toolchain_names[@]}" -eq 0 ]; then info "no buildos.toolchain_url specified, not bootstrapping" fi -# Divide CPUs and RAM (in kB) among the toolchains. +# Divide CPUs and RAM (in KB) among the instances. +# +# By default reserve 4G of RAM for ourselves (rootfs, tmpfs). # -# Reserve 4G of RAM for ourselves (rootfs, tmpfs). +# Note that MemTotal in /proc/meminfo is the available memory, not physical. +# And to make it easier to provision memory it's really helpful to base it +# in the physical value. # -ram_total="$(sed -n -re 's/^MemTotal: *([0-9]+) *kB$/\1/p' ... for v in /build/machines/*; do if [ ! -d "$v" ]; then diag+=("$v: error: invalid volume") - fail="true" + fail=true continue fi @@ -292,7 +343,7 @@ function machines_for () # ... for m in *; do if [ ! -d "$m" ]; then diag+=("$v/$m: error: invalid machine") - fail="true" + fail=true continue fi @@ -308,19 +359,29 @@ function machines_clean_subvolume () # { if ! btrfs property set -ts "$1" ro false; then diag+=("$1: error: unable to change subvolume property") - fail="true" + fail=true return 1 fi if ! btrfs subvolume delete "$1"; then diag+=("$1: error: unable to delete subvolume") - fail="true" + fail=true + return 1 + fi +} + +function machines_clean_lockfile () # +{ + if ! rm -f "$1"; then + diag+=("$1: error: unable to delete lockfile") + fail=true return 1 fi } # Cleanup the -- entries for the specified toolchain -# called before starting each toolchain. +# (all instances) as well as -.lock file. Called before +# starting bbot instances for each toolchain. # function machines_clean_toolchain () # { @@ -330,24 +391,33 @@ function machines_clean_toolchain () # cd "$m" - local s - for s in "$m"-"$tn"-*; do + local i + for i in "$m"-"$tn"-*; do - if [ ! -d "$s" ]; then - diag+=("$v/$m/$s: error: invalid machine subvolume") - fail="true" + if [ ! -d "$i" ]; then + diag+=("$v/$m/$i: error: invalid machine subvolume") + fail=true continue fi - if machines_clean_subvolume "$v/$m/$s"; then - diag+=("$v/$m/$s: info: deleted stray toolchain working subvolume") + if machines_clean_subvolume "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted stray toolchain working subvolume") fi done + i="$m-$tn.lock" + if [ -f "$i" ]; then + + if machines_clean_lockfile "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted stray lockfile") + fi + fi + cd "$v" } -# Cleanup stray snapshots or deleted machines. Called once during startup. +# Cleanup stray snapshots and lockfiles as well as deleted machines. Called +# once during startup. # function machines_clean_stray () # { @@ -359,35 +429,48 @@ function machines_clean_stray () # # Collect current machine symlink's bootstrap protocol numbers. If there # are no current machine symlinks, then we delete the whole thing. # - local s ps=() - for s in "$m"-*; do - if [[ "$s" =~ ^"$m"-[0-9]+$ ]]; then + local i ps=() + for i in "$m"-*; do + if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then - if [ ! -L "$s" ]; then - diag+=("$v/$m/$s: error: not a symlink") - fail="true" + if [ ! -L "$i" ]; then + diag+=("$v/$m/$i: error: not a symlink") + fail=true fi # Treat it as if it were a symlink even if its not. Failed that we # may try to delete the whole thing. # - ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$s")") + ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$i")") fi done # Examine each machine subvolume. # - for s in "$m"-*; do + for i in "$m"-*; do # -

(current machine symlink) # - if [[ "$s" =~ ^"$m"-[0-9]+$ ]]; then + if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then continue fi - if [ ! -d "$s" ]; then - diag+=("$v/$m/$s: error: invalid machine subvolume") - fail="true" + # Lockfile. + # + if [ -f "$i" ]; then + + if [[ "$i" =~ ^"$m"-.+\.lock$ ]]; then + + if machines_clean_lockfile "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted lockfile") + fi + continue + fi + fi + + if [ ! -d "$i" ]; then + diag+=("$v/$m/$i: error: invalid machine subvolume") + fail=true continue fi @@ -400,8 +483,8 @@ function machines_clean_stray () # # local p f= for p in "${ps[@]}"; do - if [[ "$s" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then - f="true" + if [[ "$i" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then + f=false break fi done @@ -415,8 +498,8 @@ function machines_clean_stray () # f= local tn for tn in "${toolchain_names[@]}"; do - if [[ "$s" =~ ^"$m"-"$tn"$ ]]; then - f="true" + if [[ "$i" =~ ^"$m"-"$tn"$ ]]; then + f=false break fi done @@ -426,11 +509,11 @@ function machines_clean_stray () # fi fi - # This is either a stray working submodule or a bootsrapped subvolume + # This is either a stray working subvolume or a bootsrapped subvolume # for a toolchain that was deleted (or we are deleting everything). # - if machines_clean_subvolume "$v/$m/$s"; then - diag+=("$v/$m/$s: info: deleted subvolume") + if machines_clean_subvolume "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted subvolume") fi done @@ -443,7 +526,7 @@ function machines_clean_stray () # diag+=("$v/$m: info: deleted machine directory") else diag+=("$v/$m: error: unable to delete machine directory") - fail="true" + fail=true fi fi } @@ -463,8 +546,7 @@ if [ "${#diag[@]}" -gt 0 ]; then info "$s" && print_diag 1>&2 if [ -n "$fail" ]; then - info "correct and restart the monitor (systemctl restart buildos)" - exit 1 + error "correct and restart the monitor (systemctl restart buildos)" fi fi @@ -513,7 +595,7 @@ function toolchain_fetch () # return 1 fi - info "toolchain '$tn' version $tv" + info "toolchain $tn version $tv" declare -g "${tp}toolchain_fver=$tv" # Full version. echo "$tv" >"$tr/version-full" @@ -713,9 +795,12 @@ function bbot_check () # function bbot_start () # { local tn="$1" - local ti="$2" + local tx="$2" local tp="${toolchains["$tn"]}" + local tc="$(toolchain_value "$tp" nice)" + local tb="$(toolchain_value "$tp" bridge)" + local ti="$(toolchain_value "$tp" instances)" local tv="$(toolchain_value "$tp" toolchain_fver)" local ts="$(toolchain_value "$tp" toolchain_file_csum)" @@ -741,9 +826,13 @@ function bbot_start () # # if [ "$b_word" = "configured" ]; then - if ! sudo systemctl stop "bbot-agent@$tn"; then - info "failed to stop bbot-agent@$tn service, assuming not running" - fi + for ((i=1; i <= ti; i++)); do + if ! sudo systemctl stop "bbot-agent-$tn@$i"; then + info "failed to stop bbot-agent-$tn@$i service, assuming not running" + continue + fi + info "stopped bbot-agent-$tn@$i service" + done # We may not be able to uninstall if we previously failed to build. # @@ -752,37 +841,45 @@ function bbot_start () # fi fi - # Build and install the bbot agent. Since other agents might already - # be running, limit the number of jobs to our slice. + # Build and install the bbot agent. Since other agents might already be + # running, limit the number of jobs to our slice. # - if ! bpkg --fetch-timeout "$timeout" \ - --build-option --jobs --build-option "$cpu_slice" \ + if ! bpkg --fetch-timeout "$timeout" \ + --build-option --jobs=$(($ti * $cpu_slice)) \ build --yes libbbot bbot; then - info "failed to build bbot-agent@$tn" + info "failed to build bbot-agent for $tn" break fi if ! bpkg install "${vars[@]}" bbot; then - info "failed to install bbot-agent@$tn" + info "failed to install bbot-agent for $tn" break fi - # Post-process and install systemd .service file. Note that we cannot use - # the systemd pattern machinery since each version of bbot can have its - # own version of the .service file. + # Post-process and install the systemd .service file. Since we may have + # multiple toolchains, we embed the toolchain name into the service name + # with the systemd pattern machinery used to run multiple bbot instances + # per toolchain. + # + # We assume `%I` is only used in Description and similar and rewrite it + # as `/%i` (e.g., `stage/1`). # sed -i -r \ - -e "s/%[iI]/$tn/g" \ - -e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \ + -e "s#%I#$tn/%I#g" \ -e "s/^(Environment=CPU)=.*/\1=$cpu_slice/" \ -e "s/^(Environment=RAM)=.*/\1=$ram_slice/" \ + -e "s/^(Environment=BRIDGE)=.*/\1=$tb/" \ + -e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \ -e "s/^(Environment=TOOLCHAIN_ID)=.*/\1=$ts/" \ - -e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$ti/" \ + -e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$tx/" \ -e "s/^(Environment=TOOLCHAIN_VER)=.*/\1=$tv/" \ + -e "s/^(Environment=TOOLCHAIN_NAME)=.*/\1=$tn/" \ + -e "s/^(Nice)=.*/\1=$tc/" \ + -e "s#^ExecStart=[^ ]+(.*)#ExecStart=$id/bin/bbot-agent\1#" \ "$id/lib/systemd/system/bbot-agent@.service" # Patch in the controller URLs. These can contain special characters - # like & so we have to escape them. + # like `&` so we have to escape them. # n="${tp}controller_url[@]" for i in "${!n}"; do @@ -801,8 +898,11 @@ function bbot_start () # "$id/lib/systemd/system/bbot-agent@.service" done - sudo ln -sf "$id/lib/systemd/system/bbot-agent@.service" \ - "/usr/lib/systemd/system/bbot-agent@$tn.service" + # Note: using a hard link to prevent systemd from being too clever and + # calling the service bbot-agent@. + # + sudo ln -f "$id/lib/systemd/system/bbot-agent@.service" \ + "/usr/lib/systemd/system/bbot-agent-$tn@.service" # Clean up any machine snapshots that might have been left behind. # @@ -818,25 +918,28 @@ function bbot_start () # print_diag 1>&2 if [ -n "$fail" ]; then - info "correct and start bbot-agent@$tn (systemctl start bbot-agent@$tn)" + info "correct and start bbot-agent for $tn (systemctl start bbot-agent-$tn@N)" break fi fi - # Start the service. With Type=simple start returns as soon as the process - # has forked. To see if the service actually started is done as part of - # service monitoring. + # Start each service instance. With Type=simple start returns as soon as + # the process has forked. Making sure the service has actually started is + # done as part of the service monitoring. # - if ! sudo systemctl start "bbot-agent@$tn"; then - info "failed to start bbot-agent@$tn service" - break - fi - r=0 + for ((i=1; i <= ti; i++)); do + if ! sudo systemctl start "bbot-agent-$tn@$i"; then + info "failed to start bbot-agent-$tn@$i service instance" + r=1 + break + fi + done + break done - cd "$owd" + cd "$owd" return "$r" } @@ -855,6 +958,27 @@ while true; do count=$(($count + 1)) + # Check for OS changes. Do this first in case of any issues in the following + # checks. + # + if [ -n "$buildid_url" ]; then + # Fetch the current id. While normally it will be a TFTP URL, it could also + # be HTTP(S) so we configure sensible behavior for that. + # + if id="$("${curl[@]}" "$buildid_url")"; then + if [ "$id" != "$buildid" ]; then + email "rebooting because of new os build" <&1 | tee "$tr/toolchain-$count.log" 1>&2 @@ -917,15 +1041,15 @@ EOF tv="$(cat $tr/version-full)" declare "${tp}toolchain_fver=$tv" - s="bootstrapped '$tn' toolchain $tv" + s="bootstrapped $tn toolchain $tv" toolchain_boots+=("$tn") ;; 1) - s="skipping disabled '$tn' toolchain, waiting for new version" + s="skipping disabled $tn toolchain, waiting for new version" toolchain_boots+=("") # Skip. ;; *) - s="failed to bootstrap '$tn' toolchain, waiting for new version" + s="failed to bootstrap $tn toolchain, waiting for new version" toolchain_boots+=("") # Skip. ;; esac @@ -946,10 +1070,10 @@ EOF # if [ "${#toolchain_names[@]}" -eq "${#toolchain_boots[@]}" ]; then - ti=0 # Toolchain index. + tx=0 # Toolchain index. for tn in "${toolchain_boots[@]}"; do - ti=$(($ti + 1)) + tx=$(($tx + 1)) # Skip those that failed to bootstrap. # @@ -958,6 +1082,7 @@ EOF fi tp="${toolchains["$tn"]}" + ti="$(toolchain_value "$tp" instances)" tr="$(toolchain_value "$tp" toolchain_root)" # Or those that have no controllers (maybe it would have been better @@ -969,171 +1094,160 @@ EOF fi s= - bbot_check "$tn" 2>&1 | tee "$tr/bbot-$count.log" 1>&2 + bbot_check "$tn" 2>&1 | tee "$tr/bbot-agent-$count.log" 1>&2 case "${PIPESTATUS[0]}" in 0) - rm -f "$tr/bbot-$count.log" + rm -f "$tr/bbot-agent-$count.log" - # Check if the service has failed. + # For each service instance check if it has failed. # - if sudo systemctl is-failed --quiet "bbot-agent@$tn"; then - s="bbot-agent@$tn service has failed, stopping" - - # Note: ignore errors. - # - sudo systemctl status "bbot-agent@$tn" 2>&1 | \ - tee "$tr/bbot-$count.log" 1>&2 - - # Reset it so that we don't keep sending the log on each - # iteration. Note: ignore errors. - # - sudo systemctl reset-failed "bbot-agent@$tn" 2>&1 | \ - tee -a "$tr/bbot-$count.log" 1>&2 - else - # See if there is any diagnostics in the systemd journal. We - # notify about warning and up. - # - # The old versions journalctl behavior is to not output anything - # (not even the cursor) if there are no new entries. The new - # versions output the old cursor. - # - # Plus, it sometimes changes the cursor even without any errors in - # it (journal rewind/truncation maybe?) so we have to detect that. - # - c=(sudo journalctl --no-pager --quiet --output short-full \ - --unit "bbot-agent@$tn") - - # Get the last cursor if any. - # - oc="${toolchain_cursors["$tn"]}" - if [ -n "$oc" ]; then - c+=("--after-cursor" "$oc") - fi + for ((i=1; i <= ti; i++)); do + + if sudo systemctl is-failed --quiet "bbot-agent-$tn@$i"; then + s="bbot-agent-$tn@$i service has failed, stopping" + + # Note: ignore errors. + # + sudo systemctl status "bbot-agent-$tn@$i" 2>&1 | \ + tee "$tr/bbot-agent-$i-$count.log" 1>&2 + + # Reset it so that we don't keep sending the log on each + # iteration. Note: ignore errors. + # + sudo systemctl reset-failed "bbot-agent-$tn@$i" 2>&1 | \ + tee -a "$tr/bbot-agent-$i-$count.log" 1>&2 + + info "$s" + email "$s" <&1 | tee -a "$tr/bbot-$count.log" 1>&2 + bbot_start "$tn" "$tx" 2>&1 | tee -a "$tr/bbot-agent-$count.log" 1>&2 if [ "${PIPESTATUS[0]}" -eq 0 ]; then - s="${s}started bbot-agent@$tn" + s="${s}started bbot-agent for $tn, $ti instances" else - s="failed to ${s}start bbot-agent@$tn, waiting for new version" + s="failed to ${s}start bbot-agent for $tn, waiting for new version" fi ;; *) - s="failed to fetch package information for '$tn' toolchain, will try again" + s="failed to fetch package information for $tn toolchain, will try again" ;; esac info "$s" email "$s" <} (values without the toolchain name use the toolchain name \c{default}). The toolchain name may not contain \c{-}. +Each toolchain may also execute multiple \c{bbot} agent instances. The number +of instances is specified with the \c{buildos.instances[.]} parameter. + +All \c{bbot} agent instances of a toolchain are executed with the same nice +value which can be specified with the \c{buildos.nice[.]} parameter. It +should be between -20 (highest priority) and 19 (lowest priority) with 0 +being the default. See \cb{sched(7)} for details. + +The bridge interface to be used for machine networking can be specified with +the \c{buildos.bridge[.]} parameter. Valid values are \c{br0} (public +bridge to the physical interface) and \c{br1} (private/NAT'ed bridge to +\c{br0}). If unspecified, \c{br1} is used by default. + In the checksums file blank lines and lines that start with \c{#} are ignored. If the first line is the special \c{disabled} value, then this toolchain is ignored. Otherwise, each line in the checksums file is the output of the @@ -564,7 +582,10 @@ If the machine has been suspended, it can be resumed using the following command: \ -echo cont | ssh build@build socat - UNIX-CONNECT:/tmp/-monitor +echo cont | ssh build@build socat - UNIX-CONNECT:/tmp/monitor-- \ +Other useful QEMU monitor commands are \c{system_powerdown} and +\c{system_reset}. + " diff --git a/init b/init index 672c9f1..d84e0bb 100755 --- a/init +++ b/init @@ -84,7 +84,7 @@ sensors-detect --auto # # First we separete quoted variables and arguments with newlines (giving # priority to assignments). Then we replace whitespaces with newline on -# lines that don't contain quites. Finally, clean up by removing blank +# lines that don't contain quotes. Finally, clean up by removing blank # lines. # # Note: the same code as in buildos. @@ -196,10 +196,18 @@ if [ -z "$eth" ]; then error fi -mac="$(cat "/sys/class/net/$eth/address")" -mid="$(sed -e 's/://g' <<<"$mac")" # Machine id. +# Global and local MAC addresses (used below for br0 and br1, respectively). +# Derive the local address from the global by fixing the first octet to 02 +# (locally-assigned). +# +gmac="$(cat "/sys/class/net/$eth/address")" +lmac="$(sed -re 's/..:(.+)/02:\1/g' <<<"$gmac")" + +info "configured $eth ($gmac)" -info "configured $eth ($mac)" +# Machine id. +# +mid="$(sed -re 's/://g' <<<"$gmac")" # Set the hostname. # @@ -219,12 +227,17 @@ info "hostname $hname" # dhclient -x 2>/dev/null -# @@ Need to be made configurable. +# @@ Needs to be made configurable. Something like 172.23.0.0/16. # -priv_network="172.16.123.0" -priv_netmask="255.255.255.0" -priv_netbase="$(sed -e 's/^\(.*\)\.0$/\1/' <<<"$priv_network")" +priv_network="172.23.0.0" +priv_netmask="255.255.0.0" +priv_netbase="$(sed -e 's/^\(.*\)\.0\.0$/\1/' <<<"$priv_network")" +# Note that if we don't assign the bridge MAC address, then it will keep +# changing every time an interface with a greater address (e.g., a tap) +# joins the bridge. Needless to say, constantly changing MAC will wreck +# all kinds of networking havoc. +# cat </etc/network/interfaces auto lo iface lo inet loopback @@ -237,18 +250,22 @@ iface br0 inet dhcp bridge_stp off bridge_maxwait 0 bridge_fd 0 - bridge_hw $mac + bridge_hw $gmac + post-up ip link set $eth txqueuelen 4000 + post-up ip link set br0 txqueuelen 4000 # Private bridge with NAT to br0. # auto br1 iface br1 inet static - address ${priv_netbase}.1 + address ${priv_netbase}.0.1 netmask $priv_netmask bridge_ports none bridge_stp off bridge_maxwait 0 bridge_fd 0 + bridge_hw $lmac + post-up ip link set br1 txqueuelen 4000 post-up iptables -t nat -A POSTROUTING -o br0 -j MASQUERADE post-up iptables -A FORWARD -i br0 -o br1 -m state --state RELATED,ESTABLISHED -j ACCEPT post-up iptables -A FORWARD -i br1 -o br0 -j ACCEPT @@ -257,7 +274,7 @@ EOF cat </etc/dnsmasq.d/br1-dhcp interface=br1 bind-interfaces -dhcp-range=${priv_netbase}.10,${priv_netbase}.250,12h +dhcp-range=${priv_netbase}.1.1,${priv_netbase}.255.255,$priv_netmask,2h EOF # Figure out disk configuration and generate the corresponding /etc/fstab. -- cgit v1.1