diff options
-rwxr-xr-x | bootstrap | 2 | ||||
-rwxr-xr-x | buildos | 538 | ||||
-rw-r--r-- | doc/manual.cli | 41 | ||||
-rwxr-xr-x | init | 39 |
4 files changed, 386 insertions, 234 deletions
@@ -67,7 +67,7 @@ base_pkgs+=",qemu-kvm,qemu-utils,socat" base_pkgs+=",g++,make,pkg-config" -extra_pkgs="ca-certificates,smartmontools" +extra_pkgs="ca-certificates,time,dmidecode,smartmontools" owd="$(pwd)" trap "{ cd '$owd'; exit 1; }" ERR @@ -51,7 +51,7 @@ info "starting build os monitor..." # # First we separete quoted variables and arguments with newlines (giving # priority to assignments). Then we replace whitespaces with newline on -# lines that don't contain quites. Finally, clean up by removing blank +# lines that don't contain quites. Finally, we clean up by removing blank # lines. # # Note: the same code as in init. @@ -137,7 +137,9 @@ function toolchain_value () # <toolchain-prefix> <variable> echo "${!n}" } +instances=0 # Number of bbot instances across all toolchains. toolchain_names=() + for tn in "${!toolchains[@]}"; do tp="${toolchains["$tn"]}" tu="$(toolchain_value "$tp" toolchain_url)" @@ -161,8 +163,28 @@ for tn in "${!toolchains[@]}"; do declare "${tp}toolchain_ver=" declare "${tp}toolchain_fver=" # Full version (with snapshot). - # If buildos.toolchain_trust was not specified, set it to "no" so that - # we don't prompt if the repository happens to be signed. + # Default to 1 bbot agent instance. + # + if [ -z "$(toolchain_value "$tp" instances)" ]; then + declare "${tp}instances=1" + fi + + instances=$(($instances + $(toolchain_value "$tp" instances))) + + # Default to 0 nice value. + # + if [ -z "$(toolchain_value "$tp" nice)" ]; then + declare "${tp}nice=0" + fi + + # Default to br1 (private/NAT bridge). + # + if [ -z "$(toolchain_value "$tp" bridge)" ]; then + declare "${tp}bridge=br1" + fi + + # If toolchain_trust was not specified, set it to "no" so that we don't + # prompt if the repository happens to be signed. # if [ -z "$(toolchain_value "$tp" toolchain_trust)" ]; then declare "${tp}toolchain_trust=no" @@ -180,27 +202,48 @@ if [ "${#toolchain_names[@]}" -eq 0 ]; then info "no buildos.toolchain_url specified, not bootstrapping" fi -# Divide CPUs and RAM (in kB) among the toolchains. +# Divide CPUs and RAM (in KB) among the instances. +# +# By default reserve 4G of RAM for ourselves (rootfs, tmpfs). # -# Reserve 4G of RAM for ourselves (rootfs, tmpfs). +# Note that MemTotal in /proc/meminfo is the available memory, not physical. +# And to make it easier to provision memory it's really helpful to base it +# in the physical value. # -ram_total="$(sed -n -re 's/^MemTotal: *([0-9]+) *kB$/\1/p' </proc/meminfo)" +ram_total=0 +for i in $(sudo dmidecode -t 17 | sed -n -re 's/^\s*Size:\s*([0-9]+)\s*MB.*$/\1/p'); do + ram_total=$(($ram_total + $i * 1024)) +done + +if [ "$ram_total" -eq 0 ]; then + error "unable to determine physical memory size" +fi + cpu_total="$(lscpu | sed -n -re 's/^CPU\(s\): *([0-9]+)$/\1/p')" +if [ -z "$ram_reserved" ]; then + ram_reserved=4 +fi +ram_reserved=$(($ram_reserved * 1024 * 1024)) + if [ -z "$ram_overcommit" ]; then ram_overcommit=1 fi +if [ -z "$cpu_reserved" ]; then + cpu_reserved=0 +fi + if [ -z "$cpu_overcommit" ]; then cpu_overcommit=1 fi -ram_slice=$(("$ram_total" - 4 * 1024 * 1024)) -cpu_slice="$cpu_total" +ram_slice=$(($ram_total - $ram_reserved)) +cpu_slice=$(($cpu_total - $cpu_reserved)) -if [ "${#toolchain_names[@]}" -gt 1 ]; then - ram_slice=$(("$ram_slice" * "$ram_overcommit" / "${#toolchain_names[@]}")) - cpu_slice=$(("$cpu_slice" * "$cpu_overcommit" / "${#toolchain_names[@]}")) +if [ "$instances" -gt 1 ]; then + ram_slice=$(($ram_slice * $ram_overcommit / $instances)) + cpu_slice=$(($cpu_slice * $cpu_overcommit / $instances)) if [ "$cpu_slice" -eq 0 ]; then cpu_slice=1 @@ -212,13 +255,15 @@ fi function print () { echo "cpu_total: $cpu_total" + echo "cpu_reserved: $cpu_reserved" echo "cpu_overcommit: $cpu_overcommit" echo "cpu_slice: $cpu_slice" echo - echo "ram_total: $ram_total kB" + echo "ram_total: $ram_total KB" + echo "ram_reserved: $ram_reserved KB" echo "ram_overcommit: $ram_overcommit" - echo "ram_slice: $ram_slice kB" + echo "ram_slice: $ram_slice KB" echo echo "buildid: $buildid" @@ -228,9 +273,15 @@ function print () local n i tn tp tu tt for tn in "${toolchain_names[@]}"; do tp="${toolchains["$tn"]}" + tc="$(toolchain_value "$tp" nice)" + tb="$(toolchain_value "$tp" bridge)" + ti="$(toolchain_value "$tp" instances)" tu="$(toolchain_value "$tp" toolchain_url)" tt="$(toolchain_value "$tp" toolchain_trust)" + echo "$tn.nice: $tc" + echo "$tn.bridge: $tb" + echo "$tn.instances: $ti" echo "$tn.toolchain_url: $tu" echo "$tn.toolchain_trust: $tt" @@ -283,7 +334,7 @@ function machines_for () # <function> <function-args>... for v in /build/machines/*; do if [ ! -d "$v" ]; then diag+=("$v: error: invalid volume") - fail="true" + fail=true continue fi @@ -292,7 +343,7 @@ function machines_for () # <function> <function-args>... for m in *; do if [ ! -d "$m" ]; then diag+=("$v/$m: error: invalid machine") - fail="true" + fail=true continue fi @@ -308,19 +359,29 @@ function machines_clean_subvolume () # <subvolume-path> { if ! btrfs property set -ts "$1" ro false; then diag+=("$1: error: unable to change subvolume property") - fail="true" + fail=true return 1 fi if ! btrfs subvolume delete "$1"; then diag+=("$1: error: unable to delete subvolume") - fail="true" + fail=true + return 1 + fi +} + +function machines_clean_lockfile () # <lockfile-path> +{ + if ! rm -f "$1"; then + diag+=("$1: error: unable to delete lockfile") + fail=true return 1 fi } # Cleanup the <name>-<toolchain>-<xxx> entries for the specified toolchain -# called before starting each toolchain. +# (all instances) as well as <name>-<toolchain>.lock file. Called before +# starting bbot instances for each toolchain. # function machines_clean_toolchain () # <volume-dir> <machine> <toolchain> { @@ -330,24 +391,33 @@ function machines_clean_toolchain () # <volume-dir> <machine> <toolchain> cd "$m" - local s - for s in "$m"-"$tn"-*; do + local i + for i in "$m"-"$tn"-*; do - if [ ! -d "$s" ]; then - diag+=("$v/$m/$s: error: invalid machine subvolume") - fail="true" + if [ ! -d "$i" ]; then + diag+=("$v/$m/$i: error: invalid machine subvolume") + fail=true continue fi - if machines_clean_subvolume "$v/$m/$s"; then - diag+=("$v/$m/$s: info: deleted stray toolchain working subvolume") + if machines_clean_subvolume "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted stray toolchain working subvolume") fi done + i="$m-$tn.lock" + if [ -f "$i" ]; then + + if machines_clean_lockfile "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted stray lockfile") + fi + fi + cd "$v" } -# Cleanup stray snapshots or deleted machines. Called once during startup. +# Cleanup stray snapshots and lockfiles as well as deleted machines. Called +# once during startup. # function machines_clean_stray () # <volume-dir> <machine> { @@ -359,35 +429,48 @@ function machines_clean_stray () # <volume-dir> <machine> # Collect current machine symlink's bootstrap protocol numbers. If there # are no current machine symlinks, then we delete the whole thing. # - local s ps=() - for s in "$m"-*; do - if [[ "$s" =~ ^"$m"-[0-9]+$ ]]; then + local i ps=() + for i in "$m"-*; do + if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then - if [ ! -L "$s" ]; then - diag+=("$v/$m/$s: error: not a symlink") - fail="true" + if [ ! -L "$i" ]; then + diag+=("$v/$m/$i: error: not a symlink") + fail=true fi # Treat it as if it were a symlink even if its not. Failed that we # may try to delete the whole thing. # - ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$s")") + ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$i")") fi done # Examine each machine subvolume. # - for s in "$m"-*; do + for i in "$m"-*; do # <name>-<P> (current machine symlink) # - if [[ "$s" =~ ^"$m"-[0-9]+$ ]]; then + if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then continue fi - if [ ! -d "$s" ]; then - diag+=("$v/$m/$s: error: invalid machine subvolume") - fail="true" + # Lockfile. + # + if [ -f "$i" ]; then + + if [[ "$i" =~ ^"$m"-.+\.lock$ ]]; then + + if machines_clean_lockfile "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted lockfile") + fi + continue + fi + fi + + if [ ! -d "$i" ]; then + diag+=("$v/$m/$i: error: invalid machine subvolume") + fail=true continue fi @@ -400,8 +483,8 @@ function machines_clean_stray () # <volume-dir> <machine> # local p f= for p in "${ps[@]}"; do - if [[ "$s" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then - f="true" + if [[ "$i" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then + f=false break fi done @@ -415,8 +498,8 @@ function machines_clean_stray () # <volume-dir> <machine> f= local tn for tn in "${toolchain_names[@]}"; do - if [[ "$s" =~ ^"$m"-"$tn"$ ]]; then - f="true" + if [[ "$i" =~ ^"$m"-"$tn"$ ]]; then + f=false break fi done @@ -426,11 +509,11 @@ function machines_clean_stray () # <volume-dir> <machine> fi fi - # This is either a stray working submodule or a bootsrapped subvolume + # This is either a stray working subvolume or a bootsrapped subvolume # for a toolchain that was deleted (or we are deleting everything). # - if machines_clean_subvolume "$v/$m/$s"; then - diag+=("$v/$m/$s: info: deleted subvolume") + if machines_clean_subvolume "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted subvolume") fi done @@ -443,7 +526,7 @@ function machines_clean_stray () # <volume-dir> <machine> diag+=("$v/$m: info: deleted machine directory") else diag+=("$v/$m: error: unable to delete machine directory") - fail="true" + fail=true fi fi } @@ -463,8 +546,7 @@ if [ "${#diag[@]}" -gt 0 ]; then info "$s" && print_diag 1>&2 if [ -n "$fail" ]; then - info "correct and restart the monitor (systemctl restart buildos)" - exit 1 + error "correct and restart the monitor (systemctl restart buildos)" fi fi @@ -513,7 +595,7 @@ function toolchain_fetch () # <toolchain-name> <line> return 1 fi - info "toolchain '$tn' version $tv" + info "toolchain $tn version $tv" declare -g "${tp}toolchain_fver=$tv" # Full version. echo "$tv" >"$tr/version-full" @@ -713,9 +795,12 @@ function bbot_check () # <toolchain-name> function bbot_start () # <toolchain-name> <toolchain-index> { local tn="$1" - local ti="$2" + local tx="$2" local tp="${toolchains["$tn"]}" + local tc="$(toolchain_value "$tp" nice)" + local tb="$(toolchain_value "$tp" bridge)" + local ti="$(toolchain_value "$tp" instances)" local tv="$(toolchain_value "$tp" toolchain_fver)" local ts="$(toolchain_value "$tp" toolchain_file_csum)" @@ -741,9 +826,13 @@ function bbot_start () # <toolchain-name> <toolchain-index> # if [ "$b_word" = "configured" ]; then - if ! sudo systemctl stop "bbot-agent@$tn"; then - info "failed to stop bbot-agent@$tn service, assuming not running" - fi + for ((i=1; i <= ti; i++)); do + if ! sudo systemctl stop "bbot-agent-$tn@$i"; then + info "failed to stop bbot-agent-$tn@$i service, assuming not running" + continue + fi + info "stopped bbot-agent-$tn@$i service" + done # We may not be able to uninstall if we previously failed to build. # @@ -752,37 +841,45 @@ function bbot_start () # <toolchain-name> <toolchain-index> fi fi - # Build and install the bbot agent. Since other agents might already - # be running, limit the number of jobs to our slice. + # Build and install the bbot agent. Since other agents might already be + # running, limit the number of jobs to our slice. # - if ! bpkg --fetch-timeout "$timeout" \ - --build-option --jobs --build-option "$cpu_slice" \ + if ! bpkg --fetch-timeout "$timeout" \ + --build-option --jobs=$(($ti * $cpu_slice)) \ build --yes libbbot bbot; then - info "failed to build bbot-agent@$tn" + info "failed to build bbot-agent for $tn" break fi if ! bpkg install "${vars[@]}" bbot; then - info "failed to install bbot-agent@$tn" + info "failed to install bbot-agent for $tn" break fi - # Post-process and install systemd .service file. Note that we cannot use - # the systemd pattern machinery since each version of bbot can have its - # own version of the .service file. + # Post-process and install the systemd .service file. Since we may have + # multiple toolchains, we embed the toolchain name into the service name + # with the systemd pattern machinery used to run multiple bbot instances + # per toolchain. + # + # We assume `%I` is only used in Description and similar and rewrite it + # as `<name>/%i` (e.g., `stage/1`). # sed -i -r \ - -e "s/%[iI]/$tn/g" \ - -e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \ + -e "s#%I#$tn/%I#g" \ -e "s/^(Environment=CPU)=.*/\1=$cpu_slice/" \ -e "s/^(Environment=RAM)=.*/\1=$ram_slice/" \ + -e "s/^(Environment=BRIDGE)=.*/\1=$tb/" \ + -e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \ -e "s/^(Environment=TOOLCHAIN_ID)=.*/\1=$ts/" \ - -e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$ti/" \ + -e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$tx/" \ -e "s/^(Environment=TOOLCHAIN_VER)=.*/\1=$tv/" \ + -e "s/^(Environment=TOOLCHAIN_NAME)=.*/\1=$tn/" \ + -e "s/^(Nice)=.*/\1=$tc/" \ + -e "s#^ExecStart=[^ ]+(.*)#ExecStart=$id/bin/bbot-agent\1#" \ "$id/lib/systemd/system/bbot-agent@.service" # Patch in the controller URLs. These can contain special characters - # like & so we have to escape them. + # like `&` so we have to escape them. # n="${tp}controller_url[@]" for i in "${!n}"; do @@ -801,8 +898,11 @@ function bbot_start () # <toolchain-name> <toolchain-index> "$id/lib/systemd/system/bbot-agent@.service" done - sudo ln -sf "$id/lib/systemd/system/bbot-agent@.service" \ - "/usr/lib/systemd/system/bbot-agent@$tn.service" + # Note: using a hard link to prevent systemd from being too clever and + # calling the service bbot-agent@. + # + sudo ln -f "$id/lib/systemd/system/bbot-agent@.service" \ + "/usr/lib/systemd/system/bbot-agent-$tn@.service" # Clean up any machine snapshots that might have been left behind. # @@ -818,25 +918,28 @@ function bbot_start () # <toolchain-name> <toolchain-index> print_diag 1>&2 if [ -n "$fail" ]; then - info "correct and start bbot-agent@$tn (systemctl start bbot-agent@$tn)" + info "correct and start bbot-agent for $tn (systemctl start bbot-agent-$tn@N)" break fi fi - # Start the service. With Type=simple start returns as soon as the process - # has forked. To see if the service actually started is done as part of - # service monitoring. + # Start each service instance. With Type=simple start returns as soon as + # the process has forked. Making sure the service has actually started is + # done as part of the service monitoring. # - if ! sudo systemctl start "bbot-agent@$tn"; then - info "failed to start bbot-agent@$tn service" - break - fi - r=0 + for ((i=1; i <= ti; i++)); do + if ! sudo systemctl start "bbot-agent-$tn@$i"; then + info "failed to start bbot-agent-$tn@$i service instance" + r=1 + break + fi + done + break done - cd "$owd" + cd "$owd" return "$r" } @@ -855,6 +958,27 @@ while true; do count=$(($count + 1)) + # Check for OS changes. Do this first in case of any issues in the following + # checks. + # + if [ -n "$buildid_url" ]; then + # Fetch the current id. While normally it will be a TFTP URL, it could also + # be HTTP(S) so we configure sensible behavior for that. + # + if id="$("${curl[@]}" "$buildid_url")"; then + if [ "$id" != "$buildid" ]; then + email "rebooting because of new os build" <<EOF +old_buildid: $buildid +new_buildid: $id +EOF + info "new os build ($id), rebooting..." + restart + fi + else + info "unable to fetch $buildid_url, will try again" + fi + fi + # Check for toolchain changes. If this is the first run, bootstrap them. # for tn in "${toolchain_names[@]}"; do @@ -886,11 +1010,11 @@ while true; do cs="$(toolchain_checksum "$tp" "$f")" if [ "$ts" != "$cs" ]; then - email "rebooting because of new '$tn' toolchain" <<EOF + email "rebooting because of new $tn toolchain" <<EOF old_checksum: $ts new_checksum: $cs EOF - info "new '$tn' toolchain ($cs), rebooting..." + info "new $tn toolchain ($cs), rebooting..." restart fi else @@ -905,7 +1029,7 @@ EOF # subshell and any variables it sets (like toolchain_ver) won't be # visible to us. # - info "bootstrapping '$tn' toolchain..." + info "bootstrapping $tn toolchain..." toolchain_bootstrap "$tn" 2>&1 | tee "$tr/toolchain-$count.log" 1>&2 @@ -917,15 +1041,15 @@ EOF tv="$(cat $tr/version-full)" declare "${tp}toolchain_fver=$tv" - s="bootstrapped '$tn' toolchain $tv" + s="bootstrapped $tn toolchain $tv" toolchain_boots+=("$tn") ;; 1) - s="skipping disabled '$tn' toolchain, waiting for new version" + s="skipping disabled $tn toolchain, waiting for new version" toolchain_boots+=("") # Skip. ;; *) - s="failed to bootstrap '$tn' toolchain, waiting for new version" + s="failed to bootstrap $tn toolchain, waiting for new version" toolchain_boots+=("") # Skip. ;; esac @@ -946,10 +1070,10 @@ EOF # if [ "${#toolchain_names[@]}" -eq "${#toolchain_boots[@]}" ]; then - ti=0 # Toolchain index. + tx=0 # Toolchain index. for tn in "${toolchain_boots[@]}"; do - ti=$(($ti + 1)) + tx=$(($tx + 1)) # Skip those that failed to bootstrap. # @@ -958,6 +1082,7 @@ EOF fi tp="${toolchains["$tn"]}" + ti="$(toolchain_value "$tp" instances)" tr="$(toolchain_value "$tp" toolchain_root)" # Or those that have no controllers (maybe it would have been better @@ -969,171 +1094,160 @@ EOF fi s= - bbot_check "$tn" 2>&1 | tee "$tr/bbot-$count.log" 1>&2 + bbot_check "$tn" 2>&1 | tee "$tr/bbot-agent-$count.log" 1>&2 case "${PIPESTATUS[0]}" in 0) - rm -f "$tr/bbot-$count.log" + rm -f "$tr/bbot-agent-$count.log" - # Check if the service has failed. + # For each service instance check if it has failed. # - if sudo systemctl is-failed --quiet "bbot-agent@$tn"; then - s="bbot-agent@$tn service has failed, stopping" - - # Note: ignore errors. - # - sudo systemctl status "bbot-agent@$tn" 2>&1 | \ - tee "$tr/bbot-$count.log" 1>&2 - - # Reset it so that we don't keep sending the log on each - # iteration. Note: ignore errors. - # - sudo systemctl reset-failed "bbot-agent@$tn" 2>&1 | \ - tee -a "$tr/bbot-$count.log" 1>&2 - else - # See if there is any diagnostics in the systemd journal. We - # notify about warning and up. - # - # The old versions journalctl behavior is to not output anything - # (not even the cursor) if there are no new entries. The new - # versions output the old cursor. - # - # Plus, it sometimes changes the cursor even without any errors in - # it (journal rewind/truncation maybe?) so we have to detect that. - # - c=(sudo journalctl --no-pager --quiet --output short-full \ - --unit "bbot-agent@$tn") - - # Get the last cursor if any. - # - oc="${toolchain_cursors["$tn"]}" - if [ -n "$oc" ]; then - c+=("--after-cursor" "$oc") - fi + for ((i=1; i <= ti; i++)); do + + if sudo systemctl is-failed --quiet "bbot-agent-$tn@$i"; then + s="bbot-agent-$tn@$i service has failed, stopping" + + # Note: ignore errors. + # + sudo systemctl status "bbot-agent-$tn@$i" 2>&1 | \ + tee "$tr/bbot-agent-$i-$count.log" 1>&2 + + # Reset it so that we don't keep sending the log on each + # iteration. Note: ignore errors. + # + sudo systemctl reset-failed "bbot-agent-$tn@$i" 2>&1 | \ + tee -a "$tr/bbot-agent-$i-$count.log" 1>&2 + + info "$s" + email "$s" <<EOF +$tn.bbot_log: tftp://$hname/toolchains/$tn/bbot-agent-$i-$count.log +EOF + else + # See if there is any diagnostics in the systemd journal. We + # notify about warnings and up. + # + # The old versions journalctl behavior is to not output anything + # (not even the cursor) if there are no new entries. The new + # versions output the old cursor. + # + # Plus, it sometimes changes the cursor even without any errors + # in it (journal rewind/truncation maybe?) so we have to detect + # that. + # + c=(sudo journalctl --no-pager --quiet --output short-full \ + --unit "bbot-agent-$tn@$i") + + # Get the last cursor if any. + # + oc="${toolchain_cursors["$tn/$i"]}" + if [ -n "$oc" ]; then + c+=("--after-cursor" "$oc") + fi - # Get the "log range": the first line is the date of the first - # error, the second line is the date of the last error, and the - # third line is the end cursor. It can also be just one line in - # which case it is the new cursor (that rewind stuff). - # - # Here is what's going on in that sed script: - # - # The first chunk matches the first line. We first put it into - # the hold space (in case that's the only line) and then extract - # and print the date. - # - # The second chunk matches the last line. We first handle the hold - # space which by now should contain the last error line and then - # the cursor. - # - # The last chunk matches every other line. We simply replace the - # hold space with the next line so that at the end we have the - # last line there. - # - lr="$("${c[@]}" --priority 4 --show-cursor | sed -n -r \ + # Get the "log range": the first line is the date of the first + # error, the second line is the date of the last error, and the + # third line is the end cursor. It can also be just one line in + # which case it is the new cursor (that rewind stuff). + # + # Here is what's going on in that sed script: + # + # The first chunk matches the first line. We first put it into + # the hold space (in case that's the only line) and then extract + # and print the date. + # + # The second chunk matches the last line. We first handle the + # hold space which by now should contain the last error line and + # then the cursor. + # + # The last chunk matches every other line. We simply replace the + # hold space with the next line so that at the end we have the + # last line there. + # + lr="$("${c[@]}" --priority 4 --show-cursor | sed -n -r \ -e '1{h;s/^[MTWFS].. ([^ ]+ [^ ]+) .*$/\1/p;t}' \ -e '${x;s/^[MTWFS].. ([^ ]+ [^ ]+) .*$/\1/p;x;s/^-- cursor: (.+)$/\1/p;t}' \ -e 'h')" - lc="$(wc -l <<<"$lr")" - nc="$(sed -n -e "${lc}p" <<<"$lr")" - - # If we have no new entries, then nothing to do. - # - if [ "$nc" != "$oc" ]; then + lc="$(wc -l <<<"$lr")" + nc="$(sed -n -e "${lc}p" <<<"$lr")" - # We may have no actual entries (cursor rewind). + # If we have no new entries, then nothing to do. # - if [ "$lc" -ne 1 ]; then + if [ "$nc" != "$oc" ]; then - # Try to get some context before the first error and after the - # last. This is unexpectedly hard in systemd. - # - # This can be a lot of output which makes it hard to spot the - # error so we are going to print just the error summary first. - # Quite a mess, I agree. + # We may have no actual entries (cursor rewind). # - sd="$(sed -n -e '1p' <<<"$lr")" - sd="$(date '+%s' -d "$sd")" # sec - sd="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($sd - 10))")" # -10sec - - ed="$(sed -n -e '2p' <<<"$lr")" - ed="$(date '+%s' -d "$ed")" # sec - ed="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($ed + 10))")" # +10sec - - s="bbot-agent@$tn service issued new diagnostics" - - info "$s" - { - echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}"; - echo; - echo "summary:"; - echo; - "${c[@]}" --priority 4 | head -n 200; - echo; - echo "context:"; - echo; - if [ -n "$oc" ]; then - unset 'c[-1]' # Pop cursor (for --since/--until). - unset 'c[-1]' - fi; - "${c[@]}" --since "$sd" --until "$ed" | head -n 200 - } | email "$s" + if [ "$lc" -ne 1 ]; then + + # Try to get some context before the first error and after + # the last. This is unexpectedly hard in systemd. + # + # This can be a lot of output which makes it hard to spot + # the error so we are going to print just the error summary + # first. Quite a mess, I agree. + # + sd="$(sed -n -e '1p' <<<"$lr")" + sd="$(date '+%s' -d "$sd")" # sec + sd="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($sd - 10))")" # -10sec + + ed="$(sed -n -e '2p' <<<"$lr")" + ed="$(date '+%s' -d "$ed")" # sec + ed="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($ed + 10))")" # +10sec + + s="bbot-agent-$tn@$i service issued new diagnostics" + + info "$s" + { + echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}"; + echo; + echo "summary:"; + echo; + "${c[@]}" --priority 4 | head -n 200; + echo; + echo "context:"; + echo; + if [ -n "$oc" ]; then + unset 'c[-1]' # Pop cursor (for --since/--until). + unset 'c[-1]' + fi; + "${c[@]}" --since "$sd" --until "$ed" | head -n 200 + } | email "$s" + fi + + toolchain_cursors["$tn/$i"]="$nc" fi - - toolchain_cursors["$tn"]="$nc" fi + done - continue - fi + continue # We have already issues diagnostics, if any. ;; 1) s="re" ;& 2) - info "${s}starting bbot-agent@$tn..." + info "${s}starting bbot-agent for $tn..." # Note: appending to the same log. # - bbot_start "$tn" "$ti" 2>&1 | tee -a "$tr/bbot-$count.log" 1>&2 + bbot_start "$tn" "$tx" 2>&1 | tee -a "$tr/bbot-agent-$count.log" 1>&2 if [ "${PIPESTATUS[0]}" -eq 0 ]; then - s="${s}started bbot-agent@$tn" + s="${s}started bbot-agent for $tn, $ti instances" else - s="failed to ${s}start bbot-agent@$tn, waiting for new version" + s="failed to ${s}start bbot-agent for $tn, waiting for new version" fi ;; *) - s="failed to fetch package information for '$tn' toolchain, will try again" + s="failed to fetch package information for $tn toolchain, will try again" ;; esac info "$s" email "$s" <<EOF -$tn.bbot_log: tftp://$hname/toolchains/$tn/bbot-$count.log +$tn.bbot_log: tftp://$hname/toolchains/$tn/bbot-agent-$count.log EOF done fi - # Check for OS changes. - # - if [ -n "$buildid_url" ]; then - # Fetch the current id. While normally it will be a TFTP URL, it could also - # be HTTP(S) so we configure sensible behavior for that. - # - if id="$("${curl[@]}" "$buildid_url")"; then - if [ "$id" != "$buildid" ]; then - email "rebooting because of new os build" <<EOF -old_buildid: $buildid -new_buildid: $id -EOF - info "new os build ($id), rebooting..." - restart - fi - else - info "unable to fetch $buildid_url, will try again" - fi - fi - sensors -A info "monitoring..." sleep 60 diff --git a/doc/manual.cli b/doc/manual.cli index 20e0846..abccb8b 100644 --- a/doc/manual.cli +++ b/doc/manual.cli @@ -34,9 +34,9 @@ mode} and receive \i{build tasks} from their respective agents. \h1#arch|Architecture| Build OS root filesystem (\c{rootfs}) resides entirely in RAM with all changes -(such as installation of the \c{build2} toolchain} discarded on the next +(such as installation of the \c{build2} toolchain) discarded on the next reboot. A small amount of persistent (but not precious) state is stored in -\c{/state} (see \l{#config-storage-state State}). A minimum of 4G of RAM +\c{/state} (see \l{#config-storage-state State}). A minimum of 4GB of RAM is required for Build OS itself (that is, excluding any virtual machines and containers). @@ -180,13 +180,13 @@ sudo kvm \ \h#config-cpu-ram|CPU and RAM| -A Build OS instances divides available CPUs and RAM (minus 4G) into \i{slices} -that are then \i{committed} to each toolchain. If you don't expect your -toolchains to utilize these resources at the same time, then it may make -sense to overcommit them to improve utilization. The respective overcommit -values can be specified as ratios with the \c{buildos.cpu_overcommit} -and \c{buildos.ram_overcommit} kernel command line parameters. For example, -given the following CPU overcommit: +A Build OS instances divides available CPUs and RAM (minus reserved, see +below) into \i{slices} that are then \i{committed} to each instance of each +toolchain. If you don't expect your builds to utilize these resources at the +same time, then it may make sense to overcommit them to improve utilization. +The respective overcommit values can be specified as ratios with the +\c{buildos.cpu_overcommit} and \c{buildos.ram_overcommit} kernel command +line parameters. For example, given the following CPU overcommit: \ buildos.cpu_overcommit=3/2 @@ -195,6 +195,11 @@ buildos.cpu_overcommit=3/2 A Build OS machine with 8 CPUs (hardware threads) and three toolchains will assign 4 CPUs (\c{8 * 3/2 / 3}) to each slice. +It is also possible to reserve a number of CPUs and an amount of RAM to +Build OS with the \c{buildos.cpu_reserved} and \c{buildos.ram_reserved} +(in GB) kernel command line parameters. If unspecified, 4GB of RAM is +reserved by default. + \h#config-storage|Storage| @@ -342,6 +347,19 @@ for example, \c{buildos.toolchain_url.<name>} (values without the toolchain name use the toolchain name \c{default}). The toolchain name may not contain \c{-}. +Each toolchain may also execute multiple \c{bbot} agent instances. The number +of instances is specified with the \c{buildos.instances[.<name>]} parameter. + +All \c{bbot} agent instances of a toolchain are executed with the same nice +value which can be specified with the \c{buildos.nice[.<name>]} parameter. It +should be between -20 (highest priority) and 19 (lowest priority) with 0 +being the default. See \cb{sched(7)} for details. + +The bridge interface to be used for machine networking can be specified with +the \c{buildos.bridge[.<name>]} parameter. Valid values are \c{br0} (public +bridge to the physical interface) and \c{br1} (private/NAT'ed bridge to +\c{br0}). If unspecified, \c{br1} is used by default. + In the checksums file blank lines and lines that start with \c{#} are ignored. If the first line is the special \c{disabled} value, then this toolchain is ignored. Otherwise, each line in the checksums file is the output of the @@ -564,7 +582,10 @@ If the machine has been suspended, it can be resumed using the following command: \ -echo cont | ssh build@build socat - UNIX-CONNECT:/tmp/<toolchain>-monitor +echo cont | ssh build@build socat - UNIX-CONNECT:/tmp/monitor-<toolchain>-<instance> \ +Other useful QEMU monitor commands are \c{system_powerdown} and +\c{system_reset}. + " @@ -84,7 +84,7 @@ sensors-detect --auto # # First we separete quoted variables and arguments with newlines (giving # priority to assignments). Then we replace whitespaces with newline on -# lines that don't contain quites. Finally, clean up by removing blank +# lines that don't contain quotes. Finally, clean up by removing blank # lines. # # Note: the same code as in buildos. @@ -196,10 +196,18 @@ if [ -z "$eth" ]; then error fi -mac="$(cat "/sys/class/net/$eth/address")" -mid="$(sed -e 's/://g' <<<"$mac")" # Machine id. +# Global and local MAC addresses (used below for br0 and br1, respectively). +# Derive the local address from the global by fixing the first octet to 02 +# (locally-assigned). +# +gmac="$(cat "/sys/class/net/$eth/address")" +lmac="$(sed -re 's/..:(.+)/02:\1/g' <<<"$gmac")" + +info "configured $eth ($gmac)" -info "configured $eth ($mac)" +# Machine id. +# +mid="$(sed -re 's/://g' <<<"$gmac")" # Set the hostname. # @@ -219,12 +227,17 @@ info "hostname $hname" # dhclient -x 2>/dev/null -# @@ Need to be made configurable. +# @@ Needs to be made configurable. Something like 172.23.0.0/16. # -priv_network="172.16.123.0" -priv_netmask="255.255.255.0" -priv_netbase="$(sed -e 's/^\(.*\)\.0$/\1/' <<<"$priv_network")" +priv_network="172.23.0.0" +priv_netmask="255.255.0.0" +priv_netbase="$(sed -e 's/^\(.*\)\.0\.0$/\1/' <<<"$priv_network")" +# Note that if we don't assign the bridge MAC address, then it will keep +# changing every time an interface with a greater address (e.g., a tap) +# joins the bridge. Needless to say, constantly changing MAC will wreck +# all kinds of networking havoc. +# cat <<EOF >/etc/network/interfaces auto lo iface lo inet loopback @@ -237,18 +250,22 @@ iface br0 inet dhcp bridge_stp off bridge_maxwait 0 bridge_fd 0 - bridge_hw $mac + bridge_hw $gmac + post-up ip link set $eth txqueuelen 4000 + post-up ip link set br0 txqueuelen 4000 # Private bridge with NAT to br0. # auto br1 iface br1 inet static - address ${priv_netbase}.1 + address ${priv_netbase}.0.1 netmask $priv_netmask bridge_ports none bridge_stp off bridge_maxwait 0 bridge_fd 0 + bridge_hw $lmac + post-up ip link set br1 txqueuelen 4000 post-up iptables -t nat -A POSTROUTING -o br0 -j MASQUERADE post-up iptables -A FORWARD -i br0 -o br1 -m state --state RELATED,ESTABLISHED -j ACCEPT post-up iptables -A FORWARD -i br1 -o br0 -j ACCEPT @@ -257,7 +274,7 @@ EOF cat <<EOF >/etc/dnsmasq.d/br1-dhcp interface=br1 bind-interfaces -dhcp-range=${priv_netbase}.10,${priv_netbase}.250,12h +dhcp-range=${priv_netbase}.1.1,${priv_netbase}.255.255,$priv_netmask,2h EOF # Figure out disk configuration and generate the corresponding /etc/fstab. |