aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xbootstrap2
-rwxr-xr-xbuildos538
-rw-r--r--doc/manual.cli41
-rwxr-xr-xinit39
4 files changed, 386 insertions, 234 deletions
diff --git a/bootstrap b/bootstrap
index 78dd42e..e606c9f 100755
--- a/bootstrap
+++ b/bootstrap
@@ -67,7 +67,7 @@ base_pkgs+=",qemu-kvm,qemu-utils,socat"
base_pkgs+=",g++,make,pkg-config"
-extra_pkgs="ca-certificates,smartmontools"
+extra_pkgs="ca-certificates,time,dmidecode,smartmontools"
owd="$(pwd)"
trap "{ cd '$owd'; exit 1; }" ERR
diff --git a/buildos b/buildos
index 012869b..52e984e 100755
--- a/buildos
+++ b/buildos
@@ -51,7 +51,7 @@ info "starting build os monitor..."
#
# First we separete quoted variables and arguments with newlines (giving
# priority to assignments). Then we replace whitespaces with newline on
-# lines that don't contain quites. Finally, clean up by removing blank
+# lines that don't contain quites. Finally, we clean up by removing blank
# lines.
#
# Note: the same code as in init.
@@ -137,7 +137,9 @@ function toolchain_value () # <toolchain-prefix> <variable>
echo "${!n}"
}
+instances=0 # Number of bbot instances across all toolchains.
toolchain_names=()
+
for tn in "${!toolchains[@]}"; do
tp="${toolchains["$tn"]}"
tu="$(toolchain_value "$tp" toolchain_url)"
@@ -161,8 +163,28 @@ for tn in "${!toolchains[@]}"; do
declare "${tp}toolchain_ver="
declare "${tp}toolchain_fver=" # Full version (with snapshot).
- # If buildos.toolchain_trust was not specified, set it to "no" so that
- # we don't prompt if the repository happens to be signed.
+ # Default to 1 bbot agent instance.
+ #
+ if [ -z "$(toolchain_value "$tp" instances)" ]; then
+ declare "${tp}instances=1"
+ fi
+
+ instances=$(($instances + $(toolchain_value "$tp" instances)))
+
+ # Default to 0 nice value.
+ #
+ if [ -z "$(toolchain_value "$tp" nice)" ]; then
+ declare "${tp}nice=0"
+ fi
+
+ # Default to br1 (private/NAT bridge).
+ #
+ if [ -z "$(toolchain_value "$tp" bridge)" ]; then
+ declare "${tp}bridge=br1"
+ fi
+
+ # If toolchain_trust was not specified, set it to "no" so that we don't
+ # prompt if the repository happens to be signed.
#
if [ -z "$(toolchain_value "$tp" toolchain_trust)" ]; then
declare "${tp}toolchain_trust=no"
@@ -180,27 +202,48 @@ if [ "${#toolchain_names[@]}" -eq 0 ]; then
info "no buildos.toolchain_url specified, not bootstrapping"
fi
-# Divide CPUs and RAM (in kB) among the toolchains.
+# Divide CPUs and RAM (in KB) among the instances.
+#
+# By default reserve 4G of RAM for ourselves (rootfs, tmpfs).
#
-# Reserve 4G of RAM for ourselves (rootfs, tmpfs).
+# Note that MemTotal in /proc/meminfo is the available memory, not physical.
+# And to make it easier to provision memory it's really helpful to base it
+# in the physical value.
#
-ram_total="$(sed -n -re 's/^MemTotal: *([0-9]+) *kB$/\1/p' </proc/meminfo)"
+ram_total=0
+for i in $(sudo dmidecode -t 17 | sed -n -re 's/^\s*Size:\s*([0-9]+)\s*MB.*$/\1/p'); do
+ ram_total=$(($ram_total + $i * 1024))
+done
+
+if [ "$ram_total" -eq 0 ]; then
+ error "unable to determine physical memory size"
+fi
+
cpu_total="$(lscpu | sed -n -re 's/^CPU\(s\): *([0-9]+)$/\1/p')"
+if [ -z "$ram_reserved" ]; then
+ ram_reserved=4
+fi
+ram_reserved=$(($ram_reserved * 1024 * 1024))
+
if [ -z "$ram_overcommit" ]; then
ram_overcommit=1
fi
+if [ -z "$cpu_reserved" ]; then
+ cpu_reserved=0
+fi
+
if [ -z "$cpu_overcommit" ]; then
cpu_overcommit=1
fi
-ram_slice=$(("$ram_total" - 4 * 1024 * 1024))
-cpu_slice="$cpu_total"
+ram_slice=$(($ram_total - $ram_reserved))
+cpu_slice=$(($cpu_total - $cpu_reserved))
-if [ "${#toolchain_names[@]}" -gt 1 ]; then
- ram_slice=$(("$ram_slice" * "$ram_overcommit" / "${#toolchain_names[@]}"))
- cpu_slice=$(("$cpu_slice" * "$cpu_overcommit" / "${#toolchain_names[@]}"))
+if [ "$instances" -gt 1 ]; then
+ ram_slice=$(($ram_slice * $ram_overcommit / $instances))
+ cpu_slice=$(($cpu_slice * $cpu_overcommit / $instances))
if [ "$cpu_slice" -eq 0 ]; then
cpu_slice=1
@@ -212,13 +255,15 @@ fi
function print ()
{
echo "cpu_total: $cpu_total"
+ echo "cpu_reserved: $cpu_reserved"
echo "cpu_overcommit: $cpu_overcommit"
echo "cpu_slice: $cpu_slice"
echo
- echo "ram_total: $ram_total kB"
+ echo "ram_total: $ram_total KB"
+ echo "ram_reserved: $ram_reserved KB"
echo "ram_overcommit: $ram_overcommit"
- echo "ram_slice: $ram_slice kB"
+ echo "ram_slice: $ram_slice KB"
echo
echo "buildid: $buildid"
@@ -228,9 +273,15 @@ function print ()
local n i tn tp tu tt
for tn in "${toolchain_names[@]}"; do
tp="${toolchains["$tn"]}"
+ tc="$(toolchain_value "$tp" nice)"
+ tb="$(toolchain_value "$tp" bridge)"
+ ti="$(toolchain_value "$tp" instances)"
tu="$(toolchain_value "$tp" toolchain_url)"
tt="$(toolchain_value "$tp" toolchain_trust)"
+ echo "$tn.nice: $tc"
+ echo "$tn.bridge: $tb"
+ echo "$tn.instances: $ti"
echo "$tn.toolchain_url: $tu"
echo "$tn.toolchain_trust: $tt"
@@ -283,7 +334,7 @@ function machines_for () # <function> <function-args>...
for v in /build/machines/*; do
if [ ! -d "$v" ]; then
diag+=("$v: error: invalid volume")
- fail="true"
+ fail=true
continue
fi
@@ -292,7 +343,7 @@ function machines_for () # <function> <function-args>...
for m in *; do
if [ ! -d "$m" ]; then
diag+=("$v/$m: error: invalid machine")
- fail="true"
+ fail=true
continue
fi
@@ -308,19 +359,29 @@ function machines_clean_subvolume () # <subvolume-path>
{
if ! btrfs property set -ts "$1" ro false; then
diag+=("$1: error: unable to change subvolume property")
- fail="true"
+ fail=true
return 1
fi
if ! btrfs subvolume delete "$1"; then
diag+=("$1: error: unable to delete subvolume")
- fail="true"
+ fail=true
+ return 1
+ fi
+}
+
+function machines_clean_lockfile () # <lockfile-path>
+{
+ if ! rm -f "$1"; then
+ diag+=("$1: error: unable to delete lockfile")
+ fail=true
return 1
fi
}
# Cleanup the <name>-<toolchain>-<xxx> entries for the specified toolchain
-# called before starting each toolchain.
+# (all instances) as well as <name>-<toolchain>.lock file. Called before
+# starting bbot instances for each toolchain.
#
function machines_clean_toolchain () # <volume-dir> <machine> <toolchain>
{
@@ -330,24 +391,33 @@ function machines_clean_toolchain () # <volume-dir> <machine> <toolchain>
cd "$m"
- local s
- for s in "$m"-"$tn"-*; do
+ local i
+ for i in "$m"-"$tn"-*; do
- if [ ! -d "$s" ]; then
- diag+=("$v/$m/$s: error: invalid machine subvolume")
- fail="true"
+ if [ ! -d "$i" ]; then
+ diag+=("$v/$m/$i: error: invalid machine subvolume")
+ fail=true
continue
fi
- if machines_clean_subvolume "$v/$m/$s"; then
- diag+=("$v/$m/$s: info: deleted stray toolchain working subvolume")
+ if machines_clean_subvolume "$v/$m/$i"; then
+ diag+=("$v/$m/$i: info: deleted stray toolchain working subvolume")
fi
done
+ i="$m-$tn.lock"
+ if [ -f "$i" ]; then
+
+ if machines_clean_lockfile "$v/$m/$i"; then
+ diag+=("$v/$m/$i: info: deleted stray lockfile")
+ fi
+ fi
+
cd "$v"
}
-# Cleanup stray snapshots or deleted machines. Called once during startup.
+# Cleanup stray snapshots and lockfiles as well as deleted machines. Called
+# once during startup.
#
function machines_clean_stray () # <volume-dir> <machine>
{
@@ -359,35 +429,48 @@ function machines_clean_stray () # <volume-dir> <machine>
# Collect current machine symlink's bootstrap protocol numbers. If there
# are no current machine symlinks, then we delete the whole thing.
#
- local s ps=()
- for s in "$m"-*; do
- if [[ "$s" =~ ^"$m"-[0-9]+$ ]]; then
+ local i ps=()
+ for i in "$m"-*; do
+ if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then
- if [ ! -L "$s" ]; then
- diag+=("$v/$m/$s: error: not a symlink")
- fail="true"
+ if [ ! -L "$i" ]; then
+ diag+=("$v/$m/$i: error: not a symlink")
+ fail=true
fi
# Treat it as if it were a symlink even if its not. Failed that we
# may try to delete the whole thing.
#
- ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$s")")
+ ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$i")")
fi
done
# Examine each machine subvolume.
#
- for s in "$m"-*; do
+ for i in "$m"-*; do
# <name>-<P> (current machine symlink)
#
- if [[ "$s" =~ ^"$m"-[0-9]+$ ]]; then
+ if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then
continue
fi
- if [ ! -d "$s" ]; then
- diag+=("$v/$m/$s: error: invalid machine subvolume")
- fail="true"
+ # Lockfile.
+ #
+ if [ -f "$i" ]; then
+
+ if [[ "$i" =~ ^"$m"-.+\.lock$ ]]; then
+
+ if machines_clean_lockfile "$v/$m/$i"; then
+ diag+=("$v/$m/$i: info: deleted lockfile")
+ fi
+ continue
+ fi
+ fi
+
+ if [ ! -d "$i" ]; then
+ diag+=("$v/$m/$i: error: invalid machine subvolume")
+ fail=true
continue
fi
@@ -400,8 +483,8 @@ function machines_clean_stray () # <volume-dir> <machine>
#
local p f=
for p in "${ps[@]}"; do
- if [[ "$s" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then
- f="true"
+ if [[ "$i" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then
+ f=false
break
fi
done
@@ -415,8 +498,8 @@ function machines_clean_stray () # <volume-dir> <machine>
f=
local tn
for tn in "${toolchain_names[@]}"; do
- if [[ "$s" =~ ^"$m"-"$tn"$ ]]; then
- f="true"
+ if [[ "$i" =~ ^"$m"-"$tn"$ ]]; then
+ f=false
break
fi
done
@@ -426,11 +509,11 @@ function machines_clean_stray () # <volume-dir> <machine>
fi
fi
- # This is either a stray working submodule or a bootsrapped subvolume
+ # This is either a stray working subvolume or a bootsrapped subvolume
# for a toolchain that was deleted (or we are deleting everything).
#
- if machines_clean_subvolume "$v/$m/$s"; then
- diag+=("$v/$m/$s: info: deleted subvolume")
+ if machines_clean_subvolume "$v/$m/$i"; then
+ diag+=("$v/$m/$i: info: deleted subvolume")
fi
done
@@ -443,7 +526,7 @@ function machines_clean_stray () # <volume-dir> <machine>
diag+=("$v/$m: info: deleted machine directory")
else
diag+=("$v/$m: error: unable to delete machine directory")
- fail="true"
+ fail=true
fi
fi
}
@@ -463,8 +546,7 @@ if [ "${#diag[@]}" -gt 0 ]; then
info "$s" && print_diag 1>&2
if [ -n "$fail" ]; then
- info "correct and restart the monitor (systemctl restart buildos)"
- exit 1
+ error "correct and restart the monitor (systemctl restart buildos)"
fi
fi
@@ -513,7 +595,7 @@ function toolchain_fetch () # <toolchain-name> <line>
return 1
fi
- info "toolchain '$tn' version $tv"
+ info "toolchain $tn version $tv"
declare -g "${tp}toolchain_fver=$tv" # Full version.
echo "$tv" >"$tr/version-full"
@@ -713,9 +795,12 @@ function bbot_check () # <toolchain-name>
function bbot_start () # <toolchain-name> <toolchain-index>
{
local tn="$1"
- local ti="$2"
+ local tx="$2"
local tp="${toolchains["$tn"]}"
+ local tc="$(toolchain_value "$tp" nice)"
+ local tb="$(toolchain_value "$tp" bridge)"
+ local ti="$(toolchain_value "$tp" instances)"
local tv="$(toolchain_value "$tp" toolchain_fver)"
local ts="$(toolchain_value "$tp" toolchain_file_csum)"
@@ -741,9 +826,13 @@ function bbot_start () # <toolchain-name> <toolchain-index>
#
if [ "$b_word" = "configured" ]; then
- if ! sudo systemctl stop "bbot-agent@$tn"; then
- info "failed to stop bbot-agent@$tn service, assuming not running"
- fi
+ for ((i=1; i <= ti; i++)); do
+ if ! sudo systemctl stop "bbot-agent-$tn@$i"; then
+ info "failed to stop bbot-agent-$tn@$i service, assuming not running"
+ continue
+ fi
+ info "stopped bbot-agent-$tn@$i service"
+ done
# We may not be able to uninstall if we previously failed to build.
#
@@ -752,37 +841,45 @@ function bbot_start () # <toolchain-name> <toolchain-index>
fi
fi
- # Build and install the bbot agent. Since other agents might already
- # be running, limit the number of jobs to our slice.
+ # Build and install the bbot agent. Since other agents might already be
+ # running, limit the number of jobs to our slice.
#
- if ! bpkg --fetch-timeout "$timeout" \
- --build-option --jobs --build-option "$cpu_slice" \
+ if ! bpkg --fetch-timeout "$timeout" \
+ --build-option --jobs=$(($ti * $cpu_slice)) \
build --yes libbbot bbot; then
- info "failed to build bbot-agent@$tn"
+ info "failed to build bbot-agent for $tn"
break
fi
if ! bpkg install "${vars[@]}" bbot; then
- info "failed to install bbot-agent@$tn"
+ info "failed to install bbot-agent for $tn"
break
fi
- # Post-process and install systemd .service file. Note that we cannot use
- # the systemd pattern machinery since each version of bbot can have its
- # own version of the .service file.
+ # Post-process and install the systemd .service file. Since we may have
+ # multiple toolchains, we embed the toolchain name into the service name
+ # with the systemd pattern machinery used to run multiple bbot instances
+ # per toolchain.
+ #
+ # We assume `%I` is only used in Description and similar and rewrite it
+ # as `<name>/%i` (e.g., `stage/1`).
#
sed -i -r \
- -e "s/%[iI]/$tn/g" \
- -e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \
+ -e "s#%I#$tn/%I#g" \
-e "s/^(Environment=CPU)=.*/\1=$cpu_slice/" \
-e "s/^(Environment=RAM)=.*/\1=$ram_slice/" \
+ -e "s/^(Environment=BRIDGE)=.*/\1=$tb/" \
+ -e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \
-e "s/^(Environment=TOOLCHAIN_ID)=.*/\1=$ts/" \
- -e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$ti/" \
+ -e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$tx/" \
-e "s/^(Environment=TOOLCHAIN_VER)=.*/\1=$tv/" \
+ -e "s/^(Environment=TOOLCHAIN_NAME)=.*/\1=$tn/" \
+ -e "s/^(Nice)=.*/\1=$tc/" \
+ -e "s#^ExecStart=[^ ]+(.*)#ExecStart=$id/bin/bbot-agent\1#" \
"$id/lib/systemd/system/bbot-agent@.service"
# Patch in the controller URLs. These can contain special characters
- # like & so we have to escape them.
+ # like `&` so we have to escape them.
#
n="${tp}controller_url[@]"
for i in "${!n}"; do
@@ -801,8 +898,11 @@ function bbot_start () # <toolchain-name> <toolchain-index>
"$id/lib/systemd/system/bbot-agent@.service"
done
- sudo ln -sf "$id/lib/systemd/system/bbot-agent@.service" \
- "/usr/lib/systemd/system/bbot-agent@$tn.service"
+ # Note: using a hard link to prevent systemd from being too clever and
+ # calling the service bbot-agent@.
+ #
+ sudo ln -f "$id/lib/systemd/system/bbot-agent@.service" \
+ "/usr/lib/systemd/system/bbot-agent-$tn@.service"
# Clean up any machine snapshots that might have been left behind.
#
@@ -818,25 +918,28 @@ function bbot_start () # <toolchain-name> <toolchain-index>
print_diag 1>&2
if [ -n "$fail" ]; then
- info "correct and start bbot-agent@$tn (systemctl start bbot-agent@$tn)"
+ info "correct and start bbot-agent for $tn (systemctl start bbot-agent-$tn@N)"
break
fi
fi
- # Start the service. With Type=simple start returns as soon as the process
- # has forked. To see if the service actually started is done as part of
- # service monitoring.
+ # Start each service instance. With Type=simple start returns as soon as
+ # the process has forked. Making sure the service has actually started is
+ # done as part of the service monitoring.
#
- if ! sudo systemctl start "bbot-agent@$tn"; then
- info "failed to start bbot-agent@$tn service"
- break
- fi
-
r=0
+ for ((i=1; i <= ti; i++)); do
+ if ! sudo systemctl start "bbot-agent-$tn@$i"; then
+ info "failed to start bbot-agent-$tn@$i service instance"
+ r=1
+ break
+ fi
+ done
+
break
done
- cd "$owd"
+ cd "$owd"
return "$r"
}
@@ -855,6 +958,27 @@ while true; do
count=$(($count + 1))
+ # Check for OS changes. Do this first in case of any issues in the following
+ # checks.
+ #
+ if [ -n "$buildid_url" ]; then
+ # Fetch the current id. While normally it will be a TFTP URL, it could also
+ # be HTTP(S) so we configure sensible behavior for that.
+ #
+ if id="$("${curl[@]}" "$buildid_url")"; then
+ if [ "$id" != "$buildid" ]; then
+ email "rebooting because of new os build" <<EOF
+old_buildid: $buildid
+new_buildid: $id
+EOF
+ info "new os build ($id), rebooting..."
+ restart
+ fi
+ else
+ info "unable to fetch $buildid_url, will try again"
+ fi
+ fi
+
# Check for toolchain changes. If this is the first run, bootstrap them.
#
for tn in "${toolchain_names[@]}"; do
@@ -886,11 +1010,11 @@ while true; do
cs="$(toolchain_checksum "$tp" "$f")"
if [ "$ts" != "$cs" ]; then
- email "rebooting because of new '$tn' toolchain" <<EOF
+ email "rebooting because of new $tn toolchain" <<EOF
old_checksum: $ts
new_checksum: $cs
EOF
- info "new '$tn' toolchain ($cs), rebooting..."
+ info "new $tn toolchain ($cs), rebooting..."
restart
fi
else
@@ -905,7 +1029,7 @@ EOF
# subshell and any variables it sets (like toolchain_ver) won't be
# visible to us.
#
- info "bootstrapping '$tn' toolchain..."
+ info "bootstrapping $tn toolchain..."
toolchain_bootstrap "$tn" 2>&1 | tee "$tr/toolchain-$count.log" 1>&2
@@ -917,15 +1041,15 @@ EOF
tv="$(cat $tr/version-full)"
declare "${tp}toolchain_fver=$tv"
- s="bootstrapped '$tn' toolchain $tv"
+ s="bootstrapped $tn toolchain $tv"
toolchain_boots+=("$tn")
;;
1)
- s="skipping disabled '$tn' toolchain, waiting for new version"
+ s="skipping disabled $tn toolchain, waiting for new version"
toolchain_boots+=("") # Skip.
;;
*)
- s="failed to bootstrap '$tn' toolchain, waiting for new version"
+ s="failed to bootstrap $tn toolchain, waiting for new version"
toolchain_boots+=("") # Skip.
;;
esac
@@ -946,10 +1070,10 @@ EOF
#
if [ "${#toolchain_names[@]}" -eq "${#toolchain_boots[@]}" ]; then
- ti=0 # Toolchain index.
+ tx=0 # Toolchain index.
for tn in "${toolchain_boots[@]}"; do
- ti=$(($ti + 1))
+ tx=$(($tx + 1))
# Skip those that failed to bootstrap.
#
@@ -958,6 +1082,7 @@ EOF
fi
tp="${toolchains["$tn"]}"
+ ti="$(toolchain_value "$tp" instances)"
tr="$(toolchain_value "$tp" toolchain_root)"
# Or those that have no controllers (maybe it would have been better
@@ -969,171 +1094,160 @@ EOF
fi
s=
- bbot_check "$tn" 2>&1 | tee "$tr/bbot-$count.log" 1>&2
+ bbot_check "$tn" 2>&1 | tee "$tr/bbot-agent-$count.log" 1>&2
case "${PIPESTATUS[0]}" in
0)
- rm -f "$tr/bbot-$count.log"
+ rm -f "$tr/bbot-agent-$count.log"
- # Check if the service has failed.
+ # For each service instance check if it has failed.
#
- if sudo systemctl is-failed --quiet "bbot-agent@$tn"; then
- s="bbot-agent@$tn service has failed, stopping"
-
- # Note: ignore errors.
- #
- sudo systemctl status "bbot-agent@$tn" 2>&1 | \
- tee "$tr/bbot-$count.log" 1>&2
-
- # Reset it so that we don't keep sending the log on each
- # iteration. Note: ignore errors.
- #
- sudo systemctl reset-failed "bbot-agent@$tn" 2>&1 | \
- tee -a "$tr/bbot-$count.log" 1>&2
- else
- # See if there is any diagnostics in the systemd journal. We
- # notify about warning and up.
- #
- # The old versions journalctl behavior is to not output anything
- # (not even the cursor) if there are no new entries. The new
- # versions output the old cursor.
- #
- # Plus, it sometimes changes the cursor even without any errors in
- # it (journal rewind/truncation maybe?) so we have to detect that.
- #
- c=(sudo journalctl --no-pager --quiet --output short-full \
- --unit "bbot-agent@$tn")
-
- # Get the last cursor if any.
- #
- oc="${toolchain_cursors["$tn"]}"
- if [ -n "$oc" ]; then
- c+=("--after-cursor" "$oc")
- fi
+ for ((i=1; i <= ti; i++)); do
+
+ if sudo systemctl is-failed --quiet "bbot-agent-$tn@$i"; then
+ s="bbot-agent-$tn@$i service has failed, stopping"
+
+ # Note: ignore errors.
+ #
+ sudo systemctl status "bbot-agent-$tn@$i" 2>&1 | \
+ tee "$tr/bbot-agent-$i-$count.log" 1>&2
+
+ # Reset it so that we don't keep sending the log on each
+ # iteration. Note: ignore errors.
+ #
+ sudo systemctl reset-failed "bbot-agent-$tn@$i" 2>&1 | \
+ tee -a "$tr/bbot-agent-$i-$count.log" 1>&2
+
+ info "$s"
+ email "$s" <<EOF
+$tn.bbot_log: tftp://$hname/toolchains/$tn/bbot-agent-$i-$count.log
+EOF
+ else
+ # See if there is any diagnostics in the systemd journal. We
+ # notify about warnings and up.
+ #
+ # The old versions journalctl behavior is to not output anything
+ # (not even the cursor) if there are no new entries. The new
+ # versions output the old cursor.
+ #
+ # Plus, it sometimes changes the cursor even without any errors
+ # in it (journal rewind/truncation maybe?) so we have to detect
+ # that.
+ #
+ c=(sudo journalctl --no-pager --quiet --output short-full \
+ --unit "bbot-agent-$tn@$i")
+
+ # Get the last cursor if any.
+ #
+ oc="${toolchain_cursors["$tn/$i"]}"
+ if [ -n "$oc" ]; then
+ c+=("--after-cursor" "$oc")
+ fi
- # Get the "log range": the first line is the date of the first
- # error, the second line is the date of the last error, and the
- # third line is the end cursor. It can also be just one line in
- # which case it is the new cursor (that rewind stuff).
- #
- # Here is what's going on in that sed script:
- #
- # The first chunk matches the first line. We first put it into
- # the hold space (in case that's the only line) and then extract
- # and print the date.
- #
- # The second chunk matches the last line. We first handle the hold
- # space which by now should contain the last error line and then
- # the cursor.
- #
- # The last chunk matches every other line. We simply replace the
- # hold space with the next line so that at the end we have the
- # last line there.
- #
- lr="$("${c[@]}" --priority 4 --show-cursor | sed -n -r \
+ # Get the "log range": the first line is the date of the first
+ # error, the second line is the date of the last error, and the
+ # third line is the end cursor. It can also be just one line in
+ # which case it is the new cursor (that rewind stuff).
+ #
+ # Here is what's going on in that sed script:
+ #
+ # The first chunk matches the first line. We first put it into
+ # the hold space (in case that's the only line) and then extract
+ # and print the date.
+ #
+ # The second chunk matches the last line. We first handle the
+ # hold space which by now should contain the last error line and
+ # then the cursor.
+ #
+ # The last chunk matches every other line. We simply replace the
+ # hold space with the next line so that at the end we have the
+ # last line there.
+ #
+ lr="$("${c[@]}" --priority 4 --show-cursor | sed -n -r \
-e '1{h;s/^[MTWFS].. ([^ ]+ [^ ]+) .*$/\1/p;t}' \
-e '${x;s/^[MTWFS].. ([^ ]+ [^ ]+) .*$/\1/p;x;s/^-- cursor: (.+)$/\1/p;t}' \
-e 'h')"
- lc="$(wc -l <<<"$lr")"
- nc="$(sed -n -e "${lc}p" <<<"$lr")"
-
- # If we have no new entries, then nothing to do.
- #
- if [ "$nc" != "$oc" ]; then
+ lc="$(wc -l <<<"$lr")"
+ nc="$(sed -n -e "${lc}p" <<<"$lr")"
- # We may have no actual entries (cursor rewind).
+ # If we have no new entries, then nothing to do.
#
- if [ "$lc" -ne 1 ]; then
+ if [ "$nc" != "$oc" ]; then
- # Try to get some context before the first error and after the
- # last. This is unexpectedly hard in systemd.
- #
- # This can be a lot of output which makes it hard to spot the
- # error so we are going to print just the error summary first.
- # Quite a mess, I agree.
+ # We may have no actual entries (cursor rewind).
#
- sd="$(sed -n -e '1p' <<<"$lr")"
- sd="$(date '+%s' -d "$sd")" # sec
- sd="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($sd - 10))")" # -10sec
-
- ed="$(sed -n -e '2p' <<<"$lr")"
- ed="$(date '+%s' -d "$ed")" # sec
- ed="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($ed + 10))")" # +10sec
-
- s="bbot-agent@$tn service issued new diagnostics"
-
- info "$s"
- {
- echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}";
- echo;
- echo "summary:";
- echo;
- "${c[@]}" --priority 4 | head -n 200;
- echo;
- echo "context:";
- echo;
- if [ -n "$oc" ]; then
- unset 'c[-1]' # Pop cursor (for --since/--until).
- unset 'c[-1]'
- fi;
- "${c[@]}" --since "$sd" --until "$ed" | head -n 200
- } | email "$s"
+ if [ "$lc" -ne 1 ]; then
+
+ # Try to get some context before the first error and after
+ # the last. This is unexpectedly hard in systemd.
+ #
+ # This can be a lot of output which makes it hard to spot
+ # the error so we are going to print just the error summary
+ # first. Quite a mess, I agree.
+ #
+ sd="$(sed -n -e '1p' <<<"$lr")"
+ sd="$(date '+%s' -d "$sd")" # sec
+ sd="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($sd - 10))")" # -10sec
+
+ ed="$(sed -n -e '2p' <<<"$lr")"
+ ed="$(date '+%s' -d "$ed")" # sec
+ ed="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($ed + 10))")" # +10sec
+
+ s="bbot-agent-$tn@$i service issued new diagnostics"
+
+ info "$s"
+ {
+ echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}";
+ echo;
+ echo "summary:";
+ echo;
+ "${c[@]}" --priority 4 | head -n 200;
+ echo;
+ echo "context:";
+ echo;
+ if [ -n "$oc" ]; then
+ unset 'c[-1]' # Pop cursor (for --since/--until).
+ unset 'c[-1]'
+ fi;
+ "${c[@]}" --since "$sd" --until "$ed" | head -n 200
+ } | email "$s"
+ fi
+
+ toolchain_cursors["$tn/$i"]="$nc"
fi
-
- toolchain_cursors["$tn"]="$nc"
fi
+ done
- continue
- fi
+ continue # We have already issues diagnostics, if any.
;;
1)
s="re"
;&
2)
- info "${s}starting bbot-agent@$tn..."
+ info "${s}starting bbot-agent for $tn..."
# Note: appending to the same log.
#
- bbot_start "$tn" "$ti" 2>&1 | tee -a "$tr/bbot-$count.log" 1>&2
+ bbot_start "$tn" "$tx" 2>&1 | tee -a "$tr/bbot-agent-$count.log" 1>&2
if [ "${PIPESTATUS[0]}" -eq 0 ]; then
- s="${s}started bbot-agent@$tn"
+ s="${s}started bbot-agent for $tn, $ti instances"
else
- s="failed to ${s}start bbot-agent@$tn, waiting for new version"
+ s="failed to ${s}start bbot-agent for $tn, waiting for new version"
fi
;;
*)
- s="failed to fetch package information for '$tn' toolchain, will try again"
+ s="failed to fetch package information for $tn toolchain, will try again"
;;
esac
info "$s"
email "$s" <<EOF
-$tn.bbot_log: tftp://$hname/toolchains/$tn/bbot-$count.log
+$tn.bbot_log: tftp://$hname/toolchains/$tn/bbot-agent-$count.log
EOF
done
fi
- # Check for OS changes.
- #
- if [ -n "$buildid_url" ]; then
- # Fetch the current id. While normally it will be a TFTP URL, it could also
- # be HTTP(S) so we configure sensible behavior for that.
- #
- if id="$("${curl[@]}" "$buildid_url")"; then
- if [ "$id" != "$buildid" ]; then
- email "rebooting because of new os build" <<EOF
-old_buildid: $buildid
-new_buildid: $id
-EOF
- info "new os build ($id), rebooting..."
- restart
- fi
- else
- info "unable to fetch $buildid_url, will try again"
- fi
- fi
-
sensors -A
info "monitoring..."
sleep 60
diff --git a/doc/manual.cli b/doc/manual.cli
index 20e0846..abccb8b 100644
--- a/doc/manual.cli
+++ b/doc/manual.cli
@@ -34,9 +34,9 @@ mode} and receive \i{build tasks} from their respective agents.
\h1#arch|Architecture|
Build OS root filesystem (\c{rootfs}) resides entirely in RAM with all changes
-(such as installation of the \c{build2} toolchain} discarded on the next
+(such as installation of the \c{build2} toolchain) discarded on the next
reboot. A small amount of persistent (but not precious) state is stored in
-\c{/state} (see \l{#config-storage-state State}). A minimum of 4G of RAM
+\c{/state} (see \l{#config-storage-state State}). A minimum of 4GB of RAM
is required for Build OS itself (that is, excluding any virtual machines
and containers).
@@ -180,13 +180,13 @@ sudo kvm \
\h#config-cpu-ram|CPU and RAM|
-A Build OS instances divides available CPUs and RAM (minus 4G) into \i{slices}
-that are then \i{committed} to each toolchain. If you don't expect your
-toolchains to utilize these resources at the same time, then it may make
-sense to overcommit them to improve utilization. The respective overcommit
-values can be specified as ratios with the \c{buildos.cpu_overcommit}
-and \c{buildos.ram_overcommit} kernel command line parameters. For example,
-given the following CPU overcommit:
+A Build OS instances divides available CPUs and RAM (minus reserved, see
+below) into \i{slices} that are then \i{committed} to each instance of each
+toolchain. If you don't expect your builds to utilize these resources at the
+same time, then it may make sense to overcommit them to improve utilization.
+The respective overcommit values can be specified as ratios with the
+\c{buildos.cpu_overcommit} and \c{buildos.ram_overcommit} kernel command
+line parameters. For example, given the following CPU overcommit:
\
buildos.cpu_overcommit=3/2
@@ -195,6 +195,11 @@ buildos.cpu_overcommit=3/2
A Build OS machine with 8 CPUs (hardware threads) and three toolchains will
assign 4 CPUs (\c{8 * 3/2 / 3}) to each slice.
+It is also possible to reserve a number of CPUs and an amount of RAM to
+Build OS with the \c{buildos.cpu_reserved} and \c{buildos.ram_reserved}
+(in GB) kernel command line parameters. If unspecified, 4GB of RAM is
+reserved by default.
+
\h#config-storage|Storage|
@@ -342,6 +347,19 @@ for example, \c{buildos.toolchain_url.<name>} (values without the toolchain
name use the toolchain name \c{default}). The toolchain name may not contain
\c{-}.
+Each toolchain may also execute multiple \c{bbot} agent instances. The number
+of instances is specified with the \c{buildos.instances[.<name>]} parameter.
+
+All \c{bbot} agent instances of a toolchain are executed with the same nice
+value which can be specified with the \c{buildos.nice[.<name>]} parameter. It
+should be between -20 (highest priority) and 19 (lowest priority) with 0
+being the default. See \cb{sched(7)} for details.
+
+The bridge interface to be used for machine networking can be specified with
+the \c{buildos.bridge[.<name>]} parameter. Valid values are \c{br0} (public
+bridge to the physical interface) and \c{br1} (private/NAT'ed bridge to
+\c{br0}). If unspecified, \c{br1} is used by default.
+
In the checksums file blank lines and lines that start with \c{#} are ignored.
If the first line is the special \c{disabled} value, then this toolchain is
ignored. Otherwise, each line in the checksums file is the output of the
@@ -564,7 +582,10 @@ If the machine has been suspended, it can be resumed using the following
command:
\
-echo cont | ssh build@build socat - UNIX-CONNECT:/tmp/<toolchain>-monitor
+echo cont | ssh build@build socat - UNIX-CONNECT:/tmp/monitor-<toolchain>-<instance>
\
+Other useful QEMU monitor commands are \c{system_powerdown} and
+\c{system_reset}.
+
"
diff --git a/init b/init
index 672c9f1..d84e0bb 100755
--- a/init
+++ b/init
@@ -84,7 +84,7 @@ sensors-detect --auto
#
# First we separete quoted variables and arguments with newlines (giving
# priority to assignments). Then we replace whitespaces with newline on
-# lines that don't contain quites. Finally, clean up by removing blank
+# lines that don't contain quotes. Finally, clean up by removing blank
# lines.
#
# Note: the same code as in buildos.
@@ -196,10 +196,18 @@ if [ -z "$eth" ]; then
error
fi
-mac="$(cat "/sys/class/net/$eth/address")"
-mid="$(sed -e 's/://g' <<<"$mac")" # Machine id.
+# Global and local MAC addresses (used below for br0 and br1, respectively).
+# Derive the local address from the global by fixing the first octet to 02
+# (locally-assigned).
+#
+gmac="$(cat "/sys/class/net/$eth/address")"
+lmac="$(sed -re 's/..:(.+)/02:\1/g' <<<"$gmac")"
+
+info "configured $eth ($gmac)"
-info "configured $eth ($mac)"
+# Machine id.
+#
+mid="$(sed -re 's/://g' <<<"$gmac")"
# Set the hostname.
#
@@ -219,12 +227,17 @@ info "hostname $hname"
#
dhclient -x 2>/dev/null
-# @@ Need to be made configurable.
+# @@ Needs to be made configurable. Something like 172.23.0.0/16.
#
-priv_network="172.16.123.0"
-priv_netmask="255.255.255.0"
-priv_netbase="$(sed -e 's/^\(.*\)\.0$/\1/' <<<"$priv_network")"
+priv_network="172.23.0.0"
+priv_netmask="255.255.0.0"
+priv_netbase="$(sed -e 's/^\(.*\)\.0\.0$/\1/' <<<"$priv_network")"
+# Note that if we don't assign the bridge MAC address, then it will keep
+# changing every time an interface with a greater address (e.g., a tap)
+# joins the bridge. Needless to say, constantly changing MAC will wreck
+# all kinds of networking havoc.
+#
cat <<EOF >/etc/network/interfaces
auto lo
iface lo inet loopback
@@ -237,18 +250,22 @@ iface br0 inet dhcp
bridge_stp off
bridge_maxwait 0
bridge_fd 0
- bridge_hw $mac
+ bridge_hw $gmac
+ post-up ip link set $eth txqueuelen 4000
+ post-up ip link set br0 txqueuelen 4000
# Private bridge with NAT to br0.
#
auto br1
iface br1 inet static
- address ${priv_netbase}.1
+ address ${priv_netbase}.0.1
netmask $priv_netmask
bridge_ports none
bridge_stp off
bridge_maxwait 0
bridge_fd 0
+ bridge_hw $lmac
+ post-up ip link set br1 txqueuelen 4000
post-up iptables -t nat -A POSTROUTING -o br0 -j MASQUERADE
post-up iptables -A FORWARD -i br0 -o br1 -m state --state RELATED,ESTABLISHED -j ACCEPT
post-up iptables -A FORWARD -i br1 -o br0 -j ACCEPT
@@ -257,7 +274,7 @@ EOF
cat <<EOF >/etc/dnsmasq.d/br1-dhcp
interface=br1
bind-interfaces
-dhcp-range=${priv_netbase}.10,${priv_netbase}.250,12h
+dhcp-range=${priv_netbase}.1.1,${priv_netbase}.255.255,$priv_netmask,2h
EOF
# Figure out disk configuration and generate the corresponding /etc/fstab.