From 1c99f419fe91a9a39eb5faa056b9867de2d988a1 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Mon, 2 Dec 2024 13:48:06 +0200 Subject: Handle inability to serialize result manifest due to no disk space --- bbot/worker/worker.cxx | 182 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 118 insertions(+), 64 deletions(-) (limited to 'bbot/worker/worker.cxx') diff --git a/bbot/worker/worker.cxx b/bbot/worker/worker.cxx index d413912..5a4d240 100644 --- a/bbot/worker/worker.cxx +++ b/bbot/worker/worker.cxx @@ -12,6 +12,7 @@ #include // strchr(), strncmp() #include #include +#include // generic_category() #include #include @@ -1222,16 +1223,21 @@ run_tar (step_id step, } #endif -// Upload compressed manifest to the specified TFTP URL with curl. Issue -// diagnostics and throw failed on invalid manifest or process management -// errors and throw io_error for input/output errors or non-zero curl exit. +// Upload compressed manifest to the specified TFTP URL with curl. If disk +// is false, then upload without creating a temporary file. +// +// Issue diagnostics and throw failed on invalid manifest or process +// management errors and throw io_error for curl input/output errors or +// non-zero curl exit. Also throw system_error with ENOSPC if unable to create +// a temporary file due to insufficient disk space. // template static void upload_manifest (tracer& trace, const string& url, const T& m, - const string& what) + const string& what, + bool disk = true) { try { @@ -1242,74 +1248,88 @@ upload_manifest (tracer& trace, // other hand, uploading from a file appears to work reliably (we still // get an odd error on Windows from time to time with larger uploads). // - // Let's not break lines in the manifest values not to further increase - // the size of the manifest encoded representation. Also here we don't - // care much about readability of the manifest since it will only be read - // by the bbot agent anyway. - // -#if 0 - // Note: need to add compression support if re-enable this. - tftp_curl c (trace, - path ("-"), - nullfd, - curl::put, - url, - "--tftp-blksize", tftp_blksize, - "--max-time", tftp_put_timeout); - - manifest_serializer s (c.out, url, true /* long_lines */); - m.serialize (s); - c.out.close (); -#else auto_rmfile tmp; + if (disk) + { + try + { + try + { + tmp = auto_rmfile (path::temp_path (what + "-manifest.lz4")); + ofdstream ofs (tmp.path, fdopen_mode::binary); + olz4stream ozs (ofs, 9, 5 /* 256KB */, nullopt /* content_size */); + + // Let's not break lines in the manifest values not to further + // increase the size of the manifest encoded representation. Also + // here we don't care much about readability of the manifest since + // it will only be read by the bbot agent anyway. + // + manifest_serializer s (ozs, tmp.path.string (), true /*long_lines*/); + m.serialize (s); + + ozs.close (); + ofs.close (); + } + catch (const system_error& e) + { + if (e.code ().category () == std::generic_category () && + e.code ().value () == ENOSPC) + throw_generic_error (ENOSPC); + + fail << "unable to save " << what << " manifest: " << e; + } + } + catch (const io_error& e) // In case not derived from system_error. + { + fail << "unable to save " << what << " manifest: " << e; + } + } + try { - tmp = auto_rmfile (path::temp_path (what + "-manifest.lz4")); - ofdstream ofs (tmp.path, fdopen_mode::binary); - olz4stream ozs (ofs, 9, 5 /* 256KB */, nullopt /* content_size */); - manifest_serializer s (ozs, tmp.path.string (), true /* long_lines */); - m.serialize (s); - ozs.close (); - ofs.close (); + tftp_curl c (trace, + disk ? tmp.path : path ("-"), + nullfd, + curl::put, + url, + "--tftp-blksize", tftp_blksize, + "--max-time", tftp_put_timeout); + + if (!disk) + { + // Note: it's simpler to compress than do deal with the variability + // everywhere else. + // + olz4stream ozs (c.out, 9, 5 /* 256KB */, nullopt /* content_size */); + + manifest_serializer s (ozs, url, true /* long_lines */); + m.serialize (s); + + ozs.close (); + c.out.close (); + } + + if (!c.wait ()) + throw_generic_ios_failure (EIO, "non-zero curl exit code"); } - catch (const io_error& e) // In case not derived from system_error. + catch (const process_error& e) { - fail << "unable to save " << what << " manifest: " << e; + fail << "unable to execute curl: " << e; } catch (const system_error& e) { - fail << "unable to save " << what << " manifest: " << e; - } + const auto& c (e.code ()); - tftp_curl c (trace, - tmp.path, - nullfd, - curl::put, - url, - "--tftp-blksize", tftp_blksize, - "--max-time", tftp_put_timeout); -#endif - - if (!c.wait ()) - throw_generic_ios_failure (EIO, "non-zero curl exit code"); + if (c.category () == generic_category ()) + throw_generic_ios_failure (c.value (), e.what ()); + else + throw_system_ios_failure (c.value (), e.what ()); + } } catch (const manifest_serialization& e) { fail << "invalid " << what << " manifest: " << e.description; } - catch (const process_error& e) - { - fail << "unable to execute curl: " << e; - } - catch (const system_error& e) - { - const auto& c (e.code ()); - - if (c.category () == generic_category ()) - throw_generic_ios_failure (c.value (), e.what ()); - else - throw_system_ios_failure (c.value (), e.what ()); - } } static strings @@ -6448,15 +6468,24 @@ build (size_t argc, const char* argv[]) // return rm.status != result_status::abnormal ? 0 : 2; } + // + // We use exit code 3 to signal an unsuccessful attempt to upload the result + // manifest and exit code 4 to singal that there was no disk space to + // serialize the manifest. See startup() for details. + // catch (const io_error& e) { error << "unable to upload result manifest to " << url << ": " << e; + return 3; } + catch (const system_error& e) + { + assert (e.code ().category () == std::generic_category () && + e.code ().value () == ENOSPC); - // We use exit code 3 to signal an unsuccessful attempt to upload the result - // manifest. See startup() for details. - // - return 3; + error << "unable to serialize result manifest: " << e; + return 4; + } } // Parse the task_manifest::auxiliary_environment value into the list of @@ -6593,7 +6622,8 @@ startup () task_manifest tm; auto upload_result = [&trace, &tm] (result_status rs, - operation_results&& ors) + operation_results&& ors, + bool disk = true) { const string url ("tftp://" + ops.tftp_host () + "/result.manifest.lz4"); @@ -6611,12 +6641,16 @@ startup () try { - upload_manifest (trace, url, rm, "result"); + upload_manifest (trace, url, rm, "result", disk); } catch (const io_error& e) { fail << "unable to upload result manifest to " << url << ": " << e; } + catch (const system_error& e) + { + fail << "unable to serialize result manifest: " << e; + } }; try @@ -6755,6 +6789,8 @@ startup () // execution when the interactive build breakpoint is reached. Thus, we // don't redirect stdin to /dev/null. // + // Exit code 1 signals abnormal worker termination. + // // Exit code 2 signals abnormal termination but where the worker uploaded // the result itself. // @@ -6762,6 +6798,9 @@ startup () // result manifest. There is no reason to retry (most likely there is // nobody listening on the other end anymore). // + // Exit code 4 signals the inability to serialize the result manifest due + // to insufficient disk space. + // switch (run_io_exit (trace, 0 /* stdin */, 2 /* stdout */, 2 /* stderr */, process_env (pp, aux_env), @@ -6769,6 +6808,21 @@ startup () argv0.effect_string (), os)) { + case 4: + { + // Note that while it may seem like abnormal is the more appropriate + // result, experience shows running out of disk space is not that + // uncommon and suspending the machine every time we hit this makes it + // unusable for the rest of the users. + // + operation_result r { + "configure", + result_status::abort, + "error: no disk space left to serialize result manifest\n"}; + + upload_result (result_status::abort, {move (r)}, false /* disk */); + return 1; + } case 3: case 2: return 1; case 0: return 0; -- cgit v1.1