diff options
-rw-r--r-- | bbot/worker.cxx | 50 |
1 files changed, 37 insertions, 13 deletions
diff --git a/bbot/worker.cxx b/bbot/worker.cxx index 382e334..af1429c 100644 --- a/bbot/worker.cxx +++ b/bbot/worker.cxx @@ -34,6 +34,7 @@ namespace bbot dir_path env_dir; const size_t tftp_timeout (10); // 10 seconds. + const size_t tftp_retries (3); // Task request retries (see startup()). } static dir_path @@ -331,21 +332,44 @@ startup () { // Download the task. // - try + // We are downloading from our host so there shouldn't normally be any + // connectivity issues. Unless, of course, we are on Windows where all + // kinds of flakiness is business as usual. Note that having a long enough + // timeout is not enough: if we try to connect before the network is up, + // we will keep waiting forever, even after it is up. So we have to + // timeout and try again. This is also pretty bad (unlike, say during + // bootstrap which doesn't happen very often) since we are wasting the + // machine time. So we are going to log it as a warning and not merely a + // trace since if this is a common occurrence, then something has to be + // done about it. + // + for (size_t retry (1);; ++retry) { - tftp_curl c (trace, - nullfd, - mf, - curl::get, - url, - "--max-time", tftp_timeout); + try + { + tftp_curl c (trace, + nullfd, + mf, + curl::get, + url, + "--max-time", tftp_timeout); - if (!c.wait ()) - throw_generic_error (EIO); - } - catch (const system_error& e) - { - fail << "unable to download task manifest from " << url << ": " << e; + if (!c.wait ()) + throw_generic_error (EIO); + + break; + } + catch (const system_error& e) + { + bool bail (retry > tftp_retries); + diag_record dr (bail ? error : warn); + + dr << "unable to download task manifest from " << url << " on " + << retry << " try: " << e; + + if (bail) + throw failed (); + } } // Parse it. |