aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bbot/worker.cxx50
1 files changed, 37 insertions, 13 deletions
diff --git a/bbot/worker.cxx b/bbot/worker.cxx
index 382e334..af1429c 100644
--- a/bbot/worker.cxx
+++ b/bbot/worker.cxx
@@ -34,6 +34,7 @@ namespace bbot
dir_path env_dir;
const size_t tftp_timeout (10); // 10 seconds.
+ const size_t tftp_retries (3); // Task request retries (see startup()).
}
static dir_path
@@ -331,21 +332,44 @@ startup ()
{
// Download the task.
//
- try
+ // We are downloading from our host so there shouldn't normally be any
+ // connectivity issues. Unless, of course, we are on Windows where all
+ // kinds of flakiness is business as usual. Note that having a long enough
+ // timeout is not enough: if we try to connect before the network is up,
+ // we will keep waiting forever, even after it is up. So we have to
+ // timeout and try again. This is also pretty bad (unlike, say during
+ // bootstrap which doesn't happen very often) since we are wasting the
+ // machine time. So we are going to log it as a warning and not merely a
+ // trace since if this is a common occurrence, then something has to be
+ // done about it.
+ //
+ for (size_t retry (1);; ++retry)
{
- tftp_curl c (trace,
- nullfd,
- mf,
- curl::get,
- url,
- "--max-time", tftp_timeout);
+ try
+ {
+ tftp_curl c (trace,
+ nullfd,
+ mf,
+ curl::get,
+ url,
+ "--max-time", tftp_timeout);
- if (!c.wait ())
- throw_generic_error (EIO);
- }
- catch (const system_error& e)
- {
- fail << "unable to download task manifest from " << url << ": " << e;
+ if (!c.wait ())
+ throw_generic_error (EIO);
+
+ break;
+ }
+ catch (const system_error& e)
+ {
+ bool bail (retry > tftp_retries);
+ diag_record dr (bail ? error : warn);
+
+ dr << "unable to download task manifest from " << url << " on "
+ << retry << " try: " << e;
+
+ if (bail)
+ throw failed ();
+ }
}
// Parse it.