6 files changed, 265 insertions, 409 deletions
diff --git a/mod/mod-ci-github-gh.cxx b/mod/mod-ci-github-gh.cxx
index a1e4d53..fc5cf82 100644
--- a/mod/mod-ci-github-gh.cxx
+++ b/mod/mod-ci-github-gh.cxx
@@ -19,7 +19,7 @@ namespace brep
   // Return the GitHub check run status corresponding to a build_state.
   //
   string
-  gh_to_status (build_state st) noexcept
+  gh_to_status (build_state st)
   {
     // Just return by value (small string optimization).
     //
@@ -484,10 +484,6 @@ namespace brep
         return p.name () == s ? (v = true) : false;
       };
 
-      // Pass true to gh_check_run() to indicate that the we're parsing a
-      // webhook event or REST API response (in which case more fields are
-      // expected to be present than in a GraphQL response).
-      //
       if      (c (ac, "action"))       action = p.next_expect_string ();
       else if (c (cs, "check_run"))    check_run = gh_check_run_ex (p);
       else if (c (rp, "repository"))   repository = gh_repository (p);
@@ -597,6 +593,14 @@ namespace brep
                       "invalid IAT expires_at value '" + v +
                         "': " + e.what ());
         }
+        catch (const system_error& e)
+        {
+          // Translate for simplicity.
+          //
+          throw_json (p,
+                      "unable to convert IAT expires_at value '" + v +
+                      "': " + e.what ());
+        }
       }
       else p.next_expect_value_skip ();
     }
@@ -623,37 +627,17 @@ namespace brep
   string
   gh_to_iso8601 (timestamp t)
   {
-    try
-    {
-      return butl::to_string (t,
-                              "%Y-%m-%dT%TZ",
-                              false /* special */,
-                              false /* local */);
-    }
-    catch (const system_error& e)
-    {
-      throw runtime_error (
-        string ("failed to convert timestamp to ISO 8601 string: ") +
-        e.what ());
-    }
+    return butl::to_string (t,
+                            "%Y-%m-%dT%TZ",
+                            false /* special */,
+                            false /* local */);
   }
 
   timestamp
   gh_from_iso8601 (const string& s)
   {
-    try
-    {
-      // @@ TMP butl::from_string()'s comment says it also throws
-      //    invalid_argument but that seems to be false.
-      //
-      return butl::from_string (s.c_str (),
-                                "%Y-%m-%dT%TZ",
-                                false /* local */);
-    }
-    catch (const system_error& e)
-    {
-      throw invalid_argument ("invalid ISO 8601 timestamp value '" + s +
-                              "': " + e.what ());
-    }
+    return butl::from_string (s.c_str (),
+                              "%Y-%m-%dT%TZ",
+                              false /* local */);
   }
 }
diff --git a/mod/mod-ci-github-gh.hxx b/mod/mod-ci-github-gh.hxx
index 5fff2bc..eeda871 100644
--- a/mod/mod-ci-github-gh.hxx
+++ b/mod/mod-ci-github-gh.hxx
@@ -107,7 +107,7 @@ namespace brep
   // Return the GitHub check run status corresponding to a build_state.
   //
   string
-  gh_to_status (build_state st) noexcept;
+  gh_to_status (build_state);
 
   // Return the build_state corresponding to a GitHub check run status
   // string. Throw invalid_argument if the passed status was invalid.
@@ -214,12 +214,14 @@ namespace brep
     gh_installation_access_token () = default;
   };
 
-  // Throw runtime_error if the conversion fails.
+  // Throw system_error if the conversion fails due to underlying operating
+  // system errors.
   //
   string
   gh_to_iso8601 (timestamp);
 
-  // Throw invalid_argument if the conversion fails.
+  // Throw invalid_argument if the conversion fails due to the invalid
+  // argument and system_error if due to underlying operating system errors.
   //
   timestamp
   gh_from_iso8601 (const string&);
diff --git a/mod/mod-ci-github-gq.cxx b/mod/mod-ci-github-gq.cxx
index a0a0d6b..7b7e464 100644
--- a/mod/mod-ci-github-gq.cxx
+++ b/mod/mod-ci-github-gq.cxx
@@ -521,9 +521,11 @@ namespace brep
         os                                                        << '\n';
         os << "  startedAt: " << gq_str (gh_to_iso8601 (*sa));
       }
-      catch (const runtime_error& e)
+      catch (const system_error& e)
       {
-        throw invalid_argument ("invalid started_at value " +
+        // Translate for simplicity.
+        //
+        throw invalid_argument ("unable to convert started_at value " +
                                 to_string (system_clock::to_time_t (*sa)) +
                                 ": " + e.what ());
       }
diff --git a/mod/mod-ci-github-gq.hxx b/mod/mod-ci-github-gq.hxx
index 86ab859..50950d4 100644
--- a/mod/mod-ci-github-gq.hxx
+++ b/mod/mod-ci-github-gq.hxx
@@ -24,7 +24,8 @@ namespace brep
   // `check_runs` with the new data (node id and state_synced). Return false
   // and issue diagnostics if the request failed.
   //
-  // Throw invalid_argument.
+  // Throw invalid_argument if the passed data is invalid, missing, or
+  // inconsistent.
   //
   // Note that creating a check_run named `foo` will effectively replace any
   // existing check_runs with that name. They will still exist on the GitHub
@@ -42,7 +43,8 @@ namespace brep
   // data (node id, state, and state_synced). Return false and issue
   // diagnostics if the request failed.
   //
-  // Throw invalid_argument.
+  // Throw invalid_argument if the passed data is invalid, missing, or
+  // inconsistent.
   //
   // If the details_url is absent GitHub will use the app's homepage.
   //
@@ -70,7 +72,8 @@ namespace brep
   // Update a check run on GitHub. Update `cr` with the new data (state and
   // state_synced). Return false and issue diagnostics if the request failed.
   //
-  // Throw invalid_argument.
+  // Throw invalid_argument if the passed data is invalid, missing, or
+  // inconsistent.
   //
   // Note that GitHub allows any state transitions except from built (but
   // built to built is allowed). The latter case is signalled by setting the
diff --git a/mod/mod-ci-github-service-data.cxx b/mod/mod-ci-github-service-data.cxx
index d3071b2..31a556d 100644
--- a/mod/mod-ci-github-service-data.cxx
+++ b/mod/mod-ci-github-service-data.cxx
@@ -220,9 +220,11 @@ namespace brep
       {
         v = gh_to_iso8601 (installation_access.expires_at);
       }
-      catch (const runtime_error& e)
+      catch (const system_error& e)
       {
-        throw invalid_argument ("invalid IAT expires_at value " +
+        // Translate for simplicity.
+        //
+        throw invalid_argument ("unable to convert IAT expires_at value " +
                                 to_string (system_clock::to_time_t (
                                   installation_access.expires_at)));
       }
diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx
index c703f32..56f703e 100644
--- a/mod/mod-ci-github.cxx
+++ b/mod/mod-ci-github.cxx
@@ -693,6 +693,8 @@ namespace brep
 
   // Create a gq_built_result.
   //
+  // Throw invalid_argument in case of invalid result_status.
+  //
   static gq_built_result
   make_built_result (result_status rs, bool warning_success, string message)
   {
@@ -1166,312 +1168,6 @@ namespace brep
     return true;
   }
 
-  // @@ TMP
-  //
-#if 0
-  bool ci_github::
-  handle_check_run_rerequest (const gh_check_run_event& cr,
-                              bool warning_success)
-  {
-    HANDLER_DIAG;
-
-    l3 ([&]{trace << "check_run event { " << cr << " }";});
-
-    // Fail if this is the conclusion check run.
-    //
-    if (cr.check_run.name == conclusion_check_run_name)
-    {
-      // @@ Fail conclusion check run with appropriate message and reurn
-      //    true.
-
-      l3 ([&]{trace << "ignoring conclusion check_run";});
-
-      // 422 Unprocessable Content: The request was well-formed (i.e.,
-      // syntactically correct) but could not be processed.
-      //
-      throw invalid_request (422, "Conclusion check run cannot be rebuilt");
-    }
-
-    // Get a new installation access token.
-    //
-    auto get_iat = [this, &trace, &error, &cr] ()
-      -> optional<gh_installation_access_token>
-    {
-      optional<string> jwt (generate_jwt (trace, error));
-      if (!jwt)
-        return nullopt;
-
-      optional<gh_installation_access_token> iat (
-        obtain_installation_access_token (cr.installation.id,
-                                          move (*jwt),
-                                          error));
-
-      if (iat)
-        l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
-
-      return iat;
-    };
-
-    // Create a new conclusion check run, replacing the existing one.
-    //
-    // Return the check run on success or nullopt on failure.
-    //
-    auto create_conclusion_cr =
-      [&cr, &error, warning_success] (const gh_installation_access_token& iat,
-                                      build_state bs,
-                                      optional<result_status> rs = nullopt,
-                                      optional<string> msg = nullopt)
-      -> optional<check_run>
-    {
-      optional<gq_built_result> br;
-      if (rs)
-      {
-        assert (msg);
-
-        br = make_built_result (*rs, warning_success, move (*msg));
-      }
-
-      check_run r;
-      r.name = conclusion_check_run_name;
-
-      if (gq_create_check_run (error, r, iat.token,
-                               rni, hs,
-                               nullopt /* details_url */,
-                               bs, move (br)))
-      {
-        return r;
-      }
-      else
-        return nullopt;
-    };
-
-    // The overall plan is as follows:
-    //
-    // 1. Call the rebuild() function to attempt to schedule a rebuild. Pass
-    //    the update function that does the following (if called):
-    //
-    //    a. Update the check run being rebuilt (may also not exist).
-    //
-    //    b. Clear the completed flag if true.
-    //
-    //    c. "Return" the service data to be used after the call.
-    //
-    // 2. If the result of rebuild() indicates the tenant is archived, fail
-    //    the conclusion check run with appropriate diagnostics.
-    //
-    // 3. If original state is queued, then no rebuild was scheduled and we do
-    //    nothing.
-    //
-    // 4. Otherwise (the original state is building or built):
-    //
-    //    a. Change the check run state to queued.
-    //
-    //    b. Change the conclusion check run to building (do unconditionally
-    //       to mitigate races).
-    //
-    // Note that while conceptually we are updating existing check runs, in
-    // practice we have to create new check runs to replace the existing ones
-    // because GitHub does not allow transitioning out of the built state.
-    //
-    // This results in a new node id for each check run but we can't save them
-    // to the service data after the rebuild() call. As a workaround, when
-    // updating the service data we 1) clear the re-requested check run's node
-    // id and set the state_synced flag to true to signal to build_building()
-    // and build_built() that it needs to create a new check run; and 2) clear
-    // the conclusion check run's node id to cause build_built() to create a
-    // new conclusion check run. And these two check runs' node ids will be
-    // saved to the service data.
-
-    // Parse the check_run's details_url to extract build id.
-    //
-    // While this is a bit hackish, there doesn't seem to be a better way
-    // (like associating custom data with a check run). Note that the GitHub
-    // UI only allows rebuilding completed check runs, so the details URL
-    // should be there.
-    //
-    optional<build_id> bid (parse_details_url (cr.check_run.details_url));
-    if (!bid)
-    {
-      fail << "check run " << cr.check_run.node_id
-           << ": failed to extract build id from details_url";
-    }
-
-    // The IAT retrieved from the service data.
-    //
-    optional<gh_installation_access_token> iat;
-
-    // True if the check run exists in the service data.
-    //
-    bool cr_found (false);
-
-    // Update the state of the check run in the service data. Return (via
-    // captured references) the IAT and whether the check run was found.
-    //
-    // Called by rebuild(), but only if the build is actually restarted.
-    //
-    auto update_sd = [&iat,
-                      &cr_found,
-                      &error,
-                      &cr] (const tenant_service& ts, build_state)
-      -> optional<string>
-    {
-      // NOTE: this lambda may be called repeatedly (e.g., due to transaction
-      // being aborted) and so should not move out of its captures.
-
-      service_data sd;
-      try
-      {
-        sd = service_data (*ts.data);
-      }
-      catch (const invalid_argument& e)
-      {
-        error << "failed to parse service data: " << e;
-        return nullptr;
-      }
-
-      if (!iat)
-        iat = sd.installation_access;
-
-      // If the re-requested check run is found, update it in the service
-      // data.
-      //
-      const string& nid (cr.check_run.node_id);
-
-      for (check_run& cr: sd.check_runs)
-      {
-        if (cr.node_id && *cr.node_id == nid)
-        {
-          cr_found = true;
-          cr.state = build_state::queued;
-          sd.completed = false;
-
-          // Clear the check run node ids and set state_synced to true to
-          // cause build_building() and/or build_built() to create new check
-          // runs (see the plan above for details).
-          //
-          cr.node_id = nullopt;
-          cr.state_synced = true;
-          sd.conclusion_node_id = nullopt;
-
-          return sd.json ();
-        }
-      }
-
-      return nullopt;
-    };
-
-    optional<build_state> bs (rebuild (*build_db_, retry_, *bid, update_sd));
-
-    if (!bs)
-    {
-      // Build has expired (most probably the tenant has been archived).
-      //
-      // Update the conclusion check run to notify the user (but have to
-      // replace it with a new one because we don't know the existing one's
-      // node id).
-      //
-      optional<gh_installation_access_token> iat (get_iat ());
-      if (!iat)
-        throw server_error ();
-
-      if (optional<check_run> ccr = create_conclusion_cr (
-            *iat,
-            build_state::built,
-            result_status::error,
-            "Unable to rebuild: tenant has been archived or no such build"))
-      {
-        l3 ([&]{trace << "created conclusion check_run { " << *ccr << " }";});
-      }
-      else
-      {
-        // Log the error and return failure to GitHub which will presumably
-        // indicate this in its GUI.
-        //
-        fail << "check run " << cr.check_run.node_id
-             << ": unable to create conclusion check run";
-      }
-    }
-    else if (*bs == build_state::queued)
-    {
-      // The build was already queued so nothing to be done. This might happen
-      // if the user clicked "re-run" multiple times before we managed to
-      // update the check run.
-    }
-    else
-    {
-      // The build has been requeued.
-      //
-      assert (*bs == build_state::building || *bs == build_state::built);
-
-      if (!cr_found)
-      {
-        // Respond with an error otherwise GitHub will post a message in its
-        // GUI saying "you have successfully requested a rebuild of ..."
-        //
-        fail << "check_run " << cr.check_run.node_id
-             << ": build restarted but check run does not exist "
-             << "in service data";
-      }
-
-      // Get a new IAT if the one from the service data has expired.
-      //
-      assert (iat.has_value ());
-
-      if (system_clock::now () > iat->expires_at)
-      {
-        iat = get_iat ();
-        if (!iat)
-          throw server_error ();
-      }
-
-      // Update (by replacing) the re-requested and conclusion check runs to
-      // queued and building, respectively.
-      //
-      // If either fails we can only log the error but build_building() and/or
-      // build_built() should correct the situation (see above for details).
-      //
-
-      // Update re-requested check run.
-      //
-      {
-        check_run ncr; // New check run.
-        ncr.name = cr.check_run.name;
-
-        if (gq_create_check_run (error,
-                                 ncr,
-                                 iat->token,
-                                 cr.repository.node_id,
-                                 cr.check_run.check_suite.head_sha,
-                                 cr.check_run.details_url,
-                                 build_state::queued))
-        {
-          l3 ([&]{trace << "created check_run { " << ncr << " }";});
-        }
-        else
-        {
-          error << "check_run " << cr.check_run.node_id
-                << ": unable to create (to update) check run in queued state";
-        }
-      }
-
-      // Update conclusion check run.
-      //
-      if (optional<check_run> ccr =
-            create_conclusion_cr (*iat, build_state::building))
-      {
-        l3 ([&]{trace << "created conclusion check_run { " << *ccr << " }";});
-      }
-      else
-      {
-        error << "check_run " << cr.check_run.node_id
-              << ": unable to create (to update) conclusion check run";
-      }
-    }
-
-    return true;
-  }
-#endif
-
   // Miscellaneous pull request facts
   //
   // - Although some of the GitHub documentation makes it sound like they
@@ -1629,6 +1325,8 @@ namespace brep
   build_unloaded (tenant_service&& ts,
                   const diag_epilogue& log_writer) const noexcept
   {
+    // NOTE: this function is noexcept and should not throw.
+
     NOTIFICATION_DIAG (log_writer);
 
     service_data sd;
@@ -1651,7 +1349,16 @@ namespace brep
   build_unloaded_pre_check (tenant_service&& ts,
                             service_data&& sd,
                             const diag_epilogue& log_writer) const noexcept
+  try
   {
+    // NOTE: this function is noexcept and should not throw.
+    //
+    // In a few places where invalid_argument is unlikely to be thrown and/or
+    // would indicate that things are seriously broken we let it propagate to
+    // the function catch block where the pre-check tenant will be canceled
+    // (otherwise we could end up in an infinite loop, e.g., because the
+    // problematic arguments won't change).
+
     NOTIFICATION_DIAG (log_writer);
 
     // We get here for PRs only (but both local and remote). The overall
@@ -1677,6 +1384,8 @@ namespace brep
     // Request PR pre-check info (triggering the generation of the test merge
     // commit on the GitHub's side).
     //
+    // Let unlikely invalid_argument propagate (see above).
+    //
     optional<gq_pr_pre_check_info> pc (
       gq_fetch_pull_request_pre_check_info (error,
                                             sd.installation_access.token,
@@ -1736,38 +1445,50 @@ namespace brep
       // notifications until (1) we load the tenant, (2) we cancel it, or (3)
       // it gets archived after some timeout.
       //
-      if (auto pr = create (error, warn, verb_ ? &trace : nullptr,
-                            *build_db_, retry_,
-                            tenant_service (sid, "ci-github", sd.json ()),
-                            chrono::seconds (30) /* interval */,
-                            chrono::seconds (0) /* delay */,
-                            duplicate_tenant_mode::ignore))
-      {
-        if (pr->second == duplicate_tenant_result::ignored)
+      try
+      {
+        if (auto pr = create (error, warn, verb_ ? &trace : nullptr,
+                              *build_db_, retry_,
+                              tenant_service (sid, "ci-github", sd.json ()),
+                              chrono::seconds (30) /* interval */,
+                              chrono::seconds (0) /* delay */,
+                              duplicate_tenant_mode::ignore))
         {
-          // This PR is sharing a head commit with something else.
-          //
-          // If this is a local PR then it's probably the branch push, which
-          // is expected, so do nothing.
-          //
-          // If this is a remote PR then it could be anything (branch push,
-          // local PR, or another remote PR) which in turn means the CI result
-          // may end up being for head, not merge commit. There is nothing we
-          // can do about it on our side (the user can enable the head-behind-
-          // base protection on their side).
-          //
-          if (sd.kind == service_data::remote)
+          if (pr->second == duplicate_tenant_result::ignored)
           {
-            l3 ([&]{trace << "remote pull request " << *sd.pr_node_id
-                          << ": CI tenant already exists for " << sid;});
+            // This PR is sharing a head commit with something else.
+            //
+            // If this is a local PR then it's probably the branch push, which
+            // is expected, so do nothing.
+            //
+            // If this is a remote PR then it could be anything (branch push,
+            // local PR, or another remote PR) which in turn means the CI
+            // result may end up being for head, not merge commit. There is
+            // nothing we can do about it on our side (the user can enable the
+            // head-behind- base protection on their side).
+            //
+            if (sd.kind == service_data::remote)
+            {
+              l3 ([&]{trace << "remote pull request " << *sd.pr_node_id
+                            << ": CI tenant already exists for " << sid;});
+            }
           }
         }
+        else
+        {
+          error << "pull request " << *sd.pr_node_id
+                << ": failed to create unloaded CI tenant "
+                << "with tenant_service id " << sid;
+
+          // Fall through to cancel.
+        }
       }
-      else
+      catch (const runtime_error& e) // Database retries exhausted.
       {
         error << "pull request " << *sd.pr_node_id
-              << ": unable to create unloaded CI tenant "
-              << "with tenant_service id " << sid;
+              << ": failed to create unloaded CI tenant "
+              << "with tenant_service id " << sid
+              << ": " << e.what ();
 
         // Fall through to cancel.
       }
@@ -1775,16 +1496,50 @@ namespace brep
 
     // Cancel the pre-check tenant.
     //
-    if (!cancel (error, warn, verb_ ? &trace : nullptr,
-                 *build_db_, retry_,
-                 ts.type,
-                 ts.id))
+    try
+    {
+      if (!cancel (error, warn, verb_ ? &trace : nullptr,
+                   *build_db_, retry_,
+                   ts.type,
+                   ts.id))
+      {
+        // Should never happen (no such tenant).
+        //
+        error << "pull request " << *sd.pr_node_id
+              << ": failed to cancel pre-check tenant with tenant_service id "
+              << ts.id;
+      }
+    }
+    catch (const runtime_error& e) // Database retries exhausted.
     {
-      // Should never happen (no such tenant).
-      //
       error << "pull request " << *sd.pr_node_id
             << ": failed to cancel pre-check tenant with tenant_service id "
-            << ts.id;
+            << ts.id << ": " << e.what ();
+    }
+
+    return nullptr;
+  }
+  catch (const std::exception& e)
+  {
+    NOTIFICATION_DIAG (log_writer);
+    error << "pull request " << *sd.pr_node_id
+          << ": unhandled exception: " << e.what ();
+
+    // Cancel the pre-check tenant otherwise we could end up in an infinite
+    // loop (see top of function).
+    //
+    try
+    {
+      if (cancel (error, warn, verb_ ? &trace : nullptr,
+                  *build_db_, retry_,
+                  ts.type,
+                  ts.id))
+        l3 ([&]{trace << "canceled pre-check tenant " << ts.id;});
+    }
+    catch (const runtime_error& e) // Database retries exhausted.
+    {
+      l3 ([&]{trace << "failed to cancel pre-check tenant " << ts.id << ": "
+                    << e.what ();});
     }
 
     return nullptr;
@@ -1794,7 +1549,16 @@ namespace brep
   build_unloaded_load (tenant_service&& ts,
                        service_data&& sd,
                        const diag_epilogue& log_writer) const noexcept
+  try
   {
+    // NOTE: this function is noexcept and should not throw.
+    //
+    // In a few places where invalid_argument is unlikely to be thrown and/or
+    // would indicate that things are seriously broken we let it propagate to
+    // the function catch block where the tenant will be canceled (otherwise
+    // we could end up in an infinite loop, e.g., because the problematic
+    // arguments won't change).
+
     NOTIFICATION_DIAG (log_writer);
 
     // Load the tenant, which is essentially the same for both branch push and
@@ -1840,6 +1604,8 @@ namespace brep
       check_run cr;
       cr.name = move (name);
 
+      // Let unlikely invalid_argument propagate (see above).
+      //
       if (gq_create_check_run (error,
                                cr,
                                iat->token,
@@ -1866,12 +1632,16 @@ namespace brep
     {
       assert (!node_id.empty ());
 
+      // Let unlikely invalid_argument propagate (see above).
+      //
       gq_built_result br (
         make_built_result (rs, sd.warning_success, move (summary)));
 
       check_run cr;
       cr.name = name; // For display purposes only.
 
+      // Let unlikely invalid_argument propagate (see above).
+      //
       if (gq_update_check_run (error,
                                cr,
                                iat->token,
@@ -1898,16 +1668,24 @@ namespace brep
     //
     string conclusion_node_id; // Conclusion check run node ID.
 
-    if (auto cr = create_synthetic_cr (conclusion_check_run_name))
+    if (!sd.conclusion_node_id)
     {
-      l3 ([&]{trace << "created check_run { " << *cr << " }";});
+      if (auto cr = create_synthetic_cr (conclusion_check_run_name))
+      {
+        l3 ([&]{trace << "created check_run { " << *cr << " }";});
 
-      conclusion_node_id = move (*cr->node_id);
+        conclusion_node_id = move (*cr->node_id);
+      }
     }
 
+    const string& effective_conclusion_node_id (
+      sd.conclusion_node_id
+      ? *sd.conclusion_node_id
+      : conclusion_node_id);
+
     // Load the CI tenant if the conclusion check run was created.
     //
-    if (!conclusion_node_id.empty ())
+    if (!effective_conclusion_node_id.empty ())
     {
       string ru; // Repository URL.
 
@@ -1924,36 +1702,50 @@ namespace brep
       else
         ru = sd.repository_clone_url + '#' + sd.check_sha;
 
+      // Let unlikely invalid_argument propagate (see above).
+      //
       repository_location rl (move (ru), repository_type::git);
 
-      optional<start_result> r (load (error, warn, verb_ ? &trace : nullptr,
-                                      *build_db_, retry_,
-                                      move (ts),
-                                      move (rl)));
-
-      if (!r || r->status != 200)
+      try
       {
-        if (auto cr = update_synthetic_cr (conclusion_node_id,
-                                           conclusion_check_run_name,
-                                           result_status::error,
-                                           to_check_run_summary (r)))
-        {
-          l3 ([&]{trace << "updated check_run { " << *cr << " }";});
-        }
-        else
+        optional<start_result> r (load (error, warn, verb_ ? &trace : nullptr,
+                                        *build_db_, retry_,
+                                        move (ts),
+                                        move (rl)));
+
+        if (!r || r->status != 200)
         {
-          // Nothing really we can do in this case since we will not receive
-          // any further notifications. Log the error as a last resort.
+          // Let unlikely invalid_argument propagate (see above).
+          //
+          if (auto cr = update_synthetic_cr (effective_conclusion_node_id,
+                                             conclusion_check_run_name,
+                                             result_status::error,
+                                             to_check_run_summary (r)))
+          {
+            l3 ([&]{trace << "updated check_run { " << *cr << " }";});
+          }
+          else
+          {
+            // Nothing really we can do in this case since we will not receive
+            // any further notifications. Log the error as a last resort.
+
+            error << "failed to load CI tenant " << ts.id
+                  << " and unable to update conclusion";
+          }
 
-          error << "failed to load CI tenant " << ts.id
-                << " and unable to update conclusion";
+          return nullptr; // No need to update service data in this case.
         }
+      }
+      catch (const runtime_error& e) // Database retries exhausted.
+      {
+        error << "failed to load CI tenant " << ts.id << ": " << e.what ();
 
-        return nullptr; // No need to update service data in this case.
+        // Fall through to retry on next call.
       }
     }
-    else if (!new_iat)
-      return nullptr; // Nothing to save (but retry on next call).
+
+    if (!new_iat && conclusion_node_id.empty ())
+      return nullptr; // Nothing to save (but potentially retry on next call).
 
     return [&error,
             iat = move (new_iat),
@@ -1984,6 +1776,28 @@ namespace brep
       return sd.json ();
     };
   }
+  catch (const std::exception& e)
+  {
+    NOTIFICATION_DIAG (log_writer);
+    error << "CI tenant " << ts.id << ": unhandled exception: " << e.what ();
+
+    // Cancel the tenant otherwise we could end up in an infinite loop (see
+    // top of function).
+    //
+    try
+    {
+      if (cancel (error, warn, verb_ ? &trace : nullptr,
+                  *build_db_, retry_, ts.type, ts.id))
+        l3 ([&]{trace << "canceled CI tenant " << ts.id;});
+    }
+    catch (const runtime_error& e) // Database retries exhausted.
+    {
+      l3 ([&]{trace << "failed to cancel CI tenant " << ts.id
+                    << ": " << e.what ();});
+    }
+
+    return nullptr;
+  }
 
   // Build state change notifications (see tenant-services.hxx for
   // background). Mapping our state transitions to GitHub pose multiple
@@ -2079,7 +1893,10 @@ namespace brep
                 optional<build_state> istate,
                 const build_queued_hints& hs,
                 const diag_epilogue& log_writer) const noexcept
+  try
   {
+    // NOTE: this function is noexcept and should not throw.
+
     NOTIFICATION_DIAG (log_writer);
 
     service_data sd;
@@ -2179,6 +1996,8 @@ namespace brep
     {
       // Create a check_run for each build as a single request.
       //
+      // Let unlikely invalid_argument propagate.
+      //
       if (gq_create_check_runs (error,
                                 crs,
                                 iat->token,
@@ -2242,12 +2061,23 @@ namespace brep
       return sd.json ();
     };
   }
+  catch (const std::exception& e)
+  {
+    NOTIFICATION_DIAG (log_writer);
+
+    error << "CI tenant " << ts.id << ": unhandled exception: " << e.what ();
+
+    return nullptr;
+  }
 
   function<optional<string> (const tenant_service&)> ci_github::
   build_building (const tenant_service& ts,
                   const build& b,
                   const diag_epilogue& log_writer) const noexcept
+  try
   {
+    // NOTE: this function is noexcept and should not throw.
+
     NOTIFICATION_DIAG (log_writer);
 
     service_data sd;
@@ -2323,6 +2153,8 @@ namespace brep
     //
     if (iat != nullptr)
     {
+      // Let unlikely invalid_argument propagate.
+      //
       if (gq_update_check_run (error,
                                *cr,
                                iat->token,
@@ -2388,17 +2220,30 @@ namespace brep
       return sd.json ();
     };
   }
+  catch (const std::exception& e)
+  {
+    NOTIFICATION_DIAG (log_writer);
+
+    string bid (gh_check_run_name (b)); // Full build id.
+
+    error << "check run " << bid << ": unhandled exception: " << e.what();
+
+    return nullptr;
+  }
 
   function<optional<string> (const tenant_service&)> ci_github::
   build_built (const tenant_service& ts,
                const build& b,
                const diag_epilogue& log_writer) const noexcept
+  try
   {
+    // NOTE: this function is noexcept and should not throw.
+
+    NOTIFICATION_DIAG (log_writer);
+
     // @@ TODO Include service_data::event_node_id and perhaps ts.id in
     //    diagnostics? E.g. when failing to update check runs we print the
     //    build ID only.
-    //
-    NOTIFICATION_DIAG (log_writer);
 
     service_data sd;
     try
@@ -2518,6 +2363,11 @@ namespace brep
       {
         using namespace web::xhtml;
 
+        // Note: let all serialization exceptions propagate. The XML
+        // serialization code can throw bad_alloc or xml::serialization in
+        // case of I/O failures, but we're serializing to a string stream so
+        // both exceptions are unlikely.
+        //
         ostringstream os;
         xml::serializer s (os, "check_run_summary");
 
@@ -2613,7 +2463,8 @@ namespace brep
 
       if (cr.node_id)
       {
-        // Update existing check run to built.
+        // Update existing check run to built. Let unlikely invalid_argument
+        // propagate.
         //
         if (gq_update_check_run (error,
                                  cr,
@@ -2630,7 +2481,7 @@ namespace brep
       }
       else
       {
-        // Create new check run.
+        // Create new check run. Let unlikely invalid_argument propagate.
         //
         // Note that we don't have build hints so will be creating this check
         // run with the full build id as name. In the unlikely event that an
@@ -2678,6 +2529,8 @@ namespace brep
           cr.node_id = *sd.conclusion_node_id;
           cr.name = conclusion_check_run_name;
 
+          // Let unlikely invalid_argument propagate.
+          //
           if (gq_update_check_run (error,
                                    cr,
                                    iat->token,
@@ -2782,6 +2635,16 @@ namespace brep
       return sd.json ();
     };
   }
+  catch (const std::exception& e)
+  {
+    NOTIFICATION_DIAG (log_writer);
+
+    string bid (gh_check_run_name (b)); // Full build id.
+
+    error << "check run " << bid << ": unhandled exception: " << e.what();
+
+    return nullptr;
+  }
 
   string ci_github::
   details_url (const build& b) const