From de9f0666062a638a5d5d2be931d9f2a40ea07195 Mon Sep 17 00:00:00 2001
From: Francois Kritzinger <francois@codesynthesis.com>
Date: Thu, 19 Dec 2024 16:09:59 +0200
Subject: ci-github: Rearrange order of functions

---
 mod/mod-ci-github.cxx | 1670 ++++++++++++++++++++++++-------------------------
 1 file changed, 835 insertions(+), 835 deletions(-)

(limited to 'mod/mod-ci-github.cxx')
diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx
index cbd8e70..31f3b06 100644
--- a/mod/mod-ci-github.cxx
+++ b/mod/mod-ci-github.cxx
@@ -546,64 +546,62 @@ namespace brep
   //
   static string conclusion_check_run_name ("CONCLUSION");
 
-  // Return the colored circle corresponding to a result_status.
-  //
-  static string
-  circle (result_status rs)
+  bool ci_github::
+  handle_branch_push (gh_push_event ps, bool warning_success)
   {
-    switch (rs)
+    HANDLER_DIAG;
+
+    l3 ([&]{trace << "push event { " << ps << " }";});
+
+    // Cancel the CI tenant associated with the overwritten/deleted previous
+    // head commit if this is a forced push or a branch deletion.
+    //
+    if (ps.forced || ps.deleted)
     {
-    case result_status::success:  return "\U0001F7E2"; // Green circle.
-    case result_status::warning:  return "\U0001F7E0"; // Orange circle.
-    case result_status::error:
-    case result_status::abort:
-    case result_status::abnormal: return "\U0001F534"; // Red circle.
+      // Service id that will uniquely identify the CI tenant.
+      //
+      string sid (ps.repository.node_id + ':' + ps.before);
 
-      // Valid values we should never encounter.
+      // Note that it's possible this commit still exists in another branch so
+      // we do refcount-aware cancel.
       //
-    case result_status::skip:
-    case result_status::interrupt:
-      throw invalid_argument ("unexpected result_status value: " +
-                              to_string (rs));
+      if (optional<tenant_service> ts = cancel (error, warn,
+                                                verb_ ? &trace : nullptr,
+                                                *build_db_, retry_,
+                                                "ci-github", sid,
+                                                true /* ref_count */))
+      {
+        l3 ([&]{trace << (ps.forced ? "forced push " + ps.after + " to "
+                                    : "deletion of ")
+                      << ps.ref << ": attempted to cancel CI of previous"
+                      << " head commit with tenant_service id " << sid
+                      << " (ref_count: " << ts->ref_count << ')';});
+      }
+      else
+      {
+        // It's possible that there was no CI for the previous commit for
+        // various reasons (e.g., CI was not enabled).
+        //
+        l3 ([&]{trace << (ps.forced ? "forced push " + ps.after + " to "
+                                    : "deletion of ")
+                      << ps.ref << ": failed to cancel CI of previous"
+                      << " head commit with tenant_service id " << sid;});
+      }
     }
 
-    return ""; // Should never reach.
-  }
-
-  // Make a check run summary from a CI start_result.
-  //
-  static string
-  to_check_run_summary (const optional<ci_start::start_result>& r)
-  {
-    string s;
-
-    s = "```\n";
-    if (r) s += r->message;
-    else s += "Internal service error";
-    s += "\n```";
-
-    return s;
-  }
-
-  bool ci_github::
-  handle_check_suite_rerequest (gh_check_suite_event cs, bool warning_success)
-  {
-    HANDLER_DIAG;
-
-    l3 ([&]{trace << "check_suite event { " << cs << " }";});
-
-    assert (cs.action == "rerequested");
+    if (ps.deleted)
+      return true; // Do nothing further if this was a branch deletion.
 
     // While we don't need the installation access token in this request,
     // let's obtain it to flush out any permission issues early. Also, it is
     // valid for an hour so we will most likely make use of it.
     //
-    optional<string> jwt (generate_jwt (cs.check_suite.app_id, trace, error));
+    optional<string> jwt (generate_jwt (ps.app_id, trace, error));
     if (!jwt)
       throw server_error ();
 
     optional<gh_installation_access_token> iat (
-        obtain_installation_access_token (cs.installation.id,
+        obtain_installation_access_token (ps.installation.id,
                                           move (*jwt),
                                           error));
     if (!iat)
@@ -611,84 +609,36 @@ namespace brep
 
     l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
 
-    // Service id that uniquely identifies the CI tenant.
-    //
-    string sid (cs.repository.node_id + ':' + cs.check_suite.head_sha);
+    // While it would have been nice to cancel CIs of PRs with this branch as
+    // base not to waste resources, there are complications: Firstly, we can
+    // only do this for remote PRs (since local PRs will most likely share the
+    // result with branch push). Secondly, we try to do our best even if the
+    // branch protection rule for head behind is not enabled. In this case, it
+    // would be good to complete the CI. So maybe/later. See also the head
+    // case in handle_pull_request(), where we do cancel remote PRs that are
+    // not shared.
 
-    // If the user requests a rebuild of the (entire) PR, then this manifests
-    // as the check_suite rather than pull_request event. Specifically:
-    //
-    // - For a local PR, this event is shared with the branch push and all we
-    //   need to do is restart the CI for the head commit.
-    //
-    // - For a remote PR, this event will have no gh_check_suite::head_branch.
-    //   In this case we need to load the existing service data for this head
-    //   commit, extract the test merge commit, and restart the CI for that.
-    //
-    //   Note that it's possible the base branch has moved in the meantime and
-    //   ideally we would want to re-request the test merge commit, etc.
-    //   However, this will only be necessary if the user does not follow our
-    //   recommendation of enabling the head-behind-base protection. And it
-    //   seems all this extra complexity would not be warranted.
+    // Service id that uniquely identifies the CI tenant.
     //
-    string check_sha;
-    service_data::kind_type kind;
-
-    if (!cs.check_suite.head_branch)
-    {
-      // Rebuild of remote PR.
-      //
-      kind = service_data::remote;
-
-      if (optional<tenant_data> d = find (*build_db_, "ci-github", sid))
-      {
-        tenant_service& ts (d->service);
-
-        try
-        {
-          service_data sd (*ts.data);
-          check_sha = move (sd.check_sha); // Test merge commit.
-        }
-        catch (const invalid_argument& e)
-        {
-          fail << "failed to parse service data: " << e;
-        }
-      }
-      else
-      {
-        error << "check suite " << cs.check_suite.node_id
-              << " for remote pull request:"
-              << " re-requested but tenant_service with id " << sid
-              << " did not exist";
-        return true;
-      }
-    }
-    else
-    {
-      // Rebuild of branch push or local PR.
-      //
-      kind = service_data::local;
-      check_sha = cs.check_suite.head_sha;
-    }
+    string sid (ps.repository.node_id + ':' + ps.after);
 
     service_data sd (warning_success,
                      iat->token,
                      iat->expires_at,
-                     cs.check_suite.app_id,
-                     cs.installation.id,
-                     move (cs.repository.node_id),
-                     move (cs.repository.clone_url),
-                     kind, false /* pre_check */, true /* re_requested */,
-                     move (check_sha),
-                     move (cs.check_suite.head_sha) /* report_sha */);
-
-    // Replace the existing CI tenant if it exists.
-    //
-    // Note that GitHub UI does not allow re-running the entire check suite
-    // until all the check runs are completed.
-    //
+                     ps.app_id,
+                     ps.installation.id,
+                     move (ps.repository.node_id),
+                     move (ps.repository.clone_url),
+                     service_data::local,
+                     false /* pre_check */,
+                     false /* re_requested */,
+                     ps.after /* check_sha */,
+                     ps.after /* report_sha */);
 
-    // Create an unloaded CI tenant.
+    // Create an unloaded CI tenant, doing nothing if one already exists
+    // (which could've been created by handle_pull_request() or by us as a
+    // result of a push to another branch). Note that the tenant's reference
+    // count is incremented in all cases.
     //
     // Note: use no delay since we need to (re)create the synthetic conclusion
     // check run as soon as possible.
@@ -700,264 +650,229 @@ namespace brep
     // notifications until (1) we load the tenant, (2) we cancel it, or (3)
     // it gets archived after some timeout.
     //
-    auto pr (create (error,
-                     warn,
-                     verb_ ? &trace : nullptr,
-                     *build_db_, retry_,
-                     tenant_service (sid, "ci-github", sd.json ()),
-                     chrono::seconds (30) /* interval */,
-                     chrono::seconds (0) /* delay */,
-                     duplicate_tenant_mode::replace));
-
-    if (!pr)
+    if (!create (error, warn, verb_ ? &trace : nullptr,
+                 *build_db_, retry_,
+                 tenant_service (sid, "ci-github", sd.json ()),
+                 chrono::seconds (30) /* interval */,
+                 chrono::seconds (0) /* delay */,
+                 duplicate_tenant_mode::ignore))
     {
-      fail << "check suite " << cs.check_suite.node_id
+      fail << "push " + ps.after + " to " + ps.ref
            << ": unable to create unloaded CI tenant";
     }
 
-    if (pr->second == duplicate_tenant_result::created)
-    {
-      error << "check suite " << cs.check_suite.node_id
-            << ": re-requested but tenant_service with id " << sid
-            << " did not exist";
-      return true;
-    }
-
     return true;
   }
 
+  // Miscellaneous pull request facts
+  //
+  // - Although some of the GitHub documentation makes it sound like they
+  //   expect check runs to be added to both the PR head commit and the merge
+  //   commit, the PR UI does not react to the merge commit's check runs
+  //   consistently. It actually seems to be quite broken. The only thing it
+  //   does seem to do reliably is blocking the PR merge if the merge commit's
+  //   check runs are not successful (i.e, overriding the PR head commit's
+  //   check runs). But the UI looks quite messed up generally in this state.
+  //
+  // - When new commits are added to a PR base branch, pull_request.base.sha
+  //   does not change, but the test merge commit will be updated to include
+  //   the new commits to the base branch.
+  //
+  // - When new commits are added to a PR head branch, pull_request.head.sha
+  //   gets updated with the head commit's SHA and check_suite.pull_requests[]
+  //   will contain all PRs with this branch as head.
+  //
   bool ci_github::
-  handle_check_suite_completed (gh_check_suite_event cs, bool warning_success)
+  handle_pull_request (gh_pull_request_event pr, bool warning_success)
   {
-    // The plans is as follows:
-    //
-    // 1. Load the service data.
-    //
-    // 2. Verify it is completed.
-    //
-    // 3. Verify the check run counts match.
-    //
-    // 4. Verify (like in build_built()) that all the check runs are
-    //    completed.
-    //
-    // 5. Verify the result matches what GitHub thinks it is.
-
     HANDLER_DIAG;
 
-    l3 ([&]{trace << "check_suite event { " << cs << " }";});
+    l3 ([&]{trace << "pull_request event { " << pr << " }";});
 
-    // Service id that uniquely identifies the CI tenant.
+    // While we don't need the installation access token in this request,
+    // let's obtain it to flush out any permission issues early. Also, it is
+    // valid for an hour so we will most likely make use of it.
     //
-    string sid (cs.repository.node_id + ':' + cs.check_suite.head_sha);
+    optional<string> jwt (generate_jwt (pr.pull_request.app_id, trace, error));
+    if (!jwt)
+      throw server_error ();
 
-    // The common log entry subject.
-    //
-    string sub ("check suite " + cs.check_suite.node_id + '/' + sid);
+    optional<gh_installation_access_token> iat (
+      obtain_installation_access_token (pr.installation.id,
+                                        move (*jwt),
+                                        error));
+    if (!iat)
+      throw server_error ();
 
-    // Load the service data.
+    l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
+
+    // Distinguish between local and remote PRs by comparing the head and base
+    // repositories' paths.
     //
-    service_data sd;
+    service_data::kind_type kind (
+      pr.pull_request.head_path == pr.pull_request.base_path
+      ? service_data::local
+      : service_data::remote);
 
-    if (optional<tenant_data> d = find (*build_db_, "ci-github", sid))
+    // Note that similar to the branch push case above, while it would have
+    // been nice to cancel the previous CI job once the PR head moves (the
+    // "synchronize" event), due to the head sharing problem the previous CI
+    // job might actually still be relevant (in both local and remote PR
+    // cases). So we only do it for the remote PRs and only if the head is not
+    // shared (via tenant reference counting).
+    //
+    if (kind == service_data::remote && pr.action == "synchronize")
     {
-      try
+      if (pr.before)
       {
-        sd = service_data (*d->service.data);
+        // Service id that will uniquely identify the CI tenant.
+        //
+        string sid (pr.repository.node_id + ':' + *pr.before);
+
+        if (optional<tenant_service> ts = cancel (error, warn,
+                                                  verb_ ? &trace : nullptr,
+                                                  *build_db_, retry_,
+                                                  "ci-github", sid,
+                                                  true /* ref_count */))
+        {
+          l3 ([&]{trace << "pull request " << pr.pull_request.node_id
+                        << ": attempted to cancel CI of previous head commit"
+                        << " (ref_count: " << ts->ref_count << ')';});
+        }
+        else
+        {
+          // It's possible that there was no CI for the previous commit for
+          // various reasons (e.g., CI was not enabled).
+          //
+          l3 ([&]{trace << "pull request " << pr.pull_request.node_id
+                        << ": failed to cancel CI of previous head commit "
+                        << "with tenant_service id " << sid;});
+        }
       }
-      catch (const invalid_argument& e)
+      else
       {
-        fail << "failed to parse service data: " << e;
+        error << "pull request " << pr.pull_request.node_id
+              << ": before commit is missing in synchronize event";
       }
     }
-    else
-    {
-      error << sub << ": tenant_service does not exist";
-      return true;
-    }
 
-    // Verify the completed flag and the number of check runs.
+    // Note: for remote PRs the check_sha will be set later, in
+    // build_unloaded_pre_check(), to test merge commit id.
     //
-    if (!sd.completed)
-    {
-      error << sub << " service data complete flag is false";
-      return true;
-    }
+    string check_sha (kind == service_data::local
+                      ? pr.pull_request.head_sha
+                      : "");
 
-    // Received count will be one higher because we don't store the conclusion
-    // check run.
+    // Note that PR rebuilds (re-requested) are handled by
+    // handle_check_suite_rerequest().
     //
-    size_t check_runs_count (sd.check_runs.size () + 1);
-
-    if (check_runs_count == 1)
-    {
-      error << sub << ": no check runs in service data";
-      return true;
-    }
-
-    if (cs.check_suite.check_runs_count != check_runs_count)
-    {
-      error << sub << ": check runs count " << cs.check_suite.check_runs_count
-            << " does not match service data count " << check_runs_count;
-      return true;
-    }
-
-    // Verify that all the check runs are built and compute the summary
-    // conclusion.
+    // Note that, in the case of a remote PR, GitHub will copy the PR head
+    // commit from the head (forked) repository into the base repository. So
+    // the check runs must always be added to the base repository, whether the
+    // PR is local or remote. The head commit refs are located at
+    // refs/pull/<PR-number>/head.
     //
-    result_status conclusion (result_status::success);
-
-    for (const check_run& cr: sd.check_runs)
-    {
-      if (cr.state == build_state::built)
-      {
-        assert (cr.status.has_value ());
-        conclusion |= *cr.status;
-      }
-      else
-      {
-        error << sub << ": unbuilt check run in service data";
-        return true;
-      }
-    }
+    service_data sd (warning_success,
+                     move (iat->token),
+                     iat->expires_at,
+                     pr.pull_request.app_id,
+                     pr.installation.id,
+                     move (pr.repository.node_id),
+                     move (pr.repository.clone_url),
+                     kind, true /* pre_check */, false /* re_request */,
+                     move (check_sha),
+                     move (pr.pull_request.head_sha) /* report_sha */,
+                     pr.pull_request.node_id,
+                     pr.pull_request.number);
 
-    // Verify the conclusion.
+    // Create an unloaded CI tenant for the pre-check phase (during which we
+    // wait for the PR's merge commit and behindness to become available).
     //
-    if (!cs.check_suite.conclusion)
-    {
-      error << sub << ": absent conclusion in completed check suite";
-      return true;
-    }
-
-    // Note that the case mismatch is due to GraphQL (gh_conclusion())
-    // requiring uppercase conclusion values while the received webhook values
-    // are lower case.
+    // Create with an empty service id so that the generated tenant id is used
+    // instead during the pre-check phase (so as not to clash with a proper
+    // service id for this head commit, potentially created in
+    // handle_branch_push() or as another PR).
     //
-    string gh_conclusion (gh_to_conclusion (conclusion, warning_success));
+    tenant_service ts ("", "ci-github", sd.json ());
 
-    if (icasecmp (*cs.check_suite.conclusion, gh_conclusion) != 0)
+    // Note: use no delay since we need to start the actual CI (which in turn
+    // (re)creates the synthetic conclusion check run) as soon as possible.
+    //
+    // After this call we will start getting the build_unloaded()
+    // notifications -- which will be routed to build_unloaded_pre_check() --
+    // until we cancel the tenant or it gets archived after some timeout.
+    // (Note that we never actually load this request, we always cancel it;
+    // see build_unloaded_pre_check() for details.)
+    //
+    if (!create (error,
+                 warn,
+                 verb_ ? &trace : nullptr,
+                 *build_db_, retry_,
+                 move (ts),
+                 chrono::seconds (30) /* interval */,
+                 chrono::seconds (0) /* delay */))
     {
-      error << sub << ": conclusion " << *cs.check_suite.conclusion
-            << " does not match service data conclusion " << gh_conclusion;
-      return true;
+      fail << "pull request " << pr.pull_request.node_id
+           << ": unable to create unloaded pre-check tenant";
     }
 
     return true;
   }
 
-  // Create a gq_built_result.
-  //
-  // Throw invalid_argument in case of invalid result_status.
-  //
-  static gq_built_result
-  make_built_result (result_status rs, bool warning_success, string message)
-  {
-    return {gh_to_conclusion (rs, warning_success),
-            circle (rs) + ' ' + ucase (to_string (rs)),
-            move (message)};
-  }
-
-  // Parse a check run details URL into a build_id.
-  //
-  // Return nullopt if the URL is invalid.
-  //
-  static optional<build_id>
-  parse_details_url (const string& details_url);
-
-  // Note that GitHub always posts a message to their GUI saying "You have
-  // successfully requested <check_run_name> be rerun", regardless of what
-  // HTTP status code we respond with. However we do return error status codes
-  // when there is no better option (like failing the conclusion) in case they
-  // start handling them someday.
-  //
   bool ci_github::
-  handle_check_run_rerequest (const gh_check_run_event& cr,
-                              bool warning_success)
+  handle_check_suite_rerequest (gh_check_suite_event cs, bool warning_success)
   {
     HANDLER_DIAG;
 
-    l3 ([&]{trace << "check_run event { " << cr << " }";});
+    l3 ([&]{trace << "check_suite event { " << cs << " }";});
 
-    // The overall plan is as follows:
-    //
-    // 1. Load service data.
-    //
-    // 2. If the tenant is archived, then fail (re-create) both the check run
-    //    and the conclusion with appropriate diagnostics.
-    //
-    // 3. If the check run is in the queued state, then do nothing.
-    //
-    // 4. Re-create the check run in the queued state and the conclusion in
-    //    the building state. Note: do in a single request to make sure we
-    //    either "win" or "loose" the potential race for both (important
-    //    for #7).
-    //
-    // 5. Call the rebuild() function to attempt to schedule a rebuild. Pass
-    //    the update function that does the following (if called):
-    //
-    //    a. Save new node ids.
-    //
-    //    b. Update the check run state (may also not exist).
-    //
-    //    c. Clear the completed flag if true.
-    //
-    // 6. If the result of rebuild() indicates the tenant is archived, then
-    //    fail (update) both the check run and conclusion with appropriate
-    //    diagnostics.
-    //
-    // 7. If original state is queued (no rebuild was scheduled), then fail
-    //    (update) both the check run and the conclusion.
-    //
-    // Note that while conceptually we are updating existing check runs, in
-    // practice we have to re-create as new check runs in order to replace the
-    // existing ones because GitHub does not allow transitioning out of the
-    // built state.
+    assert (cs.action == "rerequested");
 
-    // Get a new installation access token.
+    // While we don't need the installation access token in this request,
+    // let's obtain it to flush out any permission issues early. Also, it is
+    // valid for an hour so we will most likely make use of it.
     //
-    auto get_iat = [this, &trace, &error, &cr] ()
-      -> optional<gh_installation_access_token>
-    {
-      optional<string> jwt (generate_jwt (cr.check_run.app_id, trace, error));
-      if (!jwt)
-        return nullopt;
+    optional<string> jwt (generate_jwt (cs.check_suite.app_id, trace, error));
+    if (!jwt)
+      throw server_error ();
 
-      optional<gh_installation_access_token> iat (
-        obtain_installation_access_token (cr.installation.id,
+    optional<gh_installation_access_token> iat (
+        obtain_installation_access_token (cs.installation.id,
                                           move (*jwt),
                                           error));
+    if (!iat)
+      throw server_error ();
 
-      if (iat)
-        l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
-
-      return iat;
-    };
-
-    const string& repo_node_id (cr.repository.node_id);
-    const string& head_sha (cr.check_run.check_suite.head_sha);
+    l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
 
-    // Prepare the build and conclusion check runs. They are sent to GitHub in
-    // a single request (unless something goes wrong) so store them together
-    // from the outset.
+    // Service id that uniquely identifies the CI tenant.
     //
-    vector<check_run> check_runs (2);
-    check_run& bcr (check_runs[0]); // Build check run
-    check_run& ccr (check_runs[1]); // Conclusion check run
-
-    bcr.name = cr.check_run.name;
-    ccr.name = conclusion_check_run_name;
-
-    const gh_installation_access_token* iat (nullptr);
-    optional<gh_installation_access_token> new_iat;
+    string sid (cs.repository.node_id + ':' + cs.check_suite.head_sha);
 
-    // Load the service data, failing the check runs if the tenant has been
-    // archived.
+    // If the user requests a rebuild of the (entire) PR, then this manifests
+    // as the check_suite rather than pull_request event. Specifically:
     //
-    service_data sd;
-    string tenant_id;
+    // - For a local PR, this event is shared with the branch push and all we
+    //   need to do is restart the CI for the head commit.
+    //
+    // - For a remote PR, this event will have no gh_check_suite::head_branch.
+    //   In this case we need to load the existing service data for this head
+    //   commit, extract the test merge commit, and restart the CI for that.
+    //
+    //   Note that it's possible the base branch has moved in the meantime and
+    //   ideally we would want to re-request the test merge commit, etc.
+    //   However, this will only be necessary if the user does not follow our
+    //   recommendation of enabling the head-behind-base protection. And it
+    //   seems all this extra complexity would not be warranted.
+    //
+    string check_sha;
+    service_data::kind_type kind;
+
+    if (!cs.check_suite.head_branch)
     {
-      // Service id that uniquely identifies the CI tenant.
+      // Rebuild of remote PR.
       //
-      string sid (repo_node_id + ':' + head_sha);
+      kind = service_data::remote;
 
       if (optional<tenant_data> d = find (*build_db_, "ci-github", sid))
       {
@@ -965,651 +880,736 @@ namespace brep
 
         try
         {
-          sd = service_data (*ts.data);
+          service_data sd (*ts.data);
+          check_sha = move (sd.check_sha); // Test merge commit.
         }
         catch (const invalid_argument& e)
         {
           fail << "failed to parse service data: " << e;
         }
+      }
+      else
+      {
+        error << "check suite " << cs.check_suite.node_id
+              << " for remote pull request:"
+              << " re-requested but tenant_service with id " << sid
+              << " did not exist";
+        return true;
+      }
+    }
+    else
+    {
+      // Rebuild of branch push or local PR.
+      //
+      kind = service_data::local;
+      check_sha = cs.check_suite.head_sha;
+    }
 
-        if (!sd.conclusion_node_id)
-          fail << "no conclusion node id for check run " << cr.check_run.node_id;
+    service_data sd (warning_success,
+                     iat->token,
+                     iat->expires_at,
+                     cs.check_suite.app_id,
+                     cs.installation.id,
+                     move (cs.repository.node_id),
+                     move (cs.repository.clone_url),
+                     kind, false /* pre_check */, true /* re_requested */,
+                     move (check_sha),
+                     move (cs.check_suite.head_sha) /* report_sha */);
 
-        tenant_id = d->tenant_id;
+    // Replace the existing CI tenant if it exists.
+    //
+    // Note that GitHub UI does not allow re-running the entire check suite
+    // until all the check runs are completed.
+    //
 
-        // Get a new IAT if the one from the service data has expired.
-        //
-        if (system_clock::now () > sd.installation_access.expires_at)
-        {
-          if ((new_iat = get_iat ()))
-            iat = &*new_iat;
-          else
-            throw server_error ();
-        }
-        else
-          iat = &sd.installation_access;
+    // Create an unloaded CI tenant.
+    //
+    // Note: use no delay since we need to (re)create the synthetic conclusion
+    // check run as soon as possible.
+    //
+    // Note that we use the create() API instead of start() since duplicate
+    // management is not available in start().
+    //
+    // After this call we will start getting the build_unloaded()
+    // notifications until (1) we load the tenant, (2) we cancel it, or (3)
+    // it gets archived after some timeout.
+    //
+    auto pr (create (error,
+                     warn,
+                     verb_ ? &trace : nullptr,
+                     *build_db_, retry_,
+                     tenant_service (sid, "ci-github", sd.json ()),
+                     chrono::seconds (30) /* interval */,
+                     chrono::seconds (0) /* delay */,
+                     duplicate_tenant_mode::replace));
 
-        if (d->archived) // Tenant is archived
-        {
-          // Fail the check runs.
-          //
-          gq_built_result br (
-            make_built_result (
-              result_status::error, warning_success,
-              "Unable to rebuild individual configuration: build has "
-              "been archived"));
+    if (!pr)
+    {
+      fail << "check suite " << cs.check_suite.node_id
+           << ": unable to create unloaded CI tenant";
+    }
 
-          // Try to update the conclusion check run even if the first update
-          // fails.
-          //
-          bool f (false); // Failed.
+    if (pr->second == duplicate_tenant_result::created)
+    {
+      error << "check suite " << cs.check_suite.node_id
+            << ": re-requested but tenant_service with id " << sid
+            << " did not exist";
+      return true;
+    }
 
-          if (gq_update_check_run (error, bcr, iat->token,
-                                   repo_node_id, cr.check_run.node_id,
-                                   nullopt /* details_url */,
-                                   build_state::built, br))
-          {
-            l3 ([&]{trace << "updated check_run { " << bcr << " }";});
-          }
-          else
-          {
-            error << "check_run " << cr.check_run.node_id
-                  << ": unable to update check run";
-            f = true;
-          }
+    return true;
+  }
 
-          if (gq_update_check_run (error, ccr, iat->token,
-                                   repo_node_id, *sd.conclusion_node_id,
-                                   nullopt /* details_url */,
-                                   build_state::built, move (br)))
-          {
-            l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";});
-          }
-          else
-          {
-            error << "check_run " << cr.check_run.node_id
-                  << ": unable to update conclusion check run";
-            f = true;
-          }
+  bool ci_github::
+  handle_check_suite_completed (gh_check_suite_event cs, bool warning_success)
+  {
+    // The plans is as follows:
+    //
+    // 1. Load the service data.
+    //
+    // 2. Verify it is completed.
+    //
+    // 3. Verify the check run counts match.
+    //
+    // 4. Verify (like in build_built()) that all the check runs are
+    //    completed.
+    //
+    // 5. Verify the result matches what GitHub thinks it is.
 
-          // Fail the handler if either of the check runs could not be
-          // updated.
-          //
-          if (f)
-            throw server_error ();
+    HANDLER_DIAG;
 
-          return true;
-        }
-      }
-      else
-      {
-        // No such tenant.
-        //
-        fail << "check run " << cr.check_run.node_id
-             << " re-requested but tenant_service with id " << sid
-             << " does not exist";
-      }
-    }
+    l3 ([&]{trace << "check_suite event { " << cs << " }";});
 
-    // Fail if it's the conclusion check run that is being re-requested.
-    //
-    // @@ TMP When user selects re-run all failed checks we receive multiple
-    //    check_runs, one of which is for the CCR. We then update it with the
-    //    error message, triggering another check_suite(completed) right after
-    //    all of the check_runs(rerequested).
+    // Service id that uniquely identifies the CI tenant.
     //
-    if (cr.check_run.name == conclusion_check_run_name)
-    {
-      l3 ([&]{trace << "re-requested conclusion check_run";});
+    string sid (cs.repository.node_id + ':' + cs.check_suite.head_sha);
 
-      if (!sd.conclusion_node_id)
-        fail << "no conclusion node id for check run " << cr.check_run.node_id;
+    // The common log entry subject.
+    //
+    string sub ("check suite " + cs.check_suite.node_id + '/' + sid);
 
-      gq_built_result br (
-        make_built_result (result_status::error, warning_success,
-                           "Conclusion check run cannot be rebuilt"));
+    // Load the service data.
+    //
+    service_data sd;
 
-      // Fail (update) the conclusion check run.
-      //
-      if (gq_update_check_run (error, ccr, iat->token,
-                               repo_node_id, *sd.conclusion_node_id,
-                               nullopt /* details_url */,
-                               build_state::built, move (br)))
+    if (optional<tenant_data> d = find (*build_db_, "ci-github", sid))
+    {
+      try
       {
-        l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";});
+        sd = service_data (*d->service.data);
       }
-      else
+      catch (const invalid_argument& e)
       {
-        fail << "check run " << cr.check_run.node_id
-             << ": unable to update conclusion check run "
-             << *sd.conclusion_node_id;
+        fail << "failed to parse service data: " << e;
       }
-
+    }
+    else
+    {
+      error << sub << ": tenant_service does not exist";
       return true;
     }
 
-    // Parse the check_run's details_url to extract build id.
-    //
-    // While this is a bit hackish, there doesn't seem to be a better way
-    // (like associating custom data with a check run). Note that the GitHub
-    // UI only allows rebuilding completed check runs, so the details URL
-    // should be there.
+    // Verify the completed flag and the number of check runs.
     //
-    optional<build_id> bid (parse_details_url (cr.check_run.details_url));
-    if (!bid)
+    if (!sd.completed)
     {
-      fail << "check run " << cr.check_run.node_id
-           << ": failed to extract build id from details_url";
+      error << sub << " service data complete flag is false";
+      return true;
     }
 
-    // Initialize the check run (`bcr`) with state from the service data.
+    // Received count will be one higher because we don't store the conclusion
+    // check run.
     //
-    {
-      // Search for the check run in the service data.
-      //
-      // Note that we look by name in case node id got replaced by a racing
-      // re-request (in which case we ignore this request).
-      //
-      auto i (find_if (sd.check_runs.begin (), sd.check_runs.end (),
-                       [&cr] (const check_run& scr)
-                       {
-                         return scr.name == cr.check_run.name;
-                       }));
+    size_t check_runs_count (sd.check_runs.size () + 1);
 
-      if (i == sd.check_runs.end ())
-        fail << "check_run " << cr.check_run.node_id
-             << " (" << cr.check_run.name << "): "
-             << "re-requested but does not exist in service data";
+    if (check_runs_count == 1)
+    {
+      error << sub << ": no check runs in service data";
+      return true;
+    }
 
-      // Do nothing if node ids don't match.
-      //
-      if (i->node_id && *i->node_id != cr.check_run.node_id)
+    if (cs.check_suite.check_runs_count != check_runs_count)
+    {
+      error << sub << ": check runs count " << cs.check_suite.check_runs_count
+            << " does not match service data count " << check_runs_count;
+      return true;
+    }
+
+    // Verify that all the check runs are built and compute the summary
+    // conclusion.
+    //
+    result_status conclusion (result_status::success);
+
+    for (const check_run& cr: sd.check_runs)
+    {
+      if (cr.state == build_state::built)
       {
-        l3 ([&]{trace << "check_run " << cr.check_run.node_id
-                      << " (" << cr.check_run.name << "): "
-                      << "node id has changed in service data";});
-        return true;
+        assert (cr.status.has_value ());
+        conclusion |= *cr.status;
       }
-
-      // Do nothing if the build is already queued.
-      //
-      if (i->state == build_state::queued)
+      else
       {
-        l3 ([&]{trace << "ignoring already-queued check run";});
+        error << sub << ": unbuilt check run in service data";
         return true;
       }
-
-      bcr.name = i->name;
-      bcr.build_id = i->build_id;
-      bcr.state = i->state;
     }
 
-    // Transition the build and conclusion check runs out of the built state
-    // (or any other state) by re-creating them.
+    // Verify the conclusion.
     //
-    bcr.state = build_state::queued;
-    bcr.state_synced = false;
-    bcr.details_url = cr.check_run.details_url;
-
-    ccr.state = build_state::building;
-    ccr.state_synced = false;
-
-    if (gq_create_check_runs (error, check_runs, iat->token,
-                              repo_node_id, head_sha))
-    {
-      assert (bcr.state == build_state::queued);
-      assert (ccr.state == build_state::building);
-
-      l3 ([&]{trace << "created check_run { " << bcr << " }";});
-      l3 ([&]{trace << "created conclusion check_run { " << ccr << " }";});
-    }
-    else
+    if (!cs.check_suite.conclusion)
     {
-      fail << "check run " << cr.check_run.node_id
-           << ": unable to re-create build and conclusion check runs";
+      error << sub << ": absent conclusion in completed check suite";
+      return true;
     }
 
-    // Request the rebuild and update service data.
+    // Note that the case mismatch is due to GraphQL (gh_conclusion())
+    // requiring uppercase conclusion values while the received webhook values
+    // are lower case.
     //
-    bool race (false);
+    string gh_conclusion (gh_to_conclusion (conclusion, warning_success));
 
-    // Callback function called by rebuild() to update the service data (but
-    // only if the build is actually restarted).
-    //
-    auto update_sd = [&error, &new_iat, &race,
-                      tenant_id = move (tenant_id),
-                      &cr, &bcr, &ccr] (const string& ti,
-                                        const tenant_service& ts,
-                                        build_state) -> optional<string>
+    if (icasecmp (*cs.check_suite.conclusion, gh_conclusion) != 0)
     {
-      // NOTE: this lambda may be called repeatedly (e.g., due to transaction
-      // being aborted) and so should not move out of its captures.
-
-      race = false; // Reset.
+      error << sub << ": conclusion " << *cs.check_suite.conclusion
+            << " does not match service data conclusion " << gh_conclusion;
+      return true;
+    }
 
-      if (tenant_id != ti)
-      {
-        // The tenant got replaced since we loaded it but we managed to
-        // trigger a rebuild in the new tenant. Who knows whose check runs are
-        // visible, so let's fail ours similar to the cases below.
-        //
-        race = true;
-        return nullopt;
-      }
+    return true;
+  }
 
-      service_data sd;
-      try
-      {
-        sd = service_data (*ts.data);
-      }
-      catch (const invalid_argument& e)
-      {
-        error << "failed to parse service data: " << e;
-        return nullopt;
-      }
+  // Return the colored circle corresponding to a result_status.
+  //
+  static string
+  circle (result_status rs)
+  {
+    switch (rs)
+    {
+    case result_status::success:  return "\U0001F7E2"; // Green circle.
+    case result_status::warning:  return "\U0001F7E0"; // Orange circle.
+    case result_status::error:
+    case result_status::abort:
+    case result_status::abnormal: return "\U0001F534"; // Red circle.
 
-      // Note that we again look by name in case node id got replaced by a
-      // racing re-request. In this case, however, it's impossible to decide
-      // who won that race, so let's fail the check suite to be on the safe
-      // side (in a sense, similar to the rebuild() returning queued below).
+      // Valid values we should never encounter.
       //
-      auto i (find_if (
-                sd.check_runs.begin (), sd.check_runs.end (),
-                [&cr] (const check_run& scr)
-                {
-                  return scr.name == cr.check_run.name;
-                }));
-
-      if (i == sd.check_runs.end ())
-      {
-        error << "check_run " << cr.check_run.node_id
-              << " (" << cr.check_run.name << "): "
-              << "re-requested but does not exist in service data";
-        return nullopt;
-      }
+    case result_status::skip:
+    case result_status::interrupt:
+      throw invalid_argument ("unexpected result_status value: " +
+                              to_string (rs));
+    }
 
-      if (i->node_id && *i->node_id != cr.check_run.node_id)
-      {
-        // Keep the old conclusion node id to make sure any further state
-        // transitions are ignored. A bit of a hack.
-        //
-        race = true;
-        return nullopt;
-      }
+    return ""; // Should never reach.
+  }
 
-      *i = bcr; // Update with new node_id, state, state_synced.
+  // Make a check run summary from a CI start_result.
+  //
+  static string
+  to_check_run_summary (const optional<ci_start::start_result>& r)
+  {
+    string s;
 
-      sd.conclusion_node_id = ccr.node_id;
-      sd.completed = false;
+    s = "```\n";
+    if (r) s += r->message;
+    else s += "Internal service error";
+    s += "\n```";
 
-      // Save the IAT if we created a new one.
-      //
-      if (new_iat)
-        sd.installation_access = *new_iat;
+    return s;
+  }
 
-      return sd.json ();
-    };
+  // Create a gq_built_result.
+  //
+  // Throw invalid_argument in case of invalid result_status.
+  //
+  static gq_built_result
+  make_built_result (result_status rs, bool warning_success, string message)
+  {
+    return {gh_to_conclusion (rs, warning_success),
+            circle (rs) + ' ' + ucase (to_string (rs)),
+            move (message)};
+  }
 
-    optional<build_state> bs (rebuild (*build_db_, retry_, *bid, update_sd));
+  // Parse a check run details URL into a build_id.
+  //
+  // Return nullopt if the URL is invalid.
+  //
+  static optional<build_id>
+  parse_details_url (const string& details_url);
 
-    // If the build has been archived or re-enqueued since we loaded the
-    // service data, fail (by updating) both the build check run and the
-    // conclusion check run. Otherwise the build has been successfully
-    // re-enqueued so do nothing further.
-    //
-    if (!race && bs && *bs != build_state::queued)
-      return true;
+  // Note that GitHub always posts a message to their GUI saying "You have
+  // successfully requested <check_run_name> be rerun", regardless of what
+  // HTTP status code we respond with. However we do return error status codes
+  // when there is no better option (like failing the conclusion) in case they
+  // start handling them someday.
+  //
+  bool ci_github::
+  handle_check_run_rerequest (const gh_check_run_event& cr,
+                              bool warning_success)
+  {
+    HANDLER_DIAG;
 
-    gq_built_result br; // Built result for both check runs.
+    l3 ([&]{trace << "check_run event { " << cr << " }";});
 
-    if (race || bs) // Race or re-enqueued.
-    {
-      // The re-enqueued case: this build has been re-enqueued since we first
-      // loaded the service data. This could happen if the user clicked
-      // "re-run" multiple times and another handler won the rebuild() race.
-      //
-      // However the winner of the check runs race cannot be determined.
-      //
-      // Best case the other handler won the check runs race as well and
-      // thus everything will proceed normally. Our check runs will be
-      // invisible and disregarded.
-      //
-      // Worst case we won the check runs race and the other handler's check
-      // runs -- the ones that will be updated by the build_*() notifications
-      // -- are no longer visible, leaving things quite broken.
-      //
-      // Either way, we fail our check runs. In the best case scenario it
-      // will have no effect; in the worst case scenario it lets the user
-      // know something has gone wrong.
-      //
-      br = make_built_result (result_status::error, warning_success,
-                              "Unable to rebuild, try again");
-    }
-    else // Archived.
-    {
-      // The build has expired since we loaded the service data. Most likely
-      // the tenant has been archived.
-      //
-      br = make_built_result (
-        result_status::error, warning_success,
-        "Unable to rebuild individual configuration: build has been archived");
-    }
-
-    // Try to update the conclusion check run even if the first update fails.
+    // The overall plan is as follows:
     //
-    bool f (false); // Failed.
-
-    // Fail the build check run.
+    // 1. Load service data.
     //
-    if (gq_update_check_run (error, bcr, iat->token,
-                             repo_node_id, *bcr.node_id,
-                             nullopt /* details_url */,
-                             build_state::built, br))
-    {
-      l3 ([&]{trace << "updated check_run { " << bcr << " }";});
-    }
-    else
-    {
-      error << "check run " << cr.check_run.node_id
-            << ": unable to update (replacement) check run "
-            << *bcr.node_id;
-      f = true;
-    }
+    // 2. If the tenant is archived, then fail (re-create) both the check run
+    //    and the conclusion with appropriate diagnostics.
+    //
+    // 3. If the check run is in the queued state, then do nothing.
+    //
+    // 4. Re-create the check run in the queued state and the conclusion in
+    //    the building state. Note: do in a single request to make sure we
+    //    either "win" or "loose" the potential race for both (important
+    //    for #7).
+    //
+    // 5. Call the rebuild() function to attempt to schedule a rebuild. Pass
+    //    the update function that does the following (if called):
+    //
+    //    a. Save new node ids.
+    //
+    //    b. Update the check run state (may also not exist).
+    //
+    //    c. Clear the completed flag if true.
+    //
+    // 6. If the result of rebuild() indicates the tenant is archived, then
+    //    fail (update) both the check run and conclusion with appropriate
+    //    diagnostics.
+    //
+    // 7. If original state is queued (no rebuild was scheduled), then fail
+    //    (update) both the check run and the conclusion.
+    //
+    // Note that while conceptually we are updating existing check runs, in
+    // practice we have to re-create as new check runs in order to replace the
+    // existing ones because GitHub does not allow transitioning out of the
+    // built state.
 
-    // Fail the conclusion check run.
+    // Get a new installation access token.
     //
-    if (gq_update_check_run (error, ccr, iat->token,
-                             repo_node_id, *ccr.node_id,
-                             nullopt /* details_url */,
-                             build_state::built, move (br)))
-    {
-      l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";});
-    }
-    else
+    auto get_iat = [this, &trace, &error, &cr] ()
+      -> optional<gh_installation_access_token>
     {
-      error << "check run " << cr.check_run.node_id
-            << ": unable to update conclusion check run " << *ccr.node_id;
-      f = true;
-    }
+      optional<string> jwt (generate_jwt (cr.check_run.app_id, trace, error));
+      if (!jwt)
+        return nullopt;
 
-    // Fail the handler if either of the check runs could not be updated.
-    //
-    if (f)
-      throw server_error ();
+      optional<gh_installation_access_token> iat (
+        obtain_installation_access_token (cr.installation.id,
+                                          move (*jwt),
+                                          error));
 
-    return true;
-  }
+      if (iat)
+        l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
 
-  // Miscellaneous pull request facts
-  //
-  // - Although some of the GitHub documentation makes it sound like they
-  //   expect check runs to be added to both the PR head commit and the merge
-  //   commit, the PR UI does not react to the merge commit's check runs
-  //   consistently. It actually seems to be quite broken. The only thing it
-  //   does seem to do reliably is blocking the PR merge if the merge commit's
-  //   check runs are not successful (i.e, overriding the PR head commit's
-  //   check runs). But the UI looks quite messed up generally in this state.
-  //
-  // - When new commits are added to a PR base branch, pull_request.base.sha
-  //   does not change, but the test merge commit will be updated to include
-  //   the new commits to the base branch.
-  //
-  // - When new commits are added to a PR head branch, pull_request.head.sha
-  //   gets updated with the head commit's SHA and check_suite.pull_requests[]
-  //   will contain all PRs with this branch as head.
-  //
-  bool ci_github::
-  handle_pull_request (gh_pull_request_event pr, bool warning_success)
-  {
-    HANDLER_DIAG;
+      return iat;
+    };
 
-    l3 ([&]{trace << "pull_request event { " << pr << " }";});
+    const string& repo_node_id (cr.repository.node_id);
+    const string& head_sha (cr.check_run.check_suite.head_sha);
 
-    // While we don't need the installation access token in this request,
-    // let's obtain it to flush out any permission issues early. Also, it is
-    // valid for an hour so we will most likely make use of it.
+    // Prepare the build and conclusion check runs. They are sent to GitHub in
+    // a single request (unless something goes wrong) so store them together
+    // from the outset.
     //
-    optional<string> jwt (generate_jwt (pr.pull_request.app_id, trace, error));
-    if (!jwt)
-      throw server_error ();
-
-    optional<gh_installation_access_token> iat (
-      obtain_installation_access_token (pr.installation.id,
-                                        move (*jwt),
-                                        error));
-    if (!iat)
-      throw server_error ();
+    vector<check_run> check_runs (2);
+    check_run& bcr (check_runs[0]); // Build check run
+    check_run& ccr (check_runs[1]); // Conclusion check run
 
-    l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
+    bcr.name = cr.check_run.name;
+    ccr.name = conclusion_check_run_name;
 
-    // Distinguish between local and remote PRs by comparing the head and base
-    // repositories' paths.
-    //
-    service_data::kind_type kind (
-      pr.pull_request.head_path == pr.pull_request.base_path
-      ? service_data::local
-      : service_data::remote);
+    const gh_installation_access_token* iat (nullptr);
+    optional<gh_installation_access_token> new_iat;
 
-    // Note that similar to the branch push case above, while it would have
-    // been nice to cancel the previous CI job once the PR head moves (the
-    // "synchronize" event), due to the head sharing problem the previous CI
-    // job might actually still be relevant (in both local and remote PR
-    // cases). So we only do it for the remote PRs and only if the head is not
-    // shared (via tenant reference counting).
+    // Load the service data, failing the check runs if the tenant has been
+    // archived.
     //
-    if (kind == service_data::remote && pr.action == "synchronize")
+    service_data sd;
+    string tenant_id;
     {
-      if (pr.before)
+      // Service id that uniquely identifies the CI tenant.
+      //
+      string sid (repo_node_id + ':' + head_sha);
+
+      if (optional<tenant_data> d = find (*build_db_, "ci-github", sid))
       {
-        // Service id that will uniquely identify the CI tenant.
-        //
-        string sid (pr.repository.node_id + ':' + *pr.before);
+        tenant_service& ts (d->service);
 
-        if (optional<tenant_service> ts = cancel (error, warn,
-                                                  verb_ ? &trace : nullptr,
-                                                  *build_db_, retry_,
-                                                  "ci-github", sid,
-                                                  true /* ref_count */))
+        try
         {
-          l3 ([&]{trace << "pull request " << pr.pull_request.node_id
-                        << ": attempted to cancel CI of previous head commit"
-                        << " (ref_count: " << ts->ref_count << ')';});
+          sd = service_data (*ts.data);
+        }
+        catch (const invalid_argument& e)
+        {
+          fail << "failed to parse service data: " << e;
+        }
+
+        if (!sd.conclusion_node_id)
+          fail << "no conclusion node id for check run " << cr.check_run.node_id;
+
+        tenant_id = d->tenant_id;
+
+        // Get a new IAT if the one from the service data has expired.
+        //
+        if (system_clock::now () > sd.installation_access.expires_at)
+        {
+          if ((new_iat = get_iat ()))
+            iat = &*new_iat;
+          else
+            throw server_error ();
         }
         else
+          iat = &sd.installation_access;
+
+        if (d->archived) // Tenant is archived
         {
-          // It's possible that there was no CI for the previous commit for
-          // various reasons (e.g., CI was not enabled).
+          // Fail the check runs.
           //
-          l3 ([&]{trace << "pull request " << pr.pull_request.node_id
-                        << ": failed to cancel CI of previous head commit "
-                        << "with tenant_service id " << sid;});
+          gq_built_result br (
+            make_built_result (
+              result_status::error, warning_success,
+              "Unable to rebuild individual configuration: build has "
+              "been archived"));
+
+          // Try to update the conclusion check run even if the first update
+          // fails.
+          //
+          bool f (false); // Failed.
+
+          if (gq_update_check_run (error, bcr, iat->token,
+                                   repo_node_id, cr.check_run.node_id,
+                                   nullopt /* details_url */,
+                                   build_state::built, br))
+          {
+            l3 ([&]{trace << "updated check_run { " << bcr << " }";});
+          }
+          else
+          {
+            error << "check_run " << cr.check_run.node_id
+                  << ": unable to update check run";
+            f = true;
+          }
+
+          if (gq_update_check_run (error, ccr, iat->token,
+                                   repo_node_id, *sd.conclusion_node_id,
+                                   nullopt /* details_url */,
+                                   build_state::built, move (br)))
+          {
+            l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";});
+          }
+          else
+          {
+            error << "check_run " << cr.check_run.node_id
+                  << ": unable to update conclusion check run";
+            f = true;
+          }
+
+          // Fail the handler if either of the check runs could not be
+          // updated.
+          //
+          if (f)
+            throw server_error ();
+
+          return true;
         }
       }
       else
       {
-        error << "pull request " << pr.pull_request.node_id
-              << ": before commit is missing in synchronize event";
+        // No such tenant.
+        //
+        fail << "check run " << cr.check_run.node_id
+             << " re-requested but tenant_service with id " << sid
+             << " does not exist";
       }
     }
 
-    // Note: for remote PRs the check_sha will be set later, in
-    // build_unloaded_pre_check(), to test merge commit id.
+    // Fail if it's the conclusion check run that is being re-requested.
     //
-    string check_sha (kind == service_data::local
-                      ? pr.pull_request.head_sha
-                      : "");
+    // @@ TMP When user selects re-run all failed checks we receive multiple
+    //    check_runs, one of which is for the CCR. We then update it with the
+    //    error message, triggering another check_suite(completed) right after
+    //    all of the check_runs(rerequested).
+    //
+    if (cr.check_run.name == conclusion_check_run_name)
+    {
+      l3 ([&]{trace << "re-requested conclusion check_run";});
 
-    // Note that PR rebuilds (re-requested) are handled by
-    // handle_check_suite_rerequest().
+      if (!sd.conclusion_node_id)
+        fail << "no conclusion node id for check run " << cr.check_run.node_id;
+
+      gq_built_result br (
+        make_built_result (result_status::error, warning_success,
+                           "Conclusion check run cannot be rebuilt"));
+
+      // Fail (update) the conclusion check run.
+      //
+      if (gq_update_check_run (error, ccr, iat->token,
+                               repo_node_id, *sd.conclusion_node_id,
+                               nullopt /* details_url */,
+                               build_state::built, move (br)))
+      {
+        l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";});
+      }
+      else
+      {
+        fail << "check run " << cr.check_run.node_id
+             << ": unable to update conclusion check run "
+             << *sd.conclusion_node_id;
+      }
+
+      return true;
+    }
+
+    // Parse the check_run's details_url to extract build id.
     //
-    // Note that, in the case of a remote PR, GitHub will copy the PR head
-    // commit from the head (forked) repository into the base repository. So
-    // the check runs must always be added to the base repository, whether the
-    // PR is local or remote. The head commit refs are located at
-    // refs/pull/<PR-number>/head.
+    // While this is a bit hackish, there doesn't seem to be a better way
+    // (like associating custom data with a check run). Note that the GitHub
+    // UI only allows rebuilding completed check runs, so the details URL
+    // should be there.
     //
-    service_data sd (warning_success,
-                     move (iat->token),
-                     iat->expires_at,
-                     pr.pull_request.app_id,
-                     pr.installation.id,
-                     move (pr.repository.node_id),
-                     move (pr.repository.clone_url),
-                     kind, true /* pre_check */, false /* re_request */,
-                     move (check_sha),
-                     move (pr.pull_request.head_sha) /* report_sha */,
-                     pr.pull_request.node_id,
-                     pr.pull_request.number);
+    optional<build_id> bid (parse_details_url (cr.check_run.details_url));
+    if (!bid)
+    {
+      fail << "check run " << cr.check_run.node_id
+           << ": failed to extract build id from details_url";
+    }
 
-    // Create an unloaded CI tenant for the pre-check phase (during which we
-    // wait for the PR's merge commit and behindness to become available).
+    // Initialize the check run (`bcr`) with state from the service data.
     //
-    // Create with an empty service id so that the generated tenant id is used
-    // instead during the pre-check phase (so as not to clash with a proper
-    // service id for this head commit, potentially created in
-    // handle_branch_push() or as another PR).
+    {
+      // Search for the check run in the service data.
+      //
+      // Note that we look by name in case node id got replaced by a racing
+      // re-request (in which case we ignore this request).
+      //
+      auto i (find_if (sd.check_runs.begin (), sd.check_runs.end (),
+                       [&cr] (const check_run& scr)
+                       {
+                         return scr.name == cr.check_run.name;
+                       }));
+
+      if (i == sd.check_runs.end ())
+        fail << "check_run " << cr.check_run.node_id
+             << " (" << cr.check_run.name << "): "
+             << "re-requested but does not exist in service data";
+
+      // Do nothing if node ids don't match.
+      //
+      if (i->node_id && *i->node_id != cr.check_run.node_id)
+      {
+        l3 ([&]{trace << "check_run " << cr.check_run.node_id
+                      << " (" << cr.check_run.name << "): "
+                      << "node id has changed in service data";});
+        return true;
+      }
+
+      // Do nothing if the build is already queued.
+      //
+      if (i->state == build_state::queued)
+      {
+        l3 ([&]{trace << "ignoring already-queued check run";});
+        return true;
+      }
+
+      bcr.name = i->name;
+      bcr.build_id = i->build_id;
+      bcr.state = i->state;
+    }
+
+    // Transition the build and conclusion check runs out of the built state
+    // (or any other state) by re-creating them.
     //
-    tenant_service ts ("", "ci-github", sd.json ());
+    bcr.state = build_state::queued;
+    bcr.state_synced = false;
+    bcr.details_url = cr.check_run.details_url;
+
+    ccr.state = build_state::building;
+    ccr.state_synced = false;
+
+    if (gq_create_check_runs (error, check_runs, iat->token,
+                              repo_node_id, head_sha))
+    {
+      assert (bcr.state == build_state::queued);
+      assert (ccr.state == build_state::building);
+
+      l3 ([&]{trace << "created check_run { " << bcr << " }";});
+      l3 ([&]{trace << "created conclusion check_run { " << ccr << " }";});
+    }
+    else
+    {
+      fail << "check run " << cr.check_run.node_id
+           << ": unable to re-create build and conclusion check runs";
+    }
 
-    // Note: use no delay since we need to start the actual CI (which in turn
-    // (re)creates the synthetic conclusion check run) as soon as possible.
+    // Request the rebuild and update service data.
     //
-    // After this call we will start getting the build_unloaded()
-    // notifications -- which will be routed to build_unloaded_pre_check() --
-    // until we cancel the tenant or it gets archived after some timeout.
-    // (Note that we never actually load this request, we always cancel it;
-    // see build_unloaded_pre_check() for details.)
+    bool race (false);
+
+    // Callback function called by rebuild() to update the service data (but
+    // only if the build is actually restarted).
     //
-    if (!create (error,
-                 warn,
-                 verb_ ? &trace : nullptr,
-                 *build_db_, retry_,
-                 move (ts),
-                 chrono::seconds (30) /* interval */,
-                 chrono::seconds (0) /* delay */))
+    auto update_sd = [&error, &new_iat, &race,
+                      tenant_id = move (tenant_id),
+                      &cr, &bcr, &ccr] (const string& ti,
+                                        const tenant_service& ts,
+                                        build_state) -> optional<string>
     {
-      fail << "pull request " << pr.pull_request.node_id
-           << ": unable to create unloaded pre-check tenant";
-    }
+      // NOTE: this lambda may be called repeatedly (e.g., due to transaction
+      // being aborted) and so should not move out of its captures.
 
-    return true;
-  }
+      race = false; // Reset.
 
-  bool ci_github::
-  handle_branch_push (gh_push_event ps, bool warning_success)
-  {
-    HANDLER_DIAG;
+      if (tenant_id != ti)
+      {
+        // The tenant got replaced since we loaded it but we managed to
+        // trigger a rebuild in the new tenant. Who knows whose check runs are
+        // visible, so let's fail ours similar to the cases below.
+        //
+        race = true;
+        return nullopt;
+      }
 
-    l3 ([&]{trace << "push event { " << ps << " }";});
+      service_data sd;
+      try
+      {
+        sd = service_data (*ts.data);
+      }
+      catch (const invalid_argument& e)
+      {
+        error << "failed to parse service data: " << e;
+        return nullopt;
+      }
 
-    // Cancel the CI tenant associated with the overwritten/deleted previous
-    // head commit if this is a forced push or a branch deletion.
-    //
-    if (ps.forced || ps.deleted)
-    {
-      // Service id that will uniquely identify the CI tenant.
+      // Note that we again look by name in case node id got replaced by a
+      // racing re-request. In this case, however, it's impossible to decide
+      // who won that race, so let's fail the check suite to be on the safe
+      // side (in a sense, similar to the rebuild() returning queued below).
       //
-      string sid (ps.repository.node_id + ':' + ps.before);
+      auto i (find_if (
+                sd.check_runs.begin (), sd.check_runs.end (),
+                [&cr] (const check_run& scr)
+                {
+                  return scr.name == cr.check_run.name;
+                }));
 
-      // Note that it's possible this commit still exists in another branch so
-      // we do refcount-aware cancel.
-      //
-      if (optional<tenant_service> ts = cancel (error, warn,
-                                                verb_ ? &trace : nullptr,
-                                                *build_db_, retry_,
-                                                "ci-github", sid,
-                                                true /* ref_count */))
+      if (i == sd.check_runs.end ())
       {
-        l3 ([&]{trace << (ps.forced ? "forced push " + ps.after + " to "
-                                    : "deletion of ")
-                      << ps.ref << ": attempted to cancel CI of previous"
-                      << " head commit with tenant_service id " << sid
-                      << " (ref_count: " << ts->ref_count << ')';});
+        error << "check_run " << cr.check_run.node_id
+              << " (" << cr.check_run.name << "): "
+              << "re-requested but does not exist in service data";
+        return nullopt;
       }
-      else
+
+      if (i->node_id && *i->node_id != cr.check_run.node_id)
       {
-        // It's possible that there was no CI for the previous commit for
-        // various reasons (e.g., CI was not enabled).
+        // Keep the old conclusion node id to make sure any further state
+        // transitions are ignored. A bit of a hack.
         //
-        l3 ([&]{trace << (ps.forced ? "forced push " + ps.after + " to "
-                                    : "deletion of ")
-                      << ps.ref << ": failed to cancel CI of previous"
-                      << " head commit with tenant_service id " << sid;});
+        race = true;
+        return nullopt;
       }
-    }
 
-    if (ps.deleted)
-      return true; // Do nothing further if this was a branch deletion.
+      *i = bcr; // Update with new node_id, state, state_synced.
 
-    // While we don't need the installation access token in this request,
-    // let's obtain it to flush out any permission issues early. Also, it is
-    // valid for an hour so we will most likely make use of it.
-    //
-    optional<string> jwt (generate_jwt (ps.app_id, trace, error));
-    if (!jwt)
-      throw server_error ();
+      sd.conclusion_node_id = ccr.node_id;
+      sd.completed = false;
 
-    optional<gh_installation_access_token> iat (
-        obtain_installation_access_token (ps.installation.id,
-                                          move (*jwt),
-                                          error));
-    if (!iat)
-      throw server_error ();
+      // Save the IAT if we created a new one.
+      //
+      if (new_iat)
+        sd.installation_access = *new_iat;
 
-    l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
+      return sd.json ();
+    };
 
-    // While it would have been nice to cancel CIs of PRs with this branch as
-    // base not to waste resources, there are complications: Firstly, we can
-    // only do this for remote PRs (since local PRs will most likely share the
-    // result with branch push). Secondly, we try to do our best even if the
-    // branch protection rule for head behind is not enabled. In this case, it
-    // would be good to complete the CI. So maybe/later. See also the head
-    // case in handle_pull_request(), where we do cancel remote PRs that are
-    // not shared.
+    optional<build_state> bs (rebuild (*build_db_, retry_, *bid, update_sd));
 
-    // Service id that uniquely identifies the CI tenant.
+    // If the build has been archived or re-enqueued since we loaded the
+    // service data, fail (by updating) both the build check run and the
+    // conclusion check run. Otherwise the build has been successfully
+    // re-enqueued so do nothing further.
     //
-    string sid (ps.repository.node_id + ':' + ps.after);
+    if (!race && bs && *bs != build_state::queued)
+      return true;
 
-    service_data sd (warning_success,
-                     iat->token,
-                     iat->expires_at,
-                     ps.app_id,
-                     ps.installation.id,
-                     move (ps.repository.node_id),
-                     move (ps.repository.clone_url),
-                     service_data::local,
-                     false /* pre_check */,
-                     false /* re_requested */,
-                     ps.after /* check_sha */,
-                     ps.after /* report_sha */);
+    gq_built_result br; // Built result for both check runs.
 
-    // Create an unloaded CI tenant, doing nothing if one already exists
-    // (which could've been created by handle_pull_request() or by us as a
-    // result of a push to another branch). Note that the tenant's reference
-    // count is incremented in all cases.
-    //
-    // Note: use no delay since we need to (re)create the synthetic conclusion
-    // check run as soon as possible.
+    if (race || bs) // Race or re-enqueued.
+    {
+      // The re-enqueued case: this build has been re-enqueued since we first
+      // loaded the service data. This could happen if the user clicked
+      // "re-run" multiple times and another handler won the rebuild() race.
+      //
+      // However the winner of the check runs race cannot be determined.
+      //
+      // Best case the other handler won the check runs race as well and
+      // thus everything will proceed normally. Our check runs will be
+      // invisible and disregarded.
+      //
+      // Worst case we won the check runs race and the other handler's check
+      // runs -- the ones that will be updated by the build_*() notifications
+      // -- are no longer visible, leaving things quite broken.
+      //
+      // Either way, we fail our check runs. In the best case scenario it
+      // will have no effect; in the worst case scenario it lets the user
+      // know something has gone wrong.
+      //
+      br = make_built_result (result_status::error, warning_success,
+                              "Unable to rebuild, try again");
+    }
+    else // Archived.
+    {
+      // The build has expired since we loaded the service data. Most likely
+      // the tenant has been archived.
+      //
+      br = make_built_result (
+        result_status::error, warning_success,
+        "Unable to rebuild individual configuration: build has been archived");
+    }
+
+    // Try to update the conclusion check run even if the first update fails.
     //
-    // Note that we use the create() API instead of start() since duplicate
-    // management is not available in start().
+    bool f (false); // Failed.
+
+    // Fail the build check run.
     //
-    // After this call we will start getting the build_unloaded()
-    // notifications until (1) we load the tenant, (2) we cancel it, or (3)
-    // it gets archived after some timeout.
+    if (gq_update_check_run (error, bcr, iat->token,
+                             repo_node_id, *bcr.node_id,
+                             nullopt /* details_url */,
+                             build_state::built, br))
+    {
+      l3 ([&]{trace << "updated check_run { " << bcr << " }";});
+    }
+    else
+    {
+      error << "check run " << cr.check_run.node_id
+            << ": unable to update (replacement) check run "
+            << *bcr.node_id;
+      f = true;
+    }
+
+    // Fail the conclusion check run.
     //
-    if (!create (error, warn, verb_ ? &trace : nullptr,
-                 *build_db_, retry_,
-                 tenant_service (sid, "ci-github", sd.json ()),
-                 chrono::seconds (30) /* interval */,
-                 chrono::seconds (0) /* delay */,
-                 duplicate_tenant_mode::ignore))
+    if (gq_update_check_run (error, ccr, iat->token,
+                             repo_node_id, *ccr.node_id,
+                             nullopt /* details_url */,
+                             build_state::built, move (br)))
     {
-      fail << "push " + ps.after + " to " + ps.ref
-           << ": unable to create unloaded CI tenant";
+      l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";});
+    }
+    else
+    {
+      error << "check run " << cr.check_run.node_id
+            << ": unable to update conclusion check run " << *ccr.node_id;
+      f = true;
     }
 
+    // Fail the handler if either of the check runs could not be updated.
+    //
+    if (f)
+      throw server_error ();
+
     return true;
   }
 
-- 
cgit v1.1