From ee468f2996225ee497a245b5fb22cad40895c355 Mon Sep 17 00:00:00 2001
From: Francois Kritzinger <francois@codesynthesis.com>
Date: Wed, 17 Apr 2024 08:46:00 +0200
Subject: build_building()

---
 mod/mod-ci-github.cxx | 301 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 294 insertions(+), 7 deletions(-)
diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx
index 9ffc2d2..22ba205 100644
--- a/mod/mod-ci-github.cxx
+++ b/mod/mod-ci-github.cxx
@@ -1508,14 +1508,301 @@ namespace brep
                   const diag_epilogue& log_writer) const noexcept
   {
     // Note that we may receive this notification before the corresponding
-    // check run object has been persistent in the service data (see the
-    // returned by build_queued() lambda for details). So we may try to create
-    // a new check run but find out that it's actually already exist on the
-    // GitHub side. In this case we will somehow need to get the node_id for
-    // the existing check run and re-try but this time updating instead of
-    // creating.
+    // check run object has been persisted in the service data (see the
+    // returned by build_queued() lambda for details). Thus we wouldn't know
+    // whether the check run has been created on GitHub yet or not. And given
+    // that, on GitHub, creating a check run with an existent name does not
+    // fail but instead replaces the existing check run (same name, different
+    // node ID), we have to check whether a check run already exists on GitHub
+    // before creating it.
+    //
+    // @@ TMP Will have to do this in build_queued() as well.
+    //
+    NOTIFICATION_DIAG (log_writer);
 
-    return nullptr;
+    service_data sd;
+    try
+    {
+      sd = service_data (*ts.data);
+    }
+    catch (const invalid_argument& e)
+    {
+      error << "failed to parse service data: " << e;
+      return nullptr;
+    }
+
+    service_data::check_run cr; // Updated check run.
+
+    // Create a new check run on GitHub.
+    //
+    auto create = [&cr, &b, &hs, &sd, &error] (const string& iat)
+    {
+      string rq (graphql_request (create_check_runs (sd.repository_id,
+                                                     sd.head_sha,
+                                                     {b},
+                                                     build_state::building,
+                                                     &hs)));
+
+      return mutate_check_run (cr,
+                               {b},
+                               iat,
+                               move (rq),
+                               build_state::building,
+                               error);
+    };
+
+    // Update a check run that already exists on GitHub.
+    //
+    auto update =
+        [&cr, &b, &sd, &error] (const string& iat,
+                                const string& nid,
+                                build_state st = build_state::building)
+    {
+      string rq (graphql_request (
+          update_check_run (sd.repository_id, nid, build_state::building)));
+
+      return mutate_check_run (cr, {b}, iat, move (rq), st, error);
+    };
+
+    // Get a new installation access token if the current one has expired.
+    //
+    const installation_access_token* iat (nullptr);
+    optional<installation_access_token> new_iat;
+
+    if (system_clock::now () > sd.installation_access.expires_at)
+    {
+      if (optional<string> jwt = generate_jwt (trace, error))
+      {
+        new_iat = obtain_installation_access_token (sd.installation_id,
+                                                    move (*jwt),
+                                                    error);
+        if (new_iat)
+          iat = &*new_iat;
+      }
+    }
+    else
+      iat = &sd.installation_access;
+
+    if (iat != nullptr)
+    {
+      string bid (check_run_name (b)); // Full Build ID.
+
+      // Stored check run.
+      //
+      const service_data::check_run* scr (sd.find_check_run (bid));
+
+      if (scr != nullptr && scr->node_id)
+      {
+        // The check run exists on GitHub and in the persisted service data.
+        //
+        if (!scr->state || *scr->state == build_state::queued)
+        {
+          // If the stored state is queued (most likely), at least
+          // build_queued() and its lambda has run. If this is all then GitHub
+          // has the queued status. But it's also possible that build_built()
+          // has also run and updated GitHub to built but its lambda has not
+          // yet run.
+          //
+          // On the other hand, if there is no stored state, both of the other
+          // notifications must have run: the first created the check run on
+          // GitHub and stored the node ID; the second failed to update GitHub
+          // and stored the nullopt state. Either way around, build_built()
+          // has run so built is the correct state.
+          //
+          //   @@ TMP Maybe we should always store the state and add a flag
+          //          like "gh_updated", then we would know for sure whether
+          //          we need to update to built or building.
+          //
+          // So GitHub currently either has queued or built but we can't be
+          // sure which.
+          //
+          // If it has queued, updating to building is correct.
+          //
+          // If it has built then it would be a logical mistake to update to
+          // building. However, GitHub ignores updates from built to any other
+          // state (the REST API responds with HTTP 200 and the full check run
+          // JSON body but with the "completed" status; presumably the GraphQL
+          // API has the same semantics) so we can just try to update to
+          // building and see what the actual status it returns is.
+          //
+          // If scr->state is nullopt then GitHub has either queued or built
+          // but we know build_built() has run so we need to update to built
+          // instead of building.
+          //
+          cr = move (*scr);
+          cr.state = nullopt;
+
+          build_state st (scr->state ? build_state::building
+                                     : build_state::built);
+
+          if (update (iat->token, *cr.node_id, st))
+          {
+            // @@ TODO If !scr->state and GH had built then we probably don't
+            //         want to run the lambda either but currently it will run
+            //         and this update message is not accurate. Is stored
+            //         failed state the only way?
+            //
+            if (cr.state == st)
+              l3 ([&]{trace << "updated check_run { " << cr << " }";});
+            else
+            {
+              // Do not persist anything if state was already built on
+              // GitHub.
+              //
+              assert (cr.state == build_state::built);
+
+              return nullptr;
+            }
+          }
+        }
+        else if (*scr->state == build_state::built)
+        {
+          // Ignore out of order built notification.
+          //
+          assert (*scr->state == build_state::built);
+
+          warn << *scr << ": "
+               << "out of order transition from "
+               << to_string (build_state::queued) << " to "
+               << to_string (build_state::building) <<
+            " (stored state is " << to_string (build_state::built) << ")";
+
+          return nullptr;
+        }
+      }
+      else // (src == nullptr || !scr->node_id)
+      {
+        // No state has been persisted, or one or both of the other
+        // notifications were unable to create the check run on GitHub.
+        //
+        // It's also possible that another notification has since created the
+        // check run on GitHub but its lambda just hasn't run yet.
+        //
+        // Thus the check run may or may not exist on GitHub.
+        //
+        // Because creation destroys check runs with the same name (see
+        // comments at top of function) we have to check whether the check run
+        // exists on GitHub before we can do anything.
+        //
+        // Destructive creation would be catastrophic if, for example, our
+        // "new" building check run replaced the existing built one because it
+        // would look exactly like a transition from built back to building in
+        // the GitHub UI. And then the built lambda could run after our
+        // building lambda, creating an opportunity for the real node ID to be
+        // overwritten with the old one.
+        //
+        cr.build_id = move (bid);
+
+        // Fetch the check run by name from GitHub.
+        //
+        pair<optional<check_run>, bool> pr (
+            fetch_check_run (iat->token,
+                             ts.id,
+                             check_run_name (b, &hs),
+                             error));
+
+        if (pr.second) // No errors.
+        {
+          if (!pr.first) // Check run does not exist on GitHub.
+          {
+            // Assume the most probable cases: build_queued() failed to create
+            // the check run or build_building() is running before
+            // build_queued(), so creating with building state is
+            // correct. (The least likely being that build_built() ran before
+            // this, in which case we should create with the built state.)
+            //
+            // @@ TODO Create with whatever the failed state was if we decide
+            //         to store it.
+            //
+            if (create (iat->token))
+              l3 ([&]{trace << "created check_run { " << cr << " }";});
+          }
+          else // Check run exists on GitHub.
+          {
+            if (pr.first->status == to_string_gh (build_state::queued))
+            {
+              if (scr != nullptr)
+              {
+                cr = move (*scr);
+                cr.state = nullopt;
+              }
+
+              if (update (iat->token, pr.first->node_id))
+                l3 ([&]{trace << "updated check_run { " << cr << " }";});
+            }
+            else
+            {
+              // Do nothing because the GitHub state is already built so the
+              // lambda returned by build_built() will update the database to
+              // built.
+              //
+              return nullptr;
+            }
+          }
+        }
+        else // Error communicating with GitHub.
+        {
+          // Can't tell whether the check run exists GitHub. Make the final
+          // decision on whether or not to store nullopt in node_id and state
+          // based on what's in the database when the lambda runs
+          // (build_built() and its lambda could run in the meantime).
+          //
+          // @@ TODO Store build_state::building if we start storing failed
+          //         state.
+        }
+      }
+    }
+
+    return [iat = move (new_iat),
+            cr = move (cr),
+            error = move (error),
+            warn = move (warn)] (const tenant_service& ts) -> optional<string>
+    {
+      // NOTE: this lambda may be called repeatedly (e.g., due to transaction
+      // being aborted) and so should not move out of its captures.
+
+      service_data sd;
+      try
+      {
+        sd = service_data (*ts.data);
+      }
+      catch (const invalid_argument& e)
+      {
+        error << "failed to parse service data: " << e;
+        return nullopt;
+      }
+
+      if (iat)
+        sd.installation_access = *iat;
+
+      if (service_data::check_run* scr = sd.find_check_run (cr.build_id))
+      {
+        // Update existing check run.
+        //
+        // @@ TODO What if the failed GH update was for built? May end up with
+        //         permanently building check run.
+        //
+        if (!scr->state || scr->state == build_state::queued)
+        {
+          scr->state = cr.state;
+          if (!scr->node_id)
+            scr->node_id = move (cr.node_id);
+        }
+      }
+      else
+      {
+        // Store new check run.
+        //
+        sd.check_runs.push_back (cr);
+
+        warn << "check run { " << cr << " }: "
+             << to_string (build_state::building)
+             << " state being persisted before "
+             << to_string (build_state::queued);
+      }
+
+      return sd.json ();
+    };
   }
 
   function<optional<string> (const tenant_service&)> ci_github::
-- 
cgit v1.1