From de9f0666062a638a5d5d2be931d9f2a40ea07195 Mon Sep 17 00:00:00 2001 From: Francois Kritzinger Date: Thu, 19 Dec 2024 16:09:59 +0200 Subject: ci-github: Rearrange order of functions --- mod/mod-ci-github.cxx | 1670 ++++++++++++++++++++++++------------------------- 1 file changed, 835 insertions(+), 835 deletions(-) (limited to 'mod/mod-ci-github.cxx') diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx index cbd8e70..31f3b06 100644 --- a/mod/mod-ci-github.cxx +++ b/mod/mod-ci-github.cxx @@ -546,64 +546,62 @@ namespace brep // static string conclusion_check_run_name ("CONCLUSION"); - // Return the colored circle corresponding to a result_status. - // - static string - circle (result_status rs) + bool ci_github:: + handle_branch_push (gh_push_event ps, bool warning_success) { - switch (rs) + HANDLER_DIAG; + + l3 ([&]{trace << "push event { " << ps << " }";}); + + // Cancel the CI tenant associated with the overwritten/deleted previous + // head commit if this is a forced push or a branch deletion. + // + if (ps.forced || ps.deleted) { - case result_status::success: return "\U0001F7E2"; // Green circle. - case result_status::warning: return "\U0001F7E0"; // Orange circle. - case result_status::error: - case result_status::abort: - case result_status::abnormal: return "\U0001F534"; // Red circle. + // Service id that will uniquely identify the CI tenant. + // + string sid (ps.repository.node_id + ':' + ps.before); - // Valid values we should never encounter. + // Note that it's possible this commit still exists in another branch so + // we do refcount-aware cancel. // - case result_status::skip: - case result_status::interrupt: - throw invalid_argument ("unexpected result_status value: " + - to_string (rs)); + if (optional ts = cancel (error, warn, + verb_ ? &trace : nullptr, + *build_db_, retry_, + "ci-github", sid, + true /* ref_count */)) + { + l3 ([&]{trace << (ps.forced ? "forced push " + ps.after + " to " + : "deletion of ") + << ps.ref << ": attempted to cancel CI of previous" + << " head commit with tenant_service id " << sid + << " (ref_count: " << ts->ref_count << ')';}); + } + else + { + // It's possible that there was no CI for the previous commit for + // various reasons (e.g., CI was not enabled). + // + l3 ([&]{trace << (ps.forced ? "forced push " + ps.after + " to " + : "deletion of ") + << ps.ref << ": failed to cancel CI of previous" + << " head commit with tenant_service id " << sid;}); + } } - return ""; // Should never reach. - } - - // Make a check run summary from a CI start_result. - // - static string - to_check_run_summary (const optional& r) - { - string s; - - s = "```\n"; - if (r) s += r->message; - else s += "Internal service error"; - s += "\n```"; - - return s; - } - - bool ci_github:: - handle_check_suite_rerequest (gh_check_suite_event cs, bool warning_success) - { - HANDLER_DIAG; - - l3 ([&]{trace << "check_suite event { " << cs << " }";}); - - assert (cs.action == "rerequested"); + if (ps.deleted) + return true; // Do nothing further if this was a branch deletion. // While we don't need the installation access token in this request, // let's obtain it to flush out any permission issues early. Also, it is // valid for an hour so we will most likely make use of it. // - optional jwt (generate_jwt (cs.check_suite.app_id, trace, error)); + optional jwt (generate_jwt (ps.app_id, trace, error)); if (!jwt) throw server_error (); optional iat ( - obtain_installation_access_token (cs.installation.id, + obtain_installation_access_token (ps.installation.id, move (*jwt), error)); if (!iat) @@ -611,84 +609,36 @@ namespace brep l3 ([&]{trace << "installation_access_token { " << *iat << " }";}); - // Service id that uniquely identifies the CI tenant. - // - string sid (cs.repository.node_id + ':' + cs.check_suite.head_sha); + // While it would have been nice to cancel CIs of PRs with this branch as + // base not to waste resources, there are complications: Firstly, we can + // only do this for remote PRs (since local PRs will most likely share the + // result with branch push). Secondly, we try to do our best even if the + // branch protection rule for head behind is not enabled. In this case, it + // would be good to complete the CI. So maybe/later. See also the head + // case in handle_pull_request(), where we do cancel remote PRs that are + // not shared. - // If the user requests a rebuild of the (entire) PR, then this manifests - // as the check_suite rather than pull_request event. Specifically: - // - // - For a local PR, this event is shared with the branch push and all we - // need to do is restart the CI for the head commit. - // - // - For a remote PR, this event will have no gh_check_suite::head_branch. - // In this case we need to load the existing service data for this head - // commit, extract the test merge commit, and restart the CI for that. - // - // Note that it's possible the base branch has moved in the meantime and - // ideally we would want to re-request the test merge commit, etc. - // However, this will only be necessary if the user does not follow our - // recommendation of enabling the head-behind-base protection. And it - // seems all this extra complexity would not be warranted. + // Service id that uniquely identifies the CI tenant. // - string check_sha; - service_data::kind_type kind; - - if (!cs.check_suite.head_branch) - { - // Rebuild of remote PR. - // - kind = service_data::remote; - - if (optional d = find (*build_db_, "ci-github", sid)) - { - tenant_service& ts (d->service); - - try - { - service_data sd (*ts.data); - check_sha = move (sd.check_sha); // Test merge commit. - } - catch (const invalid_argument& e) - { - fail << "failed to parse service data: " << e; - } - } - else - { - error << "check suite " << cs.check_suite.node_id - << " for remote pull request:" - << " re-requested but tenant_service with id " << sid - << " did not exist"; - return true; - } - } - else - { - // Rebuild of branch push or local PR. - // - kind = service_data::local; - check_sha = cs.check_suite.head_sha; - } + string sid (ps.repository.node_id + ':' + ps.after); service_data sd (warning_success, iat->token, iat->expires_at, - cs.check_suite.app_id, - cs.installation.id, - move (cs.repository.node_id), - move (cs.repository.clone_url), - kind, false /* pre_check */, true /* re_requested */, - move (check_sha), - move (cs.check_suite.head_sha) /* report_sha */); - - // Replace the existing CI tenant if it exists. - // - // Note that GitHub UI does not allow re-running the entire check suite - // until all the check runs are completed. - // + ps.app_id, + ps.installation.id, + move (ps.repository.node_id), + move (ps.repository.clone_url), + service_data::local, + false /* pre_check */, + false /* re_requested */, + ps.after /* check_sha */, + ps.after /* report_sha */); - // Create an unloaded CI tenant. + // Create an unloaded CI tenant, doing nothing if one already exists + // (which could've been created by handle_pull_request() or by us as a + // result of a push to another branch). Note that the tenant's reference + // count is incremented in all cases. // // Note: use no delay since we need to (re)create the synthetic conclusion // check run as soon as possible. @@ -700,264 +650,229 @@ namespace brep // notifications until (1) we load the tenant, (2) we cancel it, or (3) // it gets archived after some timeout. // - auto pr (create (error, - warn, - verb_ ? &trace : nullptr, - *build_db_, retry_, - tenant_service (sid, "ci-github", sd.json ()), - chrono::seconds (30) /* interval */, - chrono::seconds (0) /* delay */, - duplicate_tenant_mode::replace)); - - if (!pr) + if (!create (error, warn, verb_ ? &trace : nullptr, + *build_db_, retry_, + tenant_service (sid, "ci-github", sd.json ()), + chrono::seconds (30) /* interval */, + chrono::seconds (0) /* delay */, + duplicate_tenant_mode::ignore)) { - fail << "check suite " << cs.check_suite.node_id + fail << "push " + ps.after + " to " + ps.ref << ": unable to create unloaded CI tenant"; } - if (pr->second == duplicate_tenant_result::created) - { - error << "check suite " << cs.check_suite.node_id - << ": re-requested but tenant_service with id " << sid - << " did not exist"; - return true; - } - return true; } + // Miscellaneous pull request facts + // + // - Although some of the GitHub documentation makes it sound like they + // expect check runs to be added to both the PR head commit and the merge + // commit, the PR UI does not react to the merge commit's check runs + // consistently. It actually seems to be quite broken. The only thing it + // does seem to do reliably is blocking the PR merge if the merge commit's + // check runs are not successful (i.e, overriding the PR head commit's + // check runs). But the UI looks quite messed up generally in this state. + // + // - When new commits are added to a PR base branch, pull_request.base.sha + // does not change, but the test merge commit will be updated to include + // the new commits to the base branch. + // + // - When new commits are added to a PR head branch, pull_request.head.sha + // gets updated with the head commit's SHA and check_suite.pull_requests[] + // will contain all PRs with this branch as head. + // bool ci_github:: - handle_check_suite_completed (gh_check_suite_event cs, bool warning_success) + handle_pull_request (gh_pull_request_event pr, bool warning_success) { - // The plans is as follows: - // - // 1. Load the service data. - // - // 2. Verify it is completed. - // - // 3. Verify the check run counts match. - // - // 4. Verify (like in build_built()) that all the check runs are - // completed. - // - // 5. Verify the result matches what GitHub thinks it is. - HANDLER_DIAG; - l3 ([&]{trace << "check_suite event { " << cs << " }";}); + l3 ([&]{trace << "pull_request event { " << pr << " }";}); - // Service id that uniquely identifies the CI tenant. + // While we don't need the installation access token in this request, + // let's obtain it to flush out any permission issues early. Also, it is + // valid for an hour so we will most likely make use of it. // - string sid (cs.repository.node_id + ':' + cs.check_suite.head_sha); + optional jwt (generate_jwt (pr.pull_request.app_id, trace, error)); + if (!jwt) + throw server_error (); - // The common log entry subject. - // - string sub ("check suite " + cs.check_suite.node_id + '/' + sid); + optional iat ( + obtain_installation_access_token (pr.installation.id, + move (*jwt), + error)); + if (!iat) + throw server_error (); - // Load the service data. + l3 ([&]{trace << "installation_access_token { " << *iat << " }";}); + + // Distinguish between local and remote PRs by comparing the head and base + // repositories' paths. // - service_data sd; + service_data::kind_type kind ( + pr.pull_request.head_path == pr.pull_request.base_path + ? service_data::local + : service_data::remote); - if (optional d = find (*build_db_, "ci-github", sid)) + // Note that similar to the branch push case above, while it would have + // been nice to cancel the previous CI job once the PR head moves (the + // "synchronize" event), due to the head sharing problem the previous CI + // job might actually still be relevant (in both local and remote PR + // cases). So we only do it for the remote PRs and only if the head is not + // shared (via tenant reference counting). + // + if (kind == service_data::remote && pr.action == "synchronize") { - try + if (pr.before) { - sd = service_data (*d->service.data); + // Service id that will uniquely identify the CI tenant. + // + string sid (pr.repository.node_id + ':' + *pr.before); + + if (optional ts = cancel (error, warn, + verb_ ? &trace : nullptr, + *build_db_, retry_, + "ci-github", sid, + true /* ref_count */)) + { + l3 ([&]{trace << "pull request " << pr.pull_request.node_id + << ": attempted to cancel CI of previous head commit" + << " (ref_count: " << ts->ref_count << ')';}); + } + else + { + // It's possible that there was no CI for the previous commit for + // various reasons (e.g., CI was not enabled). + // + l3 ([&]{trace << "pull request " << pr.pull_request.node_id + << ": failed to cancel CI of previous head commit " + << "with tenant_service id " << sid;}); + } } - catch (const invalid_argument& e) + else { - fail << "failed to parse service data: " << e; + error << "pull request " << pr.pull_request.node_id + << ": before commit is missing in synchronize event"; } } - else - { - error << sub << ": tenant_service does not exist"; - return true; - } - // Verify the completed flag and the number of check runs. + // Note: for remote PRs the check_sha will be set later, in + // build_unloaded_pre_check(), to test merge commit id. // - if (!sd.completed) - { - error << sub << " service data complete flag is false"; - return true; - } + string check_sha (kind == service_data::local + ? pr.pull_request.head_sha + : ""); - // Received count will be one higher because we don't store the conclusion - // check run. + // Note that PR rebuilds (re-requested) are handled by + // handle_check_suite_rerequest(). // - size_t check_runs_count (sd.check_runs.size () + 1); - - if (check_runs_count == 1) - { - error << sub << ": no check runs in service data"; - return true; - } - - if (cs.check_suite.check_runs_count != check_runs_count) - { - error << sub << ": check runs count " << cs.check_suite.check_runs_count - << " does not match service data count " << check_runs_count; - return true; - } - - // Verify that all the check runs are built and compute the summary - // conclusion. + // Note that, in the case of a remote PR, GitHub will copy the PR head + // commit from the head (forked) repository into the base repository. So + // the check runs must always be added to the base repository, whether the + // PR is local or remote. The head commit refs are located at + // refs/pull//head. // - result_status conclusion (result_status::success); - - for (const check_run& cr: sd.check_runs) - { - if (cr.state == build_state::built) - { - assert (cr.status.has_value ()); - conclusion |= *cr.status; - } - else - { - error << sub << ": unbuilt check run in service data"; - return true; - } - } + service_data sd (warning_success, + move (iat->token), + iat->expires_at, + pr.pull_request.app_id, + pr.installation.id, + move (pr.repository.node_id), + move (pr.repository.clone_url), + kind, true /* pre_check */, false /* re_request */, + move (check_sha), + move (pr.pull_request.head_sha) /* report_sha */, + pr.pull_request.node_id, + pr.pull_request.number); - // Verify the conclusion. + // Create an unloaded CI tenant for the pre-check phase (during which we + // wait for the PR's merge commit and behindness to become available). // - if (!cs.check_suite.conclusion) - { - error << sub << ": absent conclusion in completed check suite"; - return true; - } - - // Note that the case mismatch is due to GraphQL (gh_conclusion()) - // requiring uppercase conclusion values while the received webhook values - // are lower case. + // Create with an empty service id so that the generated tenant id is used + // instead during the pre-check phase (so as not to clash with a proper + // service id for this head commit, potentially created in + // handle_branch_push() or as another PR). // - string gh_conclusion (gh_to_conclusion (conclusion, warning_success)); + tenant_service ts ("", "ci-github", sd.json ()); - if (icasecmp (*cs.check_suite.conclusion, gh_conclusion) != 0) + // Note: use no delay since we need to start the actual CI (which in turn + // (re)creates the synthetic conclusion check run) as soon as possible. + // + // After this call we will start getting the build_unloaded() + // notifications -- which will be routed to build_unloaded_pre_check() -- + // until we cancel the tenant or it gets archived after some timeout. + // (Note that we never actually load this request, we always cancel it; + // see build_unloaded_pre_check() for details.) + // + if (!create (error, + warn, + verb_ ? &trace : nullptr, + *build_db_, retry_, + move (ts), + chrono::seconds (30) /* interval */, + chrono::seconds (0) /* delay */)) { - error << sub << ": conclusion " << *cs.check_suite.conclusion - << " does not match service data conclusion " << gh_conclusion; - return true; + fail << "pull request " << pr.pull_request.node_id + << ": unable to create unloaded pre-check tenant"; } return true; } - // Create a gq_built_result. - // - // Throw invalid_argument in case of invalid result_status. - // - static gq_built_result - make_built_result (result_status rs, bool warning_success, string message) - { - return {gh_to_conclusion (rs, warning_success), - circle (rs) + ' ' + ucase (to_string (rs)), - move (message)}; - } - - // Parse a check run details URL into a build_id. - // - // Return nullopt if the URL is invalid. - // - static optional - parse_details_url (const string& details_url); - - // Note that GitHub always posts a message to their GUI saying "You have - // successfully requested be rerun", regardless of what - // HTTP status code we respond with. However we do return error status codes - // when there is no better option (like failing the conclusion) in case they - // start handling them someday. - // bool ci_github:: - handle_check_run_rerequest (const gh_check_run_event& cr, - bool warning_success) + handle_check_suite_rerequest (gh_check_suite_event cs, bool warning_success) { HANDLER_DIAG; - l3 ([&]{trace << "check_run event { " << cr << " }";}); + l3 ([&]{trace << "check_suite event { " << cs << " }";}); - // The overall plan is as follows: - // - // 1. Load service data. - // - // 2. If the tenant is archived, then fail (re-create) both the check run - // and the conclusion with appropriate diagnostics. - // - // 3. If the check run is in the queued state, then do nothing. - // - // 4. Re-create the check run in the queued state and the conclusion in - // the building state. Note: do in a single request to make sure we - // either "win" or "loose" the potential race for both (important - // for #7). - // - // 5. Call the rebuild() function to attempt to schedule a rebuild. Pass - // the update function that does the following (if called): - // - // a. Save new node ids. - // - // b. Update the check run state (may also not exist). - // - // c. Clear the completed flag if true. - // - // 6. If the result of rebuild() indicates the tenant is archived, then - // fail (update) both the check run and conclusion with appropriate - // diagnostics. - // - // 7. If original state is queued (no rebuild was scheduled), then fail - // (update) both the check run and the conclusion. - // - // Note that while conceptually we are updating existing check runs, in - // practice we have to re-create as new check runs in order to replace the - // existing ones because GitHub does not allow transitioning out of the - // built state. + assert (cs.action == "rerequested"); - // Get a new installation access token. + // While we don't need the installation access token in this request, + // let's obtain it to flush out any permission issues early. Also, it is + // valid for an hour so we will most likely make use of it. // - auto get_iat = [this, &trace, &error, &cr] () - -> optional - { - optional jwt (generate_jwt (cr.check_run.app_id, trace, error)); - if (!jwt) - return nullopt; + optional jwt (generate_jwt (cs.check_suite.app_id, trace, error)); + if (!jwt) + throw server_error (); - optional iat ( - obtain_installation_access_token (cr.installation.id, + optional iat ( + obtain_installation_access_token (cs.installation.id, move (*jwt), error)); + if (!iat) + throw server_error (); - if (iat) - l3 ([&]{trace << "installation_access_token { " << *iat << " }";}); - - return iat; - }; - - const string& repo_node_id (cr.repository.node_id); - const string& head_sha (cr.check_run.check_suite.head_sha); + l3 ([&]{trace << "installation_access_token { " << *iat << " }";}); - // Prepare the build and conclusion check runs. They are sent to GitHub in - // a single request (unless something goes wrong) so store them together - // from the outset. + // Service id that uniquely identifies the CI tenant. // - vector check_runs (2); - check_run& bcr (check_runs[0]); // Build check run - check_run& ccr (check_runs[1]); // Conclusion check run - - bcr.name = cr.check_run.name; - ccr.name = conclusion_check_run_name; - - const gh_installation_access_token* iat (nullptr); - optional new_iat; + string sid (cs.repository.node_id + ':' + cs.check_suite.head_sha); - // Load the service data, failing the check runs if the tenant has been - // archived. + // If the user requests a rebuild of the (entire) PR, then this manifests + // as the check_suite rather than pull_request event. Specifically: // - service_data sd; - string tenant_id; + // - For a local PR, this event is shared with the branch push and all we + // need to do is restart the CI for the head commit. + // + // - For a remote PR, this event will have no gh_check_suite::head_branch. + // In this case we need to load the existing service data for this head + // commit, extract the test merge commit, and restart the CI for that. + // + // Note that it's possible the base branch has moved in the meantime and + // ideally we would want to re-request the test merge commit, etc. + // However, this will only be necessary if the user does not follow our + // recommendation of enabling the head-behind-base protection. And it + // seems all this extra complexity would not be warranted. + // + string check_sha; + service_data::kind_type kind; + + if (!cs.check_suite.head_branch) { - // Service id that uniquely identifies the CI tenant. + // Rebuild of remote PR. // - string sid (repo_node_id + ':' + head_sha); + kind = service_data::remote; if (optional d = find (*build_db_, "ci-github", sid)) { @@ -965,651 +880,736 @@ namespace brep try { - sd = service_data (*ts.data); + service_data sd (*ts.data); + check_sha = move (sd.check_sha); // Test merge commit. } catch (const invalid_argument& e) { fail << "failed to parse service data: " << e; } + } + else + { + error << "check suite " << cs.check_suite.node_id + << " for remote pull request:" + << " re-requested but tenant_service with id " << sid + << " did not exist"; + return true; + } + } + else + { + // Rebuild of branch push or local PR. + // + kind = service_data::local; + check_sha = cs.check_suite.head_sha; + } - if (!sd.conclusion_node_id) - fail << "no conclusion node id for check run " << cr.check_run.node_id; + service_data sd (warning_success, + iat->token, + iat->expires_at, + cs.check_suite.app_id, + cs.installation.id, + move (cs.repository.node_id), + move (cs.repository.clone_url), + kind, false /* pre_check */, true /* re_requested */, + move (check_sha), + move (cs.check_suite.head_sha) /* report_sha */); - tenant_id = d->tenant_id; + // Replace the existing CI tenant if it exists. + // + // Note that GitHub UI does not allow re-running the entire check suite + // until all the check runs are completed. + // - // Get a new IAT if the one from the service data has expired. - // - if (system_clock::now () > sd.installation_access.expires_at) - { - if ((new_iat = get_iat ())) - iat = &*new_iat; - else - throw server_error (); - } - else - iat = &sd.installation_access; + // Create an unloaded CI tenant. + // + // Note: use no delay since we need to (re)create the synthetic conclusion + // check run as soon as possible. + // + // Note that we use the create() API instead of start() since duplicate + // management is not available in start(). + // + // After this call we will start getting the build_unloaded() + // notifications until (1) we load the tenant, (2) we cancel it, or (3) + // it gets archived after some timeout. + // + auto pr (create (error, + warn, + verb_ ? &trace : nullptr, + *build_db_, retry_, + tenant_service (sid, "ci-github", sd.json ()), + chrono::seconds (30) /* interval */, + chrono::seconds (0) /* delay */, + duplicate_tenant_mode::replace)); - if (d->archived) // Tenant is archived - { - // Fail the check runs. - // - gq_built_result br ( - make_built_result ( - result_status::error, warning_success, - "Unable to rebuild individual configuration: build has " - "been archived")); + if (!pr) + { + fail << "check suite " << cs.check_suite.node_id + << ": unable to create unloaded CI tenant"; + } - // Try to update the conclusion check run even if the first update - // fails. - // - bool f (false); // Failed. + if (pr->second == duplicate_tenant_result::created) + { + error << "check suite " << cs.check_suite.node_id + << ": re-requested but tenant_service with id " << sid + << " did not exist"; + return true; + } - if (gq_update_check_run (error, bcr, iat->token, - repo_node_id, cr.check_run.node_id, - nullopt /* details_url */, - build_state::built, br)) - { - l3 ([&]{trace << "updated check_run { " << bcr << " }";}); - } - else - { - error << "check_run " << cr.check_run.node_id - << ": unable to update check run"; - f = true; - } + return true; + } - if (gq_update_check_run (error, ccr, iat->token, - repo_node_id, *sd.conclusion_node_id, - nullopt /* details_url */, - build_state::built, move (br))) - { - l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";}); - } - else - { - error << "check_run " << cr.check_run.node_id - << ": unable to update conclusion check run"; - f = true; - } + bool ci_github:: + handle_check_suite_completed (gh_check_suite_event cs, bool warning_success) + { + // The plans is as follows: + // + // 1. Load the service data. + // + // 2. Verify it is completed. + // + // 3. Verify the check run counts match. + // + // 4. Verify (like in build_built()) that all the check runs are + // completed. + // + // 5. Verify the result matches what GitHub thinks it is. - // Fail the handler if either of the check runs could not be - // updated. - // - if (f) - throw server_error (); + HANDLER_DIAG; - return true; - } - } - else - { - // No such tenant. - // - fail << "check run " << cr.check_run.node_id - << " re-requested but tenant_service with id " << sid - << " does not exist"; - } - } + l3 ([&]{trace << "check_suite event { " << cs << " }";}); - // Fail if it's the conclusion check run that is being re-requested. - // - // @@ TMP When user selects re-run all failed checks we receive multiple - // check_runs, one of which is for the CCR. We then update it with the - // error message, triggering another check_suite(completed) right after - // all of the check_runs(rerequested). + // Service id that uniquely identifies the CI tenant. // - if (cr.check_run.name == conclusion_check_run_name) - { - l3 ([&]{trace << "re-requested conclusion check_run";}); + string sid (cs.repository.node_id + ':' + cs.check_suite.head_sha); - if (!sd.conclusion_node_id) - fail << "no conclusion node id for check run " << cr.check_run.node_id; + // The common log entry subject. + // + string sub ("check suite " + cs.check_suite.node_id + '/' + sid); - gq_built_result br ( - make_built_result (result_status::error, warning_success, - "Conclusion check run cannot be rebuilt")); + // Load the service data. + // + service_data sd; - // Fail (update) the conclusion check run. - // - if (gq_update_check_run (error, ccr, iat->token, - repo_node_id, *sd.conclusion_node_id, - nullopt /* details_url */, - build_state::built, move (br))) + if (optional d = find (*build_db_, "ci-github", sid)) + { + try { - l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";}); + sd = service_data (*d->service.data); } - else + catch (const invalid_argument& e) { - fail << "check run " << cr.check_run.node_id - << ": unable to update conclusion check run " - << *sd.conclusion_node_id; + fail << "failed to parse service data: " << e; } - + } + else + { + error << sub << ": tenant_service does not exist"; return true; } - // Parse the check_run's details_url to extract build id. - // - // While this is a bit hackish, there doesn't seem to be a better way - // (like associating custom data with a check run). Note that the GitHub - // UI only allows rebuilding completed check runs, so the details URL - // should be there. + // Verify the completed flag and the number of check runs. // - optional bid (parse_details_url (cr.check_run.details_url)); - if (!bid) + if (!sd.completed) { - fail << "check run " << cr.check_run.node_id - << ": failed to extract build id from details_url"; + error << sub << " service data complete flag is false"; + return true; } - // Initialize the check run (`bcr`) with state from the service data. + // Received count will be one higher because we don't store the conclusion + // check run. // - { - // Search for the check run in the service data. - // - // Note that we look by name in case node id got replaced by a racing - // re-request (in which case we ignore this request). - // - auto i (find_if (sd.check_runs.begin (), sd.check_runs.end (), - [&cr] (const check_run& scr) - { - return scr.name == cr.check_run.name; - })); + size_t check_runs_count (sd.check_runs.size () + 1); - if (i == sd.check_runs.end ()) - fail << "check_run " << cr.check_run.node_id - << " (" << cr.check_run.name << "): " - << "re-requested but does not exist in service data"; + if (check_runs_count == 1) + { + error << sub << ": no check runs in service data"; + return true; + } - // Do nothing if node ids don't match. - // - if (i->node_id && *i->node_id != cr.check_run.node_id) + if (cs.check_suite.check_runs_count != check_runs_count) + { + error << sub << ": check runs count " << cs.check_suite.check_runs_count + << " does not match service data count " << check_runs_count; + return true; + } + + // Verify that all the check runs are built and compute the summary + // conclusion. + // + result_status conclusion (result_status::success); + + for (const check_run& cr: sd.check_runs) + { + if (cr.state == build_state::built) { - l3 ([&]{trace << "check_run " << cr.check_run.node_id - << " (" << cr.check_run.name << "): " - << "node id has changed in service data";}); - return true; + assert (cr.status.has_value ()); + conclusion |= *cr.status; } - - // Do nothing if the build is already queued. - // - if (i->state == build_state::queued) + else { - l3 ([&]{trace << "ignoring already-queued check run";}); + error << sub << ": unbuilt check run in service data"; return true; } - - bcr.name = i->name; - bcr.build_id = i->build_id; - bcr.state = i->state; } - // Transition the build and conclusion check runs out of the built state - // (or any other state) by re-creating them. + // Verify the conclusion. // - bcr.state = build_state::queued; - bcr.state_synced = false; - bcr.details_url = cr.check_run.details_url; - - ccr.state = build_state::building; - ccr.state_synced = false; - - if (gq_create_check_runs (error, check_runs, iat->token, - repo_node_id, head_sha)) - { - assert (bcr.state == build_state::queued); - assert (ccr.state == build_state::building); - - l3 ([&]{trace << "created check_run { " << bcr << " }";}); - l3 ([&]{trace << "created conclusion check_run { " << ccr << " }";}); - } - else + if (!cs.check_suite.conclusion) { - fail << "check run " << cr.check_run.node_id - << ": unable to re-create build and conclusion check runs"; + error << sub << ": absent conclusion in completed check suite"; + return true; } - // Request the rebuild and update service data. + // Note that the case mismatch is due to GraphQL (gh_conclusion()) + // requiring uppercase conclusion values while the received webhook values + // are lower case. // - bool race (false); + string gh_conclusion (gh_to_conclusion (conclusion, warning_success)); - // Callback function called by rebuild() to update the service data (but - // only if the build is actually restarted). - // - auto update_sd = [&error, &new_iat, &race, - tenant_id = move (tenant_id), - &cr, &bcr, &ccr] (const string& ti, - const tenant_service& ts, - build_state) -> optional + if (icasecmp (*cs.check_suite.conclusion, gh_conclusion) != 0) { - // NOTE: this lambda may be called repeatedly (e.g., due to transaction - // being aborted) and so should not move out of its captures. - - race = false; // Reset. + error << sub << ": conclusion " << *cs.check_suite.conclusion + << " does not match service data conclusion " << gh_conclusion; + return true; + } - if (tenant_id != ti) - { - // The tenant got replaced since we loaded it but we managed to - // trigger a rebuild in the new tenant. Who knows whose check runs are - // visible, so let's fail ours similar to the cases below. - // - race = true; - return nullopt; - } + return true; + } - service_data sd; - try - { - sd = service_data (*ts.data); - } - catch (const invalid_argument& e) - { - error << "failed to parse service data: " << e; - return nullopt; - } + // Return the colored circle corresponding to a result_status. + // + static string + circle (result_status rs) + { + switch (rs) + { + case result_status::success: return "\U0001F7E2"; // Green circle. + case result_status::warning: return "\U0001F7E0"; // Orange circle. + case result_status::error: + case result_status::abort: + case result_status::abnormal: return "\U0001F534"; // Red circle. - // Note that we again look by name in case node id got replaced by a - // racing re-request. In this case, however, it's impossible to decide - // who won that race, so let's fail the check suite to be on the safe - // side (in a sense, similar to the rebuild() returning queued below). + // Valid values we should never encounter. // - auto i (find_if ( - sd.check_runs.begin (), sd.check_runs.end (), - [&cr] (const check_run& scr) - { - return scr.name == cr.check_run.name; - })); - - if (i == sd.check_runs.end ()) - { - error << "check_run " << cr.check_run.node_id - << " (" << cr.check_run.name << "): " - << "re-requested but does not exist in service data"; - return nullopt; - } + case result_status::skip: + case result_status::interrupt: + throw invalid_argument ("unexpected result_status value: " + + to_string (rs)); + } - if (i->node_id && *i->node_id != cr.check_run.node_id) - { - // Keep the old conclusion node id to make sure any further state - // transitions are ignored. A bit of a hack. - // - race = true; - return nullopt; - } + return ""; // Should never reach. + } - *i = bcr; // Update with new node_id, state, state_synced. + // Make a check run summary from a CI start_result. + // + static string + to_check_run_summary (const optional& r) + { + string s; - sd.conclusion_node_id = ccr.node_id; - sd.completed = false; + s = "```\n"; + if (r) s += r->message; + else s += "Internal service error"; + s += "\n```"; - // Save the IAT if we created a new one. - // - if (new_iat) - sd.installation_access = *new_iat; + return s; + } - return sd.json (); - }; + // Create a gq_built_result. + // + // Throw invalid_argument in case of invalid result_status. + // + static gq_built_result + make_built_result (result_status rs, bool warning_success, string message) + { + return {gh_to_conclusion (rs, warning_success), + circle (rs) + ' ' + ucase (to_string (rs)), + move (message)}; + } - optional bs (rebuild (*build_db_, retry_, *bid, update_sd)); + // Parse a check run details URL into a build_id. + // + // Return nullopt if the URL is invalid. + // + static optional + parse_details_url (const string& details_url); - // If the build has been archived or re-enqueued since we loaded the - // service data, fail (by updating) both the build check run and the - // conclusion check run. Otherwise the build has been successfully - // re-enqueued so do nothing further. - // - if (!race && bs && *bs != build_state::queued) - return true; + // Note that GitHub always posts a message to their GUI saying "You have + // successfully requested be rerun", regardless of what + // HTTP status code we respond with. However we do return error status codes + // when there is no better option (like failing the conclusion) in case they + // start handling them someday. + // + bool ci_github:: + handle_check_run_rerequest (const gh_check_run_event& cr, + bool warning_success) + { + HANDLER_DIAG; - gq_built_result br; // Built result for both check runs. + l3 ([&]{trace << "check_run event { " << cr << " }";}); - if (race || bs) // Race or re-enqueued. - { - // The re-enqueued case: this build has been re-enqueued since we first - // loaded the service data. This could happen if the user clicked - // "re-run" multiple times and another handler won the rebuild() race. - // - // However the winner of the check runs race cannot be determined. - // - // Best case the other handler won the check runs race as well and - // thus everything will proceed normally. Our check runs will be - // invisible and disregarded. - // - // Worst case we won the check runs race and the other handler's check - // runs -- the ones that will be updated by the build_*() notifications - // -- are no longer visible, leaving things quite broken. - // - // Either way, we fail our check runs. In the best case scenario it - // will have no effect; in the worst case scenario it lets the user - // know something has gone wrong. - // - br = make_built_result (result_status::error, warning_success, - "Unable to rebuild, try again"); - } - else // Archived. - { - // The build has expired since we loaded the service data. Most likely - // the tenant has been archived. - // - br = make_built_result ( - result_status::error, warning_success, - "Unable to rebuild individual configuration: build has been archived"); - } - - // Try to update the conclusion check run even if the first update fails. + // The overall plan is as follows: // - bool f (false); // Failed. - - // Fail the build check run. + // 1. Load service data. // - if (gq_update_check_run (error, bcr, iat->token, - repo_node_id, *bcr.node_id, - nullopt /* details_url */, - build_state::built, br)) - { - l3 ([&]{trace << "updated check_run { " << bcr << " }";}); - } - else - { - error << "check run " << cr.check_run.node_id - << ": unable to update (replacement) check run " - << *bcr.node_id; - f = true; - } + // 2. If the tenant is archived, then fail (re-create) both the check run + // and the conclusion with appropriate diagnostics. + // + // 3. If the check run is in the queued state, then do nothing. + // + // 4. Re-create the check run in the queued state and the conclusion in + // the building state. Note: do in a single request to make sure we + // either "win" or "loose" the potential race for both (important + // for #7). + // + // 5. Call the rebuild() function to attempt to schedule a rebuild. Pass + // the update function that does the following (if called): + // + // a. Save new node ids. + // + // b. Update the check run state (may also not exist). + // + // c. Clear the completed flag if true. + // + // 6. If the result of rebuild() indicates the tenant is archived, then + // fail (update) both the check run and conclusion with appropriate + // diagnostics. + // + // 7. If original state is queued (no rebuild was scheduled), then fail + // (update) both the check run and the conclusion. + // + // Note that while conceptually we are updating existing check runs, in + // practice we have to re-create as new check runs in order to replace the + // existing ones because GitHub does not allow transitioning out of the + // built state. - // Fail the conclusion check run. + // Get a new installation access token. // - if (gq_update_check_run (error, ccr, iat->token, - repo_node_id, *ccr.node_id, - nullopt /* details_url */, - build_state::built, move (br))) - { - l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";}); - } - else + auto get_iat = [this, &trace, &error, &cr] () + -> optional { - error << "check run " << cr.check_run.node_id - << ": unable to update conclusion check run " << *ccr.node_id; - f = true; - } + optional jwt (generate_jwt (cr.check_run.app_id, trace, error)); + if (!jwt) + return nullopt; - // Fail the handler if either of the check runs could not be updated. - // - if (f) - throw server_error (); + optional iat ( + obtain_installation_access_token (cr.installation.id, + move (*jwt), + error)); - return true; - } + if (iat) + l3 ([&]{trace << "installation_access_token { " << *iat << " }";}); - // Miscellaneous pull request facts - // - // - Although some of the GitHub documentation makes it sound like they - // expect check runs to be added to both the PR head commit and the merge - // commit, the PR UI does not react to the merge commit's check runs - // consistently. It actually seems to be quite broken. The only thing it - // does seem to do reliably is blocking the PR merge if the merge commit's - // check runs are not successful (i.e, overriding the PR head commit's - // check runs). But the UI looks quite messed up generally in this state. - // - // - When new commits are added to a PR base branch, pull_request.base.sha - // does not change, but the test merge commit will be updated to include - // the new commits to the base branch. - // - // - When new commits are added to a PR head branch, pull_request.head.sha - // gets updated with the head commit's SHA and check_suite.pull_requests[] - // will contain all PRs with this branch as head. - // - bool ci_github:: - handle_pull_request (gh_pull_request_event pr, bool warning_success) - { - HANDLER_DIAG; + return iat; + }; - l3 ([&]{trace << "pull_request event { " << pr << " }";}); + const string& repo_node_id (cr.repository.node_id); + const string& head_sha (cr.check_run.check_suite.head_sha); - // While we don't need the installation access token in this request, - // let's obtain it to flush out any permission issues early. Also, it is - // valid for an hour so we will most likely make use of it. + // Prepare the build and conclusion check runs. They are sent to GitHub in + // a single request (unless something goes wrong) so store them together + // from the outset. // - optional jwt (generate_jwt (pr.pull_request.app_id, trace, error)); - if (!jwt) - throw server_error (); - - optional iat ( - obtain_installation_access_token (pr.installation.id, - move (*jwt), - error)); - if (!iat) - throw server_error (); + vector check_runs (2); + check_run& bcr (check_runs[0]); // Build check run + check_run& ccr (check_runs[1]); // Conclusion check run - l3 ([&]{trace << "installation_access_token { " << *iat << " }";}); + bcr.name = cr.check_run.name; + ccr.name = conclusion_check_run_name; - // Distinguish between local and remote PRs by comparing the head and base - // repositories' paths. - // - service_data::kind_type kind ( - pr.pull_request.head_path == pr.pull_request.base_path - ? service_data::local - : service_data::remote); + const gh_installation_access_token* iat (nullptr); + optional new_iat; - // Note that similar to the branch push case above, while it would have - // been nice to cancel the previous CI job once the PR head moves (the - // "synchronize" event), due to the head sharing problem the previous CI - // job might actually still be relevant (in both local and remote PR - // cases). So we only do it for the remote PRs and only if the head is not - // shared (via tenant reference counting). + // Load the service data, failing the check runs if the tenant has been + // archived. // - if (kind == service_data::remote && pr.action == "synchronize") + service_data sd; + string tenant_id; { - if (pr.before) + // Service id that uniquely identifies the CI tenant. + // + string sid (repo_node_id + ':' + head_sha); + + if (optional d = find (*build_db_, "ci-github", sid)) { - // Service id that will uniquely identify the CI tenant. - // - string sid (pr.repository.node_id + ':' + *pr.before); + tenant_service& ts (d->service); - if (optional ts = cancel (error, warn, - verb_ ? &trace : nullptr, - *build_db_, retry_, - "ci-github", sid, - true /* ref_count */)) + try { - l3 ([&]{trace << "pull request " << pr.pull_request.node_id - << ": attempted to cancel CI of previous head commit" - << " (ref_count: " << ts->ref_count << ')';}); + sd = service_data (*ts.data); + } + catch (const invalid_argument& e) + { + fail << "failed to parse service data: " << e; + } + + if (!sd.conclusion_node_id) + fail << "no conclusion node id for check run " << cr.check_run.node_id; + + tenant_id = d->tenant_id; + + // Get a new IAT if the one from the service data has expired. + // + if (system_clock::now () > sd.installation_access.expires_at) + { + if ((new_iat = get_iat ())) + iat = &*new_iat; + else + throw server_error (); } else + iat = &sd.installation_access; + + if (d->archived) // Tenant is archived { - // It's possible that there was no CI for the previous commit for - // various reasons (e.g., CI was not enabled). + // Fail the check runs. // - l3 ([&]{trace << "pull request " << pr.pull_request.node_id - << ": failed to cancel CI of previous head commit " - << "with tenant_service id " << sid;}); + gq_built_result br ( + make_built_result ( + result_status::error, warning_success, + "Unable to rebuild individual configuration: build has " + "been archived")); + + // Try to update the conclusion check run even if the first update + // fails. + // + bool f (false); // Failed. + + if (gq_update_check_run (error, bcr, iat->token, + repo_node_id, cr.check_run.node_id, + nullopt /* details_url */, + build_state::built, br)) + { + l3 ([&]{trace << "updated check_run { " << bcr << " }";}); + } + else + { + error << "check_run " << cr.check_run.node_id + << ": unable to update check run"; + f = true; + } + + if (gq_update_check_run (error, ccr, iat->token, + repo_node_id, *sd.conclusion_node_id, + nullopt /* details_url */, + build_state::built, move (br))) + { + l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";}); + } + else + { + error << "check_run " << cr.check_run.node_id + << ": unable to update conclusion check run"; + f = true; + } + + // Fail the handler if either of the check runs could not be + // updated. + // + if (f) + throw server_error (); + + return true; } } else { - error << "pull request " << pr.pull_request.node_id - << ": before commit is missing in synchronize event"; + // No such tenant. + // + fail << "check run " << cr.check_run.node_id + << " re-requested but tenant_service with id " << sid + << " does not exist"; } } - // Note: for remote PRs the check_sha will be set later, in - // build_unloaded_pre_check(), to test merge commit id. + // Fail if it's the conclusion check run that is being re-requested. // - string check_sha (kind == service_data::local - ? pr.pull_request.head_sha - : ""); + // @@ TMP When user selects re-run all failed checks we receive multiple + // check_runs, one of which is for the CCR. We then update it with the + // error message, triggering another check_suite(completed) right after + // all of the check_runs(rerequested). + // + if (cr.check_run.name == conclusion_check_run_name) + { + l3 ([&]{trace << "re-requested conclusion check_run";}); - // Note that PR rebuilds (re-requested) are handled by - // handle_check_suite_rerequest(). + if (!sd.conclusion_node_id) + fail << "no conclusion node id for check run " << cr.check_run.node_id; + + gq_built_result br ( + make_built_result (result_status::error, warning_success, + "Conclusion check run cannot be rebuilt")); + + // Fail (update) the conclusion check run. + // + if (gq_update_check_run (error, ccr, iat->token, + repo_node_id, *sd.conclusion_node_id, + nullopt /* details_url */, + build_state::built, move (br))) + { + l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";}); + } + else + { + fail << "check run " << cr.check_run.node_id + << ": unable to update conclusion check run " + << *sd.conclusion_node_id; + } + + return true; + } + + // Parse the check_run's details_url to extract build id. // - // Note that, in the case of a remote PR, GitHub will copy the PR head - // commit from the head (forked) repository into the base repository. So - // the check runs must always be added to the base repository, whether the - // PR is local or remote. The head commit refs are located at - // refs/pull//head. + // While this is a bit hackish, there doesn't seem to be a better way + // (like associating custom data with a check run). Note that the GitHub + // UI only allows rebuilding completed check runs, so the details URL + // should be there. // - service_data sd (warning_success, - move (iat->token), - iat->expires_at, - pr.pull_request.app_id, - pr.installation.id, - move (pr.repository.node_id), - move (pr.repository.clone_url), - kind, true /* pre_check */, false /* re_request */, - move (check_sha), - move (pr.pull_request.head_sha) /* report_sha */, - pr.pull_request.node_id, - pr.pull_request.number); + optional bid (parse_details_url (cr.check_run.details_url)); + if (!bid) + { + fail << "check run " << cr.check_run.node_id + << ": failed to extract build id from details_url"; + } - // Create an unloaded CI tenant for the pre-check phase (during which we - // wait for the PR's merge commit and behindness to become available). + // Initialize the check run (`bcr`) with state from the service data. // - // Create with an empty service id so that the generated tenant id is used - // instead during the pre-check phase (so as not to clash with a proper - // service id for this head commit, potentially created in - // handle_branch_push() or as another PR). + { + // Search for the check run in the service data. + // + // Note that we look by name in case node id got replaced by a racing + // re-request (in which case we ignore this request). + // + auto i (find_if (sd.check_runs.begin (), sd.check_runs.end (), + [&cr] (const check_run& scr) + { + return scr.name == cr.check_run.name; + })); + + if (i == sd.check_runs.end ()) + fail << "check_run " << cr.check_run.node_id + << " (" << cr.check_run.name << "): " + << "re-requested but does not exist in service data"; + + // Do nothing if node ids don't match. + // + if (i->node_id && *i->node_id != cr.check_run.node_id) + { + l3 ([&]{trace << "check_run " << cr.check_run.node_id + << " (" << cr.check_run.name << "): " + << "node id has changed in service data";}); + return true; + } + + // Do nothing if the build is already queued. + // + if (i->state == build_state::queued) + { + l3 ([&]{trace << "ignoring already-queued check run";}); + return true; + } + + bcr.name = i->name; + bcr.build_id = i->build_id; + bcr.state = i->state; + } + + // Transition the build and conclusion check runs out of the built state + // (or any other state) by re-creating them. // - tenant_service ts ("", "ci-github", sd.json ()); + bcr.state = build_state::queued; + bcr.state_synced = false; + bcr.details_url = cr.check_run.details_url; + + ccr.state = build_state::building; + ccr.state_synced = false; + + if (gq_create_check_runs (error, check_runs, iat->token, + repo_node_id, head_sha)) + { + assert (bcr.state == build_state::queued); + assert (ccr.state == build_state::building); + + l3 ([&]{trace << "created check_run { " << bcr << " }";}); + l3 ([&]{trace << "created conclusion check_run { " << ccr << " }";}); + } + else + { + fail << "check run " << cr.check_run.node_id + << ": unable to re-create build and conclusion check runs"; + } - // Note: use no delay since we need to start the actual CI (which in turn - // (re)creates the synthetic conclusion check run) as soon as possible. + // Request the rebuild and update service data. // - // After this call we will start getting the build_unloaded() - // notifications -- which will be routed to build_unloaded_pre_check() -- - // until we cancel the tenant or it gets archived after some timeout. - // (Note that we never actually load this request, we always cancel it; - // see build_unloaded_pre_check() for details.) + bool race (false); + + // Callback function called by rebuild() to update the service data (but + // only if the build is actually restarted). // - if (!create (error, - warn, - verb_ ? &trace : nullptr, - *build_db_, retry_, - move (ts), - chrono::seconds (30) /* interval */, - chrono::seconds (0) /* delay */)) + auto update_sd = [&error, &new_iat, &race, + tenant_id = move (tenant_id), + &cr, &bcr, &ccr] (const string& ti, + const tenant_service& ts, + build_state) -> optional { - fail << "pull request " << pr.pull_request.node_id - << ": unable to create unloaded pre-check tenant"; - } + // NOTE: this lambda may be called repeatedly (e.g., due to transaction + // being aborted) and so should not move out of its captures. - return true; - } + race = false; // Reset. - bool ci_github:: - handle_branch_push (gh_push_event ps, bool warning_success) - { - HANDLER_DIAG; + if (tenant_id != ti) + { + // The tenant got replaced since we loaded it but we managed to + // trigger a rebuild in the new tenant. Who knows whose check runs are + // visible, so let's fail ours similar to the cases below. + // + race = true; + return nullopt; + } - l3 ([&]{trace << "push event { " << ps << " }";}); + service_data sd; + try + { + sd = service_data (*ts.data); + } + catch (const invalid_argument& e) + { + error << "failed to parse service data: " << e; + return nullopt; + } - // Cancel the CI tenant associated with the overwritten/deleted previous - // head commit if this is a forced push or a branch deletion. - // - if (ps.forced || ps.deleted) - { - // Service id that will uniquely identify the CI tenant. + // Note that we again look by name in case node id got replaced by a + // racing re-request. In this case, however, it's impossible to decide + // who won that race, so let's fail the check suite to be on the safe + // side (in a sense, similar to the rebuild() returning queued below). // - string sid (ps.repository.node_id + ':' + ps.before); + auto i (find_if ( + sd.check_runs.begin (), sd.check_runs.end (), + [&cr] (const check_run& scr) + { + return scr.name == cr.check_run.name; + })); - // Note that it's possible this commit still exists in another branch so - // we do refcount-aware cancel. - // - if (optional ts = cancel (error, warn, - verb_ ? &trace : nullptr, - *build_db_, retry_, - "ci-github", sid, - true /* ref_count */)) + if (i == sd.check_runs.end ()) { - l3 ([&]{trace << (ps.forced ? "forced push " + ps.after + " to " - : "deletion of ") - << ps.ref << ": attempted to cancel CI of previous" - << " head commit with tenant_service id " << sid - << " (ref_count: " << ts->ref_count << ')';}); + error << "check_run " << cr.check_run.node_id + << " (" << cr.check_run.name << "): " + << "re-requested but does not exist in service data"; + return nullopt; } - else + + if (i->node_id && *i->node_id != cr.check_run.node_id) { - // It's possible that there was no CI for the previous commit for - // various reasons (e.g., CI was not enabled). + // Keep the old conclusion node id to make sure any further state + // transitions are ignored. A bit of a hack. // - l3 ([&]{trace << (ps.forced ? "forced push " + ps.after + " to " - : "deletion of ") - << ps.ref << ": failed to cancel CI of previous" - << " head commit with tenant_service id " << sid;}); + race = true; + return nullopt; } - } - if (ps.deleted) - return true; // Do nothing further if this was a branch deletion. + *i = bcr; // Update with new node_id, state, state_synced. - // While we don't need the installation access token in this request, - // let's obtain it to flush out any permission issues early. Also, it is - // valid for an hour so we will most likely make use of it. - // - optional jwt (generate_jwt (ps.app_id, trace, error)); - if (!jwt) - throw server_error (); + sd.conclusion_node_id = ccr.node_id; + sd.completed = false; - optional iat ( - obtain_installation_access_token (ps.installation.id, - move (*jwt), - error)); - if (!iat) - throw server_error (); + // Save the IAT if we created a new one. + // + if (new_iat) + sd.installation_access = *new_iat; - l3 ([&]{trace << "installation_access_token { " << *iat << " }";}); + return sd.json (); + }; - // While it would have been nice to cancel CIs of PRs with this branch as - // base not to waste resources, there are complications: Firstly, we can - // only do this for remote PRs (since local PRs will most likely share the - // result with branch push). Secondly, we try to do our best even if the - // branch protection rule for head behind is not enabled. In this case, it - // would be good to complete the CI. So maybe/later. See also the head - // case in handle_pull_request(), where we do cancel remote PRs that are - // not shared. + optional bs (rebuild (*build_db_, retry_, *bid, update_sd)); - // Service id that uniquely identifies the CI tenant. + // If the build has been archived or re-enqueued since we loaded the + // service data, fail (by updating) both the build check run and the + // conclusion check run. Otherwise the build has been successfully + // re-enqueued so do nothing further. // - string sid (ps.repository.node_id + ':' + ps.after); + if (!race && bs && *bs != build_state::queued) + return true; - service_data sd (warning_success, - iat->token, - iat->expires_at, - ps.app_id, - ps.installation.id, - move (ps.repository.node_id), - move (ps.repository.clone_url), - service_data::local, - false /* pre_check */, - false /* re_requested */, - ps.after /* check_sha */, - ps.after /* report_sha */); + gq_built_result br; // Built result for both check runs. - // Create an unloaded CI tenant, doing nothing if one already exists - // (which could've been created by handle_pull_request() or by us as a - // result of a push to another branch). Note that the tenant's reference - // count is incremented in all cases. - // - // Note: use no delay since we need to (re)create the synthetic conclusion - // check run as soon as possible. + if (race || bs) // Race or re-enqueued. + { + // The re-enqueued case: this build has been re-enqueued since we first + // loaded the service data. This could happen if the user clicked + // "re-run" multiple times and another handler won the rebuild() race. + // + // However the winner of the check runs race cannot be determined. + // + // Best case the other handler won the check runs race as well and + // thus everything will proceed normally. Our check runs will be + // invisible and disregarded. + // + // Worst case we won the check runs race and the other handler's check + // runs -- the ones that will be updated by the build_*() notifications + // -- are no longer visible, leaving things quite broken. + // + // Either way, we fail our check runs. In the best case scenario it + // will have no effect; in the worst case scenario it lets the user + // know something has gone wrong. + // + br = make_built_result (result_status::error, warning_success, + "Unable to rebuild, try again"); + } + else // Archived. + { + // The build has expired since we loaded the service data. Most likely + // the tenant has been archived. + // + br = make_built_result ( + result_status::error, warning_success, + "Unable to rebuild individual configuration: build has been archived"); + } + + // Try to update the conclusion check run even if the first update fails. // - // Note that we use the create() API instead of start() since duplicate - // management is not available in start(). + bool f (false); // Failed. + + // Fail the build check run. // - // After this call we will start getting the build_unloaded() - // notifications until (1) we load the tenant, (2) we cancel it, or (3) - // it gets archived after some timeout. + if (gq_update_check_run (error, bcr, iat->token, + repo_node_id, *bcr.node_id, + nullopt /* details_url */, + build_state::built, br)) + { + l3 ([&]{trace << "updated check_run { " << bcr << " }";}); + } + else + { + error << "check run " << cr.check_run.node_id + << ": unable to update (replacement) check run " + << *bcr.node_id; + f = true; + } + + // Fail the conclusion check run. // - if (!create (error, warn, verb_ ? &trace : nullptr, - *build_db_, retry_, - tenant_service (sid, "ci-github", sd.json ()), - chrono::seconds (30) /* interval */, - chrono::seconds (0) /* delay */, - duplicate_tenant_mode::ignore)) + if (gq_update_check_run (error, ccr, iat->token, + repo_node_id, *ccr.node_id, + nullopt /* details_url */, + build_state::built, move (br))) { - fail << "push " + ps.after + " to " + ps.ref - << ": unable to create unloaded CI tenant"; + l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";}); + } + else + { + error << "check run " << cr.check_run.node_id + << ": unable to update conclusion check run " << *ccr.node_id; + f = true; } + // Fail the handler if either of the check runs could not be updated. + // + if (f) + throw server_error (); + return true; } -- cgit v1.1