From 1f8168972e9c9d7c5d70539745834947410053ed Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Wed, 15 May 2024 07:36:30 +0200 Subject: Move directory iteration retry to correct iteration loop --- bbot/agent/agent.cxx | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 5df470f..0b57208 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -1076,7 +1076,7 @@ enumerate_machines (const dir_path& machines) { tracer trace ("enumerate_machines", machines.string ().c_str ()); - size_t dir_iter_retry (0); // Directory iteration retry (see below). + size_t dir_iter_retries (0); // Directory iteration retry count (see below). for (;;) // From-scratch retry loop for after bootstrap (see below). { @@ -1152,6 +1152,7 @@ enumerate_machines (const dir_path& machines) // try { + bool dir_iter_retry (false); for (const dir_entry& ve: dir_iterator (machines, dir_iterator::no_follow)) { @@ -1426,31 +1427,36 @@ enumerate_machines (const dir_path& machines) } catch (const system_error& e) { - fail << "unable to iterate over " << vd << ": " << e; + // Once in a while we get ENOENT while iterating over the machines + // volume directory. This directory contains the machine directories + // (not btrfs subvolumes) and is not being changed when we get this + // error. Maybe this is due to directory sizes/timestamps changes, + // but then we would expect to get this error a lot more often..? So + // this feels like a btrfs bug which we are going to retry a few + // times. See GH issue #349 for additional information. + // + dir_iter_retry = (dir_iter_retries++ != 3); + + (dir_iter_retry + ? warn + : error) << "unable to iterate over " << vd << ": " << e; + + if (dir_iter_retry) + break; + else + throw failed (); } } // Outer dir_iterator loop. - } - catch (const system_error& e) - { - // Once in a while we get ENOENT while iterating over the machines - // directory. This directory contains the machine directories (not - // subvolumes) and is not being changed when we get this error. Maybe - // this is due to directory sizes/timestamps changes, but then we would - // expect to get this error a lot more often..? So this feels like a - // btrfs bug which we are going to retry a few times. See GH issue #349 - // for additional information. - // - bool retry (dir_iter_retry++ != 2); - (retry ? warn : error) << "unable to iterate over " << machines - << ": " << e; - if (retry) + if (dir_iter_retry) continue; // Re-enumerate from scratch. else - throw failed (); + dir_iter_retries = 0; // Reset for re-enumeration due to other reasons. + } + catch (const system_error& e) + { + fail << "unable to iterate over " << machines << ": " << e; } - - dir_iter_retry = 0; // Reset for re-enumeration due to other reasons. // See if there is a pending bootstrap and whether we can perform it. // -- cgit v1.1