BUG#35444244 rebootClusterFromCompleteOutage unable to rejoin instance after major outage

miguelaraujo · miguelaraujo · commit fce274315b09 · 2023-06-02T08:57:27.000+01:00
Rebooting from complete outage a Replica Cluster that can reach out to
its ClusterSet results in a failure rejoining the instances back to the
Cluster. The Cluster itself is rebooted and rejoined back to the
ClusterSet, however, the command fails to rejoin the instances.

This scenario is only reproducible when the Cluster is missing
transactions from the ClusterSet, or the Primary Cluster is under load.

The root cause is a missing transaction sync step after each instance's
recovery account is recreated. The account must be recreated on the
Primary Cluster so its replicated to the whole topology but since
there's no sync with the Replica Cluster, the account is still missing
when the instance is attempted to be rejoined.

This patch fixes that by making sure instances are only rejoined back to
their Clusters when the Cluster is rejoined back to the ClusterSet and
the transactions are sync'ed.

Change-Id: I58e1adcebe8907e77eccd17f81218afdfa9fddf5
diff --git a/modules/adminapi/cluster_set/cluster_set_impl.cc b/modules/adminapi/cluster_set/cluster_set_impl.cc
@@ -2122,14 +2122,44 @@ void Cluster_set_impl::set_maximum_transaction_size_limit(Cluster_impl *cluster,
     cluster_transaction_size_limit = value.as_int();
   }
 
+  // If there are Cluster members that are reachable but group_replication is
+  // either disabled or not installed, attempting to set
+  // group_replication_transaction_size_limit will result in an error. To avoid
+  // that, we check which are those instances to let the config_handler know
+  // that those are to be ignored
+  std::vector<std::string> ignore_instances_vec;
+  {
+    auto is_gr_active = [](const mysqlshdk::mysql::IInstance &instance) {
+      std::optional<std::string> plugin_state =
+          instance.get_plugin_status(mysqlshdk::gr::k_gr_plugin_name);
+      if (!plugin_state.has_value() ||
+          plugin_state.value_or("DISABLED") != "ACTIVE") {
+        return false;
+      }
+      return true;
+    };
+
+    cluster->execute_in_members(
+        {}, cluster->get_cluster_server()->get_connection_options(), {},
+        [&ignore_instances_vec, &is_gr_active](
+            const std::shared_ptr<Instance> &instance,
+            const mysqlshdk::gr::Member &) {
+          if (!is_gr_active(*instance)) {
+            ignore_instances_vec.push_back(instance->get_canonical_address());
+          }
+
+          return true;
+        });
+  }
+
   // The primary must be reachable at this point so it will always be
   // updated, but one of more secondaries might be unreachable and it's OK
   // if they are not updated. Auto-rejoins might fail due to transactions
   // being rejected, but the user will be warned about it in cluster.status()
   // and can fix it with .rescan(). Also, manually rejoining instances with
   // .rejoinInstance() will overcome the problem
   std::unique_ptr<mysqlshdk::config::Config> config =
-      cluster->create_config_object({}, false, false, true);
+      cluster->create_config_object(ignore_instances_vec, false, false, true);
 
   config->set(kGrTransactionSizeLimit,
               std::optional<int64_t>(cluster_transaction_size_limit));
diff --git a/modules/adminapi/dba/reboot_cluster_from_complete_outage.cc b/modules/adminapi/dba/reboot_cluster_from_complete_outage.cc
@@ -223,10 +223,20 @@ void rejoin_instances(Cluster_impl *cluster_impl,
         rejoin_options.gr_options.ip_allowlist = std::nullopt;
       }
 
+      // Do not handle the ClusterSet-related steps (configuration of the
+      // managed channel and transaction sync with the primary cluster) when:
+      //   - The Cluster was a Replica Cluster that was removed from the
+      //     ClusterSet, or
+      //   - It's not a ClusterSet member, or
+      //   - It's an INVALIDATED Cluster
+      bool ignore_cluster_set = removed_from_set ||
+                                !cluster_impl->is_cluster_set_member() ||
+                                cluster_impl->is_invalidated();
+
       Cluster_topology_executor<cluster::Rejoin_instance>{
-          cluster_impl,   instance,
-          rejoin_options, options.switch_communication_stack.has_value(),
-          true,           true}
+          cluster_impl,       instance,
+          rejoin_options,     options.switch_communication_stack.has_value(),
+          ignore_cluster_set, true}
           .run();
 
     } catch (const shcore::Error &e) {
@@ -1258,6 +1268,8 @@ std::shared_ptr<Cluster> Reboot_cluster_from_complete_outage::do_run() {
     reboot_seed();
   }
 
+  bool rejoin_remaning_instances = true;
+
   // don't rejoin the instances *if* cluster is in a cluster set and is
   // invalidated (former primary) or is a replica and the primary doesn't have
   // global status OK
@@ -1276,6 +1288,7 @@ std::shared_ptr<Cluster> Reboot_cluster_from_complete_outage::do_run() {
                      "Cluster is rejoined to the ClusterSet.";
     console->print_info(msg);
 
+    rejoin_remaning_instances = false;
   } else if (!m_options.get_dry_run()) {
     // it's either a non ClusterSet instance or it is but it's not the
     // primary, so we just need to acquire the primary before rejoining the
@@ -1304,9 +1317,6 @@ std::shared_ptr<Cluster> Reboot_cluster_from_complete_outage::do_run() {
         }
       }
     }
-
-    rejoin_instances(cluster_impl.get(), *m_target_instance, instances,
-                     m_options, !cluster_is_multi_primary);
   }
 
   // if the cluster is part of a set
@@ -1339,7 +1349,6 @@ std::shared_ptr<Cluster> Reboot_cluster_from_complete_outage::do_run() {
 
           // also ensure SRO is enabled on all members
           cluster_set_impl->ensure_replica_settings(cluster_impl.get(), false);
-
         } catch (const shcore::Exception &e) {
           switch (e.code()) {
             case SHERR_DBA_DATA_ERRANT_TRANSACTIONS:
@@ -1377,6 +1386,12 @@ std::shared_ptr<Cluster> Reboot_cluster_from_complete_outage::do_run() {
     }
   }
 
+  if (rejoin_remaning_instances && !m_options.get_dry_run()) {
+    // and finally, rejoin all instances
+    rejoin_instances(cluster_impl.get(), *m_target_instance, instances,
+                     m_options, !cluster_is_multi_primary);
+  }
+
   if (m_options.get_dry_run()) {
     console->print_info("dryRun finished.");
     console->print_info();
diff --git a/unittest/scripts/auto/js_adminapi_clusterset/scripts/reboot_cluster_more.js b/unittest/scripts/auto/js_adminapi_clusterset/scripts/reboot_cluster_more.js
@@ -120,6 +120,13 @@ shell.connect(__sandbox_uri1);
 cs = dba.getClusterSet();
 EXPECT_NO_THROWS(function(){ cs.rejoinCluster("replica"); });
 
+// Add some data to the primary Cluster
+session.runSql("create schema test");
+session.runSql("create table test.data (a int primary key auto_increment, data longtext)");
+for (i = 0; i < 20; i++) {
+    session.runSql("insert into test.data values (default, repeat('x', 4*1024*1024))");
+}
+
 EXPECT_NO_THROWS(function(){ old_primary.rejoinInstance(__sandbox_uri5); });
 EXPECT_NO_THROWS(function(){ old_primary.rejoinInstance(__sandbox_uri6); });
 
@@ -182,9 +189,13 @@ EXPECT_NO_THROWS(function(){ replica = dba.rebootClusterFromCompleteOutage("repl
 
 cs.rejoinCluster("replica");
 
-replica = dba.getCluster();
+wait_channel_ready(session, __mysql_sandbox_port6, "clusterset_replication");
+
 replica.rejoinInstance(__sandbox_uri5);
 
+// Rejoin creates a VCLE, so let's reconcile the GTID-set already
+cs.rejoinCluster("replica");
+
 shell.connect(__sandbox_uri6);
 replica2 = dba.getCluster();
 
@@ -354,6 +365,26 @@ CHECK_PRIMARY_CLUSTER([__sandbox_uri1, __sandbox_uri2, __sandbox_uri3], cluster)
 CHECK_REPLICA_CLUSTER([__sandbox_uri4, __sandbox_uri5, __sandbox_uri6], cluster, replica);
 CHECK_CLUSTER_SET(session);
 
+//@<> Rebooting a Replica Cluster should succeed to rejoin back its members even though there are missing transactions from the Primary Cluster
+testutil.stopGroup([__mysql_sandbox_port4, __mysql_sandbox_port5, __mysql_sandbox_port6]);
+
+shell.connect(__sandbox_uri1);
+
+// Add some data to the Primary Cluster
+session.runSql("create schema test2");
+session.runSql("create table test2.data (a int primary key auto_increment, data longtext)");
+for (i = 0; i < 20; i++) {
+    session.runSql("insert into test2.data values (default, repeat('x', 4*1024*1024))");
+}
+
+shell.connect(__sandbox_uri4);
+
+EXPECT_NO_THROWS(function(){ replica = dba.rebootClusterFromCompleteOutage(); });
+
+CHECK_PRIMARY_CLUSTER([__sandbox_uri1, __sandbox_uri2, __sandbox_uri3], cluster);
+CHECK_REPLICA_CLUSTER([__sandbox_uri4, __sandbox_uri5, __sandbox_uri6], cluster, replica);
+CHECK_CLUSTER_SET(session);
+
 //@<> Cleanup
 scene.destroy();
 testutil.destroySandbox(__mysql_sandbox_port4);