BUG#35400167 Read replica not shown in cluster.status()

miguelaraujo · miguelaraujo · commit 2e4634d0783a · 2023-05-19T22:20:50.000+01:00
If a Read-Replica enters an ERROR state due to a connection error for
some reason (for example, it's missing transactions that were purged),
and its current source is removed from the Cluster, the Read-Replica is
no longer shown in the output of cluster.status().

This patch fixes that by adding a missing check to verify if the current
read-replica source still belongs to the Cluster or not, to determine
whether the read-replica must be considered rogue and placed under the
current primary.

Change-Id: I504e266017f2af4a62b6afcc4ab223b481ad1e21
diff --git a/modules/adminapi/cluster/status.cc b/modules/adminapi/cluster/status.cc
@@ -30,6 +30,7 @@
 #include "modules/adminapi/common/async_topology.h"
 #include "modules/adminapi/common/common.h"
 #include "modules/adminapi/common/common_status.h"
+#include "modules/adminapi/common/dba_errors.h"
 #include "modules/adminapi/common/metadata_storage.h"
 #include "modules/adminapi/common/parallel_applier_options.h"
 #include "modules/adminapi/common/server_features.h"
@@ -2259,6 +2260,26 @@ shcore::Dictionary_t Status::get_read_replicas_info(
         continue;
       }
 
+      // If the current source does not belong to the cluster anymore, the
+      // instance must be considered rogue. The instance might have
+      // entered an error state and during that period the current source was
+      // removed from the cluster
+      try {
+        // Attempt to get the current source from the Cluster
+        m_cluster->get_metadata_storage()->get_instance_by_uuid(
+            rr_info.current_source_server_uuid);
+      } catch (const shcore::Exception &e) {
+        if (e.code() != SHERR_DBA_MEMBER_METADATA_MISSING) {
+          log_info("Error querying metadata for %s: %s\n",
+                   rr_info.current_source_server_uuid.c_str(), e.what());
+        }
+
+        // The source does not belong to the Cluster anymore, place the
+        // read-replica under the primary, as rogue
+        rogue_read_replicas.push_back(rr_info);
+        continue;
+      }
+
       if (rr_info.current_source_server_uuid ==
           instance_md.actual_server_uuid) {
         member_read_replicas.push_back(rr_info);
diff --git a/unittest/scripts/auto/js_adminapi/scripts/cluster_status_read_replicas.js b/unittest/scripts/auto/js_adminapi/scripts/cluster_status_read_replicas.js
@@ -436,6 +436,53 @@ EXPECT_EQ("ERROR: Could not connect to the Read-Replica: The instance is unreach
 
 EXPECT_EQ({}, status["defaultReplicaSet"]["topology"][__endpoint2]["readReplicas"]);
 
+cluster.switchToSinglePrimaryMode();
+testutil.startSandbox(__mysql_sandbox_port5);
+cluster.removeInstance(__endpoint4, {force: true});
+CHECK_READ_REPLICA(__sandbox_uri5, cluster, "primary", __endpoint1);
+
+session5 = mysql.getSession(__sandbox_uri5);
+
+session5.runSql("stop replica");
+
+session1 = mysql.getSession(__sandbox_uri1);
+
+session1.runSql("CREATE schema foobar;")
+
+var session2 = mysql.getSession(__sandbox_uri2);
+
+session1.runSql("FLUSH BINARY LOGS");
+session1.runSql("PURGE BINARY LOGS BEFORE DATE_ADD(NOW(6), INTERVAL 1 DAY)");
+session2.runSql("FLUSH BINARY LOGS");
+session2.runSql("PURGE BINARY LOGS BEFORE DATE_ADD(NOW(6), INTERVAL 1 DAY)");
+
+session5.runSql("start replica");
+
+// Check error
+status = cluster.status();
+print(status);
+read_replica1 = status["defaultReplicaSet"]["topology"][__endpoint1]["readReplicas"][__endpoint5];
+
+EXPECT_EQ(__endpoint5, read_replica1["address"]);
+EXPECT_EQ("ERROR", read_replica1["status"]);
+
+var regexp = /WARNING: Read Replica's replication channel stopped with a connection error: 'Got fatal error \d+ from source when reading data from binary log: 'Cannot replicate because the source purged required binary logs\. Replicate the missing transactions from elsewhere, or provision a new replica from backup\. Consider increasing the source's binary log expiration period\. The GTID set sent by the replica is '[0-9a-f-:,\s]+', and the missing transactions are '[0-9a-f-:,\s]+'''\. Use Cluster\.rejoinInstance\(\) to restore it\./;
+
+EXPECT_TRUE(read_replica1["instanceErrors"][0].match(regexp));
+
+cluster.setPrimaryInstance(__endpoint2);
+cluster.removeInstance(__endpoint1);
+
+// Confirm the instance is still placed under the current primary after the switch
+status = cluster.status();
+print(status);
+read_replica1 = status["defaultReplicaSet"]["topology"][__endpoint2]["readReplicas"][__endpoint5];
+
+EXPECT_EQ(__endpoint5, read_replica1["address"]);
+EXPECT_EQ("ERROR", read_replica1["status"]);
+
+EXPECT_TRUE(read_replica1["instanceErrors"][0].match(regexp));
+
 //@<> Cleanup
 scene.destroy();
 testutil.destroySandbox(__mysql_sandbox_port4);