Skip to content

Commit 2e4634d

Browse files
committed
BUG#35400167 Read replica not shown in cluster.status()
If a Read-Replica enters an ERROR state due to a connection error for some reason (for example, it's missing transactions that were purged), and its current source is removed from the Cluster, the Read-Replica is no longer shown in the output of cluster.status(). This patch fixes that by adding a missing check to verify if the current read-replica source still belongs to the Cluster or not, to determine whether the read-replica must be considered rogue and placed under the current primary. Change-Id: I504e266017f2af4a62b6afcc4ab223b481ad1e21
1 parent 07057e6 commit 2e4634d

File tree

2 files changed

+68
-0
lines changed

2 files changed

+68
-0
lines changed

modules/adminapi/cluster/status.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "modules/adminapi/common/async_topology.h"
3131
#include "modules/adminapi/common/common.h"
3232
#include "modules/adminapi/common/common_status.h"
33+
#include "modules/adminapi/common/dba_errors.h"
3334
#include "modules/adminapi/common/metadata_storage.h"
3435
#include "modules/adminapi/common/parallel_applier_options.h"
3536
#include "modules/adminapi/common/server_features.h"
@@ -2259,6 +2260,26 @@ shcore::Dictionary_t Status::get_read_replicas_info(
22592260
continue;
22602261
}
22612262

2263+
// If the current source does not belong to the cluster anymore, the
2264+
// instance must be considered rogue. The instance might have
2265+
// entered an error state and during that period the current source was
2266+
// removed from the cluster
2267+
try {
2268+
// Attempt to get the current source from the Cluster
2269+
m_cluster->get_metadata_storage()->get_instance_by_uuid(
2270+
rr_info.current_source_server_uuid);
2271+
} catch (const shcore::Exception &e) {
2272+
if (e.code() != SHERR_DBA_MEMBER_METADATA_MISSING) {
2273+
log_info("Error querying metadata for %s: %s\n",
2274+
rr_info.current_source_server_uuid.c_str(), e.what());
2275+
}
2276+
2277+
// The source does not belong to the Cluster anymore, place the
2278+
// read-replica under the primary, as rogue
2279+
rogue_read_replicas.push_back(rr_info);
2280+
continue;
2281+
}
2282+
22622283
if (rr_info.current_source_server_uuid ==
22632284
instance_md.actual_server_uuid) {
22642285
member_read_replicas.push_back(rr_info);

unittest/scripts/auto/js_adminapi/scripts/cluster_status_read_replicas.js

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,53 @@ EXPECT_EQ("ERROR: Could not connect to the Read-Replica: The instance is unreach
436436

437437
EXPECT_EQ({}, status["defaultReplicaSet"]["topology"][__endpoint2]["readReplicas"]);
438438

439+
cluster.switchToSinglePrimaryMode();
440+
testutil.startSandbox(__mysql_sandbox_port5);
441+
cluster.removeInstance(__endpoint4, {force: true});
442+
CHECK_READ_REPLICA(__sandbox_uri5, cluster, "primary", __endpoint1);
443+
444+
session5 = mysql.getSession(__sandbox_uri5);
445+
446+
session5.runSql("stop replica");
447+
448+
session1 = mysql.getSession(__sandbox_uri1);
449+
450+
session1.runSql("CREATE schema foobar;")
451+
452+
var session2 = mysql.getSession(__sandbox_uri2);
453+
454+
session1.runSql("FLUSH BINARY LOGS");
455+
session1.runSql("PURGE BINARY LOGS BEFORE DATE_ADD(NOW(6), INTERVAL 1 DAY)");
456+
session2.runSql("FLUSH BINARY LOGS");
457+
session2.runSql("PURGE BINARY LOGS BEFORE DATE_ADD(NOW(6), INTERVAL 1 DAY)");
458+
459+
session5.runSql("start replica");
460+
461+
// Check error
462+
status = cluster.status();
463+
print(status);
464+
read_replica1 = status["defaultReplicaSet"]["topology"][__endpoint1]["readReplicas"][__endpoint5];
465+
466+
EXPECT_EQ(__endpoint5, read_replica1["address"]);
467+
EXPECT_EQ("ERROR", read_replica1["status"]);
468+
469+
var regexp = /WARNING: Read Replica's replication channel stopped with a connection error: 'Got fatal error \d+ from source when reading data from binary log: 'Cannot replicate because the source purged required binary logs\. Replicate the missing transactions from elsewhere, or provision a new replica from backup\. Consider increasing the source's binary log expiration period\. The GTID set sent by the replica is '[0-9a-f-:,\s]+', and the missing transactions are '[0-9a-f-:,\s]+'''\. Use Cluster\.rejoinInstance\(\) to restore it\./;
470+
471+
EXPECT_TRUE(read_replica1["instanceErrors"][0].match(regexp));
472+
473+
cluster.setPrimaryInstance(__endpoint2);
474+
cluster.removeInstance(__endpoint1);
475+
476+
// Confirm the instance is still placed under the current primary after the switch
477+
status = cluster.status();
478+
print(status);
479+
read_replica1 = status["defaultReplicaSet"]["topology"][__endpoint2]["readReplicas"][__endpoint5];
480+
481+
EXPECT_EQ(__endpoint5, read_replica1["address"]);
482+
EXPECT_EQ("ERROR", read_replica1["status"]);
483+
484+
EXPECT_TRUE(read_replica1["instanceErrors"][0].match(regexp));
485+
439486
//@<> Cleanup
440487
scene.destroy();
441488
testutil.destroySandbox(__mysql_sandbox_port4);

0 commit comments

Comments
 (0)