Skip to content

Commit f3056c0

Browse files
joaosigmamiguelaraujo
authored andcommitted
BUG#34614769 clusterset.status() reports repl channel issues when its own failover is ongoing
Currently, shell retrieves the state of the replication channel "clusterset_replication", and if it contains an error, it reports it as such. This means that, if the channel is connecting (and the error is related to that), it will never show the state as connecting but always as in error. This is misleading because errors during a connection attempt are acceptable. This patch addresses this by acknowledging that the channel is connecting and ignoring the error until the channel state updates to either ON or OFF. This means that, in ClusterSet.status(), the "clusterSetReplicationStatus" field in a cluster can return "CONNECTING" (in which case the "globalStatus" is now "OK"), and in ReplicaSet.status(), the "status" field of an instance can now show "CONNECTING". Change-Id: Ief5d96f14ce6342a29f78916f316f99c9188116d
1 parent d7d2e22 commit f3056c0

File tree

23 files changed

+293
-221
lines changed

23 files changed

+293
-221
lines changed

modules/adminapi/cluster_set/cluster_set_impl.cc

Lines changed: 42 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -550,53 +550,47 @@ Cluster_channel_status Cluster_set_impl::get_replication_channel_status(
550550
const Cluster_impl &cluster) const {
551551
// Get the ClusterSet member metadata
552552
auto cluster_primary = cluster.get_cluster_server();
553-
554-
if (cluster_primary) {
555-
mysqlshdk::mysql::Replication_channel channel;
556-
557-
if (mysqlshdk::mysql::get_channel_status(
558-
*cluster_primary, k_clusterset_async_channel_name, &channel)) {
559-
if (channel.status() != mysqlshdk::mysql::Replication_channel::ON) {
560-
log_info("Channel '%s' at %s not ON: %s",
561-
k_clusterset_async_channel_name,
562-
cluster_primary->descr().c_str(),
563-
mysqlshdk::mysql::format_status(channel, true).c_str());
564-
} else {
565-
if (cluster.is_primary_cluster()) {
566-
log_info("Unexpected channel '%s' at %s: %s",
567-
k_clusterset_async_channel_name,
568-
cluster_primary->descr().c_str(),
569-
mysqlshdk::mysql::format_status(channel, true).c_str());
570-
}
571-
}
572-
573-
switch (channel.status()) {
574-
case mysqlshdk::mysql::Replication_channel::CONNECTING:
575-
case mysqlshdk::mysql::Replication_channel::ON: {
576-
auto primary = get_primary_master();
577-
auto primary_cluster = get_primary_cluster();
578-
579-
if (primary && primary_cluster &&
580-
primary_cluster->cluster_availability() ==
581-
Cluster_availability::ONLINE) {
582-
if (channel.source_uuid != primary->get_uuid())
583-
return Cluster_channel_status::MISCONFIGURED;
584-
}
585-
return Cluster_channel_status::OK;
586-
break;
587-
}
588-
589-
case mysqlshdk::mysql::Replication_channel::OFF:
590-
case mysqlshdk::mysql::Replication_channel::RECEIVER_OFF:
591-
case mysqlshdk::mysql::Replication_channel::APPLIER_OFF:
592-
return Cluster_channel_status::STOPPED;
593-
case mysqlshdk::mysql::Replication_channel::CONNECTION_ERROR:
594-
case mysqlshdk::mysql::Replication_channel::APPLIER_ERROR:
595-
return Cluster_channel_status::ERROR;
596-
}
597-
} else {
598-
return Cluster_channel_status::MISSING;
553+
if (!cluster_primary) return Cluster_channel_status::UNKNOWN;
554+
555+
mysqlshdk::mysql::Replication_channel channel;
556+
if (!mysqlshdk::mysql::get_channel_status(
557+
*cluster_primary, k_clusterset_async_channel_name, &channel))
558+
return Cluster_channel_status::MISSING;
559+
560+
if (channel.status() != mysqlshdk::mysql::Replication_channel::ON) {
561+
log_info("Channel '%s' at %s not ON: %s", k_clusterset_async_channel_name,
562+
cluster_primary->descr().c_str(),
563+
mysqlshdk::mysql::format_status(channel, true).c_str());
564+
} else if (cluster.is_primary_cluster()) {
565+
log_info("Unexpected channel '%s' at %s: %s",
566+
k_clusterset_async_channel_name, cluster_primary->descr().c_str(),
567+
mysqlshdk::mysql::format_status(channel, true).c_str());
568+
}
569+
570+
switch (channel.status()) {
571+
case mysqlshdk::mysql::Replication_channel::CONNECTING:
572+
return Cluster_channel_status::CONNECTING;
573+
574+
case mysqlshdk::mysql::Replication_channel::ON: {
575+
auto primary = get_primary_master();
576+
auto primary_cluster = get_primary_cluster();
577+
578+
if (primary && primary_cluster &&
579+
(primary_cluster->cluster_availability() ==
580+
Cluster_availability::ONLINE) &&
581+
(channel.source_uuid != primary->get_uuid()))
582+
return Cluster_channel_status::MISCONFIGURED;
583+
584+
return Cluster_channel_status::OK;
599585
}
586+
587+
case mysqlshdk::mysql::Replication_channel::OFF:
588+
case mysqlshdk::mysql::Replication_channel::RECEIVER_OFF:
589+
case mysqlshdk::mysql::Replication_channel::APPLIER_OFF:
590+
return Cluster_channel_status::STOPPED;
591+
case mysqlshdk::mysql::Replication_channel::CONNECTION_ERROR:
592+
case mysqlshdk::mysql::Replication_channel::APPLIER_ERROR:
593+
return Cluster_channel_status::ERROR;
600594
}
601595

602596
return Cluster_channel_status::UNKNOWN;
@@ -676,6 +670,7 @@ Cluster_global_status Cluster_set_impl::get_cluster_global_status(
676670

677671
switch (get_replication_channel_status(*cluster)) {
678672
case Cluster_channel_status::OK:
673+
case Cluster_channel_status::CONNECTING:
679674
case Cluster_channel_status::MISCONFIGURED:
680675
case Cluster_channel_status::ERROR:
681676
// unexpected at primary cluster
@@ -705,6 +700,7 @@ Cluster_global_status Cluster_set_impl::get_cluster_global_status(
705700
default:
706701
switch (get_replication_channel_status(*cluster)) {
707702
case Cluster_channel_status::OK:
703+
case Cluster_channel_status::CONNECTING:
708704
ret = Cluster_global_status::OK;
709705
break;
710706
case Cluster_channel_status::STOPPED:

modules/adminapi/cluster_set/status.cc

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,11 @@ shcore::Array_t cluster_diagnostics(
4040
shcore::Array_t cluster_errors) {
4141
using mysqlshdk::mysql::Replication_channel;
4242

43-
auto append_error = [&cluster_errors](const std::string &msg) {
43+
auto append_error = [&cluster_errors](std::string msg) {
4444
if (!cluster_errors) {
4545
cluster_errors = shcore::make_array();
4646
}
47-
cluster_errors->push_back(shcore::Value(msg));
47+
cluster_errors->push_back(shcore::Value(std::move(msg)));
4848
};
4949

5050
// GTID_EXECUTED consistency handled elsewhere
@@ -53,30 +53,34 @@ shcore::Array_t cluster_diagnostics(
5353
if (cluster->is_primary_cluster()) {
5454
if (!channel.host.empty() ||
5555
channel.status() != Replication_channel::Status::OFF) {
56-
append_error("WARNING: Unexpected replication channel '" +
57-
channel.channel_name + "' at Primary Cluster");
56+
append_error(shcore::str_format(
57+
"WARNING: Unexpected replication channel '%s' at Primary Cluster",
58+
channel.channel_name.c_str()));
5859
}
5960
} else {
6061
if (channel.host.empty()) {
6162
append_error(
6263
"WARNING: Replication channel from the Primary Cluster is missing");
6364
} else {
64-
if (channel.status() != Replication_channel::Status::ON) {
65+
auto channel_status = channel.status();
66+
if ((channel_status != Replication_channel::Status::CONNECTING) &&
67+
(channel_status != Replication_channel::Status::ON)) {
6568
append_error(
6669
"WARNING: Replication from the Primary Cluster not in expected "
6770
"state");
68-
}
69-
70-
auto primary = primary_cluster->get_primary_master();
7171

72-
if (primary_cluster->cluster_availability() ==
73-
Cluster_availability::ONLINE &&
74-
channel.source_uuid != primary->get_uuid()) {
75-
append_error(shcore::str_format(
76-
"WARNING: Replicating from wrong source. Expected %s (%s) but is "
77-
"%s:%i (%s)",
78-
primary->descr().c_str(), primary->get_uuid().c_str(),
79-
channel.host.c_str(), channel.port, channel.source_uuid.c_str()));
72+
auto primary = primary_cluster->get_primary_master();
73+
74+
if (primary_cluster->cluster_availability() ==
75+
Cluster_availability::ONLINE &&
76+
channel.source_uuid != primary->get_uuid()) {
77+
append_error(shcore::str_format(
78+
"WARNING: Replicating from wrong source. Expected %s (%s) but "
79+
"is %s:%i (%s)",
80+
primary->descr().c_str(), primary->get_uuid().c_str(),
81+
channel.host.c_str(), channel.port,
82+
channel.source_uuid.c_str()));
83+
}
8084
}
8185
}
8286
}

modules/adminapi/common/cluster_types.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ std::string to_string(Cluster_channel_status status) {
176176
switch (status) {
177177
case Cluster_channel_status::OK:
178178
return "OK";
179+
case Cluster_channel_status::CONNECTING:
180+
return "CONNECTING";
179181
case Cluster_channel_status::STOPPED:
180182
return "STOPPED";
181183
case Cluster_channel_status::ERROR:

modules/adminapi/common/cluster_types.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ Cluster_status to_cluster_status(const std::string &s);
102102
enum class Cluster_global_status {
103103
OK, // If it's a Primary, it must have any of the OK_* status. If a Replica,
104104
// any of the OK_* status plus the Replication Channel status must be OK
105+
// or CONNECTING
105106
OK_NOT_REPLICATING, // Replica Cluster with any of the OK_* status and
106107
// Replication Channel status STOPPED or ERROR
107108
OK_NOT_CONSISTENT, // Replica Cluster with any of the OK_* status and
@@ -120,15 +121,16 @@ enum class Cluster_global_status {
120121
std::string to_string(Cluster_global_status status);
121122

122123
enum class Cluster_channel_status {
123-
OK, // Replication channel up and running
124-
STOPPED, // Replication channel stopped gracefully. Either both IO and SQL
125-
// threads or just one of them.
126-
ERROR, // Replication channel stopped due to a replication error (e.g.
127-
// conflicting GTID-set)
128-
MISCONFIGURED, // Channel exists but is replicating from the wrong place
129-
MISSING, // Channel doesn't exist
124+
OK, // Replication channel up and running.
125+
CONNECTING, // Replication channel is connecting.
126+
STOPPED, // Replication channel stopped gracefully. Either both IO and SQL
127+
// threads or just one of them.
128+
ERROR, // Replication channel stopped due to a replication error (e.g.
129+
// conflicting GTID-set).
130+
MISCONFIGURED, // Channel exists but is replicating from the wrong place.
131+
MISSING, // Channel doesn't exist.
130132
UNKNOWN // Shell cannot connect to the Replica Cluster to obtain information
131-
// about the replication channel and others
133+
// about the replication channel and others.
132134
};
133135

134136
std::string to_string(Cluster_channel_status status);

modules/adminapi/common/dba_errors.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@
173173
#define SHERR_DBA_CLUSTER_NOT_FENCED 51617
174174
#define SHERR_DBA_CLUSTER_NOT_CONFIGURED_TRANSACTION_SIZE_LIMIT 51618
175175
#define SHERR_DBA_CLUSTER_OFFLINE 51619
176+
#define SHERR_DBA_PRIMARY_CLUSTER_STILL_CONNECTING 51620
176177

177178
// Read-Replica errors
178179
#define SHERR_DBA_READ_REPLICA_SETUP_ERROR 51700

modules/adminapi/common/global_topology.cc

Lines changed: 39 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019, 2022, Oracle and/or its affiliates.
2+
* Copyright (c) 2019, 2023, Oracle and/or its affiliates.
33
*
44
* This program is free software; you can redistribute it and/or modify
55
* it under the terms of the GNU General Public License, version 2.0,
@@ -52,6 +52,8 @@ std::string to_string(topology::Node_status status) {
5252
return "INCONSISTENT";
5353
case topology::Node_status::ONLINE:
5454
return "ONLINE";
55+
case topology::Node_status::CONNECTING:
56+
return "CONNECTING";
5557
}
5658
throw std::logic_error("internal error");
5759
}
@@ -212,50 +214,46 @@ Node_status Server::status() const {
212214

213215
if (server.status() != Instance_status::OK) return Node_status::UNREACHABLE;
214216

215-
if (!master_instance_uuid.empty()) {
216-
// SECONDARY checks
217-
if (!server.master_channel || !master_node_ptr) return Node_status::ERROR;
218-
219-
// check that the actual source is the expected source
220-
if (server.master_channel->info.source_uuid !=
221-
master_node_ptr->get_primary_member()->uuid) {
222-
log_warning(
223-
"Instance %s is expected to have source %s (%s), but is %s:%i (%s)",
224-
label.c_str(), master_node_ptr->label.c_str(),
225-
master_node_ptr->get_primary_member()->uuid.c_str(),
226-
server.master_channel->info.host.c_str(),
227-
server.master_channel->info.port,
228-
server.master_channel->info.source_uuid.c_str());
229-
return Node_status::ERROR;
230-
}
231-
232-
if (server.master_channel->info.status() ==
233-
mysqlshdk::mysql::Replication_channel::OFF ||
234-
server.master_channel->info.status() ==
235-
mysqlshdk::mysql::Replication_channel::APPLIER_OFF ||
236-
server.master_channel->info.status() ==
237-
mysqlshdk::mysql::Replication_channel::RECEIVER_OFF)
238-
return Node_status::OFFLINE;
239-
240-
if (server.master_channel->info.status() !=
241-
mysqlshdk::mysql::Replication_channel::CONNECTING &&
242-
server.master_channel->info.status() !=
243-
mysqlshdk::mysql::Replication_channel::ON)
244-
return Node_status::ERROR;
245-
246-
// check for GTID set inconsistencies
247-
if (errant_transaction_count.value_or(0) > 0)
248-
return Node_status::INCONSISTENT;
249-
250-
if (!server.is_fenced()) return Node_status::ERROR;
251-
} else {
217+
if (master_instance_uuid.empty()) {
252218
// PRIMARY checks
253219
if (server.is_fenced()) return Node_status::ERROR;
254-
255-
if (server.master_channel) return Node_status::ERROR;
220+
return (server.master_channel ? Node_status::ERROR : Node_status::ONLINE);
221+
}
222+
223+
// SECONDARY checks
224+
if (!server.master_channel || !master_node_ptr) return Node_status::ERROR;
225+
226+
// check that the actual source is the expected source
227+
if (server.master_channel->info.source_uuid !=
228+
master_node_ptr->get_primary_member()->uuid) {
229+
log_warning(
230+
"Instance %s is expected to have source %s (%s), but is %s:%i (%s)",
231+
label.c_str(), master_node_ptr->label.c_str(),
232+
master_node_ptr->get_primary_member()->uuid.c_str(),
233+
server.master_channel->info.host.c_str(),
234+
server.master_channel->info.port,
235+
server.master_channel->info.source_uuid.c_str());
236+
return Node_status::ERROR;
237+
}
238+
239+
switch (auto status = server.master_channel->info.status(); status) {
240+
case mysqlshdk::mysql::Replication_channel::ON:
241+
break;
242+
case mysqlshdk::mysql::Replication_channel::OFF:
243+
case mysqlshdk::mysql::Replication_channel::APPLIER_OFF:
244+
case mysqlshdk::mysql::Replication_channel::RECEIVER_OFF:
245+
return Node_status::OFFLINE;
246+
case mysqlshdk::mysql::Replication_channel::CONNECTING:
247+
return Node_status::CONNECTING;
248+
default:
249+
return Node_status::ERROR;
256250
}
257251

258-
return Node_status::ONLINE;
252+
// replication channel status is ON, check for GTID set inconsistencies
253+
if (errant_transaction_count.value_or(0) > 0)
254+
return Node_status::INCONSISTENT;
255+
256+
return (server.is_fenced() ? Node_status::ONLINE : Node_status::ERROR);
259257
}
260258

261259
Node_role Server::role() const {

modules/adminapi/common/global_topology.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019, 2022, Oracle and/or its affiliates.
2+
* Copyright (c) 2019, 2023, Oracle and/or its affiliates.
33
*
44
* This program is free software; you can redistribute it and/or modify
55
* it under the terms of the GNU General Public License, version 2.0,
@@ -50,6 +50,7 @@ class Node;
5050

5151
enum class Node_status {
5252
ONLINE,
53+
CONNECTING,
5354
ERROR, // Replication related error
5455
INCONSISTENT, // GTID inconsistencies detected
5556
INVALIDATED, // Instance was invalidated in a failover

modules/adminapi/common/global_topology_check.cc

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019, 2022, Oracle and/or its affiliates.
2+
* Copyright (c) 2019, 2023, Oracle and/or its affiliates.
33
*
44
* This program is free software; you can redistribute it and/or modify
55
* it under the terms of the GNU General Public License, version 2.0,
@@ -22,6 +22,7 @@
2222
*/
2323

2424
#include "modules/adminapi/common/global_topology_check.h"
25+
2526
#include "modules/adminapi/common/dba_errors.h"
2627
#include "mysqlshdk/include/shellcore/console.h"
2728

@@ -65,6 +66,7 @@ void validate_node_status(const topology::Node *node) {
6566
" is inconsistent with the rest of the replicaset.",
6667
SHERR_DBA_ASYNC_MEMBER_INCONSISTENT);
6768

69+
case topology::Node_status::CONNECTING:
6870
case topology::Node_status::ONLINE:
6971
break;
7072
}
@@ -82,25 +84,21 @@ void validate_star_topology_consistent(
8284

8385
void validate_global_topology_active_cluster_available(
8486
const topology::Global_topology &topology) {
85-
auto console = current_console();
86-
87+
assert(topology.is_single_active());
8788
if (topology.is_single_active()) {
8889
const topology::Node *master_node = topology.get_primary_master_node();
8990

9091
validate_node_status(master_node);
91-
} else {
92-
assert(0);
9392
}
9493
}
9594

9695
void validate_global_topology_consistent(
9796
const topology::Global_topology &topology) {
9897
validate_global_topology_active_cluster_available(topology);
9998

99+
assert(topology.type() == Global_topology_type::SINGLE_PRIMARY_TREE);
100100
if (topology.type() == Global_topology_type::SINGLE_PRIMARY_TREE) {
101101
validate_star_topology_consistent(topology);
102-
} else {
103-
assert(0);
104102
}
105103

106104
// // check the cluster that will be our master

0 commit comments

Comments
 (0)