Skip to content

Commit bfd0495

Browse files
author
Jaideep Karande
committed
Bug#25495393: GROUP REPLICATION CAN USE SOME VERBOSITY IN THE ERROR LOG
Description: At present not all scenarios of Group membership changes are logged. When member join or member leaves, customers cannot identify same by looking at logs. View changes are not logged. Present logging does not assist DBAs with corrective measures during network partitions or inform DBAs when member re-connects. Also at some places we are logging internal debug messages which cannot be decoded by DBAs. Important member related configuration is also not logged. Resolution: Logging has been analyzed for: 1. View changes 2. Member join 3. Member leave(graceful and crash) 4. Member unreachable 5. Member re-connect 6. Removal of debug messages 7. Member configuration has been extended with Member_id, member_uuid, single_primary_mode, and group_replication_auto_increment_increment 8. Improvements in single-primary mode status reporting and member leaving/joining logging New messages have been added and change to existing messages where required has been done.
1 parent a751a0b commit bfd0495

File tree

11 files changed

+244
-42
lines changed

11 files changed

+244
-42
lines changed

mysql-test/include/mtr_warnings.sql

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -280,8 +280,11 @@ INSERT INTO global_suppressions VALUES
280280
("\\[GCS\\] Error pushing message into group communication engine."),
281281
("\\[GCS\\] Message cannot be sent because the member does not belong to a group."),
282282
("Slave SQL for channel 'group_replication_recovery': ... The slave coordinator and worker threads are stopped, possibly leaving data in inconsistent state.*"),
283-
("The member lost contact with a majority of the members in the group. Until the network is restored.*"),
284-
("The member resumed contact with a majority of the members in the group.*"),
283+
("Member with address .* has become unreachable."),
284+
("This server is not able to reach a majority of members in the group.*"),
285+
("Member with address .* is reachable again."),
286+
("The member has resumed contact with a majority of the members in the group.*"),
287+
("Members removed from the group.*"),
285288

286289
("THE_LAST_SUPPRESSION")||
287290

rapid/plugin/group_replication/include/gcs_event_handlers.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,32 @@ class Plugin_gcs_events_handler: public Gcs_communication_event_listener,
267267
*/
268268
bool was_member_expelled_from_group(const Gcs_view& view) const;
269269

270+
/**
271+
Logs member joining message to error logs from view.
272+
273+
@param[in] view the view delivered by the GCS
274+
*/
275+
void log_members_joining_message(const Gcs_view& new_view) const;
276+
277+
/**
278+
Logs member leaving message to error logs from view.
279+
280+
@param[in] view the view delivered by the GCS
281+
*/
282+
void log_members_leaving_message(const Gcs_view& new_view) const;
283+
284+
/**
285+
This function return all members present in vector of Gcs_member_identifier
286+
in HOST:PORT format separated by comma.
287+
Function also return PRIMARY member if any in HOST:PORT format.
288+
289+
@param[in] joining/leaving members for this view
290+
@param[out] host and port of all members from view
291+
@param[out] primary member hosts and port of all members from view
292+
*/
293+
void get_hosts_from_view(const std::vector<Gcs_member_identifier> &members,
294+
std::string& all_hosts, std::string& primary_host) const;
295+
270296
Applier_module_interface* applier_module;
271297
Recovery_module* recovery_module;
272298

rapid/plugin/group_replication/include/member_info.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,14 @@ class Group_member_info_manager_interface
526526
@return true if majority of the group is unreachable
527527
*/
528528
virtual bool is_majority_unreachable()= 0;
529+
530+
/**
531+
This method returns all ONLINE and RECOVERING members comma separated
532+
host and port in string format.
533+
534+
@return hosts and port of all ONLINE and RECOVERING members
535+
*/
536+
virtual std::string get_string_current_view_active_hosts() const = 0;
529537
};
530538

531539

@@ -578,6 +586,8 @@ class Group_member_info_manager: public Group_member_info_manager_interface
578586

579587
bool is_majority_unreachable();
580588

589+
std::string get_string_current_view_active_hosts() const;
590+
581591
private:
582592
void clear_members();
583593

rapid/plugin/group_replication/include/plugin_utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
#include "plugin_psi.h"
2525
#include <mysql/group_replication_priv.h>
2626

27+
void log_primary_member_details();
28+
2729
class Blocked_transaction_handler
2830
{
2931
public:

rapid/plugin/group_replication/src/certifier.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1689,11 +1689,6 @@ void Certifier::enable_conflict_detection()
16891689
conflict_detection_enable= true;
16901690
local_member_info->enable_conflict_detection();
16911691
mysql_mutex_unlock(&LOCK_certification_info);
1692-
1693-
log_message(MY_INFORMATION_LEVEL,
1694-
"A new primary was elected, enabled conflict detection "
1695-
"until the new primary applies all relay logs");
1696-
16971692
DBUG_VOID_RETURN;
16981693
}
16991694

rapid/plugin/group_replication/src/gcs_event_handlers.cc

Lines changed: 139 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -290,13 +290,23 @@ Plugin_gcs_events_handler::on_suspicions(const std::vector<Gcs_member_identifier
290290
uit= std::find(tmp_unreachable.begin(), tmp_unreachable.end(), member);
291291
if (uit != tmp_unreachable.end())
292292
{
293+
if (!member_info->is_unreachable())
294+
log_message(MY_WARNING_LEVEL,
295+
"Member with address %s:%u has become unreachable.",
296+
member_info->get_hostname().c_str(), member_info->get_port());
297+
293298
member_info->set_unreachable();
294299

295300
// remove to not check again against this one
296301
tmp_unreachable.erase(uit);
297302
}
298303
else
299304
{
305+
if (member_info->is_unreachable())
306+
log_message(MY_WARNING_LEVEL,
307+
"Member with address %s:%u is reachable again.",
308+
member_info->get_hostname().c_str(), member_info->get_port());
309+
300310
member_info->set_reachable();
301311
}
302312
}
@@ -305,19 +315,23 @@ Plugin_gcs_events_handler::on_suspicions(const std::vector<Gcs_member_identifier
305315
if ((members.size() - unreachable.size()) <= (members.size() / 2))
306316
{
307317
if (!group_partition_handler->get_timeout_on_unreachable())
308-
log_message(MY_WARNING_LEVEL,
309-
"The member lost contact with a majority of the members in the"
310-
" group. Until the network is restored transactions will block."
311-
" As the value of group_replication_unreachable_majority_timeout"
312-
" is 0 the plugin will wait indefinitely for the network to be"
313-
" restored.");
318+
log_message(MY_ERROR_LEVEL,
319+
"This server is not able to reach a majority of members "
320+
"in the group. This server will now block all updates. "
321+
"The server will remain blocked until contact with the "
322+
"majority is restored. "
323+
"It is possible to use group_replication_force_members "
324+
"to force a new group membership.");
314325
else
315-
log_message(MY_WARNING_LEVEL,
316-
"The member lost contact with a majority of the members in the"
317-
" group. Until the network is restored transactions will block."
318-
" The plugin will wait for a network restore or timeout after"
319-
" the period defined on"
320-
" group_replication_unreachable_majority_timeout.");
326+
log_message(MY_ERROR_LEVEL,
327+
"This server is not able to reach a majority of members "
328+
"in the group. This server will now block all updates. "
329+
"The server will remain blocked for the next %lu seconds. "
330+
"Unless contact with the majority is restored, after this "
331+
"time the member will error out and leave the group. "
332+
"It is possible to use group_replication_force_members "
333+
"to force a new group membership.",
334+
group_partition_handler->get_timeout_on_unreachable());
321335

322336
if (!group_partition_handler->is_partition_handler_running() &&
323337
!group_partition_handler->is_partition_handling_terminated())
@@ -342,13 +356,86 @@ Plugin_gcs_events_handler::on_suspicions(const std::vector<Gcs_member_identifier
342356
{
343357
/* If it was not running or we canceled it in time */
344358
log_message(MY_WARNING_LEVEL,
345-
"The member resumed contact with a majority of the members"
346-
" in the group. Regular operation is re-established.");
359+
"The member has resumed contact with a majority of the "
360+
"members in the group. Regular operation is restored and "
361+
"transactions are unblocked.");
347362
}
348363
}
349364
}
350365
}
351366

367+
void
368+
Plugin_gcs_events_handler::log_members_leaving_message(const Gcs_view& new_view) const
369+
{
370+
std::string members_leaving;
371+
std::string primary_member_host;
372+
373+
get_hosts_from_view(new_view.get_leaving_members(), members_leaving, primary_member_host);
374+
375+
log_message(MY_WARNING_LEVEL,
376+
"Members removed from the group: %s",
377+
members_leaving.c_str());
378+
379+
if (!primary_member_host.empty())
380+
log_message(MY_INFORMATION_LEVEL,
381+
"Primary server with address %s left the group. "
382+
"Electing new Primary.",
383+
primary_member_host.c_str());
384+
}
385+
386+
void
387+
Plugin_gcs_events_handler::log_members_joining_message(const Gcs_view& new_view) const
388+
{
389+
std::string members_joining;
390+
std::string primary_member_host;
391+
392+
get_hosts_from_view(new_view.get_joined_members(), members_joining, primary_member_host);
393+
394+
log_message(MY_INFORMATION_LEVEL,
395+
"Members joined the group: %s",
396+
members_joining.c_str());
397+
}
398+
399+
void
400+
Plugin_gcs_events_handler::get_hosts_from_view(const std::vector<Gcs_member_identifier> &members,
401+
std::string& all_hosts, std::string& primary_host) const
402+
{
403+
std::stringstream hosts_string;
404+
std::stringstream primary_string;
405+
std::vector<Gcs_member_identifier>::const_iterator all_members_it= members.begin();
406+
407+
while (all_members_it != members.end())
408+
{
409+
Group_member_info* member_info= group_member_mgr->
410+
get_group_member_info_by_member_id((*all_members_it));
411+
all_members_it++;
412+
413+
if (member_info == NULL)
414+
continue;
415+
416+
hosts_string << member_info->get_hostname() << ":" << member_info->get_port();
417+
418+
/**
419+
Check in_primary_mode has been added for safety.
420+
Since primary role is in single-primary mode.
421+
*/
422+
if (member_info->in_primary_mode() &&
423+
member_info->get_role() == Group_member_info::MEMBER_ROLE_PRIMARY)
424+
{
425+
if (primary_string.tellp() != 0)
426+
primary_string << ", ";
427+
primary_string << member_info->get_hostname() << ":" << member_info->get_port();
428+
}
429+
430+
if (all_members_it != members.end())
431+
{
432+
hosts_string << ", ";
433+
}
434+
}
435+
all_hosts.assign (hosts_string.str());
436+
primary_host.assign (primary_string.str());
437+
}
438+
352439
void
353440
Plugin_gcs_events_handler::on_view_changed(const Gcs_view& new_view,
354441
const Exchanged_data &exchanged_data)
@@ -393,8 +480,9 @@ Plugin_gcs_events_handler::on_view_changed(const Gcs_view& new_view,
393480
{
394481
/* If it was not running or we canceled it in time */
395482
log_message(MY_WARNING_LEVEL,
396-
"The member resumed contact with a majority of the members"
397-
" in the group. Regular operation is re-established.");
483+
"The member has resumed contact with a majority of the "
484+
"members in the group. Regular operation is restored and "
485+
"transactions are unblocked.");
398486
}
399487
}
400488

@@ -405,9 +493,15 @@ Plugin_gcs_events_handler::on_view_changed(const Gcs_view& new_view,
405493
if (!is_leaving && group_partition_handler->is_partition_handling_terminated())
406494
return;
407495

496+
if (!is_leaving && new_view.get_leaving_members().size() > 0)
497+
log_members_leaving_message(new_view);
498+
408499
//update the Group Manager with all the received states
409500
this->update_group_info_manager(new_view, exchanged_data, is_leaving);
410501

502+
if (!is_joining && new_view.get_joined_members().size() > 0)
503+
log_members_joining_message(new_view);
504+
411505
//enable conflict detection if someone on group have it enabled
412506
if (local_member_info->in_primary_mode() &&
413507
group_member_mgr->is_conflict_detection_enabled())
@@ -439,6 +533,19 @@ Plugin_gcs_events_handler::on_view_changed(const Gcs_view& new_view,
439533
//Signal that the injected view was delivered
440534
if (view_change_notifier->is_injected_view_modification())
441535
view_change_notifier->end_view_modification();
536+
537+
if (!is_leaving)
538+
{
539+
log_message(MY_INFORMATION_LEVEL,
540+
"Group membership changed to %s on view %s.",
541+
group_member_mgr->get_string_current_view_active_hosts().c_str(),
542+
gcs_module->get_current_view()->get_view_id().get_representation().c_str());
543+
}
544+
else
545+
{
546+
log_message(MY_INFORMATION_LEVEL,
547+
"Group membership changed: This member has left the group.");
548+
}
442549
}
443550

444551
bool
@@ -670,12 +777,17 @@ void Plugin_gcs_events_handler::handle_leader_election_if_needed() const
670777
group_member_mgr->update_member_role(primary_uuid,
671778
Group_member_info::MEMBER_ROLE_PRIMARY);
672779

780+
log_message(MY_INFORMATION_LEVEL, "A new primary with address %s:%u "
781+
"was elected, enabling conflict detection until the new "
782+
"primary applies all relay logs.",
783+
the_primary->get_hostname().c_str(),
784+
the_primary->get_port());
785+
673786
// Check if the session was established, it can (re)set read only mode.
674787
if (!skip_set_super_readonly)
675788
{
676789
if (is_primary_local)
677790
{
678-
log_message(MY_INFORMATION_LEVEL, "Unsetting super_read_only.");
679791
if (disable_super_read_only_mode(sql_command_interface))
680792
{
681793
log_message(MY_WARNING_LEVEL,
@@ -685,7 +797,6 @@ void Plugin_gcs_events_handler::handle_leader_election_if_needed() const
685797
}
686798
else
687799
{
688-
log_message(MY_INFORMATION_LEVEL, "Setting super_read_only.");
689800
if (enable_super_read_only_mode(sql_command_interface))
690801
{
691802
log_message(MY_WARNING_LEVEL,
@@ -694,6 +805,16 @@ void Plugin_gcs_events_handler::handle_leader_election_if_needed() const
694805
}
695806
}
696807
}
808+
/* code position limits messaging to primary change */
809+
if (is_primary_local)
810+
log_message(MY_INFORMATION_LEVEL,
811+
"This server is working as primary member.");
812+
else
813+
log_message(MY_INFORMATION_LEVEL,
814+
"This server is working as secondary member with primary "
815+
"member address %s:%u.",
816+
the_primary->get_hostname().c_str(),
817+
the_primary->get_port());
697818
}
698819
}
699820
else if (!skip_set_super_readonly)
@@ -812,11 +933,6 @@ void Plugin_gcs_events_handler::handle_joining_members(const Gcs_view& new_view,
812933
Group_member_info::MEMBER_IN_RECOVERY,
813934
Group_member_info::MEMBER_OFFLINE,
814935
Group_member_info::MEMBER_END);
815-
816-
log_message(MY_INFORMATION_LEVEL,
817-
"Starting group replication recovery with view_id %s",
818-
new_view.get_view_id().get_representation().c_str());
819-
820936
/**
821937
Set the read mode if not set during start (auto-start)
822938
*/
@@ -909,10 +1025,6 @@ void Plugin_gcs_events_handler::handle_joining_members(const Gcs_view& new_view,
9091025
Group_member_info::MEMBER_IN_RECOVERY,
9101026
Group_member_info::MEMBER_OFFLINE,
9111027
Group_member_info::MEMBER_END);
912-
913-
log_message(MY_INFORMATION_LEVEL,
914-
"Marking group replication view change with view_id %s",
915-
new_view.get_view_id().get_representation().c_str());
9161028
/**
9171029
If not a joining member, all members should record on their own binlogs a
9181030
marking event that identifies the frontier between the data the joining

rapid/plugin/group_replication/src/member_info.cc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,26 @@ bool Group_member_info_manager::is_majority_unreachable()
849849
return ret;
850850
}
851851

852+
std::string
853+
Group_member_info_manager::get_string_current_view_active_hosts() const
854+
{
855+
std::stringstream hosts_string;
856+
map<string, Group_member_info*>::iterator all_members_it= members->begin();
857+
858+
while (all_members_it != members->end())
859+
{
860+
Group_member_info* member_info= (*all_members_it).second;
861+
if (member_info->get_recovery_status() == Group_member_info::MEMBER_ONLINE ||
862+
member_info->get_recovery_status() == Group_member_info::MEMBER_IN_RECOVERY)
863+
hosts_string << member_info->get_hostname() << ":" << member_info->get_port();
864+
all_members_it++;
865+
if (all_members_it != members->end())
866+
hosts_string<<", ";
867+
}
868+
869+
return hosts_string.str();
870+
}
871+
852872
Group_member_info_manager_message::Group_member_info_manager_message()
853873
: Plugin_gcs_message(CT_MEMBER_INFO_MANAGER_MESSAGE)
854874
{

0 commit comments

Comments
 (0)