Skip to content

Commit 51142fc

Browse files
committed
Enhance error information reflected by RPC status when failing to fallback (aka, no fallback addresses provided by resolver), by including the original cause of entering fallback. This falls into cases:
- balancer RPC timeout (includes a timeout message) - balancer RPC failed before receiving any backend addresses (use the error occured in balancer RPC) - all balancer-provided addresses failed, while balancer RPC had failed causing fallback (use the error status for one of the balancer-provided backend)
1 parent 9fc32f1 commit 51142fc

File tree

2 files changed

+185
-19
lines changed

2 files changed

+185
-19
lines changed

grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,20 @@ final class GrpclbState {
9595
static final Status NO_AVAILABLE_BACKENDS_STATUS =
9696
Status.UNAVAILABLE.withDescription("LoadBalancer responded without any backends");
9797
@VisibleForTesting
98-
static final Status NO_FALLBACK_BACKENDS_FOUND_STATUS =
99-
Status.UNAVAILABLE.withDescription("Unable to fallback, no fallback addresses found");
98+
static final String NO_FALLBACK_BACKENDS_ERROR =
99+
"Unable to fallback, no fallback addresses found";
100+
@VisibleForTesting
101+
static final Status BALANCER_TIMEOUT_STATUS =
102+
Status.UNAVAILABLE.withDescription("Timeout waiting for remote balancer");
103+
@VisibleForTesting
104+
static final Status BALANCER_REQUESTED_FALLBACK_STATUS =
105+
Status.UNAVAILABLE.withDescription("Fallback requested by balancer");
106+
// This error status should never be propagated to RPC failures, as "no backend or balancer
107+
// addresses found" should be directly handled as a name resolution error. So in cases of no
108+
// balancer address, fallback should never fail.
109+
private static final Status NO_LB_ADDRESS_PROVIDED_STATUS =
110+
Status.UNAVAILABLE.withDescription("No balancer address found");
111+
100112

101113
@VisibleForTesting
102114
static final RoundRobinEntry BUFFER_ENTRY = new RoundRobinEntry() {
@@ -137,6 +149,10 @@ enum Mode {
137149
private ScheduledHandle fallbackTimer;
138150
private List<EquivalentAddressGroup> fallbackBackendList = Collections.emptyList();
139151
private boolean usingFallbackBackends;
152+
// Reason to fallback, will be used as RPC's error message if fail to fallback (e.g., no
153+
// fallback addresses found).
154+
@Nullable
155+
private Status fallbackReason;
140156
// True if the current balancer has returned a serverlist. Will be reset to false when lost
141157
// connection to a balancer.
142158
private boolean balancerWorking;
@@ -239,6 +255,7 @@ void handleAddresses(
239255
// No balancer address: close existing balancer connection and enter fallback mode
240256
// immediately.
241257
shutdownLbComm();
258+
fallbackReason = NO_LB_ADDRESS_PROVIDED_STATUS;
242259
syncContext.execute(new FallbackModeTask());
243260
} else {
244261
startLbComm(newLbAddressGroups);
@@ -252,6 +269,7 @@ void handleAddresses(
252269
}
253270
// Start the fallback timer if it's never started
254271
if (fallbackTimer == null) {
272+
fallbackReason = BALANCER_TIMEOUT_STATUS;
255273
fallbackTimer = syncContext.schedule(
256274
new FallbackModeTask(), FALLBACK_TIMEOUT_MS, TimeUnit.MILLISECONDS, timerService);
257275
}
@@ -275,16 +293,21 @@ void requestConnection() {
275293
}
276294

277295
private void maybeUseFallbackBackends() {
278-
if (balancerWorking) {
279-
return;
280-
}
281-
if (usingFallbackBackends) {
296+
if (balancerWorking || usingFallbackBackends) {
282297
return;
283298
}
299+
// Balancer RPC should have either been broken or timed out.
300+
checkState(fallbackReason != null, "no reason to fallback");
284301
for (Subchannel subchannel : subchannels.values()) {
285-
if (subchannel.getAttributes().get(STATE_INFO).get().getState() == READY) {
302+
ConnectivityStateInfo stateInfo = subchannel.getAttributes().get(STATE_INFO).get();
303+
if (stateInfo.getState() == READY) {
286304
return;
287305
}
306+
// If we do have balancer-provided backends, use one of its error in the error message if
307+
// fail to fallback.
308+
if (stateInfo.getState() == TRANSIENT_FAILURE) {
309+
fallbackReason = stateInfo.getStatus();
310+
}
288311
}
289312
// Fallback conditions met
290313
useFallbackBackends();
@@ -658,7 +681,9 @@ private void handleResponse(LoadBalanceResponse response) {
658681
}
659682

660683
if (typeCase == LoadBalanceResponseTypeCase.FALLBACK_RESPONSE) {
684+
// Force entering fallback requested by balancer.
661685
cancelFallbackTimer();
686+
fallbackReason = BALANCER_REQUESTED_FALLBACK_STATUS;
662687
useFallbackBackends();
663688
maybeUpdatePicker();
664689
return;
@@ -701,8 +726,9 @@ private void handleResponse(LoadBalanceResponse response) {
701726
newBackendAddrList.add(new BackendAddressGroup(eag, token));
702727
}
703728
}
704-
// Stop using fallback backends as soon as a new server list is received from the balancer.
729+
// Exit fallback as soon as a new server list is received from the balancer.
705730
usingFallbackBackends = false;
731+
fallbackReason = null;
706732
cancelFallbackTimer();
707733
updateServerList(newDropList, newBackendAddrList, loadRecorder);
708734
maybeUpdatePicker();
@@ -717,6 +743,7 @@ private void handleStreamClosed(Status error) {
717743
cleanUp();
718744
propagateError(error);
719745
balancerWorking = false;
746+
fallbackReason = error;
720747
maybeUseFallbackBackends();
721748
maybeUpdatePicker();
722749

@@ -773,15 +800,16 @@ private void maybeUpdatePicker() {
773800
List<RoundRobinEntry> pickList;
774801
ConnectivityState state;
775802
if (backendList.isEmpty()) {
776-
if (balancerWorking) {
777-
pickList =
778-
Collections.<RoundRobinEntry>singletonList(
779-
new ErrorEntry(NO_AVAILABLE_BACKENDS_STATUS));
803+
// Note balancer (is working) may enforce using fallback backends, and that fallback may
804+
// fail. So we should check if currently in fallback first.
805+
if (usingFallbackBackends) {
806+
pickList = Collections.<RoundRobinEntry>singletonList(new ErrorEntry(
807+
fallbackReason.augmentDescription(NO_FALLBACK_BACKENDS_ERROR)));
780808
state = TRANSIENT_FAILURE;
781-
} else if (usingFallbackBackends) {
809+
} else if (balancerWorking) {
782810
pickList =
783811
Collections.<RoundRobinEntry>singletonList(
784-
new ErrorEntry(NO_FALLBACK_BACKENDS_FOUND_STATUS));
812+
new ErrorEntry(NO_AVAILABLE_BACKENDS_STATUS));
785813
state = TRANSIENT_FAILURE;
786814
} else { // still waiting for balancer
787815
pickList = Collections.singletonList(BUFFER_ENTRY);
@@ -1119,4 +1147,11 @@ public String toString() {
11191147
.toString();
11201148
}
11211149
}
1150+
1151+
private enum BalancerState {
1152+
// Using backends provided by remote balancer.
1153+
BALANCER,
1154+
// Using backends in the fallback backend list.
1155+
FALLBACK
1156+
}
11221157
}

grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java

Lines changed: 136 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,11 +1284,16 @@ private void subtestGrpclbFallbackInitialTimeout(boolean timerExpires) {
12841284
for (Subchannel subchannel : mockSubchannels) {
12851285
verify(subchannelPool).returnSubchannel(eq(subchannel), any(ConnectivityStateInfo.class));
12861286
}
1287+
1288+
// RPC error status includes message of balancer RPC timeout
12871289
inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
1288-
RoundRobinPicker picker = (RoundRobinPicker) pickerCaptor.getValue();
1289-
assertThat(picker.dropList).isEmpty();
1290-
assertThat(picker.pickList)
1291-
.containsExactly(new ErrorEntry(GrpclbState.NO_FALLBACK_BACKENDS_FOUND_STATUS));
1290+
PickResult result = pickerCaptor.getValue().pickSubchannel(mock(PickSubchannelArgs.class));
1291+
assertThat(result.getStatus().getCode())
1292+
.isEqualTo(GrpclbState.BALANCER_TIMEOUT_STATUS.getCode());
1293+
assertThat(result.getStatus().getDescription())
1294+
.startsWith(GrpclbState.BALANCER_TIMEOUT_STATUS.getDescription());
1295+
assertThat(result.getStatus().getDescription())
1296+
.contains(GrpclbState.NO_FALLBACK_BACKENDS_ERROR);
12921297
}
12931298

12941299
////////////////////////////////////////////////////////////////
@@ -1408,6 +1413,24 @@ public void grpclbFallback_breakLbStreamBeforeFallbackTimerExpires() {
14081413
eq(LoadBalanceRequest.newBuilder().setInitialRequest(
14091414
InitialLoadBalanceRequest.newBuilder().setName(SERVICE_AUTHORITY).build())
14101415
.build()));
1416+
1417+
//////////////////////////////////////////////////////////////////////
1418+
// Name resolver sends new resolution results without any backend addr
1419+
//////////////////////////////////////////////////////////////////////
1420+
deliverResolvedAddresses(Collections.<EquivalentAddressGroup>emptyList(), grpclbBalancerList);
1421+
1422+
// Still in fallback logic, except that the backend list is empty
1423+
for (Subchannel subchannel : mockSubchannels) {
1424+
verify(subchannelPool).returnSubchannel(eq(subchannel), any(ConnectivityStateInfo.class));
1425+
}
1426+
1427+
// RPC error status includes error of balancer stream
1428+
inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
1429+
PickResult result = pickerCaptor.getValue().pickSubchannel(mock(PickSubchannelArgs.class));
1430+
assertThat(result.getStatus().getCode()).isEqualTo(streamError.getCode());
1431+
assertThat(result.getStatus().getDescription()).startsWith(streamError.getDescription());
1432+
assertThat(result.getStatus().getDescription())
1433+
.contains(GrpclbState.NO_FALLBACK_BACKENDS_ERROR);
14111434
}
14121435

14131436
@Test
@@ -1434,6 +1457,24 @@ public void grpclbFallback_noBalancerAddress() {
14341457
assertEquals(0, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
14351458
verify(helper, never())
14361459
.createOobChannel(ArgumentMatchers.<EquivalentAddressGroup>anyList(), anyString());
1460+
logs.clear();
1461+
1462+
///////////////////////////////////////////////////////////////////////////////////////
1463+
// Name resolver sends new resolution results without any backend addr or balancer addr
1464+
///////////////////////////////////////////////////////////////////////////////////////
1465+
deliverResolvedAddresses(Collections.<EquivalentAddressGroup>emptyList(),
1466+
Collections.<EquivalentAddressGroup>emptyList());
1467+
assertThat(logs).containsExactly(
1468+
"DEBUG: [grpclb-<api.google.com>] Error: Status{code=UNAVAILABLE, "
1469+
+ "description=No backend or balancer addresses found, cause=null}");
1470+
1471+
// Keep using existing fallback addresses without interruption
1472+
for (Subchannel subchannel : mockSubchannels) {
1473+
verify(subchannelPool, never())
1474+
.returnSubchannel(eq(subchannel), any(ConnectivityStateInfo.class));
1475+
}
1476+
verify(helper, never())
1477+
.updateBalancingState(eq(TRANSIENT_FAILURE), any(SubchannelPicker.class));
14371478
}
14381479

14391480
@Test
@@ -1531,6 +1572,7 @@ private void subtestGrpclbFallbackConnectionLost(
15311572
}
15321573
assertEquals(0, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
15331574

1575+
// No subchannel to fallback backends should have been created if no fallback happened
15341576
if (!(balancerBroken && allSubchannelsBroken)) {
15351577
verify(subchannelPool, never()).takeOrCreateSubchannel(
15361578
eq(backendList.get(0)), any(Attributes.class));
@@ -1539,6 +1581,74 @@ private void subtestGrpclbFallbackConnectionLost(
15391581
}
15401582
}
15411583

1584+
@Test
1585+
public void grpclbFallback_allLost_failToFallback() {
1586+
long loadReportIntervalMillis = 1983;
1587+
InOrder inOrder = inOrder(helper, mockLbService, subchannelPool);
1588+
1589+
// Create balancer and (empty) backend addresses
1590+
List<EquivalentAddressGroup> grpclbBalancerList = createResolvedBalancerAddresses(1);
1591+
deliverResolvedAddresses(Collections.<EquivalentAddressGroup>emptyList(), grpclbBalancerList);
1592+
1593+
inOrder.verify(helper).createOobChannel(eq(xattr(grpclbBalancerList)),
1594+
eq(lbAuthority(0) + NO_USE_AUTHORITY_SUFFIX));
1595+
1596+
// Attempted to connect to balancer
1597+
assertEquals(1, fakeOobChannels.size());
1598+
fakeOobChannels.poll();
1599+
inOrder.verify(mockLbService).balanceLoad(lbResponseObserverCaptor.capture());
1600+
StreamObserver<LoadBalanceResponse> lbResponseObserver = lbResponseObserverCaptor.getValue();
1601+
assertEquals(1, lbRequestObservers.size());
1602+
StreamObserver<LoadBalanceRequest> lbRequestObserver = lbRequestObservers.poll();
1603+
1604+
verify(lbRequestObserver).onNext(
1605+
eq(LoadBalanceRequest.newBuilder().setInitialRequest(
1606+
InitialLoadBalanceRequest.newBuilder().setName(SERVICE_AUTHORITY).build())
1607+
.build()));
1608+
lbResponseObserver.onNext(buildInitialResponse(loadReportIntervalMillis));
1609+
// We don't care if these methods have been run.
1610+
inOrder.verify(helper, atLeast(0)).getSynchronizationContext();
1611+
inOrder.verify(helper, atLeast(0)).getScheduledExecutorService();
1612+
1613+
inOrder.verifyNoMoreInteractions();
1614+
1615+
// Balancer returns a server list
1616+
List<ServerEntry> serverList = Arrays.asList(
1617+
new ServerEntry("127.0.0.1", 2000, "token0001"),
1618+
new ServerEntry("127.0.0.1", 2010, "token0002"));
1619+
lbResponseObserver.onNext(buildInitialResponse());
1620+
lbResponseObserver.onNext(buildLbResponse(serverList));
1621+
1622+
List<Subchannel> subchannels = fallbackTestVerifyUseOfBalancerBackendLists(inOrder, serverList);
1623+
1624+
// Break connections
1625+
lbResponseObserver.onError(Status.UNAVAILABLE.asException());
1626+
// A new stream to LB is created
1627+
inOrder.verify(mockLbService).balanceLoad(lbResponseObserverCaptor.capture());
1628+
lbResponseObserver = lbResponseObserverCaptor.getValue();
1629+
assertEquals(1, lbRequestObservers.size());
1630+
lbRequestObserver = lbRequestObservers.poll();
1631+
1632+
// Break all subchannel connections
1633+
Status error = Status.UNAUTHENTICATED.withDescription("Permission denied");
1634+
for (Subchannel subchannel : subchannels) {
1635+
deliverSubchannelState(subchannel, ConnectivityStateInfo.forTransientFailure(error));
1636+
}
1637+
1638+
// Recycle all subchannels
1639+
for (Subchannel subchannel : subchannels) {
1640+
verify(subchannelPool).returnSubchannel(eq(subchannel), any(ConnectivityStateInfo.class));
1641+
}
1642+
1643+
// RPC error status includes errors of subchannels to balancer-provided backends
1644+
inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
1645+
PickResult result = pickerCaptor.getValue().pickSubchannel(mock(PickSubchannelArgs.class));
1646+
assertThat(result.getStatus().getCode()).isEqualTo(error.getCode());
1647+
assertThat(result.getStatus().getDescription()).startsWith(error.getDescription());
1648+
assertThat(result.getStatus().getDescription())
1649+
.contains(GrpclbState.NO_FALLBACK_BACKENDS_ERROR);
1650+
}
1651+
15421652
private List<Subchannel> fallbackTestVerifyUseOfFallbackBackendLists(
15431653
InOrder inOrder, List<EquivalentAddressGroup> addrs) {
15441654
return fallbackTestVerifyUseOfBackendLists(inOrder, addrs, null);
@@ -1958,6 +2068,7 @@ public void grpclbWorking_pickFirstMode_lbSendsEmptyAddress() throws Exception {
19582068
assertThat(mockSubchannels).isEmpty();
19592069
verify(subchannel).shutdown();
19602070

2071+
// RPC error status includes message of no backends provided by balancer
19612072
inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
19622073
RoundRobinPicker errorPicker = (RoundRobinPicker) pickerCaptor.getValue();
19632074
assertThat(errorPicker.pickList)
@@ -2445,7 +2556,7 @@ public void grpclbWorking_lbSendsFallbackMessage() {
24452556
new BackendEntry(subchannel2, getLoadRecorder(), "token0002"))
24462557
.inOrder();
24472558

2448-
// enter fallback mode
2559+
// Balancer forces entering fallback mode
24492560
lbResponseObserver.onNext(buildLbFallbackResponse());
24502561

24512562
// existing subchannels must be returned immediately to gracefully shutdown.
@@ -2460,6 +2571,26 @@ public void grpclbWorking_lbSendsFallbackMessage() {
24602571
assertFalse(oobChannel.isShutdown());
24612572
verify(lbRequestObserver, never()).onCompleted();
24622573

2574+
//////////////////////////////////////////////////////////////////////
2575+
// Name resolver sends new resolution results without any backend addr
2576+
//////////////////////////////////////////////////////////////////////
2577+
deliverResolvedAddresses(Collections.<EquivalentAddressGroup>emptyList(), grpclbBalancerList);
2578+
2579+
// Still in fallback logic, except that the backend list is empty
2580+
for (Subchannel subchannel : mockSubchannels) {
2581+
verify(subchannelPool).returnSubchannel(eq(subchannel), any(ConnectivityStateInfo.class));
2582+
}
2583+
2584+
// RPC error status includes message of fallback requested by balancer
2585+
inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
2586+
PickResult result = pickerCaptor.getValue().pickSubchannel(mock(PickSubchannelArgs.class));
2587+
assertThat(result.getStatus().getCode())
2588+
.isEqualTo(GrpclbState.BALANCER_REQUESTED_FALLBACK_STATUS.getCode());
2589+
assertThat(result.getStatus().getDescription())
2590+
.startsWith(GrpclbState.BALANCER_REQUESTED_FALLBACK_STATUS.getDescription());
2591+
assertThat(result.getStatus().getDescription())
2592+
.contains(GrpclbState.NO_FALLBACK_BACKENDS_ERROR);
2593+
24632594
// exit fall back by providing two new backends
24642595
ServerEntry backend2a = new ServerEntry("127.0.0.1", 8000, "token1001");
24652596
ServerEntry backend2b = new ServerEntry("127.0.0.1", 8010, "token1002");

0 commit comments

Comments
 (0)