Browse Source

Fix hanging graceful channel shutdown by cleaning up state. (#122)

Corrects two scenarios where stale state in the load balancer could
prevent a graceful channel shutdown.

Previously, multiple endpoint updates or changes to the load balancer
could leave undiscarded subchannels or old load balancers in the
channel's state, blocking a clean shutdown.

This change ensures that previous subchannels and load balancers are
fully removed from state when they shutdown. This also corrects a
related test, which had been passing incorrectly due to an explicit
cancellation masking the issue.
KZ 6 months ago
parent
commit
24cded15c6

+ 8 - 0
Sources/GRPCNIOTransportCore/Client/Connection/GRPCChannel.swift

@@ -834,6 +834,14 @@ extension GRPCChannel.StateMachine {
         case .idle, .connecting, .transientFailure, .shutdown:
           ()
         }
+      } else {
+        // In this case the LB is neither current nor next.
+        switch connectivityState {
+        case .shutdown:
+          state.past.removeValue(forKey: id)
+        case .idle, .connecting, .ready, .transientFailure:
+          ()
+        }
       }
 
       self.state = .running(state)

+ 3 - 5
Sources/GRPCNIOTransportCore/Client/Connection/LoadBalancers/PickFirstLoadBalancer.swift

@@ -395,15 +395,13 @@ extension PickFirstLoadBalancer.State.Active {
           } else {
             onUpdate = .publishStateChange(connectivityState)
           }
-
-          self.current = next
-          self.isCurrentGoingAway = false
         } else {
           // No state change to publish, just roll over.
           onUpdate = self.current.map { .close($0) } ?? .none
-          self.current = next
-          self.isCurrentGoingAway = false
         }
+        self.current = next
+        self.next = nil
+        self.isCurrentGoingAway = false
 
       case .idle, .connecting, .transientFailure, .shutdown:
         onUpdate = .none

+ 4 - 3
Tests/GRPCNIOTransportCoreTests/Client/Connection/GRPCChannelTests.swift

@@ -619,13 +619,14 @@ final class GRPCChannelTests: XCTestCase {
 
           channel.beginGracefulShutdown()
 
-        case .shutdown:
-          group.cancelAll()
-
         default:
           ()
         }
       }
+
+      let result = await group.nextResult()!
+      group.cancelAll()
+      return try result.get()
     }
   }
 

+ 14 - 3
Tests/GRPCNIOTransportCoreTests/Client/Connection/LoadBalancers/PickFirstLoadBalancerTests.swift

@@ -96,7 +96,7 @@ final class PickFirstLoadBalancerTests: XCTestCase {
   }
 
   func testEndpointUpdateHandledGracefully() async throws {
-    try await LoadBalancerTest.pickFirst(servers: 2, connector: .posix()) { context, event in
+    try await LoadBalancerTest.pickFirst(servers: 3, connector: .posix()) { context, event in
       switch event {
       case .connectivityStateChanged(.idle):
         let endpoint = Endpoint(addresses: [context.servers[0].address])
@@ -109,14 +109,25 @@ final class PickFirstLoadBalancerTests: XCTestCase {
         }
 
         // Update the endpoint so that it contains server-1.
-        let endpoint = Endpoint(addresses: [context.servers[1].address])
-        context.pickFirst!.updateEndpoint(endpoint)
+        let endpoint1 = Endpoint(addresses: [context.servers[1].address])
+        context.pickFirst!.updateEndpoint(endpoint1)
 
         // Should remain in the ready state
         try await XCTPoll(every: .milliseconds(10)) {
           context.servers[0].server.clients.isEmpty && context.servers[1].server.clients.count == 1
         }
 
+        // Update the endpoint so that it contains server-2.
+        let endpoint2 = Endpoint(addresses: [context.servers[2].address])
+        context.pickFirst!.updateEndpoint(endpoint2)
+
+        // Should remain in the ready state
+        try await XCTPoll(every: .milliseconds(10)) {
+          context.servers[0].server.clients.isEmpty
+            && context.servers[1].server.clients.isEmpty
+            && context.servers[2].server.clients.count == 1
+        }
+
         context.loadBalancer.close()
 
       default: