Просмотр исходного кода

Better waiter errors with NIOTS (#1775)

Motivation:

When establishing a connection using NIOTS, Network.framework may enter
a 'waiting' state because of a transient error. This is surfaced as a
user inbound event.

In many cases the connect call won't resolve until the connect timeout
passes (defaults to 20 seconds). Attempts to start a call on this
connection during this time will fail with a timeout at the connection
pool layer (where the max wait time defaults to 5 seconds) and users
will see a 'deadline exceeded' error without any more context. However,
we can provide more information by passing up the transient error from
Network.framework to the connection pool.

This isn't currently possible as events flow up on state changes, in
this case from connecting to transient failure when the connect times
out.

Modifications:

- Catch the `WaitingForConnectivity` event and bubble up its error
  though the idle handler to the connection pool

Result:

Better errors on connect timeouts in some situations
George Barnett 2 лет назад
Родитель
Сommit
be8e15dd72

+ 40 - 9
Sources/GRPC/ConnectionManager.swift

@@ -33,6 +33,7 @@ internal final class ConnectionManager: @unchecked Sendable {
   internal struct ConnectingState {
     var backoffIterator: ConnectionBackoffIterator?
     var reconnect: Reconnect
+    var connectError: Error?
 
     var candidate: EventLoopFuture<Channel>
     var readyChannelMuxPromise: EventLoopPromise<HTTP2StreamMultiplexer>
@@ -656,10 +657,30 @@ internal final class ConnectionManager: @unchecked Sendable {
         ]
       )
 
-    case .connecting:
-      // Ignore the error, the channel promise will notify the manager of any error which occurs
-      // while connecting.
-      ()
+    case .connecting(var state):
+      // Record the error, the channel promise will notify the manager of any error which occurs
+      // while connecting. It may be overridden by this error if it contains more relevant
+      // information
+      if state.connectError == nil {
+        state.connectError = error
+        self.state = .connecting(state)
+
+        // The pool is only notified of connection errors when the connection transitions to the
+        // transient failure state. However, in some cases (i.e. with NIOTS), errors can be thrown
+        // during the connect but before the connect times out.
+        //
+        // This opens up a period of time where you can start a call and have it fail with
+        // deadline exceeded (because no connection was available within the configured max
+        // wait time for the pool) but without any diagnostic information. The information is
+        // available but it hasn't been made available to the pool at that point in time.
+        //
+        // The delegate can't easily be modified (it's public API) and a new API doesn't make all
+        // that much sense so we elect to check whether the delegate is the pool and call it
+        // directly.
+        if let pool = self.connectivityDelegate as? ConnectionPool {
+          pool.sync.updateMostRecentError(error)
+        }
+      }
 
     case var .active(state):
       state.error = error
@@ -935,16 +956,26 @@ extension ConnectionManager {
 
     switch self.state {
     case let .connecting(connecting):
+      let reportedError: Error
+      switch error as? ChannelError {
+      case .some(.connectTimeout):
+        // A more relevant error may have been caught earlier. Use that in preference to the
+        // timeout as it'll likely be more useful.
+        reportedError = connecting.connectError ?? error
+      default:
+        reportedError = error
+      }
+
       // Should we reconnect?
       switch connecting.reconnect {
       // No, shutdown.
       case .none:
         self.logger.debug("shutting down connection, no reconnect configured/remaining")
         self.state = .shutdown(
-          ShutdownState(closeFuture: self.eventLoop.makeSucceededFuture(()), reason: error)
+          ShutdownState(closeFuture: self.eventLoop.makeSucceededFuture(()), reason: reportedError)
         )
-        connecting.readyChannelMuxPromise.fail(error)
-        connecting.candidateMuxPromise.fail(error)
+        connecting.readyChannelMuxPromise.fail(reportedError)
+        connecting.candidateMuxPromise.fail(reportedError)
 
       // Yes, after a delay.
       case let .after(delay):
@@ -953,10 +984,10 @@ extension ConnectionManager {
           self.startConnecting()
         }
         self.state = .transientFailure(
-          TransientFailureState(from: connecting, scheduled: scheduled, reason: error)
+          TransientFailureState(from: connecting, scheduled: scheduled, reason: reportedError)
         )
         // Candidate mux users are not willing to wait.
-        connecting.candidateMuxPromise.fail(error)
+        connecting.candidateMuxPromise.fail(reportedError)
       }
 
     // The application must have called shutdown while we were trying to establish a connection

+ 6 - 0
Sources/GRPC/ConnectionPool/ConnectionPool.swift

@@ -857,6 +857,12 @@ extension ConnectionPool {
       self.pool.eventLoop.assertInEventLoop()
       return self.pool._connections.values.reduce(0) { $0 + $1.reservedStreams }
     }
+
+    /// Updates the most recent connection error.
+    internal func updateMostRecentError(_ error: Error) {
+      self.pool.eventLoop.assertInEventLoop()
+      self.pool.updateMostRecentError(error)
+    }
   }
 
   internal var sync: Sync {

+ 9 - 0
Sources/GRPC/GRPCIdleHandler.swift

@@ -17,6 +17,7 @@ import Logging
 import NIOCore
 import NIOHTTP2
 import NIOTLS
+import NIOTransportServices
 
 internal final class GRPCIdleHandler: ChannelInboundHandler {
   typealias InboundIn = HTTP2Frame
@@ -287,6 +288,14 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
       )
       context.fireUserInboundEventTriggered(event)
     } else {
+      #if canImport(Network)
+      if #available(macOS 10.14, iOS 12.0, tvOS 12.0, watchOS 6.0, *) {
+        if let waitsForConnectivity = event as? NIOTSNetworkEvents.WaitingForConnectivity {
+          self.mode.connectionManager?.channelError(waitsForConnectivity.transientError)
+        }
+      }
+      #endif
+
       context.fireUserInboundEventTriggered(event)
     }
   }

+ 57 - 0
Tests/GRPCTests/GRPCNetworkFrameworkTests.swift

@@ -24,6 +24,7 @@ import NIOCore
 import NIOPosix
 import NIOSSL
 import NIOTransportServices
+import GRPCSampleData
 import Security
 import XCTest
 
@@ -208,6 +209,62 @@ final class GRPCNetworkFrameworkTests: GRPCTestCase {
 
     XCTAssertNoThrow(try self.doEchoGet())
   }
+
+  func testWaiterPicksUpNWError(
+    _ configure: (inout GRPCChannelPool.Configuration) -> Void
+  ) async throws {
+    let builder = Server.usingTLSBackedByNIOSSL(
+      on: self.group,
+      certificateChain: [SampleCertificate.server.certificate],
+      privateKey: SamplePrivateKey.server
+    )
+
+    let server = try await builder.bind(host: "127.0.0.1", port: 0).get()
+    defer { try? server.close().wait() }
+
+    let client = try GRPCChannelPool.with(
+      target: .hostAndPort("127.0.0.1", server.channel.localAddress!.port!),
+      transportSecurity: .tls(.makeClientConfigurationBackedByNetworkFramework()),
+      eventLoopGroup: self.tsGroup
+    ) {
+      configure(&$0)
+    }
+
+    let echo = Echo_EchoAsyncClient(channel: client)
+    do {
+      let _ = try await echo.get(.with { $0.text = "ignored" })
+    } catch let error as GRPCConnectionPoolError {
+      XCTAssertEqual(error.code, .deadlineExceeded)
+      XCTAssert(error.underlyingError is NWError)
+    } catch {
+      XCTFail("Expected GRPCConnectionPoolError")
+    }
+
+    let promise = self.group.next().makePromise(of: Void.self)
+    client.closeGracefully(deadline: .now() + .seconds(1), promise: promise)
+    try await promise.futureResult.get()
+  }
+
+  func testErrorPickedUpBeforeConnectTimeout() async throws {
+    try await self.testWaiterPicksUpNWError {
+      // Configure the wait time to be less than the connect timeout, the waiter
+      // should fail with the appropriate NWError before the connect times out.
+      $0.connectionPool.maxWaitTime = .milliseconds(500)
+      $0.connectionBackoff.minimumConnectionTimeout = 1.0
+    }
+  }
+
+  func testNotWaitingForConnectivity() async throws {
+    try await self.testWaiterPicksUpNWError {
+      // The minimum connect time is still high, but setting wait for activity to false
+      // means it fails on entering the waiting state rather than seeing out the connect
+      // timeout.
+      $0.connectionPool.maxWaitTime = .milliseconds(500)
+      $0.debugChannelInitializer = { channel in
+        channel.setOption(NIOTSChannelOptions.waitForActivity, value: false)
+      }
+    }
+  }
 }
 
 #endif  // canImport(Network)