-
Notifications
You must be signed in to change notification settings - Fork 1.9k
refactor: improve HTTP/2 GOAWAY and network error retry resilience #4594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -95,36 +95,16 @@ export async function fetchWithFallbacks(availableFetchers: readonly IFetcher[], | |
| try { | ||
| return { response: await fetcher.fetch(url, options) }; | ||
| } catch (err) { | ||
| // Transient electron errors: retry once on the same fetcher with a fresh connection. | ||
| if (isRetryableElectronError((err as Error)?.message)) { | ||
| return { response: await retryAfterDisconnect(fetcher, url, options, err, (err as Error).message, logService, telemetryService) }; | ||
| } | ||
|
|
||
| // For net::ERR_FAILED from network process crash, disconnect and retry once. | ||
| if (fetcher.isNetworkProcessCrashedError(err)) { | ||
| const fetcherId = fetcher.getUserAgentLibrary(); | ||
| logService.info(`FetcherService: ${fetcherId} hit network process crash error (${(err as Error)?.message}), retrying after disconnect...`); | ||
| try { | ||
| await fetcher.disconnectAll(); | ||
| const response = await fetcher.fetch(url, options); | ||
| logService.info(`FetcherService: ${fetcherId} retry after crash succeeded.`); | ||
| /* __GDPR__ | ||
| "fetcherCrashRetry" : { | ||
| "owner": "deepak1556", | ||
| "comment": "Sent when a fetcher retries after a network process crash error", | ||
| "fetcher": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "The fetcher that crashed" }, | ||
| "outcome": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Whether the retry recovered or failed" }, | ||
| "error": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "The error message" } | ||
| } | ||
| */ | ||
| telemetryService?.sendTelemetryEvent('fetcherCrashRetry', { github: true, microsoft: true }, { | ||
| fetcher: fetcherId, | ||
| outcome: 'recovered', | ||
| error: collectSingleLineErrorMessage(err, true), | ||
| }); | ||
| return { response }; | ||
| return { response: await retryAfterDisconnect(fetcher, url, options, err, 'network-process-crash', logService, telemetryService) }; | ||
| } catch (retryErr) { | ||
| logService.info(`FetcherService: ${fetcherId} retry also failed (${(retryErr as Error)?.message}), checking for demotion...`); | ||
| telemetryService?.sendTelemetryEvent('fetcherCrashRetry', { github: true, microsoft: true }, { | ||
| fetcher: fetcherId, | ||
| outcome: 'failed', | ||
| error: collectSingleLineErrorMessage(retryErr, true), | ||
| }); | ||
| err = retryErr; | ||
| } | ||
| } | ||
|
|
@@ -153,6 +133,66 @@ export async function fetchWithFallbacks(availableFetchers: readonly IFetcher[], | |
| } | ||
| } | ||
|
|
||
| const retryableElectronErrors = [ | ||
| 'net::ERR_NETWORK_CHANGED', | ||
| 'net::ERR_CONNECTION_RESET', | ||
| ]; | ||
|
|
||
| function isRetryableElectronError(message: string | undefined): boolean { | ||
| if (!message) { | ||
| return false; | ||
| } | ||
| return retryableElectronErrors.some(token => message.includes(token)); | ||
| } | ||
|
|
||
| /** | ||
| * Disconnect all connections on the fetcher and retry the request once. | ||
| * Returns the response on success, throws on failure. | ||
| */ | ||
| async function retryAfterDisconnect( | ||
| fetcher: IFetcher, | ||
| url: string, | ||
| options: FetchOptions, | ||
| originalErr: unknown, | ||
| reason: string, | ||
| logService: ILogService, | ||
| telemetryService: ITelemetryService | undefined, | ||
| ): Promise<Response> { | ||
| const fetcherId = fetcher.getUserAgentLibrary(); | ||
| logService.info(`FetcherService: ${fetcherId} hit ${reason}, retrying after disconnect...`); | ||
| try { | ||
| await fetcher.disconnectAll(); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We disconnectAll for all 3 error types we retry here? Could this make requests step on each other as some are being retried while later ones come here to first disconnect all? |
||
| const response = await fetcher.fetch(url, options); | ||
| logService.info(`FetcherService: ${fetcherId} retry after ${reason} succeeded.`); | ||
| /* __GDPR__ | ||
|
deepak1556 marked this conversation as resolved.
|
||
| "fetcherTransientRetry" : { | ||
| "owner": "AurSol", | ||
| "comment": "Sent when a fetcher retries after a transient error", | ||
| "fetcher": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "The fetcher that retried" }, | ||
| "reason": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "The type of error that triggered the retry" }, | ||
| "outcome": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Whether the retry recovered or failed" }, | ||
| "error": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "The error message" } | ||
| } | ||
| */ | ||
| telemetryService?.sendTelemetryEvent('fetcherTransientRetry', { github: true, microsoft: true }, { | ||
|
deepak1556 marked this conversation as resolved.
|
||
| fetcher: fetcherId, | ||
| reason, | ||
| outcome: 'recovered', | ||
| error: collectSingleLineErrorMessage(originalErr, true), | ||
| }); | ||
| return response; | ||
| } catch (retryErr) { | ||
| logService.info(`FetcherService: ${fetcherId} retry after ${reason} also failed (${(retryErr as Error)?.message}).`); | ||
| telemetryService?.sendTelemetryEvent('fetcherTransientRetry', { github: true, microsoft: true }, { | ||
| fetcher: fetcherId, | ||
| reason, | ||
| outcome: 'failed', | ||
| error: collectSingleLineErrorMessage(retryErr, true), | ||
| }); | ||
| throw retryErr; | ||
| } | ||
| } | ||
|
|
||
| async function tryFetch(fetcher: IFetcher, url: string, options: FetchOptions, logService: ILogService): Promise<{ ok: boolean; response: Response } | { ok: false; err: any }> { | ||
| try { | ||
| const response = await fetcher.fetch(url, options); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,9 +5,8 @@ | |
|
|
||
| import * as stream from 'stream'; | ||
| import * as undici from 'undici'; | ||
| import { Lazy } from '../../../util/vs/base/common/lazy'; | ||
| import { IEnvService } from '../../env/common/envService'; | ||
| import { HeadersImpl, IHeaders, ReportFetchEvent, WebSocketConnection, WebSocketConnectOptions } from '../common/fetcherService'; | ||
| import { FetchOptions, HeadersImpl, IHeaders, ReportFetchEvent, Response, WebSocketConnection, WebSocketConnectOptions } from '../common/fetcherService'; | ||
| import { BaseFetchFetcher } from './baseFetchFetcher'; | ||
|
|
||
| export class NodeFetchFetcher extends BaseFetchFetcher { | ||
|
|
@@ -26,6 +25,26 @@ export class NodeFetchFetcher extends BaseFetchFetcher { | |
| return NodeFetchFetcher.ID; | ||
| } | ||
|
|
||
| override async fetch(url: string, options: FetchOptions): Promise<Response> { | ||
| try { | ||
| return await super.fetch(url, options); | ||
| } catch (e) { | ||
| if (isGracefulGoawayError(e)) { | ||
| await this.disconnectAll(); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to disconnect all? nodejs/undici#3140 seems to suggest that this is done automatically. Does still doing this here risk cancelling newly issued requests? (E.g. when several requests of a session get the same GOAWAY, could they step on each other here?) |
||
| return await super.fetch(url, options); | ||
| } | ||
| throw e; | ||
| } | ||
| } | ||
|
|
||
|
deepak1556 marked this conversation as resolved.
|
||
| override async disconnectAll(): Promise<void> { | ||
| const currentAgent = agent; | ||
| if (currentAgent) { | ||
| agent = undefined; | ||
| await currentAgent.close(); | ||
| } | ||
| } | ||
|
deepak1556 marked this conversation as resolved.
|
||
|
|
||
| isInternetDisconnectedError(_e: any): boolean { | ||
| return false; | ||
| } | ||
|
|
@@ -35,15 +54,37 @@ export class NodeFetchFetcher extends BaseFetchFetcher { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Detects a graceful HTTP/2 GOAWAY error (error code 0 = NO_ERROR). | ||
| * These occur during server-side connection draining | ||
| * and are safe to retry on a new connection. | ||
| */ | ||
| export function isGracefulGoawayError(e: any): boolean { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems GOAWAY is on the session level and comes with a Last-Stream-ID that indicates which requests might still have been processed. Only those with later ids can be retried safely if I understand correctly. Undici doesn't seem to handle or surface Last-Stream-ID currently. |
||
| const message = String(e?.message || ''); | ||
| const causeMessage = String(e?.cause?.message || ''); | ||
| const combined = message + ' ' + causeMessage; | ||
| if (combined.includes('GOAWAY') && combined.includes('code 0')) { | ||
| return true; | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| function getFetch(): typeof globalThis.fetch { | ||
| const fetch = (globalThis as any).__vscodePatchedFetch || globalThis.fetch; | ||
| return function (input: string | URL | globalThis.Request, init?: RequestInit) { | ||
| return fetch(input, { dispatcher: agent.value, ...init }); | ||
| return fetch(input, { dispatcher: getOrCreateAgent(), ...init }); | ||
| }; | ||
| } | ||
|
|
||
| // Cache agent to reuse connections. | ||
| const agent = new Lazy(() => new undici.Agent({ allowH2: true })); | ||
| function getOrCreateAgent(): undici.Agent { | ||
| if (!agent) { | ||
| agent = new undici.Agent({ allowH2: true }); | ||
| } | ||
| return agent; | ||
| } | ||
|
|
||
| // Mutable agent reference — recreated on disconnectAll() to ensure fresh connections. | ||
| let agent: undici.Agent | undefined; | ||
|
|
||
| export function createWebSocket(url: string, options?: WebSocketConnectOptions): WebSocketConnection { | ||
| const wsAgent = new undici.Agent(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| /*--------------------------------------------------------------------------------------------- | ||
| * Copyright (c) Microsoft Corporation. All rights reserved. | ||
| * Licensed under the MIT License. See License.txt in the project root for license information. | ||
| *--------------------------------------------------------------------------------------------*/ | ||
|
|
||
| import { describe, expect, it } from 'vitest'; | ||
| import { isGracefulGoawayError } from '../nodeFetchFetcher'; | ||
|
|
||
| describe('isGracefulGoawayError', () => { | ||
| it('detects GOAWAY with code 0 in message', () => { | ||
| const err = new Error('"GOAWAY" frame received with code 0'); | ||
| expect(isGracefulGoawayError(err)).toBe(true); | ||
| }); | ||
|
|
||
| it('detects GOAWAY with code 0 in cause message', () => { | ||
| const cause = new Error('"GOAWAY" frame received with code 0'); | ||
| const err = new Error('fetch failed', { cause }); | ||
| expect(isGracefulGoawayError(err)).toBe(true); | ||
| }); | ||
|
|
||
| it('does not match GOAWAY with non-zero error codes', () => { | ||
| const err = new Error('"GOAWAY" frame received with code 2'); | ||
| expect(isGracefulGoawayError(err)).toBe(false); | ||
| }); | ||
|
|
||
| it('does not match non-GOAWAY errors', () => { | ||
| const err = new Error('ECONNRESET'); | ||
| expect(isGracefulGoawayError(err)).toBe(false); | ||
| }); | ||
|
|
||
| it('does not match null/undefined', () => { | ||
| expect(isGracefulGoawayError(null)).toBe(false); | ||
| expect(isGracefulGoawayError(undefined)).toBe(false); | ||
| }); | ||
|
|
||
| it('does not match error with code 0 but no GOAWAY', () => { | ||
| const err = new Error('connection reset with code 0'); | ||
| expect(isGracefulGoawayError(err)).toBe(false); | ||
| }); | ||
|
|
||
| it('handles HTTP/2 prefixed GOAWAY message', () => { | ||
| const err = new Error('HTTP/2: "GOAWAY" frame received with code 0'); | ||
| expect(isGracefulGoawayError(err)).toBe(true); | ||
| }); | ||
| }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The retry behavior in
canRetryOnceNetworkErrorchanged (now supports matchingreason.message), but there are no unit tests covering the accepted/rejected cases (e.g. matching byreason.codevsreason.message, and theERR_*vsnet::ERR_*forms). Adding focused tests for this helper would help prevent regressions in networkRequest retry behavior.