diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index cc27e608a71a..1debf5a29987 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -798,6 +798,14 @@ export class BasicCrawler { + const { page } = crawlingContext; + await page.close().catch((error: Error) => this.log.debug('Error while closing page', { error })); + } + /** * Wrapper around requestHandler that opens and closes pages etc. */ @@ -466,34 +471,30 @@ export abstract class BrowserCrawler< // So we must not save the session prior to making sure it was used only once, otherwise we would use it twice. const { request, session } = crawlingContext; - try { - if (!request.skipNavigation) { - await this._handleNavigation(crawlingContext); - tryCancel(); + if (!request.skipNavigation) { + await this._handleNavigation(crawlingContext); + tryCancel(); - await this._responseHandler(crawlingContext); - tryCancel(); + await this._responseHandler(crawlingContext); + tryCancel(); - // save cookies - // TODO: Should we save the cookies also after/only the handle page? - if (this.persistCookiesPerSession) { - const cookies = await crawlingContext.browserController.getCookies(page); - tryCancel(); - session?.setCookies(cookies, request.loadedUrl!); - } + // save cookies + // TODO: Should we save the cookies also after/only the handle page? + if (this.persistCookiesPerSession) { + const cookies = await crawlingContext.browserController.getCookies(page); + tryCancel(); + session?.setCookies(cookies, request.loadedUrl!); } + } - await addTimeoutToPromise( - () => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)), - this.requestHandlerTimeoutMillis, - `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`, - ); - tryCancel(); + await addTimeoutToPromise( + () => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)), + this.requestHandlerTimeoutMillis, + `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`, + ); + tryCancel(); - if (session) session.markGood(); - } finally { - await page.close().catch((error: Error) => this.log.debug('Error while closing page', { error })); - } + if (session) session.markGood(); } protected _enhanceCrawlingContextWithPageInfo(crawlingContext: Context, page: CommonPage, createNewSession?: boolean): void { diff --git a/test/core/crawlers/browser_crawler.test.ts b/test/core/crawlers/browser_crawler.test.ts index 4b5fb1883e01..dc066b54fef9 100644 --- a/test/core/crawlers/browser_crawler.test.ts +++ b/test/core/crawlers/browser_crawler.test.ts @@ -227,6 +227,35 @@ describe('BrowserCrawler', () => { expect(isEvaluated).toBeTruthy(); }); + test('errorHandler has open page', async () => { + const requestList = await RequestList.open({ + sources: [ + { url: 'http://example.com/?q=1' }, + ], + }); + + const result: string[] = []; + + const browserCrawler = new BrowserCrawlerTest({ + browserPoolOptions: { + browserPlugins: [puppeteerPlugin], + }, + requestList, + requestHandler: async (ctx) => { + throw new Error('Test error'); + }, + maxRequestRetries: 1, + errorHandler: async (ctx, error) => { + result.push(await ctx.page.evaluate(() => window.location.origin)); + }, + }); + + await browserCrawler.run(); + + expect(result.length).toBe(1); + expect(result[0]).toBe('http://example.com'); + }); + test('should allow modifying gotoOptions by pre navigation hooks', async () => { const requestList = await RequestList.open({ sources: [