From e4ebc7dd2261bcd7e66c6662b50fc1cbe39ff936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Wed, 21 Feb 2024 10:38:45 +0100 Subject: [PATCH 01/14] feat: tieredProxyUrls for ProxyConfiguration --- .../src/internals/browser-crawler.ts | 2 +- packages/core/src/proxy_configuration.ts | 67 ++++++++++++++-- .../src/internals/http-crawler.ts | 2 +- test/core/proxy_configuration.test.ts | 79 ++++++++++++++++++- 4 files changed, 142 insertions(+), 8 deletions(-) diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 6b29e7bd43f4..79fbcaaa0c05 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -495,7 +495,7 @@ export abstract class BrowserCrawler< if (this.proxyConfiguration && (useIncognitoPages || experimentalContainers)) { const { session } = crawlingContext; - const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id); + const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id, crawlingContext.request); crawlingContext.proxyInfo = proxyInfo; newPageOptions.proxyUrl = proxyInfo.url; diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index 8bc03d6e58a0..1a3caffb9e42 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -2,6 +2,8 @@ import log from '@apify/log'; import type { Dictionary } from '@crawlee/types'; import ow from 'ow'; +import type { Request } from './request'; + export interface ProxyConfigurationFunction { (sessionId: string | number): string | Promise; } @@ -20,6 +22,18 @@ export interface ProxyConfigurationOptions { * This function is used to generate the URL when {@apilink ProxyConfiguration.newUrl} or {@apilink ProxyConfiguration.newProxyInfo} is called. */ newUrlFunction?: ProxyConfigurationFunction; + + /** + * An array of custom proxy URLs tiers to be rotated. + * This is a more advanced version of `proxyUrls` that allows you to define + * a hierarchy of proxy URLs. If everything goes well, all the requests will be sent + * through the first proxy URL in the list. + * Whenever the crawler encounters a problem with the current proxy on the given domain, it will switch to the following proxy in the list. + * The crawler probes lower-level proxies at given intervals to check if it can make the downshift. + * + * This feature is useful when you have a set of proxies with different performance characteristics (speed, price, antibot performance etc.) and you want to use the best one for each domain. + */ + tieredProxyUrls?: string[][]; } /** @@ -116,9 +130,11 @@ export class ProxyConfiguration { isManInTheMiddle = false; protected nextCustomUrlIndex = 0; protected proxyUrls?: string[]; + protected tieredProxyUrls?: string[][]; protected usedProxyUrls = new Map(); protected newUrlFunction?: ProxyConfigurationFunction; protected log = log.child({ prefix: 'ProxyConfiguration' }); + protected domainTiers = new Map(); /** * Creates a {@apilink ProxyConfiguration} instance based on the provided options. Proxy servers are used to prevent target websites from @@ -145,15 +161,17 @@ export class ProxyConfiguration { ow(rest, ow.object.exactShape({ proxyUrls: ow.optional.array.nonEmpty.ofType(ow.string.url), newUrlFunction: ow.optional.function, + tieredProxyUrls: ow.optional.array.nonEmpty.ofType(ow.array.nonEmpty.ofType(ow.string.url)), })); - const { proxyUrls, newUrlFunction } = options; + const { proxyUrls, newUrlFunction, tieredProxyUrls } = options; - if (proxyUrls && newUrlFunction) this._throwCannotCombineCustomMethods(); + if ([proxyUrls, newUrlFunction, tieredProxyUrls].filter(x => x).length > 1) this._throwCannotCombineCustomMethods(); if (!proxyUrls && !newUrlFunction && validateRequired) this._throwNoOptionsProvided(); this.proxyUrls = proxyUrls; this.newUrlFunction = newUrlFunction; + this.tieredProxyUrls = tieredProxyUrls; } /** @@ -173,9 +191,10 @@ export class ProxyConfiguration { * The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. * @return Represents information about used proxy and its configuration. */ - async newProxyInfo(sessionId?: string | number): Promise { + async newProxyInfo(sessionId?: string | number, request?: Request): Promise { if (typeof sessionId === 'number') sessionId = `${sessionId}`; - const url = await this.newUrl(sessionId); + + const url = await this.newUrl(sessionId, request); const { username, password, port, hostname } = new URL(url); @@ -189,6 +208,40 @@ export class ProxyConfiguration { }; } + _handleTieredUrl(_sessionId: string, request?: Request): string { + if (!this.tieredProxyUrls) throw new Error('Tiered proxy URLs are not set'); + + if (!request) { + const allProxyUrls = this.tieredProxyUrls.flatMap(x => x); + return allProxyUrls[this.nextCustomUrlIndex++ % allProxyUrls.length]; + } + + const domain = new URL(request?.url).hostname; + const { retryCount } = request; + + if (!this.domainTiers.has(domain)) { + this.domainTiers.set(domain, [0]); + } + + const history = this.domainTiers.get(domain)!; + + let tierPrediction; + + if (retryCount === 0) { + const averageTier = history.reduce((a, b) => a + b, 0) / history.length; + tierPrediction = Math.floor(averageTier); + if (history.every((x, _, a) => x === a[0])) tierPrediction = Math.max(0, tierPrediction - 1); + } else { + tierPrediction = history[history.length - 1] + 1; + } + + tierPrediction = Math.min(tierPrediction, this.tieredProxyUrls!.length - 1); + + this.domainTiers.set(domain, [...history, tierPrediction].slice(-4, 4)); + + return this.tieredProxyUrls![tierPrediction][this.nextCustomUrlIndex++ % this.tieredProxyUrls![tierPrediction].length]; + } + /** * Returns a new proxy URL based on provided configuration options and the `sessionId` parameter. * @param [sessionId] @@ -202,13 +255,17 @@ export class ProxyConfiguration { * @return A string with a proxy URL, including authentication credentials and port number. * For example, `http://bob:password123@proxy.example.com:8000` */ - async newUrl(sessionId?: string | number): Promise { + async newUrl(sessionId?: string | number, request?: Request): Promise { if (typeof sessionId === 'number') sessionId = `${sessionId}`; if (this.newUrlFunction) { return this._callNewUrlFunction(sessionId)!; } + if (this.tieredProxyUrls) { + return this._handleTieredUrl(sessionId ?? Math.random().toString().slice(2, 6), request); + } + return this._handleCustomUrl(sessionId); } diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index 5549d09257bf..54f481e18e54 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -455,7 +455,7 @@ export class HttpCrawler { } }); }); + + describe('with tieredProxyUrls', () => { + test('without Request rotates the urls uniformly', async () => { + const proxyConfiguration = new ProxyConfiguration({ + tieredProxyUrls: [ + ['http://proxy.com:1111', 'http://proxy.com:2222'], + ['http://proxy.com:3333', 'http://proxy.com:4444'], + ], + }); + + // @ts-expect-error protected property + const { tieredProxyUrls } = proxyConfiguration; + expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[0][0]); + expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[0][1]); + expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[1][0]); + expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[1][1]); + expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[0][0]); + }); + + test('high retry count picks higher-level proxies', async () => { + const proxyConfiguration = new ProxyConfiguration({ + tieredProxyUrls: [ + ['http://proxy.com:1111'], + ['http://proxy.com:2222'], + ['http://proxy.com:3333'], + ], + }); + + const request = new Request({ + url: 'http://example.com', + }); + request.retryCount = 5; + + // @ts-expect-error protected property + const { tieredProxyUrls } = proxyConfiguration; + expect(await proxyConfiguration.newUrl('session-id', request)).toEqual(tieredProxyUrls[1][0]); + expect(await proxyConfiguration.newUrl('session-id', request)).toEqual(tieredProxyUrls[2][0]); + }); + + test('upshifts and downshifts properly', async () => { + const tieredProxyUrls = [ + ['http://proxy.com:1111'], + ['http://proxy.com:2222'], + ['http://proxy.com:3333'], + ]; + + const proxyConfiguration = new ProxyConfiguration({ + tieredProxyUrls, + }); + + const request = new Request({ + url: 'http://example.com', + }); + + let suggestedProxies = []; + + request.retryCount = 5; + for (let i = 0; i < 20; i++) { + suggestedProxies.push(await proxyConfiguration.newUrl('session-id', request)); + } + + expect(suggestedProxies).toHaveLength(20); + expect(suggestedProxies).toContain(tieredProxyUrls[1][0]); + expect(suggestedProxies).toContain(tieredProxyUrls[2][0]); + + suggestedProxies = []; + + request.retryCount = 0; + for (let i = 0; i < 20; i++) { + suggestedProxies.push(await proxyConfiguration.newUrl('session-id', request)); + } + + expect(suggestedProxies).toHaveLength(20); + expect(suggestedProxies).toContain(tieredProxyUrls[1][0]); + expect(suggestedProxies).toContain(tieredProxyUrls[0][0]); + }); + }); }); From a2b826966c92eca0b9219d935eb857840242cecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Wed, 21 Feb 2024 10:41:22 +0100 Subject: [PATCH 02/14] docs: reformat jsdoc --- packages/core/src/proxy_configuration.ts | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index 1a3caffb9e42..2f6049779349 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -24,12 +24,11 @@ export interface ProxyConfigurationOptions { newUrlFunction?: ProxyConfigurationFunction; /** - * An array of custom proxy URLs tiers to be rotated. - * This is a more advanced version of `proxyUrls` that allows you to define - * a hierarchy of proxy URLs. If everything goes well, all the requests will be sent - * through the first proxy URL in the list. - * Whenever the crawler encounters a problem with the current proxy on the given domain, it will switch to the following proxy in the list. - * The crawler probes lower-level proxies at given intervals to check if it can make the downshift. + * An array of custom proxy URLs to be rotated stratified in tiers. + * This is a more advanced version of `proxyUrls` that allows you to define a hierarchy of proxy URLs + * If everything goes well, all the requests will be sent through the first proxy URL in the list. + * Whenever the crawler encounters a problem with the current proxy on the given domain, it will switch to the higher tier for this domain. + * The crawler probes lower-level proxies at intervals to check if it can make the tier downshift. * * This feature is useful when you have a set of proxies with different performance characteristics (speed, price, antibot performance etc.) and you want to use the best one for each domain. */ From 3206810ae57d3ba3bd491584dff3276c8c637bc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Wed, 21 Feb 2024 14:50:18 +0100 Subject: [PATCH 03/14] fix: lint, fix tests --- packages/core/src/proxy_configuration.ts | 4 ++-- test/core/crawlers/cheerio_crawler.test.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index 2f6049779349..c21658d0043c 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -165,7 +165,7 @@ export class ProxyConfiguration { const { proxyUrls, newUrlFunction, tieredProxyUrls } = options; - if ([proxyUrls, newUrlFunction, tieredProxyUrls].filter(x => x).length > 1) this._throwCannotCombineCustomMethods(); + if ([proxyUrls, newUrlFunction, tieredProxyUrls].filter((x) => x).length > 1) this._throwCannotCombineCustomMethods(); if (!proxyUrls && !newUrlFunction && validateRequired) this._throwNoOptionsProvided(); this.proxyUrls = proxyUrls; @@ -211,7 +211,7 @@ export class ProxyConfiguration { if (!this.tieredProxyUrls) throw new Error('Tiered proxy URLs are not set'); if (!request) { - const allProxyUrls = this.tieredProxyUrls.flatMap(x => x); + const allProxyUrls = this.tieredProxyUrls.flat(); return allProxyUrls[this.nextCustomUrlIndex++ % allProxyUrls.length]; } diff --git a/test/core/crawlers/cheerio_crawler.test.ts b/test/core/crawlers/cheerio_crawler.test.ts index aa0dc169e467..7587403153d3 100644 --- a/test/core/crawlers/cheerio_crawler.test.ts +++ b/test/core/crawlers/cheerio_crawler.test.ts @@ -1174,7 +1174,7 @@ describe('CheerioCrawler', () => { // localhost proxy causes proxy errors, session rotations and finally throws, but we don't care } - expect(newUrlSpy).toBeCalledWith(usedSession.id); + expect(newUrlSpy).toBeCalledWith(usedSession.id, expect.any(Request)); }); }); From 9c0a5c99c0e2709e08e458823048c295b45a7221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 11 Mar 2024 14:20:48 +0100 Subject: [PATCH 04/14] feat: make adaptive proxy work with `useIncognitoPages: false` --- .../src/internals/browser-crawler.ts | 41 +++++++++------- .../abstract-classes/browser-controller.ts | 6 +++ .../src/abstract-classes/browser-plugin.ts | 2 + packages/browser-pool/src/browser-pool.ts | 48 +++++++++++++++---- packages/browser-pool/src/launch-context.ts | 4 ++ packages/core/src/proxy_configuration.ts | 38 +++++++++++---- 6 files changed, 105 insertions(+), 34 deletions(-) diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 79fbcaaa0c05..81df819f4ec0 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -492,22 +492,26 @@ export abstract class BrowserCrawler< const useIncognitoPages = this.launchContext?.useIncognitoPages; const experimentalContainers = this.launchContext?.experimentalContainers; - if (this.proxyConfiguration && (useIncognitoPages || experimentalContainers)) { - const { session } = crawlingContext; - - const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id, crawlingContext.request); - crawlingContext.proxyInfo = proxyInfo; - - newPageOptions.proxyUrl = proxyInfo.url; - - if (this.proxyConfiguration.isManInTheMiddle) { - /** - * @see https://playwright.dev/docs/api/class-browser/#browser-new-context - * @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md - */ - newPageOptions.pageOptions = { - ignoreHTTPSErrors: true, - }; + if (this.proxyConfiguration) { + if (useIncognitoPages || experimentalContainers) { + const { session } = crawlingContext; + + const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id, { request: crawlingContext.request }); + crawlingContext.proxyInfo = proxyInfo; + + newPageOptions.proxyUrl = proxyInfo.url; + + if (this.proxyConfiguration.isManInTheMiddle) { + /** + * @see https://playwright.dev/docs/api/class-browser/#browser-new-context + * @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md + */ + newPageOptions.pageOptions = { + ignoreHTTPSErrors: true, + }; + } + } else { + newPageOptions.proxyTier = this.proxyConfiguration.getProxyTier(crawlingContext.request); } } @@ -702,7 +706,10 @@ export abstract class BrowserCrawler< } if (this.proxyConfiguration) { - const proxyInfo = await this.proxyConfiguration.newProxyInfo(launchContextExtends.session?.id); + const proxyInfo = await this.proxyConfiguration.newProxyInfo( + launchContextExtends.session?.id, + { proxyTier: (launchContext.proxyTier as number) ?? undefined }, + ); launchContext.proxyUrl = proxyInfo.url; launchContextExtends.proxyInfo = proxyInfo; diff --git a/packages/browser-pool/src/abstract-classes/browser-controller.ts b/packages/browser-pool/src/abstract-classes/browser-controller.ts index 2bd3b72af5c4..14396e02b41c 100644 --- a/packages/browser-pool/src/abstract-classes/browser-controller.ts +++ b/packages/browser-pool/src/abstract-classes/browser-controller.ts @@ -55,6 +55,12 @@ export abstract class BrowserController< */ launchContext: LaunchContext = undefined!; + /** + * The proxy tier tied to this browser controller. + * `undefined` if no tiered proxy is used. + */ + proxyTier : number | undefined = undefined; + isActive = false; activePages = 0; diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 1f70d7b567d2..25a040410692 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -145,6 +145,7 @@ export abstract class BrowserPlugin< useIncognitoPages = this.useIncognitoPages, userDataDir = this.userDataDir, experimentalContainers = this.experimentalContainers, + proxyTier, } = options; return new LaunchContext({ @@ -155,6 +156,7 @@ export abstract class BrowserPlugin< useIncognitoPages, experimentalContainers, userDataDir, + proxyTier, }); } diff --git a/packages/browser-pool/src/browser-pool.ts b/packages/browser-pool/src/browser-pool.ts index 1d2ed50a7cc1..38a424667c2a 100644 --- a/packages/browser-pool/src/browser-pool.ts +++ b/packages/browser-pool/src/browser-pool.ts @@ -16,6 +16,7 @@ import type { FingerprintGeneratorOptions } from './fingerprinting/types'; import type { LaunchContext } from './launch-context'; import { log } from './logger'; import type { InferBrowserPluginArray, UnwrapPromise } from './utils'; +import { Request } from '@crawlee/core'; const PAGE_CLOSE_KILL_TIMEOUT_MILLIS = 1000; const BROWSER_KILLER_INTERVAL_MILLIS = 10 * 1000; @@ -391,6 +392,7 @@ export class BrowserPool< pageOptions, browserPlugin = this._pickBrowserPlugin(), proxyUrl, + proxyTier, } = options; if (this.pages.has(id)) { @@ -403,8 +405,8 @@ export class BrowserPool< // Limiter is necessary - https://github.com/apify/crawlee/issues/1126 return this.limiter(async () => { - let browserController = this._pickBrowserWithFreeCapacity(browserPlugin); - if (!browserController) browserController = await this._launchBrowser(id, { browserPlugin }); + let browserController = this._pickBrowserWithFreeCapacity(browserPlugin, { proxyTier }); + if (!browserController) browserController = await this._launchBrowser(id, { browserPlugin, proxyTier }); tryCancel(); return this._createPageForBrowser(id, browserController, pageOptions, proxyUrl); @@ -632,6 +634,7 @@ export class BrowserPool< const { browserPlugin, launchOptions, + proxyTier, } = options; const browserController = browserPlugin.createController() as BrowserControllerReturn; @@ -640,6 +643,7 @@ export class BrowserPool< const launchContext = browserPlugin.createLaunchContext({ id: pageId, launchOptions, + proxyTier, }); try { @@ -656,6 +660,7 @@ export class BrowserPool< } log.debug('Launched new browser.', { id: browserController.id }); + browserController.proxyTier = proxyTier; try { // If the launch fails on the post-launch hooks, we need to clean up @@ -690,15 +695,13 @@ export class BrowserPool< return this.browserPlugins[pluginIndex]; } - private _pickBrowserWithFreeCapacity(browserPlugin: BrowserPlugin) { - for (const controller of this.activeBrowserControllers) { + private _pickBrowserWithFreeCapacity(browserPlugin: BrowserPlugin, options?: { proxyTier?: number }) { + return [...this.activeBrowserControllers].find((controller) => { const hasCapacity = controller.activePages < this.maxOpenPagesPerBrowser; const isCorrectPlugin = controller.browserPlugin === browserPlugin; - if (hasCapacity && isCorrectPlugin) { - return controller; - } - } - return undefined; + + return hasCapacity && isCorrectPlugin && (!options?.proxyTier || controller.proxyTier === options.proxyTier); + }); } private async _closeInactiveRetiredBrowsers() { @@ -724,6 +727,28 @@ export class BrowserPool< closedBrowserIds, }); } + + // const retiredBrowserIds: string[] = []; + + // for (const controller of this.activeBrowserControllers) { + // const millisSinceLastPageOpened = Date.now() - controller.lastPageOpenedAt; + // const isBrowserIdle = millisSinceLastPageOpened >= this.closeInactiveBrowserAfterMillis; + // const isBrowserEmpty = controller.activePages === 0; + + // if (isBrowserIdle && isBrowserEmpty) { + // const { id } = controller; + // log.debug('Retiring idle browser.', { id }); + // this.retireBrowserController(controller); + // retiredBrowserIds.push(id); + // } + // } + + // if (retiredBrowserIds.length) { + // log.debug('Retired idle browsers.', { + // count: retiredBrowserIds.length, + // retiredBrowserIds, + // }); + // } } private _overridePageClose(page: PageReturn) { @@ -821,6 +846,10 @@ export interface BrowserPoolNewPageOptions { @@ -856,4 +885,5 @@ export interface BrowserPoolNewPageInNewBrowserOptions { browserPlugin: BP; launchOptions?: BP['launchOptions']; + proxyTier?: number; } diff --git a/packages/browser-pool/src/launch-context.ts b/packages/browser-pool/src/launch-context.ts index e4c61ec5a1e1..a89d31edf29f 100644 --- a/packages/browser-pool/src/launch-context.ts +++ b/packages/browser-pool/src/launch-context.ts @@ -50,6 +50,7 @@ export interface LaunchContextOptions< */ userDataDir?: string; proxyUrl?: string; + proxyTier?: number; } export class LaunchContext< @@ -65,6 +66,7 @@ export class LaunchContext< useIncognitoPages: boolean; experimentalContainers: boolean; userDataDir: string; + proxyTier?: number; private _proxyUrl?: string; private readonly _reservedFieldNames = [...Reflect.ownKeys(this), 'extend']; @@ -81,6 +83,7 @@ export class LaunchContext< useIncognitoPages, experimentalContainers, userDataDir = '', + proxyTier, } = options; this.id = id; @@ -89,6 +92,7 @@ export class LaunchContext< this.useIncognitoPages = useIncognitoPages ?? false; this.experimentalContainers = experimentalContainers ?? false; this.userDataDir = userDataDir; + this.proxyTier = proxyTier; this._proxyUrl = proxyUrl; } diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index c21658d0043c..41dfff554eb5 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -97,6 +97,11 @@ export interface ProxyInfo { port: number | string; } +interface TieredProxyOptions { + request?: Request; + proxyTier?: number; +} + /** * Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking * your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures @@ -190,10 +195,10 @@ export class ProxyConfiguration { * The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. * @return Represents information about used proxy and its configuration. */ - async newProxyInfo(sessionId?: string | number, request?: Request): Promise { + async newProxyInfo(sessionId?: string | number, options?: TieredProxyOptions): Promise { if (typeof sessionId === 'number') sessionId = `${sessionId}`; - const url = await this.newUrl(sessionId, request); + const url = await this.newUrl(sessionId, options); const { username, password, port, hostname } = new URL(url); @@ -207,15 +212,32 @@ export class ProxyConfiguration { }; } - _handleTieredUrl(_sessionId: string, request?: Request): string { + protected _handleTieredUrl(_sessionId: string, options?: TieredProxyOptions): string { if (!this.tieredProxyUrls) throw new Error('Tiered proxy URLs are not set'); - if (!request) { + if (!options || (!options?.request && options?.proxyTier === undefined)) { const allProxyUrls = this.tieredProxyUrls.flat(); return allProxyUrls[this.nextCustomUrlIndex++ % allProxyUrls.length]; } - const domain = new URL(request?.url).hostname; + let tierPrediction = options.proxyTier!; + + if (tierPrediction === null) { + tierPrediction = this.getProxyTier(options.request!)!; + } + + return this.tieredProxyUrls![tierPrediction][this.nextCustomUrlIndex++ % this.tieredProxyUrls![tierPrediction].length]; + } + + /** + * Given a `Request` object, this function returns the tier of the proxy that should be used for the request. + * + * This returns `null` if `tieredProxyUrls` option is not set. + */ + getProxyTier(request: Request): number | null { + if (!this.tieredProxyUrls) return null; + + const domain = new URL(request.url).hostname; const { retryCount } = request; if (!this.domainTiers.has(domain)) { @@ -238,7 +260,7 @@ export class ProxyConfiguration { this.domainTiers.set(domain, [...history, tierPrediction].slice(-4, 4)); - return this.tieredProxyUrls![tierPrediction][this.nextCustomUrlIndex++ % this.tieredProxyUrls![tierPrediction].length]; + return tierPrediction; } /** @@ -254,7 +276,7 @@ export class ProxyConfiguration { * @return A string with a proxy URL, including authentication credentials and port number. * For example, `http://bob:password123@proxy.example.com:8000` */ - async newUrl(sessionId?: string | number, request?: Request): Promise { + async newUrl(sessionId?: string | number, options?: TieredProxyOptions): Promise { if (typeof sessionId === 'number') sessionId = `${sessionId}`; if (this.newUrlFunction) { @@ -262,7 +284,7 @@ export class ProxyConfiguration { } if (this.tieredProxyUrls) { - return this._handleTieredUrl(sessionId ?? Math.random().toString().slice(2, 6), request); + return this._handleTieredUrl(sessionId ?? Math.random().toString().slice(2, 6), options); } return this._handleCustomUrl(sessionId); From 17592ae195f17fab8d5149be8b4db429aafa4bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 11 Mar 2024 14:26:10 +0100 Subject: [PATCH 05/14] chore: lint fix --- packages/browser-pool/src/browser-pool.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/browser-pool/src/browser-pool.ts b/packages/browser-pool/src/browser-pool.ts index 38a424667c2a..363e0e753f7b 100644 --- a/packages/browser-pool/src/browser-pool.ts +++ b/packages/browser-pool/src/browser-pool.ts @@ -1,4 +1,5 @@ import { addTimeoutToPromise, tryCancel } from '@apify/timeout'; +import { Request } from '@crawlee/core'; import type { BrowserFingerprintWithHeaders } from 'fingerprint-generator'; import { FingerprintGenerator } from 'fingerprint-generator'; import { FingerprintInjector } from 'fingerprint-injector'; @@ -16,7 +17,6 @@ import type { FingerprintGeneratorOptions } from './fingerprinting/types'; import type { LaunchContext } from './launch-context'; import { log } from './logger'; import type { InferBrowserPluginArray, UnwrapPromise } from './utils'; -import { Request } from '@crawlee/core'; const PAGE_CLOSE_KILL_TIMEOUT_MILLIS = 1000; const BROWSER_KILLER_INTERVAL_MILLIS = 10 * 1000; From 7589f4be524fb3d94253e7cfb04e424fe5921630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Wed, 13 Mar 2024 14:07:25 +0100 Subject: [PATCH 06/14] feat: add `Request.addHook`, `ProxyTierTracker` and optimum-search --- packages/browser-pool/src/browser-pool.ts | 2 +- packages/core/src/proxy_configuration.ts | 63 ++++++++++++++++------- packages/core/src/request.ts | 19 +++++++ 3 files changed, 65 insertions(+), 19 deletions(-) diff --git a/packages/browser-pool/src/browser-pool.ts b/packages/browser-pool/src/browser-pool.ts index 363e0e753f7b..0e3db7e1583d 100644 --- a/packages/browser-pool/src/browser-pool.ts +++ b/packages/browser-pool/src/browser-pool.ts @@ -700,7 +700,7 @@ export class BrowserPool< const hasCapacity = controller.activePages < this.maxOpenPagesPerBrowser; const isCorrectPlugin = controller.browserPlugin === browserPlugin; - return hasCapacity && isCorrectPlugin && (!options?.proxyTier || controller.proxyTier === options.proxyTier); + return hasCapacity && isCorrectPlugin && (typeof options?.proxyTier !== 'number' || controller.proxyTier === options.proxyTier); }); } diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index 41dfff554eb5..c5611230e9fd 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -130,6 +130,44 @@ interface TieredProxyOptions { * ``` * @category Scaling */ + +class ProxyTierTracker { + private histogram: number[]; + private currentTier: number; + + constructor(tieredProxyUrls: string[][]) { + this.histogram = tieredProxyUrls.map(() => 0); + this.currentTier = 0; + } + + private processStep(): void { + this.histogram.forEach((x, i) => { + if (this.currentTier === i) return; + if (x > 0) this.histogram[i]--; + }); + + const left = this.currentTier > 0 ? this.histogram[this.currentTier - 1] : Infinity; + const right = this.currentTier < this.histogram.length - 1 ? this.histogram[this.currentTier + 1] : Infinity; + + if (this.histogram[this.currentTier] > Math.min(left, right)) { + this.currentTier = left <= right ? this.currentTier - 1 : this.currentTier + 1; + } + + if (this.histogram[this.currentTier] === left) { + this.currentTier--; + } + } + + addError(tier: number) { + this.histogram[tier] += 10; + } + + getTier() { + this.processStep(); + return this.currentTier; + } +} + export class ProxyConfiguration { isManInTheMiddle = false; protected nextCustomUrlIndex = 0; @@ -138,7 +176,7 @@ export class ProxyConfiguration { protected usedProxyUrls = new Map(); protected newUrlFunction?: ProxyConfigurationFunction; protected log = log.child({ prefix: 'ProxyConfiguration' }); - protected domainTiers = new Map(); + protected domainTiers = new Map(); /** * Creates a {@apilink ProxyConfiguration} instance based on the provided options. Proxy servers are used to prevent target websites from @@ -238,27 +276,16 @@ export class ProxyConfiguration { if (!this.tieredProxyUrls) return null; const domain = new URL(request.url).hostname; - const { retryCount } = request; - if (!this.domainTiers.has(domain)) { - this.domainTiers.set(domain, [0]); - } - - const history = this.domainTiers.get(domain)!; - - let tierPrediction; - - if (retryCount === 0) { - const averageTier = history.reduce((a, b) => a + b, 0) / history.length; - tierPrediction = Math.floor(averageTier); - if (history.every((x, _, a) => x === a[0])) tierPrediction = Math.max(0, tierPrediction - 1); - } else { - tierPrediction = history[history.length - 1] + 1; + this.domainTiers.set(domain, new ProxyTierTracker(this.tieredProxyUrls)); } - tierPrediction = Math.min(tierPrediction, this.tieredProxyUrls!.length - 1); + const tracker = this.domainTiers.get(domain)!; + const tierPrediction = tracker.getTier(); - this.domainTiers.set(domain, [...history, tierPrediction].slice(-4, 4)); + request.addHook('sessionRotation', () => { + tracker.addError(tierPrediction); + }); return tierPrediction; } diff --git a/packages/core/src/request.ts b/packages/core/src/request.ts index ba7bfaeecd76..f738f79a6634 100644 --- a/packages/core/src/request.ts +++ b/packages/core/src/request.ts @@ -78,6 +78,8 @@ export enum RequestState { * ``` * @category Sources */ + +type RequestEvent = 'sessionRotation'; export class Request { /** Request ID */ id?: string; @@ -131,6 +133,8 @@ export class Request { */ handledAt?: string; + hooks: Map void)[]> = new Map(); + /** * `Request` parameters including the URL, HTTP method and headers, and others. */ @@ -281,6 +285,8 @@ export class Request { } else { this.userData.__crawlee.sessionRotationCount = value; } + + this.hooks.get('sessionRotation')?.forEach((hook) => hook(this)); } /** shortcut for getting `request.userData.label` */ @@ -333,6 +339,19 @@ export class Request { } } + /** + * Adds a hook to the request. The hook is called on the specified `event`. + * The hooks are only useful for short-term modifications of the request - note that the hooks are not persisted once the request is stored to a storage. + * @param event The event to add the hook to. + * @param callable The hook to add. + */ + addHook(event: RequestEvent, callable: (request: Request) => void | Promise): void { + if (!this.hooks.has(event)) { + this.hooks.set(event, []); + } + this.hooks.get(event)!.push(callable); + } + /** * Stores information about an error that occurred during processing of this request. * From 62981e22835b182d3d315e88ebabcd49fbe3376c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Wed, 13 Mar 2024 17:14:49 +0100 Subject: [PATCH 07/14] chore: lint fixes --- packages/browser-pool/src/browser-pool.ts | 1 - packages/http-crawler/src/internals/http-crawler.ts | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/browser-pool/src/browser-pool.ts b/packages/browser-pool/src/browser-pool.ts index 0e3db7e1583d..d6c2e09a9824 100644 --- a/packages/browser-pool/src/browser-pool.ts +++ b/packages/browser-pool/src/browser-pool.ts @@ -1,5 +1,4 @@ import { addTimeoutToPromise, tryCancel } from '@apify/timeout'; -import { Request } from '@crawlee/core'; import type { BrowserFingerprintWithHeaders } from 'fingerprint-generator'; import { FingerprintGenerator } from 'fingerprint-generator'; import { FingerprintInjector } from 'fingerprint-injector'; diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index 54f481e18e54..a5788957ea17 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -455,7 +455,7 @@ export class HttpCrawler Date: Thu, 14 Mar 2024 15:02:41 +0100 Subject: [PATCH 08/14] chore: fix tests --- packages/core/src/request.ts | 13 ++++++++----- test/core/crawlers/cheerio_crawler.test.ts | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/packages/core/src/request.ts b/packages/core/src/request.ts index f738f79a6634..dc68539f23c0 100644 --- a/packages/core/src/request.ts +++ b/packages/core/src/request.ts @@ -133,7 +133,10 @@ export class Request { */ handledAt?: string; - hooks: Map void)[]> = new Map(); + /** + * Local hooks for the request. Note that the hooks are not persisted once the request is stored to a storage. + */ + hooks: Partial void)[]>> = {}; /** * `Request` parameters including the URL, HTTP method and headers, and others. @@ -286,7 +289,7 @@ export class Request { this.userData.__crawlee.sessionRotationCount = value; } - this.hooks.get('sessionRotation')?.forEach((hook) => hook(this)); + this.hooks.sessionRotation?.forEach((hook) => hook(this)); } /** shortcut for getting `request.userData.label` */ @@ -346,10 +349,10 @@ export class Request { * @param callable The hook to add. */ addHook(event: RequestEvent, callable: (request: Request) => void | Promise): void { - if (!this.hooks.has(event)) { - this.hooks.set(event, []); + if (!this.hooks[event]) { + this.hooks[event] = []; } - this.hooks.get(event)!.push(callable); + this.hooks[event]!.push(callable); } /** diff --git a/test/core/crawlers/cheerio_crawler.test.ts b/test/core/crawlers/cheerio_crawler.test.ts index 7587403153d3..0b9a7c29bcde 100644 --- a/test/core/crawlers/cheerio_crawler.test.ts +++ b/test/core/crawlers/cheerio_crawler.test.ts @@ -1174,7 +1174,7 @@ describe('CheerioCrawler', () => { // localhost proxy causes proxy errors, session rotations and finally throws, but we don't care } - expect(newUrlSpy).toBeCalledWith(usedSession.id, expect.any(Request)); + expect(newUrlSpy).toBeCalledWith(usedSession.id, expect.objectContaining({ request: expect.any(Request) })); }); }); From 45ed2acbbfd76733be873baf021a0863e3c37716 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Thu, 14 Mar 2024 15:29:43 +0100 Subject: [PATCH 09/14] chore: rewrite tests for new tiered proxy logic --- packages/core/src/proxy_configuration.ts | 2 +- test/core/proxy_configuration.test.ts | 52 ++++++++++++++++-------- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index c5611230e9fd..d6844db32fab 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -260,7 +260,7 @@ export class ProxyConfiguration { let tierPrediction = options.proxyTier!; - if (tierPrediction === null) { + if (typeof tierPrediction !== 'number') { tierPrediction = this.getProxyTier(options.request!)!; } diff --git a/test/core/proxy_configuration.test.ts b/test/core/proxy_configuration.test.ts index a1a9a39953a2..6d851ab83bd3 100644 --- a/test/core/proxy_configuration.test.ts +++ b/test/core/proxy_configuration.test.ts @@ -1,4 +1,5 @@ import { ProxyConfiguration, Request } from '@crawlee/core'; +import got from 'got'; const sessionId = 538909250932; @@ -195,7 +196,7 @@ describe('ProxyConfiguration', () => { expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[0][0]); }); - test('high retry count picks higher-level proxies', async () => { + test('rotating request session results in higher-level proxies', async () => { const proxyConfiguration = new ProxyConfiguration({ tieredProxyUrls: [ ['http://proxy.com:1111'], @@ -207,12 +208,23 @@ describe('ProxyConfiguration', () => { const request = new Request({ url: 'http://example.com', }); - request.retryCount = 5; // @ts-expect-error protected property const { tieredProxyUrls } = proxyConfiguration; - expect(await proxyConfiguration.newUrl('session-id', request)).toEqual(tieredProxyUrls[1][0]); - expect(await proxyConfiguration.newUrl('session-id', request)).toEqual(tieredProxyUrls[2][0]); + expect(await proxyConfiguration.newUrl('session-id', { request })).toEqual(tieredProxyUrls[0][0]); + + request.sessionRotationCount++; + expect(await proxyConfiguration.newUrl('session-id', { request })).toEqual(tieredProxyUrls[1][0]); + + request.sessionRotationCount++; + expect(await proxyConfiguration.newUrl('session-id', { request })).toEqual(tieredProxyUrls[2][0]); + + // we still get the same (higher) proxy tier even with a new request + const request2 = new Request({ + url: 'http://example.com/another-resource', + }); + + expect(await proxyConfiguration.newUrl('session-id', { request })).toEqual(tieredProxyUrls[2][0]); }); test('upshifts and downshifts properly', async () => { @@ -230,27 +242,31 @@ describe('ProxyConfiguration', () => { url: 'http://example.com', }); - let suggestedProxies = []; - - request.retryCount = 5; - for (let i = 0; i < 20; i++) { - suggestedProxies.push(await proxyConfiguration.newUrl('session-id', request)); + let gotToTheHighestProxy = false; + for (let i = 0; i < 10; i++) { + const lastProxyUrl = await proxyConfiguration.newUrl('session-id', { request }); + if (lastProxyUrl === tieredProxyUrls[2][0]) { + gotToTheHighestProxy = true; + break; + } + request.sessionRotationCount++; } - expect(suggestedProxies).toHaveLength(20); - expect(suggestedProxies).toContain(tieredProxyUrls[1][0]); - expect(suggestedProxies).toContain(tieredProxyUrls[2][0]); + expect(gotToTheHighestProxy).toBe(true); - suggestedProxies = []; + // now let's go back down + let gotToTheLowestProxy = false; - request.retryCount = 0; for (let i = 0; i < 20; i++) { - suggestedProxies.push(await proxyConfiguration.newUrl('session-id', request)); + const lastProxyUrl = await proxyConfiguration.newUrl('session-id', { request }); + if (lastProxyUrl === tieredProxyUrls[0][0]) { + gotToTheLowestProxy = true; + break; + } + // we don't need to increment the session rotation count here - we say that the proxies are good, so we promote the lower tier proxy prediction } - expect(suggestedProxies).toHaveLength(20); - expect(suggestedProxies).toContain(tieredProxyUrls[1][0]); - expect(suggestedProxies).toContain(tieredProxyUrls[0][0]); + expect(gotToTheLowestProxy).toBe(true); }); }); }); From 6df3356b22e854d831f42882193fc0a2428f9b27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Thu, 14 Mar 2024 15:42:38 +0100 Subject: [PATCH 10/14] chore: remove unused code --- packages/browser-pool/src/browser-pool.ts | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/packages/browser-pool/src/browser-pool.ts b/packages/browser-pool/src/browser-pool.ts index d6c2e09a9824..fd59d1ab0cd4 100644 --- a/packages/browser-pool/src/browser-pool.ts +++ b/packages/browser-pool/src/browser-pool.ts @@ -726,28 +726,6 @@ export class BrowserPool< closedBrowserIds, }); } - - // const retiredBrowserIds: string[] = []; - - // for (const controller of this.activeBrowserControllers) { - // const millisSinceLastPageOpened = Date.now() - controller.lastPageOpenedAt; - // const isBrowserIdle = millisSinceLastPageOpened >= this.closeInactiveBrowserAfterMillis; - // const isBrowserEmpty = controller.activePages === 0; - - // if (isBrowserIdle && isBrowserEmpty) { - // const { id } = controller; - // log.debug('Retiring idle browser.', { id }); - // this.retireBrowserController(controller); - // retiredBrowserIds.push(id); - // } - // } - - // if (retiredBrowserIds.length) { - // log.debug('Retired idle browsers.', { - // count: retiredBrowserIds.length, - // retiredBrowserIds, - // }); - // } } private _overridePageClose(page: PageReturn) { From 49043587aa1edfb36b8e4108b71072c57507472d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Fri, 15 Mar 2024 15:40:09 +0100 Subject: [PATCH 11/14] chore: lint, PR comments --- .../abstract-classes/browser-controller.ts | 2 +- packages/core/src/proxy_configuration.ts | 73 ++++++++++++------- packages/core/src/request.ts | 4 +- test/core/proxy_configuration.test.ts | 1 - 4 files changed, 50 insertions(+), 30 deletions(-) diff --git a/packages/browser-pool/src/abstract-classes/browser-controller.ts b/packages/browser-pool/src/abstract-classes/browser-controller.ts index 14396e02b41c..91acb1681756 100644 --- a/packages/browser-pool/src/abstract-classes/browser-controller.ts +++ b/packages/browser-pool/src/abstract-classes/browser-controller.ts @@ -59,7 +59,7 @@ export abstract class BrowserController< * The proxy tier tied to this browser controller. * `undefined` if no tiered proxy is used. */ - proxyTier : number | undefined = undefined; + proxyTier?: number; isActive = false; diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index d6844db32fab..d32cfff27c3e 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -103,34 +103,10 @@ interface TieredProxyOptions { } /** - * Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking - * your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures - * them to use the selected proxies for all connections. You can get information about the currently used proxy by inspecting - * the {@apilink ProxyInfo} property in your crawler's page function. There, you can inspect the proxy's URL and other attributes. - * - * If you want to use your own proxies, use the {@apilink ProxyConfigurationOptions.proxyUrls} option. Your list of proxy URLs will - * be rotated by the configuration if this option is provided. - * - * **Example usage:** - * - * ```javascript - * - * const proxyConfiguration = new ProxyConfiguration({ - * proxyUrls: ['...', '...'], - * }); + * Internal class for tracking the proxy tier history for a specific domain. * - * const crawler = new CheerioCrawler({ - * // ... - * proxyConfiguration, - * requestHandler({ proxyInfo }) { - * const usedProxyUrl = proxyInfo.url; // Getting the proxy URL - * } - * }) - * - * ``` - * @category Scaling + * Predicts the best proxy tier for the next request based on the error history for different proxy tiers. */ - class ProxyTierTracker { private histogram: number[]; private currentTier: number; @@ -140,6 +116,9 @@ class ProxyTierTracker { this.currentTier = 0; } + /** + * Processes a single step of the algorithm and updates the current tier prediction based on the error history. + */ private processStep(): void { this.histogram.forEach((x, i) => { if (this.currentTier === i) return; @@ -158,16 +137,52 @@ class ProxyTierTracker { } } + /** + * Increases the error score for the given proxy tier. This raises the chance of picking a different proxy tier for the subsequent requests. + * @param tier The proxy tier to mark as problematic. + */ addError(tier: number) { this.histogram[tier] += 10; } + /** + * Returns the best proxy tier for the next request based on the error history for different proxy tiers. + * @returns The proxy tier prediction + */ getTier() { this.processStep(); return this.currentTier; } } +/** + * Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking + * your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures + * them to use the selected proxies for all connections. You can get information about the currently used proxy by inspecting + * the {@apilink ProxyInfo} property in your crawler's page function. There, you can inspect the proxy's URL and other attributes. + * + * If you want to use your own proxies, use the {@apilink ProxyConfigurationOptions.proxyUrls} option. Your list of proxy URLs will + * be rotated by the configuration if this option is provided. + * + * **Example usage:** + * + * ```javascript + * + * const proxyConfiguration = new ProxyConfiguration({ + * proxyUrls: ['...', '...'], + * }); + * + * const crawler = new CheerioCrawler({ + * // ... + * proxyConfiguration, + * requestHandler({ proxyInfo }) { + * const usedProxyUrl = proxyInfo.url; // Getting the proxy URL + * } + * }) + * + * ``` + * @category Scaling + */ export class ProxyConfiguration { isManInTheMiddle = false; protected nextCustomUrlIndex = 0; @@ -250,6 +265,12 @@ export class ProxyConfiguration { }; } + /** + * Given a session identifier and a request / proxy tier, this function returns a new proxy URL based on the provided configuration options. + * @param _sessionId Session identifier + * @param options Options for the tiered proxy rotation + * @returns A string with a proxy URL. + */ protected _handleTieredUrl(_sessionId: string, options?: TieredProxyOptions): string { if (!this.tieredProxyUrls) throw new Error('Tiered proxy URLs are not set'); diff --git a/packages/core/src/request.ts b/packages/core/src/request.ts index dc68539f23c0..de17665a37fd 100644 --- a/packages/core/src/request.ts +++ b/packages/core/src/request.ts @@ -47,6 +47,8 @@ export enum RequestState { SKIPPED, } +type RequestEvent = 'sessionRotation'; + /** * Represents a URL to be crawled, optionally including HTTP method, headers, payload and other metadata. * The `Request` object also stores information about errors that occurred during processing of the request. @@ -78,8 +80,6 @@ export enum RequestState { * ``` * @category Sources */ - -type RequestEvent = 'sessionRotation'; export class Request { /** Request ID */ id?: string; diff --git a/test/core/proxy_configuration.test.ts b/test/core/proxy_configuration.test.ts index 6d851ab83bd3..1bd07bb96c86 100644 --- a/test/core/proxy_configuration.test.ts +++ b/test/core/proxy_configuration.test.ts @@ -1,5 +1,4 @@ import { ProxyConfiguration, Request } from '@crawlee/core'; -import got from 'got'; const sessionId = 538909250932; From 9b1ca39f1f179b2024759144e425996bdb261404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 18 Mar 2024 16:09:16 +0100 Subject: [PATCH 12/14] feat: use `forefront` for stateless proxy tier rotation --- .../src/internals/basic-crawler.ts | 2 +- packages/core/src/proxy_configuration.ts | 16 ++++++++++---- packages/core/src/request.ts | 22 ------------------- test/core/proxy_configuration.test.ts | 10 +++------ 4 files changed, 16 insertions(+), 34 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index d966a670baf4..5814b67f88db 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1384,7 +1384,7 @@ export class BasicCrawler { - tracker.addError(tierPrediction); - }); + request.userData.__crawlee.lastProxyTier = tierPrediction; + request.userData.__crawlee.forefront = true; return tierPrediction; } diff --git a/packages/core/src/request.ts b/packages/core/src/request.ts index de17665a37fd..ba7bfaeecd76 100644 --- a/packages/core/src/request.ts +++ b/packages/core/src/request.ts @@ -47,8 +47,6 @@ export enum RequestState { SKIPPED, } -type RequestEvent = 'sessionRotation'; - /** * Represents a URL to be crawled, optionally including HTTP method, headers, payload and other metadata. * The `Request` object also stores information about errors that occurred during processing of the request. @@ -133,11 +131,6 @@ export class Request { */ handledAt?: string; - /** - * Local hooks for the request. Note that the hooks are not persisted once the request is stored to a storage. - */ - hooks: Partial void)[]>> = {}; - /** * `Request` parameters including the URL, HTTP method and headers, and others. */ @@ -288,8 +281,6 @@ export class Request { } else { this.userData.__crawlee.sessionRotationCount = value; } - - this.hooks.sessionRotation?.forEach((hook) => hook(this)); } /** shortcut for getting `request.userData.label` */ @@ -342,19 +333,6 @@ export class Request { } } - /** - * Adds a hook to the request. The hook is called on the specified `event`. - * The hooks are only useful for short-term modifications of the request - note that the hooks are not persisted once the request is stored to a storage. - * @param event The event to add the hook to. - * @param callable The hook to add. - */ - addHook(event: RequestEvent, callable: (request: Request) => void | Promise): void { - if (!this.hooks[event]) { - this.hooks[event] = []; - } - this.hooks[event]!.push(callable); - } - /** * Stores information about an error that occurred during processing of this request. * diff --git a/test/core/proxy_configuration.test.ts b/test/core/proxy_configuration.test.ts index 1bd07bb96c86..c6e98cc80777 100644 --- a/test/core/proxy_configuration.test.ts +++ b/test/core/proxy_configuration.test.ts @@ -195,7 +195,7 @@ describe('ProxyConfiguration', () => { expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[0][0]); }); - test('rotating request session results in higher-level proxies', async () => { + test('rotating a request results in higher-level proxies', async () => { const proxyConfiguration = new ProxyConfiguration({ tieredProxyUrls: [ ['http://proxy.com:1111'], @@ -211,11 +211,7 @@ describe('ProxyConfiguration', () => { // @ts-expect-error protected property const { tieredProxyUrls } = proxyConfiguration; expect(await proxyConfiguration.newUrl('session-id', { request })).toEqual(tieredProxyUrls[0][0]); - - request.sessionRotationCount++; expect(await proxyConfiguration.newUrl('session-id', { request })).toEqual(tieredProxyUrls[1][0]); - - request.sessionRotationCount++; expect(await proxyConfiguration.newUrl('session-id', { request })).toEqual(tieredProxyUrls[2][0]); // we still get the same (higher) proxy tier even with a new request @@ -223,7 +219,7 @@ describe('ProxyConfiguration', () => { url: 'http://example.com/another-resource', }); - expect(await proxyConfiguration.newUrl('session-id', { request })).toEqual(tieredProxyUrls[2][0]); + expect(await proxyConfiguration.newUrl('session-id', { request: request2 })).toEqual(tieredProxyUrls[2][0]); }); test('upshifts and downshifts properly', async () => { @@ -262,7 +258,7 @@ describe('ProxyConfiguration', () => { gotToTheLowestProxy = true; break; } - // we don't need to increment the session rotation count here - we say that the proxies are good, so we promote the lower tier proxy prediction + // We don't increment the sessionRotationCount here - this causes the proxy tier to go down (current proxy is ok, so it tries to downshift in some time) } expect(gotToTheLowestProxy).toBe(true); From ae4039233255d3426156f274652356e40469377b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 18 Mar 2024 16:10:41 +0100 Subject: [PATCH 13/14] chore: naming --- packages/core/src/proxy_configuration.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index 0c42e44697a2..40b0ef137e81 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -149,7 +149,7 @@ class ProxyTierTracker { * Returns the best proxy tier for the next request based on the error history for different proxy tiers. * @returns The proxy tier prediction */ - getTier() { + predictTier() { this.processStep(); return this.currentTier; } @@ -311,7 +311,7 @@ export class ProxyConfiguration { tracker.addError(request.userData.__crawlee.lastProxyTier); } - const tierPrediction = tracker.getTier(); + const tierPrediction = tracker.predictTier(); request.userData.__crawlee.lastProxyTier = tierPrediction; request.userData.__crawlee.forefront = true; From 13c7b3f970e657ec1f396553349a2cec95d4f372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 18 Mar 2024 16:27:38 +0100 Subject: [PATCH 14/14] fix: fix tests, cover more paths --- packages/basic-crawler/src/internals/basic-crawler.ts | 2 +- packages/core/src/proxy_configuration.ts | 2 ++ test/core/crawlers/basic_crawler.test.ts | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 5814b67f88db..de3053610ee3 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1148,7 +1148,7 @@ export class BasicCrawler { // 1st try - expect(reclaimReq).toBeCalledWith(request1); + expect(reclaimReq).toBeCalledWith(request1, expect.objectContaining({})); expect(reclaimReq).toBeCalledTimes(3); expect(processed['http://example.com/0'].userData.foo).toBe('bar');