Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: better newUrlFunction for ProxyConfiguration #2392

Merged
merged 12 commits into from
Apr 4, 2024
27 changes: 12 additions & 15 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -494,25 +494,22 @@ export abstract class BrowserCrawler<
const experimentalContainers = this.launchContext?.experimentalContainers;

if (this.proxyConfiguration) {
if (useIncognitoPages || experimentalContainers) {
const { session } = crawlingContext;
const { session } = crawlingContext;

const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id, { request: crawlingContext.request });
crawlingContext.proxyInfo = proxyInfo;
const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id, { request: crawlingContext.request });
crawlingContext.proxyInfo = proxyInfo;

newPageOptions.proxyUrl = proxyInfo.url;
newPageOptions.proxyUrl = proxyInfo?.url;
newPageOptions.proxyTier = proxyInfo?.proxyTier;

if (this.proxyConfiguration.isManInTheMiddle) {
/**
if (this.proxyConfiguration.isManInTheMiddle) {
/**
* @see https://playwright.dev/docs/api/class-browser/#browser-new-context
* @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
*/
newPageOptions.pageOptions = {
ignoreHTTPSErrors: true,
};
}
} else {
newPageOptions.proxyTier = this.proxyConfiguration.getProxyTier(crawlingContext.request);
newPageOptions.pageOptions = {
ignoreHTTPSErrors: true,
};
}
}

Expand Down Expand Up @@ -706,12 +703,12 @@ export abstract class BrowserCrawler<
launchContextExtends.session = await this.sessionPool.getSession();
}

if (this.proxyConfiguration) {
if (this.proxyConfiguration && !launchContext.proxyUrl) {
const proxyInfo = await this.proxyConfiguration.newProxyInfo(
launchContextExtends.session?.id,
{ proxyTier: (launchContext.proxyTier as number) ?? undefined },
);
launchContext.proxyUrl = proxyInfo.url;
launchContext.proxyUrl = proxyInfo?.url;
launchContextExtends.proxyInfo = proxyInfo;

// Disable SSL verification for MITM proxies
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ export abstract class BrowserController<
*/
proxyTier?: number;

/**
* The proxy URL used by the browser controller. This is set every time the browser controller uses proxy (even the tiered one).
* `undefined` if no proxy is used
*/
proxyUrl?: string;

isActive = false;

activePages = 0;
Expand Down
27 changes: 22 additions & 5 deletions packages/browser-pool/src/browser-pool.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
import type { TieredProxy } from '@crawlee/core';
import type { BrowserFingerprintWithHeaders } from 'fingerprint-generator';
import { FingerprintGenerator } from 'fingerprint-generator';
import { FingerprintInjector } from 'fingerprint-injector';
Expand Down Expand Up @@ -404,8 +405,9 @@ export class BrowserPool<

// Limiter is necessary - https://github.com/apify/crawlee/issues/1126
return this.limiter(async () => {
let browserController = this._pickBrowserWithFreeCapacity(browserPlugin, { proxyTier });
if (!browserController) browserController = await this._launchBrowser(id, { browserPlugin, proxyTier });
let browserController = this._pickBrowserWithFreeCapacity(browserPlugin, { proxyTier, proxyUrl });

if (!browserController) browserController = await this._launchBrowser(id, { browserPlugin, proxyTier, proxyUrl });
tryCancel();

return this._createPageForBrowser(id, browserController, pageOptions, proxyUrl);
Expand Down Expand Up @@ -634,6 +636,7 @@ export class BrowserPool<
browserPlugin,
launchOptions,
proxyTier,
proxyUrl,
} = options;

const browserController = browserPlugin.createController() as BrowserControllerReturn;
Expand All @@ -643,6 +646,7 @@ export class BrowserPool<
id: pageId,
launchOptions,
proxyTier,
proxyUrl,
});

try {
Expand All @@ -660,6 +664,7 @@ export class BrowserPool<

log.debug('Launched new browser.', { id: browserController.id });
browserController.proxyTier = proxyTier;
browserController.proxyUrl = proxyUrl;

try {
// If the launch fails on the post-launch hooks, we need to clean up
Expand Down Expand Up @@ -694,12 +699,23 @@ export class BrowserPool<
return this.browserPlugins[pluginIndex];
}

private _pickBrowserWithFreeCapacity(browserPlugin: BrowserPlugin, options?: { proxyTier?: number }) {
private _pickBrowserWithFreeCapacity(
browserPlugin: BrowserPlugin,
options?: Partial<TieredProxy>,
) {
return [...this.activeBrowserControllers].find((controller) => {
const hasCapacity = controller.activePages < this.maxOpenPagesPerBrowser;
const isCorrectPlugin = controller.browserPlugin === browserPlugin;

return hasCapacity && isCorrectPlugin && (typeof options?.proxyTier !== 'number' || controller.proxyTier === options.proxyTier);
const isSameProxyUrl = (controller.proxyUrl === options?.proxyUrl);
const isCorrectProxyTier = controller.proxyTier === options?.proxyTier;

return isCorrectPlugin
&& hasCapacity
&& (
(options?.proxyTier && isCorrectProxyTier)
|| (options?.proxyUrl && isSameProxyUrl)
|| (!options?.proxyUrl && !options?.proxyTier && !controller.proxyUrl && !controller.proxyTier)
);
});
}

Expand Down Expand Up @@ -863,4 +879,5 @@ interface InternalLaunchBrowserOptions<BP extends BrowserPlugin> {
browserPlugin: BP;
launchOptions?: BP['launchOptions'];
proxyTier?: number;
proxyUrl?: string;
}
69 changes: 50 additions & 19 deletions packages/core/src/proxy_configuration.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import log from '@apify/log';
import { cryptoRandomObjectId } from '@apify/utilities';
import type { Dictionary } from '@crawlee/types';
import ow from 'ow';

import type { Request } from './request';

export interface ProxyConfigurationFunction {
(sessionId: string | number): string | Promise<string>;
(sessionId: string | number, options?: { request?: Request }): string | null | Promise<string | null>;
}

export interface ProxyConfigurationOptions {
Expand All @@ -17,8 +18,9 @@ export interface ProxyConfigurationOptions {
proxyUrls?: string[];

/**
* Custom function that allows you to generate the new proxy URL dynamically. It gets the `sessionId` as a parameter
* and should always return stringified proxy URL. Can be asynchronous.
* Custom function that allows you to generate the new proxy URL dynamically. It gets the `sessionId` as a parameter and an optional parameter with the `Request` object when applicable.
* Can return either stringified proxy URL or `null` if the proxy should not be used. Can be asynchronous.
*
* This function is used to generate the URL when {@apilink ProxyConfiguration.newUrl} or {@apilink ProxyConfiguration.newProxyInfo} is called.
*/
newUrlFunction?: ProxyConfigurationFunction;
Expand All @@ -35,6 +37,11 @@ export interface ProxyConfigurationOptions {
tieredProxyUrls?: string[][];
}

export interface TieredProxy {
proxyUrl: string;
proxyTier?: number;
}

/**
* The main purpose of the ProxyInfo object is to provide information
* about the current proxy connection used by the crawler for the request.
Expand Down Expand Up @@ -95,6 +102,11 @@ export interface ProxyInfo {
* Proxy port.
*/
port: number | string;

/**
* Proxy tier for the current proxy, if applicable (only for `tieredProxyUrls`).
*/
proxyTier?: number;
}

interface TieredProxyOptions {
Expand Down Expand Up @@ -250,10 +262,20 @@ export class ProxyConfiguration {
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
* @return Represents information about used proxy and its configuration.
*/
async newProxyInfo(sessionId?: string | number, options?: TieredProxyOptions): Promise<ProxyInfo> {
async newProxyInfo(sessionId?: string | number, options?: TieredProxyOptions): Promise<ProxyInfo | undefined> {
if (typeof sessionId === 'number') sessionId = `${sessionId}`;

const url = await this.newUrl(sessionId, options);
let url: string | undefined;
let tier: number | undefined;
if (this.tieredProxyUrls) {
const { proxyUrl, proxyTier } = this._handleTieredUrl(sessionId ?? cryptoRandomObjectId(6), options);
url = proxyUrl;
tier = proxyTier;
} else {
url = await this.newUrl(sessionId, options);
}

if (!url) return undefined;

const { username, password, port, hostname } = new URL(url);

Expand All @@ -264,40 +286,46 @@ export class ProxyConfiguration {
password,
hostname,
port: port!,
proxyTier: tier,
};
}

/**
* Given a session identifier and a request / proxy tier, this function returns a new proxy URL based on the provided configuration options.
* @param _sessionId Session identifier
* @param options Options for the tiered proxy rotation
* @returns A string with a proxy URL.
* @returns An object with the proxy URL and the proxy tier used.
*/
protected _handleTieredUrl(_sessionId: string, options?: TieredProxyOptions): string {
protected _handleTieredUrl(_sessionId: string, options?: TieredProxyOptions): TieredProxy {
if (!this.tieredProxyUrls) throw new Error('Tiered proxy URLs are not set');

if (!options || (!options?.request && options?.proxyTier === undefined)) {
const allProxyUrls = this.tieredProxyUrls.flat();
return allProxyUrls[this.nextCustomUrlIndex++ % allProxyUrls.length];
return {
proxyUrl: allProxyUrls[this.nextCustomUrlIndex++ % allProxyUrls.length],
};
}

let tierPrediction = options.proxyTier!;

if (typeof tierPrediction !== 'number') {
tierPrediction = this.getProxyTier(options.request!)!;
tierPrediction = this.predictProxyTier(options.request!)!;
}

const proxyTier = this.tieredProxyUrls![tierPrediction];

return proxyTier[this.nextCustomUrlIndex++ % proxyTier.length];
return {
proxyUrl: proxyTier[this.nextCustomUrlIndex++ % proxyTier.length],
proxyTier: tierPrediction,
};
}

/**
* Given a `Request` object, this function returns the tier of the proxy that should be used for the request.
*
* This returns `null` if `tieredProxyUrls` option is not set.
*/
getProxyTier(request: Request): number | null {
protected predictProxyTier(request: Request): number | null {
if (!this.tieredProxyUrls) return null;

const domain = new URL(request.url).hostname;
Expand Down Expand Up @@ -334,15 +362,18 @@ export class ProxyConfiguration {
* @return A string with a proxy URL, including authentication credentials and port number.
* For example, `http://bob:password123@proxy.example.com:8000`
*/
async newUrl(sessionId?: string | number, options?: TieredProxyOptions): Promise<string> {
async newUrl(sessionId?: string | number, options?: TieredProxyOptions): Promise<string | undefined> {
if (typeof sessionId === 'number') sessionId = `${sessionId}`;

if (this.newUrlFunction) {
return this._callNewUrlFunction(sessionId)!;
return (await this._callNewUrlFunction(sessionId, { request: options?.request }) ?? undefined);
}

if (this.tieredProxyUrls) {
return this._handleTieredUrl(sessionId ?? Math.random().toString().slice(2, 6), options);
return this._handleTieredUrl(
sessionId ?? cryptoRandomObjectId(6),
options,
).proxyUrl;
}

return this._handleCustomUrl(sessionId);
Expand Down Expand Up @@ -371,12 +402,12 @@ export class ProxyConfiguration {
/**
* Calls the custom newUrlFunction and checks format of its return value
*/
protected async _callNewUrlFunction(sessionId?: string) {
let proxyUrl: string;

protected async _callNewUrlFunction(sessionId?: string, options?: { request?: Request }) {
const proxyUrl = await this.newUrlFunction!(sessionId!, options);
try {
proxyUrl = await this.newUrlFunction!(sessionId!);
new URL(proxyUrl); // eslint-disable-line no-new
if (proxyUrl) {
new URL(proxyUrl); // eslint-disable-line no-new
}
return proxyUrl;
} catch (err) {
this._throwNewUrlFunctionInvalid(err as Error);
Expand Down
2 changes: 1 addition & 1 deletion test/core/proxy_configuration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ describe('ProxyConfiguration', () => {
password: '',
port: '1111',
};
expect(await proxyConfiguration.newProxyInfo(sessionId)).toStrictEqual(proxyInfo);
expect(await proxyConfiguration.newProxyInfo(sessionId)).toEqual(proxyInfo);
});

test('should throw on invalid newUrlFunction', async () => {
Expand Down