Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: tieredProxyUrls for ProxyConfiguration #2348

Merged
merged 14 commits into from
Mar 25, 2024
4 changes: 2 additions & 2 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1148,7 +1148,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.log.debug(`Adding request ${request.url} (${request.id}) back to the queue`);
// eslint-disable-next-line dot-notation
source['inProgress'].add(request.id!);
await source.reclaimRequest(request);
await source.reclaimRequest(request, { forefront: request.userData?.__crawlee?.forefront });
}, delay);

return true;
Expand Down Expand Up @@ -1384,7 +1384,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
{ id, url, retryCount },
);

await source.reclaimRequest(request);
await source.reclaimRequest(request, { forefront: request.userData?.__crawlee?.forefront });
return;
}
}
Expand Down
41 changes: 24 additions & 17 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -492,22 +492,26 @@ export abstract class BrowserCrawler<
const useIncognitoPages = this.launchContext?.useIncognitoPages;
const experimentalContainers = this.launchContext?.experimentalContainers;

if (this.proxyConfiguration && (useIncognitoPages || experimentalContainers)) {
const { session } = crawlingContext;

const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id);
crawlingContext.proxyInfo = proxyInfo;

newPageOptions.proxyUrl = proxyInfo.url;

if (this.proxyConfiguration.isManInTheMiddle) {
/**
* @see https://playwright.dev/docs/api/class-browser/#browser-new-context
* @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
*/
newPageOptions.pageOptions = {
ignoreHTTPSErrors: true,
};
if (this.proxyConfiguration) {
if (useIncognitoPages || experimentalContainers) {
const { session } = crawlingContext;

const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id, { request: crawlingContext.request });
crawlingContext.proxyInfo = proxyInfo;

newPageOptions.proxyUrl = proxyInfo.url;

if (this.proxyConfiguration.isManInTheMiddle) {
/**
* @see https://playwright.dev/docs/api/class-browser/#browser-new-context
* @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
*/
newPageOptions.pageOptions = {
ignoreHTTPSErrors: true,
};
}
} else {
newPageOptions.proxyTier = this.proxyConfiguration.getProxyTier(crawlingContext.request);
}
}

Expand Down Expand Up @@ -702,7 +706,10 @@ export abstract class BrowserCrawler<
}

if (this.proxyConfiguration) {
const proxyInfo = await this.proxyConfiguration.newProxyInfo(launchContextExtends.session?.id);
const proxyInfo = await this.proxyConfiguration.newProxyInfo(
launchContextExtends.session?.id,
{ proxyTier: (launchContext.proxyTier as number) ?? undefined },
);
launchContext.proxyUrl = proxyInfo.url;
launchContextExtends.proxyInfo = proxyInfo;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ export abstract class BrowserController<
*/
launchContext: LaunchContext<Library, LibraryOptions, LaunchResult, NewPageOptions, NewPageResult> = undefined!;

/**
* The proxy tier tied to this browser controller.
* `undefined` if no tiered proxy is used.
*/
proxyTier?: number;

isActive = false;

activePages = 0;
Expand Down
2 changes: 2 additions & 0 deletions packages/browser-pool/src/abstract-classes/browser-plugin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ export abstract class BrowserPlugin<
useIncognitoPages = this.useIncognitoPages,
userDataDir = this.userDataDir,
experimentalContainers = this.experimentalContainers,
proxyTier,
} = options;

return new LaunchContext({
Expand All @@ -155,6 +156,7 @@ export abstract class BrowserPlugin<
useIncognitoPages,
experimentalContainers,
userDataDir,
proxyTier,
});
}

Expand Down
25 changes: 16 additions & 9 deletions packages/browser-pool/src/browser-pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ export class BrowserPool<
pageOptions,
browserPlugin = this._pickBrowserPlugin(),
proxyUrl,
proxyTier,
} = options;

if (this.pages.has(id)) {
Expand All @@ -403,8 +404,8 @@ export class BrowserPool<

// Limiter is necessary - https://github.com/apify/crawlee/issues/1126
return this.limiter(async () => {
let browserController = this._pickBrowserWithFreeCapacity(browserPlugin);
if (!browserController) browserController = await this._launchBrowser(id, { browserPlugin });
let browserController = this._pickBrowserWithFreeCapacity(browserPlugin, { proxyTier });
if (!browserController) browserController = await this._launchBrowser(id, { browserPlugin, proxyTier });
tryCancel();

return this._createPageForBrowser(id, browserController, pageOptions, proxyUrl);
Expand Down Expand Up @@ -632,6 +633,7 @@ export class BrowserPool<
const {
browserPlugin,
launchOptions,
proxyTier,
} = options;

const browserController = browserPlugin.createController() as BrowserControllerReturn;
Expand All @@ -640,6 +642,7 @@ export class BrowserPool<
const launchContext = browserPlugin.createLaunchContext({
id: pageId,
launchOptions,
proxyTier,
});

try {
Expand All @@ -656,6 +659,7 @@ export class BrowserPool<
}

log.debug('Launched new browser.', { id: browserController.id });
browserController.proxyTier = proxyTier;

try {
// If the launch fails on the post-launch hooks, we need to clean up
Expand Down Expand Up @@ -690,15 +694,13 @@ export class BrowserPool<
return this.browserPlugins[pluginIndex];
}

private _pickBrowserWithFreeCapacity(browserPlugin: BrowserPlugin) {
for (const controller of this.activeBrowserControllers) {
private _pickBrowserWithFreeCapacity(browserPlugin: BrowserPlugin, options?: { proxyTier?: number }) {
return [...this.activeBrowserControllers].find((controller) => {
const hasCapacity = controller.activePages < this.maxOpenPagesPerBrowser;
const isCorrectPlugin = controller.browserPlugin === browserPlugin;
if (hasCapacity && isCorrectPlugin) {
return controller;
}
}
return undefined;

return hasCapacity && isCorrectPlugin && (typeof options?.proxyTier !== 'number' || controller.proxyTier === options.proxyTier);
});
}

private async _closeInactiveRetiredBrowsers() {
Expand Down Expand Up @@ -821,6 +823,10 @@ export interface BrowserPoolNewPageOptions<PageOptions, BP extends BrowserPlugin
* Proxy URL.
*/
proxyUrl?: string;
/**
* Proxy tier.
*/
proxyTier?: number;
}

export interface BrowserPoolNewPageInNewBrowserOptions<PageOptions, BP extends BrowserPlugin> {
Expand Down Expand Up @@ -856,4 +862,5 @@ export interface BrowserPoolNewPageInNewBrowserOptions<PageOptions, BP extends B
interface InternalLaunchBrowserOptions<BP extends BrowserPlugin> {
browserPlugin: BP;
launchOptions?: BP['launchOptions'];
proxyTier?: number;
}
4 changes: 4 additions & 0 deletions packages/browser-pool/src/launch-context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export interface LaunchContextOptions<
*/
userDataDir?: string;
proxyUrl?: string;
proxyTier?: number;
}

export class LaunchContext<
Expand All @@ -65,6 +66,7 @@ export class LaunchContext<
useIncognitoPages: boolean;
experimentalContainers: boolean;
userDataDir: string;
proxyTier?: number;

private _proxyUrl?: string;
private readonly _reservedFieldNames = [...Reflect.ownKeys(this), 'extend'];
Expand All @@ -81,6 +83,7 @@ export class LaunchContext<
useIncognitoPages,
experimentalContainers,
userDataDir = '',
proxyTier,
} = options;

this.id = id;
Expand All @@ -89,6 +92,7 @@ export class LaunchContext<
this.useIncognitoPages = useIncognitoPages ?? false;
this.experimentalContainers = experimentalContainers ?? false;
this.userDataDir = userDataDir;
this.proxyTier = proxyTier;

this._proxyUrl = proxyUrl;
}
Expand Down
Loading