Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add utils.playwright.blockRequests() #1447

Merged
merged 1 commit into from
Aug 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ const log = log_.child({ prefix: 'Playwright Utils' });
const jqueryPath = require.resolve('jquery');

const MAX_INJECT_FILE_CACHE_SIZE = 10;
const DEFAULT_BLOCK_REQUEST_URL_PATTERNS = ['.css', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip'];

export interface InjectFileOptions {
/**
Expand All @@ -44,6 +45,21 @@ export interface InjectFileOptions {
surviveNavigations?: boolean;
}

export interface BlockRequestsOptions {
/**
* The patterns of URLs to block from being loaded by the browser.
* Only `*` can be used as a wildcard. It is also automatically added to the beginning
* and end of the pattern. This limitation is enforced by the DevTools protocol.
* `.png` is the same as `*.png*`.
*/
urlPatterns?: string[];

/**
* If you just want to append to the default blocked patterns, use this property.
*/
extraUrlPatterns?: string[];
}

/**
* Cache contents of previously injected files to limit file system access.
*/
Expand Down Expand Up @@ -192,6 +208,68 @@ export async function gotoExtended(page: Page, request: Request, gotoOptions: Di
return page.goto(url, gotoOptions);
}

/**
* Forces the Playwright browser tab to block loading URLs that match a provided pattern.
* This is useful to speed up crawling of websites, since it reduces the amount
* of data that needs to be downloaded from the web, but it may break some websites
* or unexpectedly prevent loading of resources.
*
* By default, the function will block all URLs including the following patterns:
*
* ```json
* [".css", ".jpg", ".jpeg", ".png", ".svg", ".gif", ".woff", ".pdf", ".zip"]
* ```
*
* If you want to extend this list further, use the `extraUrlPatterns` option,
* which will keep blocking the default patterns, as well as add your custom ones.
* If you would like to block only specific patterns, use the `urlPatterns` option,
* which will override the defaults and block only URLs with your custom patterns.
*
* This function does not use Playwright's request interception and therefore does not interfere
* with browser cache. It's also faster than blocking requests using interception,
* because the blocking happens directly in the browser without the round-trip to Node.js,
* but it does not provide the extra benefits of request interception.
*
* The function will never block main document loads and their respective redirects.
*
* **Example usage**
* ```javascript
* import { launchPlaywright, playwrightUtils } from 'crawlee';
*
* const browser = await launchPlaywright();
* const page = await browser.newPage();
*
* // Block all requests to URLs that include `adsbygoogle.js` and also all defaults.
* await playwrightUtils.blockRequests(page, {
* extraUrlPatterns: ['adsbygoogle.js'],
* });
*
* await page.goto('https://cnn.com');
* ```
*
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param [options]
*/
export async function blockRequests(page: Page, options: BlockRequestsOptions = {}): Promise<void> {
ow(page, ow.object.validate(validators.browserPage));
ow(options, ow.object.exactShape({
urlPatterns: ow.optional.array.ofType(ow.string),
extraUrlPatterns: ow.optional.array.ofType(ow.string),
}));

const {
urlPatterns = DEFAULT_BLOCK_REQUEST_URL_PATTERNS,
extraUrlPatterns = [],
} = options;

const patternsToBlock = [...urlPatterns, ...extraUrlPatterns];

const client = await page.context().newCDPSession(page);

await client.send('Network.enable');
await client.send('Network.setBlockedURLs', { urls: patternsToBlock });
}

/**
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@link CheerioCrawler}.
*
Expand All @@ -212,12 +290,14 @@ export async function parseWithCheerio(page: Page): Promise<CheerioRoot> {
export interface PlaywrightContextUtils {
injectFile(filePath: string, options?: InjectFileOptions): Promise<unknown>;
injectJQuery(): Promise<unknown>;
blockRequests(options?: BlockRequestsOptions): Promise<void>;
parseWithCheerio(): Promise<CheerioRoot>;
}

export function registerUtilsToContext(context: PlaywrightCrawlingContext): void {
context.injectFile = (filePath: string, options?: InjectFileOptions) => injectFile(context.page, filePath, options);
context.injectJQuery = () => injectJQuery(context.page);
context.blockRequests = (options?: BlockRequestsOptions) => blockRequests(context.page, options);
context.parseWithCheerio = () => parseWithCheerio(context.page);
}

Expand All @@ -226,5 +306,6 @@ export const playwrightUtils = {
injectFile,
injectJQuery,
gotoExtended,
blockRequests,
parseWithCheerio,
};
46 changes: 46 additions & 0 deletions test/core/playwright_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,52 @@ describe('playwrightUtils', () => {
}
});

describe('blockRequests()', () => {
let browser: Browser = null;
beforeAll(async () => {
browser = await launchName(launchContext);
});
afterAll(async () => {
await browser.close();
});

test('works with default values', async () => {
const loadedUrls: string[] = [];

const page = await browser.newPage();
await playwrightUtils.blockRequests(page);
page.on('response', (response) => loadedUrls.push(response.url()));
await page.setContent(`<html><body>
<link rel="stylesheet" type="text/css" href="https://example.com/style.css">
<img src="https://example.com/image.png">
<img src="https://example.com/image.gif">
<script src="https://example.com/script.js" defer="defer">></script>
</body></html>`, { waitUntil: 'load' });
expect(loadedUrls).toEqual(['https://example.com/script.js']);
});

test('works with overridden values', async () => {
const loadedUrls: string[] = [];

const page = await browser.newPage();
await playwrightUtils.blockRequests(page, {
urlPatterns: ['.css'],
});
page.on('response', (response) => loadedUrls.push(response.url()));
await page.setContent(`<html><body>
<link rel="stylesheet" type="text/css" href="https://example.com/style.css">
<img src="https://example.com/image.png">
<img src="https://example.com/image.gif">
<script src="https://example.com/script.js" defer="defer">></script>
</body></html>`, { waitUntil: 'load' });
expect(loadedUrls).toEqual(expect.arrayContaining([
'https://example.com/image.png',
'https://example.com/script.js',
'https://example.com/image.gif',
]));
});
});

test('gotoExtended() works', async () => {
const browser = await chromium.launch({ headless: true });

Expand Down