Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(core): add exclude option to enqueueLinks #1786

Merged
merged 11 commits into from
Mar 9, 2023
34 changes: 32 additions & 2 deletions packages/core/src/enqueue_links/enqueue_links.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,18 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
*/
globs?: GlobInput[];

/**
* An array of glob pattern strings or plain objects
* containing glob pattern strings matching URLs that will **never** be enqueued.
*
* The plain objects must include at least the `glob` property, which holds the glob pattern string.
* All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects.
*
* The matching is always case-insensitive.
* If you need case-sensitive matching, use `regexps` property directly.
jackHedaya marked this conversation as resolved.
Show resolved Hide resolved
*/
exclude?: (GlobInput | RegExpInput)[];

/**
* An array of regular expressions or plain objects
* containing regular expressions matching the URLs to be enqueued.
Expand Down Expand Up @@ -239,6 +251,12 @@ export async function enqueueLinks(options: SetRequired<EnqueueLinksOptions, 're
ow.string,
ow.object.hasKeys('glob'),
)),
exclude: ow.optional.array.ofType(ow.any(
ow.string,
ow.regExp,
ow.object.hasKeys('glob'),
ow.object.hasKeys('regexp'),
)),
regexps: ow.optional.array.ofType(ow.any(
ow.regExp,
ow.object.hasKeys('regexp'),
Expand All @@ -252,14 +270,26 @@ export async function enqueueLinks(options: SetRequired<EnqueueLinksOptions, 're
limit,
urls,
pseudoUrls,
exclude,
globs,
regexps,
transformRequestFunction,
forefront,
} = options;

const urlExcludePatternObjects: UrlPatternObject[] = [];
const urlPatternObjects: UrlPatternObject[] = [];

if (exclude?.length) {
for (const excl of exclude) {
if (typeof excl === 'string' || 'glob' in excl) {
urlExcludePatternObjects.push(...constructGlobObjectsFromGlobs([excl]));
} else if (typeof excl === typeof /$/ || 'regexp' in excl) {
urlExcludePatternObjects.push(...constructRegExpObjectsFromRegExps([excl]));
}
}
}

if (pseudoUrls?.length) {
log.deprecated('`pseudoUrls` option is deprecated, use `globs` or `regexps` instead');
urlPatternObjects.push(...constructRegExpObjectsFromPseudoUrls(pseudoUrls));
Expand Down Expand Up @@ -328,11 +358,11 @@ export async function enqueueLinks(options: SetRequired<EnqueueLinksOptions, 're
function createFilteredRequests() {
// No user provided patterns means we can skip an extra filtering step
if (urlPatternObjects.length === 0) {
return createRequests(requestOptions, enqueueStrategyPatterns);
return createRequests(requestOptions, enqueueStrategyPatterns, urlExcludePatternObjects);
}

// Generate requests based on the user patterns first
const generatedRequestsFromUserFilters = createRequests(requestOptions, urlPatternObjects);
const generatedRequestsFromUserFilters = createRequests(requestOptions, urlPatternObjects, urlExcludePatternObjects);
// ...then filter them by the enqueue links strategy (making this an AND check)
return filterRequestsByPatterns(generatedRequestsFromUserFilters, enqueueStrategyPatterns);
}
Expand Down
21 changes: 20 additions & 1 deletion packages/core/src/enqueue_links/shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,11 @@ export function constructRegExpObjectsFromRegExps(regexps: RegExpInput[]): RegEx
/**
* @ignore
*/
export function createRequests(requestOptions: (string | RequestOptions)[], urlPatternObjects?: UrlPatternObject[]): Request[] {
export function createRequests(
requestOptions: (string | RequestOptions)[],
urlPatternObjects?: UrlPatternObject[],
excludePatternObjects: UrlPatternObject[] = [],
): Request[] {
if (!urlPatternObjects || !urlPatternObjects.length) {
return requestOptions
.map((opts) => new Request(typeof opts === 'string' ? { url: opts } : opts));
Expand All @@ -137,6 +141,21 @@ export function createRequests(requestOptions: (string | RequestOptions)[], urlP
for (const opts of requestOptions) {
const urlToMatch = typeof opts === 'string' ? opts : opts.url;

let isExcluded = false;
for (const excludePatternObject of excludePatternObjects) {
const { regexp, glob } = excludePatternObject;

if (
(regexp && urlToMatch.match(regexp)) || // eslint-disable-line
(glob && minimatch(urlToMatch, glob, { nocase: true }))
) {
isExcluded = true;
break;
}
}

if (isExcluded) continue;

for (const urlPatternObject of urlPatternObjects) {
const { regexp, glob, ...requestRegExpOptions } = urlPatternObject;
if (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,24 @@ describe('enqueueLinks() - combining user patterns with enqueue strategies', ()
expect(enqueued[0].url).toBe('https://example.com/a/b/first');
expect(enqueued[1].url).toBe('https://example.com/a/b/third');
});

test('works with globs and exclude', async () => {
const { enqueued, requestQueue } = getMockRequestQueue();

const globs = ['**/first'];
const exclude = ['**/first'];

await cheerioCrawlerEnqueueLinks({
options: {
selector: '.click',
globs,
exclude,
},
$,
requestQueue,
originalRequestUrl: 'https://example.com',
});

expect(enqueued).toHaveLength(0);
});
});
94 changes: 94 additions & 0 deletions test/core/enqueue_links/enqueue_links.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,100 @@ describe('enqueueLinks()', () => {
expect(enqueued[2].userData).toEqual({ label: 'COOL' });
});

test('works with exclude glob', async () => {
const enqueued: (Request | RequestOptions)[] = [];
const requestQueue = new RequestQueue({ id: 'xxx', client: apifyClient });

// @ts-expect-error Override method for testing
requestQueue.addRequests = async (request) => {
enqueued.push(...request);
};
const globs = [
'https://example.com/**/*',
{ glob: '?(http|https)://cool.com/', method: 'POST' as const },
];

const exclude = ['**/first'];

await browserCrawlerEnqueueLinks({
options: {
selector: '.click',
label: 'COOL',
globs,
exclude,
transformRequestFunction: (request) => {
if (request.url.match(/example\.com\/a\/b\/third/)) {
request.method = 'OPTIONS';
}
return request;
},
},
page,
requestQueue,
originalRequestUrl: 'https://example.com',
});

expect(enqueued).toHaveLength(2);

expect(enqueued[0].url).not.toBe('https://example.com/a/b/first');
expect(enqueued[1].url).not.toBe('https://example.com/a/b/first');

expect(enqueued[0].url).toBe('https://example.com/a/b/third');
expect(enqueued[0].method).toBe('OPTIONS');
expect(enqueued[0].userData).toEqual({ label: 'COOL' });

expect(enqueued[1].url).toBe('http://cool.com/');
expect(enqueued[1].method).toBe('POST');
expect(enqueued[1].userData).toEqual({ label: 'COOL' });
});

test('works with exclude regexp', async () => {
const enqueued: (Request | RequestOptions)[] = [];
const requestQueue = new RequestQueue({ id: 'xxx', client: apifyClient });

// @ts-expect-error Override method for testing
requestQueue.addRequests = async (request) => {
enqueued.push(...request);
};
const globs = [
'https://example.com/**/*',
{ glob: '?(http|https)://cool.com/', method: 'POST' as const },
];

const exclude = [/first/];

await browserCrawlerEnqueueLinks({
options: {
selector: '.click',
label: 'COOL',
globs,
exclude,
transformRequestFunction: (request) => {
if (request.url.match(/example\.com\/a\/b\/third/)) {
request.method = 'OPTIONS';
}
return request;
},
},
page,
requestQueue,
originalRequestUrl: 'https://example.com',
});

expect(enqueued).toHaveLength(2);

expect(enqueued[0].url).not.toBe('https://example.com/a/b/first');
expect(enqueued[1].url).not.toBe('https://example.com/a/b/first');

expect(enqueued[0].url).toBe('https://example.com/a/b/third');
expect(enqueued[0].method).toBe('OPTIONS');
expect(enqueued[0].userData).toEqual({ label: 'COOL' });

expect(enqueued[1].url).toBe('http://cool.com/');
expect(enqueued[1].method).toBe('POST');
expect(enqueued[1].userData).toEqual({ label: 'COOL' });
});

test('works with pseudoUrls', async () => {
const enqueued: (Request | RequestOptions)[] = [];
const requestQueue = new RequestQueue({ id: 'xxx', client: apifyClient });
Expand Down