From 449778ec47fb54f597bf4a8d7a4a986fc2f54a48 Mon Sep 17 00:00:00 2001 From: Jack Hedaya Date: Sun, 12 Feb 2023 20:19:46 +0000 Subject: [PATCH 1/9] added blacklist option to type with documentation --- packages/core/src/enqueue_links/enqueue_links.ts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 3a2d6ab1d10d..0e688ac8a943 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -55,6 +55,18 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { */ globs?: GlobInput[]; + /** + * An array of glob pattern strings or plain objects + * containing glob pattern strings matching URLs that will **never** be enqueued. + * + * The plain objects must include at least the `glob` property, which holds the glob pattern string. + * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. + * + * The matching is always case-insensitive. + * If you need case-sensitive matching, use `regexps` property directly. + */ + blacklist?: GlobInput[]; + /** * An array of regular expressions or plain objects * containing regular expressions matching the URLs to be enqueued. From 7f969c1db02f0b8fff7cb6e066620931c06fc37e Mon Sep 17 00:00:00 2001 From: Jack Hedaya Date: Sun, 12 Feb 2023 21:16:33 +0000 Subject: [PATCH 2/9] add blacklist functionality --- .../core/src/enqueue_links/enqueue_links.ts | 12 ++++++++- packages/core/src/enqueue_links/shared.ts | 26 ++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 0e688ac8a943..8630eb66cba0 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -251,6 +251,10 @@ export async function enqueueLinks(options: SetRequired new Request(typeof opts === 'string' ? { url: opts } : opts)); } + if (!blacklistPatternObjects || !blacklistPatternObjects.length) { + blacklistPatternObjects = []; + } + const requests: Request[] = []; for (const opts of requestOptions) { const urlToMatch = typeof opts === 'string' ? opts : opts.url; + let isBlacklisted = false; + for (const blacklistPatternObject of blacklistPatternObjects) { + const { regexp, glob } = blacklistPatternObject; + + if ( + (regexp && urlToMatch.match(regexp)) || // eslint-disable-line + (glob && minimatch(urlToMatch, glob, { nocase: true })) + ) { + // Skip this request as it's blacklisted + isBlacklisted = true; + break; + } + } + + if (isBlacklisted) continue; + for (const urlPatternObject of urlPatternObjects) { const { regexp, glob, ...requestRegExpOptions } = urlPatternObject; if ( From 226e5e4f1bb7cb8cc590219e8051187a896c329d Mon Sep 17 00:00:00 2001 From: Jack Hedaya Date: Sun, 12 Feb 2023 21:16:39 +0000 Subject: [PATCH 3/9] add blacklist tests --- ...ded-patterns-with-enqueue-strategy.test.ts | 20 ++++++++ test/core/enqueue_links/enqueue_links.test.ts | 47 +++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts b/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts index 33651fca7ca6..4365477313f2 100644 --- a/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts +++ b/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts @@ -120,4 +120,24 @@ describe('enqueueLinks() - combining user patterns with enqueue strategies', () expect(enqueued[0].url).toBe('https://example.com/a/b/first'); expect(enqueued[1].url).toBe('https://example.com/a/b/third'); }); + + test('works with globs and blacklist', async () => { + const { enqueued, requestQueue } = getMockRequestQueue(); + + const globs = ['**/first']; + const blacklist = ['**/first']; + + await cheerioCrawlerEnqueueLinks({ + options: { + selector: '.click', + globs, + blacklist, + }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(0); + }); }); diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index c3d3eb16a899..ef85daa5198d 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -194,6 +194,53 @@ describe('enqueueLinks()', () => { expect(enqueued[2].userData).toEqual({ label: 'COOL' }); }); + test('works with blacklist', async () => { + const enqueued: (Request | RequestOptions)[] = []; + const requestQueue = new RequestQueue({ id: 'xxx', client: apifyClient }); + + // @ts-expect-error Override method for testing + requestQueue.addRequests = async (request) => { + enqueued.push(...request); + }; + const globs = [ + 'https://example.com/**/*', + { glob: '?(http|https)://cool.com/', method: 'POST' as const }, + ]; + + const blacklist = ["**/first"] + + await browserCrawlerEnqueueLinks({ + options: { + selector: '.click', + label: 'COOL', + globs, + blacklist, + transformRequestFunction: (request) => { + if (request.url.match(/example\.com\/a\/b\/third/)) { + request.method = 'OPTIONS'; + } + return request; + }, + }, + page, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(2); + + expect(enqueued[0].url).not.toBe('https://example.com/a/b/first') + expect(enqueued[1].url).not.toBe('https://example.com/a/b/first') + + expect(enqueued[0].url).toBe('https://example.com/a/b/third'); + expect(enqueued[0].method).toBe('OPTIONS'); + expect(enqueued[0].userData).toEqual({ label: 'COOL' }); + + expect(enqueued[1].url).toBe('http://cool.com/'); + expect(enqueued[1].method).toBe('POST'); + expect(enqueued[1].userData).toEqual({ label: 'COOL' }); + }); + test('works with pseudoUrls', async () => { const enqueued: (Request | RequestOptions)[] = []; const requestQueue = new RequestQueue({ id: 'xxx', client: apifyClient }); From 859847e40dc20f55910c0c7ae99c3a5fea4966e5 Mon Sep 17 00:00:00 2001 From: Jack Hedaya Date: Mon, 13 Feb 2023 10:22:36 -0500 Subject: [PATCH 4/9] Update test/core/enqueue_links/enqueue_links.test.ts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Martin Adámek --- test/core/enqueue_links/enqueue_links.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index ef85daa5198d..654a7503420e 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -207,7 +207,7 @@ describe('enqueueLinks()', () => { { glob: '?(http|https)://cool.com/', method: 'POST' as const }, ]; - const blacklist = ["**/first"] + const blacklist = ['**/first']; await browserCrawlerEnqueueLinks({ options: { From 6930700d83b2e54df15f605fbf3ee4fd86e95866 Mon Sep 17 00:00:00 2001 From: Jack Hedaya Date: Mon, 13 Feb 2023 16:10:03 +0000 Subject: [PATCH 5/9] renamed `blacklist` to `exclude` --- .../core/src/enqueue_links/enqueue_links.ts | 14 +++++++------- packages/core/src/enqueue_links/shared.ts | 17 ++++++----------- ...vided-patterns-with-enqueue-strategy.test.ts | 6 +++--- test/core/enqueue_links/enqueue_links.test.ts | 10 +++++----- 4 files changed, 21 insertions(+), 26 deletions(-) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 8630eb66cba0..38538dd48b79 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -65,7 +65,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { * The matching is always case-insensitive. * If you need case-sensitive matching, use `regexps` property directly. */ - blacklist?: GlobInput[]; + exclude?: GlobInput[]; /** * An array of regular expressions or plain objects @@ -251,7 +251,7 @@ export async function enqueueLinks(options: SetRequired new Request(typeof opts === 'string' ? { url: opts } : opts)); } - if (!blacklistPatternObjects || !blacklistPatternObjects.length) { - blacklistPatternObjects = []; - } - const requests: Request[] = []; for (const opts of requestOptions) { const urlToMatch = typeof opts === 'string' ? opts : opts.url; - let isBlacklisted = false; - for (const blacklistPatternObject of blacklistPatternObjects) { - const { regexp, glob } = blacklistPatternObject; + let isExcluded = false; + for (const excludePatternObject of excludePatternObjects) { + const { regexp, glob } = excludePatternObject; if ( (regexp && urlToMatch.match(regexp)) || // eslint-disable-line (glob && minimatch(urlToMatch, glob, { nocase: true })) ) { - // Skip this request as it's blacklisted - isBlacklisted = true; + isExcluded = true; break; } } - if (isBlacklisted) continue; + if (isExcluded) continue; for (const urlPatternObject of urlPatternObjects) { const { regexp, glob, ...requestRegExpOptions } = urlPatternObject; diff --git a/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts b/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts index 4365477313f2..e5f933f68e40 100644 --- a/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts +++ b/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts @@ -121,17 +121,17 @@ describe('enqueueLinks() - combining user patterns with enqueue strategies', () expect(enqueued[1].url).toBe('https://example.com/a/b/third'); }); - test('works with globs and blacklist', async () => { + test('works with globs and exclude', async () => { const { enqueued, requestQueue } = getMockRequestQueue(); const globs = ['**/first']; - const blacklist = ['**/first']; + const exclude = ['**/first']; await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', globs, - blacklist, + exclude, }, $, requestQueue, diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index ef85daa5198d..05e71acd282e 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -194,7 +194,7 @@ describe('enqueueLinks()', () => { expect(enqueued[2].userData).toEqual({ label: 'COOL' }); }); - test('works with blacklist', async () => { + test('works with exclude', async () => { const enqueued: (Request | RequestOptions)[] = []; const requestQueue = new RequestQueue({ id: 'xxx', client: apifyClient }); @@ -207,14 +207,14 @@ describe('enqueueLinks()', () => { { glob: '?(http|https)://cool.com/', method: 'POST' as const }, ]; - const blacklist = ["**/first"] + const exclude = ['**/first']; await browserCrawlerEnqueueLinks({ options: { selector: '.click', label: 'COOL', globs, - blacklist, + exclude, transformRequestFunction: (request) => { if (request.url.match(/example\.com\/a\/b\/third/)) { request.method = 'OPTIONS'; @@ -229,8 +229,8 @@ describe('enqueueLinks()', () => { expect(enqueued).toHaveLength(2); - expect(enqueued[0].url).not.toBe('https://example.com/a/b/first') - expect(enqueued[1].url).not.toBe('https://example.com/a/b/first') + expect(enqueued[0].url).not.toBe('https://example.com/a/b/first'); + expect(enqueued[1].url).not.toBe('https://example.com/a/b/first'); expect(enqueued[0].url).toBe('https://example.com/a/b/third'); expect(enqueued[0].method).toBe('OPTIONS'); From 6c1e2cdf56988e3a9502f4df5254a78a834d70cd Mon Sep 17 00:00:00 2001 From: Jack Hedaya Date: Mon, 13 Feb 2023 16:20:12 +0000 Subject: [PATCH 6/9] added exclusion to branch of logic --- packages/core/src/enqueue_links/enqueue_links.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 38538dd48b79..6d0be8b3d3fb 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -350,7 +350,7 @@ export async function enqueueLinks(options: SetRequired Date: Mon, 13 Feb 2023 16:32:41 +0000 Subject: [PATCH 7/9] add support for regex --- .../core/src/enqueue_links/enqueue_links.ts | 12 ++++- test/core/enqueue_links/enqueue_links.test.ts | 49 ++++++++++++++++++- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 6d0be8b3d3fb..84c19df86153 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -65,7 +65,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { * The matching is always case-insensitive. * If you need case-sensitive matching, use `regexps` property directly. */ - exclude?: GlobInput[]; + exclude?: (GlobInput | RegExpInput)[]; /** * An array of regular expressions or plain objects @@ -253,7 +253,9 @@ export async function enqueueLinks(options: SetRequired { expect(enqueued[2].userData).toEqual({ label: 'COOL' }); }); - test('works with exclude', async () => { + test('works with exclude glob', async () => { const enqueued: (Request | RequestOptions)[] = []; const requestQueue = new RequestQueue({ id: 'xxx', client: apifyClient }); @@ -241,6 +241,53 @@ describe('enqueueLinks()', () => { expect(enqueued[1].userData).toEqual({ label: 'COOL' }); }); + test('works with exclude regexp', async () => { + const enqueued: (Request | RequestOptions)[] = []; + const requestQueue = new RequestQueue({ id: 'xxx', client: apifyClient }); + + // @ts-expect-error Override method for testing + requestQueue.addRequests = async (request) => { + enqueued.push(...request); + }; + const globs = [ + 'https://example.com/**/*', + { glob: '?(http|https)://cool.com/', method: 'POST' as const }, + ]; + + const exclude = [/first/]; + + await browserCrawlerEnqueueLinks({ + options: { + selector: '.click', + label: 'COOL', + globs, + exclude, + transformRequestFunction: (request) => { + if (request.url.match(/example\.com\/a\/b\/third/)) { + request.method = 'OPTIONS'; + } + return request; + }, + }, + page, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(2); + + expect(enqueued[0].url).not.toBe('https://example.com/a/b/first'); + expect(enqueued[1].url).not.toBe('https://example.com/a/b/first'); + + expect(enqueued[0].url).toBe('https://example.com/a/b/third'); + expect(enqueued[0].method).toBe('OPTIONS'); + expect(enqueued[0].userData).toEqual({ label: 'COOL' }); + + expect(enqueued[1].url).toBe('http://cool.com/'); + expect(enqueued[1].method).toBe('POST'); + expect(enqueued[1].userData).toEqual({ label: 'COOL' }); + }); + test('works with pseudoUrls', async () => { const enqueued: (Request | RequestOptions)[] = []; const requestQueue = new RequestQueue({ id: 'xxx', client: apifyClient }); From 9e1ee21b10f52cdb4c3d97aa81084789f5f7fe06 Mon Sep 17 00:00:00 2001 From: Jack Hedaya Date: Mon, 13 Feb 2023 16:40:58 +0000 Subject: [PATCH 8/9] update docs --- packages/core/src/enqueue_links/enqueue_links.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 84c19df86153..d3701d32b9bc 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -56,14 +56,14 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { globs?: GlobInput[]; /** - * An array of glob pattern strings or plain objects - * containing glob pattern strings matching URLs that will **never** be enqueued. + * An array of glob pattern strings, regexp patterns or plain objects + * containing patterns matching URLs that will **never** be enqueued. * - * The plain objects must include at least the `glob` property, which holds the glob pattern string. + * The plain objects must include either the `glob` property or the `regexp` property. * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. * - * The matching is always case-insensitive. - * If you need case-sensitive matching, use `regexps` property directly. + * Glob matching is always case-insensitive. + * If you need case-sensitive matching, provide a regexp. */ exclude?: (GlobInput | RegExpInput)[]; From 4f47a463a889e2e0c3184bbce7749b893faad956 Mon Sep 17 00:00:00 2001 From: Jack Hedaya Date: Mon, 13 Feb 2023 17:05:30 +0000 Subject: [PATCH 9/9] fix weak check --- packages/core/src/enqueue_links/enqueue_links.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index d3701d32b9bc..4504f70c0f9a 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -284,7 +284,7 @@ export async function enqueueLinks(options: SetRequired