From 2a0a6cb94b24fa84b48c13fd4a57f62f3c74acaa Mon Sep 17 00:00:00 2001 From: Adam Chapman Date: Wed, 16 Oct 2024 08:20:17 +1100 Subject: [PATCH 1/8] test(exclusions): unit tests --- src/tests/test.js | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/tests/test.js b/src/tests/test.js index 7c4d0ca..910820c 100644 --- a/src/tests/test.js +++ b/src/tests/test.js @@ -263,4 +263,45 @@ describe('Sitemapper', function () { }); }); }); + + describe('isNotExcluded method', function () { + it('should return true when no exclusions are set', function () { + const result = sitemapper.isNotExcluded('https://foo.com/page1'); + result.should.be.true(); + }); + + it('should return true when url does not match any exclusion patterns', function () { + sitemapper.exclusions = [/\.pdf$/, /private/]; + const result = sitemapper.isNotExcluded('https://foo.com/page1'); + result.should.be.true(); + }); + + it('should return false when url matches an exclusion pattern', function () { + sitemapper.exclusions = [/\.pdf$/, /private/]; + const result = sitemapper.isNotExcluded('https://foo.com/document.pdf'); + result.should.be.false(); + }); + + it('should return false when url matches any of multiple exclusion patterns', function () { + sitemapper.exclusions = [/\.pdf$/, /private/, /temp/]; + const result = sitemapper.isNotExcluded('https://foo.com/private/temp.html'); + result.should.be.false(); + }); + + it('should handle complex regex patterns correctly', function () { + sitemapper.exclusions = [/^https:\/\/foo\.com\/([a-z]{2})\/private/] + const result1 = sitemapper.isNotExcluded('https://foo.com/en/private/page'); + const result2 = sitemapper.isNotExcluded('https://foo.com/en/public/page'); + result1.should.be.false(); + result2.should.be.true(); + }); + + it('should handle case sensitivity correctly', function () { + sitemapper.exclusions = [/private/i]; + const result1 = sitemapper.isNotExcluded('https://foo.com/PRIVATE/page'); + const result2 = sitemapper.isNotExcluded('https://foo.com/Private/page'); + result1.should.be.false(); + result2.should.be.false(); + }); + }); }); From 1eb7c607d595374e5a63697a34d82cbfe87a1f1e Mon Sep 17 00:00:00 2001 From: Adam Chapman Date: Wed, 16 Oct 2024 08:20:39 +1100 Subject: [PATCH 2/8] feat(exclusions): type --- sitemapper.d.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/sitemapper.d.ts b/sitemapper.d.ts index 67dc261..6e6105b 100644 --- a/sitemapper.d.ts +++ b/sitemapper.d.ts @@ -20,6 +20,7 @@ export interface SitemapperOptions { timeout?: number; url?: string; fields?: {[name: string]: boolean}; + exclusions?: RegExp[]; } declare class Sitemapper { From 61dfa447d822b9a2b5768f17c8b5cf6ce9f157fd Mon Sep 17 00:00:00 2001 From: Adam Chapman Date: Wed, 16 Oct 2024 08:21:07 +1100 Subject: [PATCH 3/8] feat(exclusions): implement isNotExcluded --- src/assets/sitemapper.js | 285 ++++++++++++++++++++------------------- 1 file changed, 150 insertions(+), 135 deletions(-) diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js index 811f443..1a8a9d3 100644 --- a/src/assets/sitemapper.js +++ b/src/assets/sitemapper.js @@ -28,11 +28,13 @@ export default class Sitemapper { * @params {boolean} [options.rejectUnauthorized] - If true (default), it will throw on invalid certificates, such as expired or self-signed ones. * @params {lastmod} [options.lastmod] - the minimum lastmod value for urls * @params {hpagent.HttpProxyAgent|hpagent.HttpsProxyAgent} [options.proxyAgent] - instance of npm "hpagent" HttpProxyAgent or HttpsProxyAgent to be passed to npm "got" + * @params {Array} [options.exclusions] - Array of regex patterns to exclude URLs * * @example let sitemap = new Sitemapper({ * url: 'https://wp.seantburke.com/sitemap.xml', * timeout: 15000, - * lastmod: 1630693759 + * lastmod: 1630693759, + * exclusions: [/foo.com/, /bar.xml/] // Filters out URLs matching these patterns * }); */ constructor(options) { @@ -49,6 +51,7 @@ export default class Sitemapper { settings.rejectUnauthorized === false ? false : true; this.fields = settings.fields || false; this.proxyAgent = settings.proxyAgent || {}; + this.exclusions = settings.exclusions || []; } /** @@ -267,140 +270,141 @@ export default class Sitemapper { * @param {integer} retryIndex - Number of retry attempts fro this URL (e.g. 0 for 1st attempt, 1 for second attempty etc.) * @returns {Promise} */ - async crawl(url, retryIndex = 0) { - try { - const { error, data } = await this.parse(url); - // The promise resolved, remove the timeout - clearTimeout(this.timeoutTable[url]); - - if (error) { - // Handle errors during sitemap parsing / request - // Retry on error until you reach the retry limit set in the settings - if (retryIndex < this.retries) { - if (this.debug) { - console.log( - `(Retry attempt: ${retryIndex + 1} / ${ - this.retries - }) ${url} due to ${data.name} on previous request` - ); - } - return this.crawl(url, retryIndex + 1); - } - - if (this.debug) { - console.error( - `Error occurred during "crawl('${url}')":\n\r Error: ${error}` - ); - } - - // Fail and log error - return { - sites: [], - errors: [ - { - type: data.name, - message: error, - url, - retries: retryIndex, - }, - ], - }; - } else if (data && data.urlset && data.urlset.url) { - // Handle URLs found inside the sitemap - if (this.debug) { - console.debug(`Urlset found during "crawl('${url}')"`); - } - // filter out any urls that are older than the lastmod - const sites = data.urlset.url - .filter((site) => { - if (this.lastmod === 0) return true; - if (site.lastmod === undefined) return false; - const modified = new Date(site.lastmod[0]).getTime(); - - return modified >= this.lastmod; - }) - .map((site) => { - if( !this.fields) { - return site.loc && site.loc[0]; - } else { - let fields = {}; - for (const [field, active] of Object.entries(this.fields)) { - if(active){ - fields[field] = site[field][0] - } - } - return fields; - } - }); - - return { - sites, - errors: [], - }; - } else if (data && data.sitemapindex) { - // Handle child sitemaps found inside the active sitemap - if (this.debug) { - console.debug(`Additional sitemap found during "crawl('${url}')"`); - } - // Map each child url into a promise to create an array of promises - const sitemap = data.sitemapindex.sitemap.map( - (map) => map.loc && map.loc[0] - ); - - // Parse all child urls within the concurrency limit in the settings - const limit = pLimit(this.concurrency); - const promiseArray = sitemap.map((site) => - limit(() => this.crawl(site)) - ); - - // Make sure all the promises resolve then filter and reduce the array - const results = await Promise.all(promiseArray); - const sites = results - .filter((result) => result.errors.length === 0) - .reduce((prev, { sites }) => [...prev, ...sites], []); - const errors = results - .filter((result) => result.errors.length !== 0) - .reduce((prev, { errors }) => [...prev, ...errors], []); - - return { - sites, - errors, - }; - } - - // Retry on error until you reach the retry limit set in the settings - if (retryIndex < this.retries) { - if (this.debug) { - console.log( - `(Retry attempt: ${retryIndex + 1} / ${ - this.retries - }) ${url} due to ${data.name} on previous request` - ); - } - return this.crawl(url, retryIndex + 1); - } - if (this.debug) { - console.error(`Unknown state during "crawl('${url})'":`, error, data); - } - - // Fail and log error - return { - sites: [], - errors: [ - { - url, - type: data.name || "UnknownStateError", - message: "An unknown error occurred.", - retries: retryIndex, - }, - ], - }; - } catch (e) { - if (this.debug) { - this.debug && console.error(e); - } - } - } + async crawl(url, retryIndex = 0) { + try { + const { error, data } = await this.parse(url); + // The promise resolved, remove the timeout + clearTimeout(this.timeoutTable[url]); + + if (error) { + // Handle errors during sitemap parsing / request + // Retry on error until you reach the retry limit set in the settings + if (retryIndex < this.retries) { + if (this.debug) { + console.log( + `(Retry attempt: ${retryIndex + 1} / ${ + this.retries + }) ${url} due to ${data.name} on previous request` + ); + } + return this.crawl(url, retryIndex + 1); + } + + if (this.debug) { + console.error( + `Error occurred during "crawl('${url}')":\n\r Error: ${error}` + ); + } + + // Fail and log error + return { + sites: [], + errors: [ + { + type: data.name, + message: error, + url, + retries: retryIndex, + }, + ], + }; + } else if (data && data.urlset && data.urlset.url) { + // Handle URLs found inside the sitemap + if (this.debug) { + console.debug(`Urlset found during "crawl('${url}')"`); + } + // filter out any urls that are older than the lastmod + const sites = data.urlset.url + .filter((site) => { + if (this.lastmod === 0) return true; + if (site.lastmod === undefined) return false; + const modified = new Date(site.lastmod[0]).getTime(); + + return modified >= this.lastmod; + }) + .filter(this.isNotExcluded.bind(this)) + .map((site) => { + if( !this.fields) { + return site.loc && site.loc[0]; + } else { + let fields = {}; + for (const [field, active] of Object.entries(this.fields)) { + if(active){ + fields[field] = site[field][0] + } + } + return fields; + } + }); + + return { + sites, + errors: [], + }; + } else if (data && data.sitemapindex) { + // Handle child sitemaps found inside the active sitemap + if (this.debug) { + console.debug(`Additional sitemap found during "crawl('${url}')"`); + } + // Map each child url into a promise to create an array of promises + const sitemap = data.sitemapindex.sitemap.map( + (map) => map.loc && map.loc[0] + ).filter(this.isNotExcluded.bind(this)); + + // Parse all child urls within the concurrency limit in the settings + const limit = pLimit(this.concurrency); + const promiseArray = sitemap.map((site) => + limit(() => this.crawl(site)) + ); + + // Make sure all the promises resolve then filter and reduce the array + const results = await Promise.all(promiseArray); + const sites = results + .filter((result) => result.errors.length === 0) + .reduce((prev, { sites }) => [...prev, ...sites], []); + const errors = results + .filter((result) => result.errors.length !== 0) + .reduce((prev, { errors }) => [...prev, ...errors], []); + + return { + sites, + errors, + }; + } + + // Retry on error until you reach the retry limit set in the settings + if (retryIndex < this.retries) { + if (this.debug) { + console.log( + `(Retry attempt: ${retryIndex + 1} / ${ + this.retries + }) ${url} due to ${data.name} on previous request` + ); + } + return this.crawl(url, retryIndex + 1); + } + if (this.debug) { + console.error(`Unknown state during "crawl('${url})'":`, error, data); + } + + // Fail and log error + return { + sites: [], + errors: [ + { + url, + type: data.name || "UnknownStateError", + message: "An unknown error occurred.", + retries: retryIndex, + }, + ], + }; + } catch (e) { + if (this.debug) { + this.debug && console.error(e); + } + } + } /** * Gets the sites from a sitemap.xml with a given URL @@ -446,6 +450,17 @@ export default class Sitemapper { }); }); } + + /** + * Checks if a site is not excluded based on the exclusion patterns. + * + * @param {string} urls - The URL to check. + * @returns {boolean} Returns true if the urls is not excluded, false otherwise. + */ + isNotExcluded(urls) { + if (this.exclusions.length === 0) return true; + return !this.exclusions.some((pattern) => pattern.test(urls)); + } } /** From 08e056a2281f33b677a9015b2a9d08f163f2e069 Mon Sep 17 00:00:00 2001 From: Adam Chapman Date: Wed, 16 Oct 2024 22:32:40 +1100 Subject: [PATCH 4/8] fix(exclusions): singularize url param --- src/assets/sitemapper.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js index 1a8a9d3..4f06e6c 100644 --- a/src/assets/sitemapper.js +++ b/src/assets/sitemapper.js @@ -454,12 +454,12 @@ export default class Sitemapper { /** * Checks if a site is not excluded based on the exclusion patterns. * - * @param {string} urls - The URL to check. + * @param {string} url - The URL to check. * @returns {boolean} Returns true if the urls is not excluded, false otherwise. */ - isNotExcluded(urls) { + isNotExcluded(url) { if (this.exclusions.length === 0) return true; - return !this.exclusions.some((pattern) => pattern.test(urls)); + return !this.exclusions.some((pattern) => pattern.test(url)); } } From a47547b073c2b69b9448a0fe9694ef05184ad133 Mon Sep 17 00:00:00 2001 From: Adam Chapman Date: Thu, 17 Oct 2024 08:14:54 +1100 Subject: [PATCH 5/8] fix(exclusions): whitespace fubar --- src/assets/sitemapper.js | 282 +++++++++++++++++++-------------------- 1 file changed, 141 insertions(+), 141 deletions(-) diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js index 4f06e6c..81c5acd 100644 --- a/src/assets/sitemapper.js +++ b/src/assets/sitemapper.js @@ -270,141 +270,141 @@ export default class Sitemapper { * @param {integer} retryIndex - Number of retry attempts fro this URL (e.g. 0 for 1st attempt, 1 for second attempty etc.) * @returns {Promise} */ - async crawl(url, retryIndex = 0) { - try { - const { error, data } = await this.parse(url); - // The promise resolved, remove the timeout - clearTimeout(this.timeoutTable[url]); - - if (error) { - // Handle errors during sitemap parsing / request - // Retry on error until you reach the retry limit set in the settings - if (retryIndex < this.retries) { - if (this.debug) { - console.log( - `(Retry attempt: ${retryIndex + 1} / ${ - this.retries - }) ${url} due to ${data.name} on previous request` - ); - } - return this.crawl(url, retryIndex + 1); - } - - if (this.debug) { - console.error( - `Error occurred during "crawl('${url}')":\n\r Error: ${error}` - ); - } - - // Fail and log error - return { - sites: [], - errors: [ - { - type: data.name, - message: error, - url, - retries: retryIndex, - }, - ], - }; - } else if (data && data.urlset && data.urlset.url) { - // Handle URLs found inside the sitemap - if (this.debug) { - console.debug(`Urlset found during "crawl('${url}')"`); - } - // filter out any urls that are older than the lastmod - const sites = data.urlset.url - .filter((site) => { - if (this.lastmod === 0) return true; - if (site.lastmod === undefined) return false; - const modified = new Date(site.lastmod[0]).getTime(); - - return modified >= this.lastmod; - }) - .filter(this.isNotExcluded.bind(this)) - .map((site) => { - if( !this.fields) { - return site.loc && site.loc[0]; - } else { - let fields = {}; - for (const [field, active] of Object.entries(this.fields)) { - if(active){ - fields[field] = site[field][0] - } - } - return fields; - } - }); - - return { - sites, - errors: [], - }; - } else if (data && data.sitemapindex) { - // Handle child sitemaps found inside the active sitemap - if (this.debug) { - console.debug(`Additional sitemap found during "crawl('${url}')"`); - } - // Map each child url into a promise to create an array of promises - const sitemap = data.sitemapindex.sitemap.map( - (map) => map.loc && map.loc[0] - ).filter(this.isNotExcluded.bind(this)); - - // Parse all child urls within the concurrency limit in the settings - const limit = pLimit(this.concurrency); - const promiseArray = sitemap.map((site) => - limit(() => this.crawl(site)) - ); - - // Make sure all the promises resolve then filter and reduce the array - const results = await Promise.all(promiseArray); - const sites = results - .filter((result) => result.errors.length === 0) - .reduce((prev, { sites }) => [...prev, ...sites], []); - const errors = results - .filter((result) => result.errors.length !== 0) - .reduce((prev, { errors }) => [...prev, ...errors], []); - - return { - sites, - errors, - }; - } - - // Retry on error until you reach the retry limit set in the settings - if (retryIndex < this.retries) { - if (this.debug) { - console.log( - `(Retry attempt: ${retryIndex + 1} / ${ - this.retries - }) ${url} due to ${data.name} on previous request` - ); - } - return this.crawl(url, retryIndex + 1); - } - if (this.debug) { - console.error(`Unknown state during "crawl('${url})'":`, error, data); - } - - // Fail and log error - return { - sites: [], - errors: [ - { - url, - type: data.name || "UnknownStateError", - message: "An unknown error occurred.", - retries: retryIndex, - }, - ], - }; - } catch (e) { - if (this.debug) { - this.debug && console.error(e); - } - } - } + async crawl(url, retryIndex = 0) { + try { + const { error, data } = await this.parse(url); + // The promise resolved, remove the timeout + clearTimeout(this.timeoutTable[url]); + + if (error) { + // Handle errors during sitemap parsing / request + // Retry on error until you reach the retry limit set in the settings + if (retryIndex < this.retries) { + if (this.debug) { + console.log( + `(Retry attempt: ${retryIndex + 1} / ${ + this.retries + }) ${url} due to ${data.name} on previous request` + ); + } + return this.crawl(url, retryIndex + 1); + } + + if (this.debug) { + console.error( + `Error occurred during "crawl('${url}')":\n\r Error: ${error}` + ); + } + + // Fail and log error + return { + sites: [], + errors: [ + { + type: data.name, + message: error, + url, + retries: retryIndex, + }, + ], + }; + } else if (data && data.urlset && data.urlset.url) { + // Handle URLs found inside the sitemap + if (this.debug) { + console.debug(`Urlset found during "crawl('${url}')"`); + } + // filter out any urls that are older than the lastmod + const sites = data.urlset.url + .filter((site) => { + if (this.lastmod === 0) return true; + if (site.lastmod === undefined) return false; + const modified = new Date(site.lastmod[0]).getTime(); + + return modified >= this.lastmod; + }) + .filter(this.isNotExcluded.bind(this)) + .map((site) => { + if( !this.fields) { + return site.loc && site.loc[0]; + } else { + let fields = {}; + for (const [field, active] of Object.entries(this.fields)) { + if(active){ + fields[field] = site[field][0] + } + } + return fields; + } + }); + + return { + sites, + errors: [], + }; + } else if (data && data.sitemapindex) { + // Handle child sitemaps found inside the active sitemap + if (this.debug) { + console.debug(`Additional sitemap found during "crawl('${url}')"`); + } + // Map each child url into a promise to create an array of promises + const sitemap = data.sitemapindex.sitemap + .map((map) => map.loc && map.loc[0]) + .filter(this.isNotExcluded.bind(this)); + + // Parse all child urls within the concurrency limit in the settings + const limit = pLimit(this.concurrency); + const promiseArray = sitemap.map((site) => + limit(() => this.crawl(site)) + ); + + // Make sure all the promises resolve then filter and reduce the array + const results = await Promise.all(promiseArray); + const sites = results + .filter((result) => result.errors.length === 0) + .reduce((prev, { sites }) => [...prev, ...sites], []); + const errors = results + .filter((result) => result.errors.length !== 0) + .reduce((prev, { errors }) => [...prev, ...errors], []); + + return { + sites, + errors, + }; + } + + // Retry on error until you reach the retry limit set in the settings + if (retryIndex < this.retries) { + if (this.debug) { + console.log( + `(Retry attempt: ${retryIndex + 1} / ${ + this.retries + }) ${url} due to ${data.name} on previous request` + ); + } + return this.crawl(url, retryIndex + 1); + } + if (this.debug) { + console.error(`Unknown state during "crawl('${url})'":`, error, data); + } + + // Fail and log error + return { + sites: [], + errors: [ + { + url, + type: data.name || "UnknownStateError", + message: "An unknown error occurred.", + retries: retryIndex, + }, + ], + }; + } catch (e) { + if (this.debug) { + this.debug && console.error(e); + } + } + } /** * Gets the sites from a sitemap.xml with a given URL @@ -452,12 +452,12 @@ export default class Sitemapper { } /** - * Checks if a site is not excluded based on the exclusion patterns. - * - * @param {string} url - The URL to check. - * @returns {boolean} Returns true if the urls is not excluded, false otherwise. - */ - isNotExcluded(url) { + * Checks if a site is not excluded based on the exclusion patterns. + * + * @param {string} url - The URL to check. + * @returns {boolean} Returns true if the urls is not excluded, false otherwise. + */ + isNotExcluded(url) { if (this.exclusions.length === 0) return true; return !this.exclusions.some((pattern) => pattern.test(url)); } From 97862f034a74e3afe07d68c09098b423d86f2b4c Mon Sep 17 00:00:00 2001 From: Adam Chapman Date: Thu, 17 Oct 2024 09:12:04 +1100 Subject: [PATCH 6/8] test(exclusions): integration test cases --- src/tests/test.js | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/tests/test.js b/src/tests/test.js index 910820c..e456485 100644 --- a/src/tests/test.js +++ b/src/tests/test.js @@ -264,6 +264,43 @@ describe('Sitemapper', function () { }); }); + describe('exclusions option', function () { + // check for the url that should be excluded in a later test + it('should prevent false positive', function (done) { + this.timeout(30000); + const url = 'https://wp.seantburke.com/sitemap.xml'; + // exclude video and image sitemap index urls + sitemapper.exclusions = [/video/,/image/] + sitemapper.fetch(url) + .then(data => { + data.sites.should.be.Array; + data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.true + done(); + }) + .catch(error => { + console.error('Test failed'); + done(error); + }); + }); + + it('should filter out page_id urls', function (done) { + this.timeout(30000); + const url = 'https://wp.seantburke.com/sitemap.xml'; + // exclude page_id=2 + sitemapper.exclusions = [/page_id/] + sitemapper.fetch(url) + .then(data => { + data.sites.should.be.Array; + data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.false; + done(); + }) + .catch(error => { + console.error('Test failed'); + done(error); + }); + }); + }); + describe('isNotExcluded method', function () { it('should return true when no exclusions are set', function () { const result = sitemapper.isNotExcluded('https://foo.com/page1'); From 5ccfa25aac0a17178cdf7ff017bb84b191f60353 Mon Sep 17 00:00:00 2001 From: Adam Chapman Date: Thu, 17 Oct 2024 09:13:22 +1100 Subject: [PATCH 7/8] refactor(exclusions): handles different map types --- src/assets/sitemapper.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js index 81c5acd..1d51aa8 100644 --- a/src/assets/sitemapper.js +++ b/src/assets/sitemapper.js @@ -322,7 +322,9 @@ export default class Sitemapper { return modified >= this.lastmod; }) - .filter(this.isNotExcluded.bind(this)) + .filter((site) => { + return this.isNotExcluded(site.loc[0]) + }) .map((site) => { if( !this.fields) { return site.loc && site.loc[0]; @@ -349,7 +351,9 @@ export default class Sitemapper { // Map each child url into a promise to create an array of promises const sitemap = data.sitemapindex.sitemap .map((map) => map.loc && map.loc[0]) - .filter(this.isNotExcluded.bind(this)); + .filter((url) => { + return this.isNotExcluded(url) + }); // Parse all child urls within the concurrency limit in the settings const limit = pLimit(this.concurrency); From ee8887d1e3a8cbe29a3815840a829ca6353f53aa Mon Sep 17 00:00:00 2001 From: Adam Chapman Date: Thu, 17 Oct 2024 17:21:00 +1100 Subject: [PATCH 8/8] refactor(exclusions): uses affirmative name for isExcluded --- src/assets/sitemapper.js | 14 +++++++------- src/tests/test.js | 40 ++++++++++++++++++++-------------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js index 1d51aa8..9b32939 100644 --- a/src/assets/sitemapper.js +++ b/src/assets/sitemapper.js @@ -323,7 +323,7 @@ export default class Sitemapper { return modified >= this.lastmod; }) .filter((site) => { - return this.isNotExcluded(site.loc[0]) + return !this.isExcluded(site.loc[0]) }) .map((site) => { if( !this.fields) { @@ -352,7 +352,7 @@ export default class Sitemapper { const sitemap = data.sitemapindex.sitemap .map((map) => map.loc && map.loc[0]) .filter((url) => { - return this.isNotExcluded(url) + return !this.isExcluded(url) }); // Parse all child urls within the concurrency limit in the settings @@ -456,14 +456,14 @@ export default class Sitemapper { } /** - * Checks if a site is not excluded based on the exclusion patterns. + * Checks if a urls is excluded based on the exclusion patterns. * * @param {string} url - The URL to check. - * @returns {boolean} Returns true if the urls is not excluded, false otherwise. + * @returns {boolean} Returns true if the urls is excluded, false otherwise. */ - isNotExcluded(url) { - if (this.exclusions.length === 0) return true; - return !this.exclusions.some((pattern) => pattern.test(url)); + isExcluded(url) { + if (this.exclusions.length === 0) return false; + return this.exclusions.some((pattern) => pattern.test(url)); } } diff --git a/src/tests/test.js b/src/tests/test.js index e456485..77f65c7 100644 --- a/src/tests/test.js +++ b/src/tests/test.js @@ -301,44 +301,44 @@ describe('Sitemapper', function () { }); }); - describe('isNotExcluded method', function () { - it('should return true when no exclusions are set', function () { - const result = sitemapper.isNotExcluded('https://foo.com/page1'); - result.should.be.true(); + describe('isExcluded method', function () { + it('should return false when no exclusions are set', function () { + const result = sitemapper.isExcluded('https://foo.com/page1'); + result.should.be.false(); }); - it('should return true when url does not match any exclusion patterns', function () { + it('should return false when url does not match any exclusion patterns', function () { sitemapper.exclusions = [/\.pdf$/, /private/]; - const result = sitemapper.isNotExcluded('https://foo.com/page1'); - result.should.be.true(); + const result = sitemapper.isExcluded('https://foo.com/page1'); + result.should.be.false(); }); it('should return false when url matches an exclusion pattern', function () { sitemapper.exclusions = [/\.pdf$/, /private/]; - const result = sitemapper.isNotExcluded('https://foo.com/document.pdf'); - result.should.be.false(); + const result = sitemapper.isExcluded('https://foo.com/document.pdf'); + result.should.be.true(); }); - it('should return false when url matches any of multiple exclusion patterns', function () { + it('should return true when url matches any of multiple exclusion patterns', function () { sitemapper.exclusions = [/\.pdf$/, /private/, /temp/]; - const result = sitemapper.isNotExcluded('https://foo.com/private/temp.html'); - result.should.be.false(); + const result = sitemapper.isExcluded('https://foo.com/private/temp.html'); + result.should.be.true(); }); it('should handle complex regex patterns correctly', function () { sitemapper.exclusions = [/^https:\/\/foo\.com\/([a-z]{2})\/private/] - const result1 = sitemapper.isNotExcluded('https://foo.com/en/private/page'); - const result2 = sitemapper.isNotExcluded('https://foo.com/en/public/page'); - result1.should.be.false(); - result2.should.be.true(); + const result1 = sitemapper.isExcluded('https://foo.com/en/private/page'); + const result2 = sitemapper.isExcluded('https://foo.com/en/public/page'); + result1.should.be.true(); + result2.should.be.false(); }); it('should handle case sensitivity correctly', function () { sitemapper.exclusions = [/private/i]; - const result1 = sitemapper.isNotExcluded('https://foo.com/PRIVATE/page'); - const result2 = sitemapper.isNotExcluded('https://foo.com/Private/page'); - result1.should.be.false(); - result2.should.be.false(); + const result1 = sitemapper.isExcluded('https://foo.com/PRIVATE/page'); + const result2 = sitemapper.isExcluded('https://foo.com/Private/page'); + result1.should.be.true(); + result2.should.be.true(); }); }); });