From 2a0a6cb94b24fa84b48c13fd4a57f62f3c74acaa Mon Sep 17 00:00:00 2001
From: Adam Chapman <adam.p.chapman@gmail.com>
Date: Wed, 16 Oct 2024 08:20:17 +1100
Subject: [PATCH 1/8] test(exclusions): unit tests

---
 src/tests/test.js | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
diff --git a/src/tests/test.js b/src/tests/test.js
index 7c4d0ca..910820c 100644
--- a/src/tests/test.js
+++ b/src/tests/test.js
@@ -263,4 +263,45 @@ describe('Sitemapper', function () {
       });
     });
   });
+
+  describe('isNotExcluded method', function () {
+    it('should return true when no exclusions are set', function () {
+      const result = sitemapper.isNotExcluded('https://foo.com/page1');
+      result.should.be.true();
+    });
+
+    it('should return true when url does not match any exclusion patterns', function () {
+      sitemapper.exclusions = [/\.pdf$/, /private/];
+      const result = sitemapper.isNotExcluded('https://foo.com/page1');
+      result.should.be.true();
+    });
+
+    it('should return false when url matches an exclusion pattern', function () {
+      sitemapper.exclusions = [/\.pdf$/, /private/];
+      const result = sitemapper.isNotExcluded('https://foo.com/document.pdf');
+      result.should.be.false();
+    });
+
+    it('should return false when url matches any of multiple exclusion patterns', function () {
+      sitemapper.exclusions = [/\.pdf$/, /private/, /temp/];
+      const result = sitemapper.isNotExcluded('https://foo.com/private/temp.html');
+      result.should.be.false();
+    });
+
+    it('should handle complex regex patterns correctly', function () {
+      sitemapper.exclusions = [/^https:\/\/foo\.com\/([a-z]{2})\/private/]
+      const result1 = sitemapper.isNotExcluded('https://foo.com/en/private/page');
+      const result2 = sitemapper.isNotExcluded('https://foo.com/en/public/page');
+      result1.should.be.false();
+      result2.should.be.true();
+    });
+
+    it('should handle case sensitivity correctly', function () {
+      sitemapper.exclusions = [/private/i];
+      const result1 = sitemapper.isNotExcluded('https://foo.com/PRIVATE/page');
+      const result2 = sitemapper.isNotExcluded('https://foo.com/Private/page');
+      result1.should.be.false();
+      result2.should.be.false();
+    });
+  });
 });

From 1eb7c607d595374e5a63697a34d82cbfe87a1f1e Mon Sep 17 00:00:00 2001
From: Adam Chapman <adam.p.chapman@gmail.com>
Date: Wed, 16 Oct 2024 08:20:39 +1100
Subject: [PATCH 2/8] feat(exclusions): type

---
 sitemapper.d.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sitemapper.d.ts b/sitemapper.d.ts
index 67dc261..6e6105b 100644
--- a/sitemapper.d.ts
+++ b/sitemapper.d.ts
@@ -20,6 +20,7 @@ export interface SitemapperOptions {
   timeout?: number;
   url?: string;
   fields?: {[name: string]: boolean};
+  exclusions?: RegExp[];
 }
 
 declare class Sitemapper {

From 61dfa447d822b9a2b5768f17c8b5cf6ce9f157fd Mon Sep 17 00:00:00 2001
From: Adam Chapman <adam.p.chapman@gmail.com>
Date: Wed, 16 Oct 2024 08:21:07 +1100
Subject: [PATCH 3/8] feat(exclusions): implement isNotExcluded

---
 src/assets/sitemapper.js | 285 ++++++++++++++++++++-------------------
 1 file changed, 150 insertions(+), 135 deletions(-)

diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js
index 811f443..1a8a9d3 100644
--- a/src/assets/sitemapper.js
+++ b/src/assets/sitemapper.js
@@ -28,11 +28,13 @@ export default class Sitemapper {
    * @params {boolean} [options.rejectUnauthorized] - If true (default), it will throw on invalid certificates, such as expired or self-signed ones.
    * @params {lastmod} [options.lastmod] - the minimum lastmod value for urls
    * @params {hpagent.HttpProxyAgent|hpagent.HttpsProxyAgent} [options.proxyAgent] - instance of npm "hpagent" HttpProxyAgent or HttpsProxyAgent to be passed to npm "got"
+   * @params {Array<RegExp>} [options.exclusions] - Array of regex patterns to exclude URLs
    *
    * @example let sitemap = new Sitemapper({
    *   url: 'https://wp.seantburke.com/sitemap.xml',
    *   timeout: 15000,
-   *   lastmod: 1630693759
+   *   lastmod: 1630693759,
+   *   exclusions: [/foo.com/, /bar.xml/] // Filters out URLs matching these patterns
    *  });
    */
   constructor(options) {
@@ -49,6 +51,7 @@ export default class Sitemapper {
       settings.rejectUnauthorized === false ? false : true;
     this.fields = settings.fields || false;
     this.proxyAgent = settings.proxyAgent || {};
+    this.exclusions = settings.exclusions || [];
   }
 
   /**
@@ -267,140 +270,141 @@ export default class Sitemapper {
    * @param {integer} retryIndex - Number of retry attempts fro this URL (e.g. 0 for 1st attempt, 1 for second attempty etc.)
    * @returns {Promise<SitesData>}
    */
-  async crawl(url, retryIndex = 0) {
-    try {
-      const { error, data } = await this.parse(url);
-      // The promise resolved, remove the timeout
-      clearTimeout(this.timeoutTable[url]);
-
-      if (error) {
-        // Handle errors during sitemap parsing / request
-        // Retry on error until you reach the retry limit set in the settings
-        if (retryIndex < this.retries) {
-          if (this.debug) {
-            console.log(
-              `(Retry attempt: ${retryIndex + 1} / ${
-                this.retries
-              }) ${url} due to ${data.name} on previous request`
-            );
-          }
-          return this.crawl(url, retryIndex + 1);
-        }
-
-        if (this.debug) {
-          console.error(
-            `Error occurred during "crawl('${url}')":\n\r Error: ${error}`
-          );
-        }
-
-        // Fail and log error
-        return {
-          sites: [],
-          errors: [
-            {
-              type: data.name,
-              message: error,
-              url,
-              retries: retryIndex,
-            },
-          ],
-        };
-      } else if (data && data.urlset && data.urlset.url) {
-        // Handle URLs found inside the sitemap
-        if (this.debug) {
-          console.debug(`Urlset found during "crawl('${url}')"`);
-        }
-        // filter out any urls that are older than the lastmod
-        const sites = data.urlset.url
-          .filter((site) => {
-            if (this.lastmod === 0) return true;
-            if (site.lastmod === undefined) return false;
-            const modified = new Date(site.lastmod[0]).getTime();
-
-            return modified >= this.lastmod;
-          })
-            .map((site) => {
-              if( !this.fields) {
-                return site.loc && site.loc[0];
-              } else {
-                  let fields = {};
-                  for (const [field, active] of Object.entries(this.fields)) {
-                    if(active){
-                      fields[field] = site[field][0]
-                    }
-                  }
-                 return fields;
-              }
-            });
-
-        return {
-          sites,
-          errors: [],
-        };
-      } else if (data && data.sitemapindex) {
-        // Handle child sitemaps found inside the active sitemap
-        if (this.debug) {
-          console.debug(`Additional sitemap found during "crawl('${url}')"`);
-        }
-        // Map each child url into a promise to create an array of promises
-        const sitemap = data.sitemapindex.sitemap.map(
-          (map) => map.loc && map.loc[0]
-        );
-
-        // Parse all child urls within the concurrency limit in the settings
-        const limit = pLimit(this.concurrency);
-        const promiseArray = sitemap.map((site) =>
-          limit(() => this.crawl(site))
-        );
-
-        // Make sure all the promises resolve then filter and reduce the array
-        const results = await Promise.all(promiseArray);
-        const sites = results
-          .filter((result) => result.errors.length === 0)
-          .reduce((prev, { sites }) => [...prev, ...sites], []);
-        const errors = results
-          .filter((result) => result.errors.length !== 0)
-          .reduce((prev, { errors }) => [...prev, ...errors], []);
-
-        return {
-          sites,
-          errors,
-        };
-      }
-
-      // Retry on error until you reach the retry limit set in the settings
-      if (retryIndex < this.retries) {
-        if (this.debug) {
-          console.log(
-            `(Retry attempt: ${retryIndex + 1} / ${
-              this.retries
-            }) ${url} due to ${data.name} on previous request`
-          );
-        }
-        return this.crawl(url, retryIndex + 1);
-      }
-      if (this.debug) {
-        console.error(`Unknown state during "crawl('${url})'":`, error, data);
-      }
-
-      // Fail and log error
-      return {
-        sites: [],
-        errors: [
-          {
-            url,
-            type: data.name || "UnknownStateError",
-            message: "An unknown error occurred.",
-            retries: retryIndex,
-          },
-        ],
-      };
-    } catch (e) {
-      if (this.debug) {
-        this.debug && console.error(e);
-      }
-    }
-  }
+   async crawl(url, retryIndex = 0) {
+     try {
+       const { error, data } = await this.parse(url);
+       // The promise resolved, remove the timeout
+       clearTimeout(this.timeoutTable[url]);
+
+       if (error) {
+         // Handle errors during sitemap parsing / request
+         // Retry on error until you reach the retry limit set in the settings
+         if (retryIndex < this.retries) {
+           if (this.debug) {
+             console.log(
+               `(Retry attempt: ${retryIndex + 1} / ${
+                 this.retries
+               }) ${url} due to ${data.name} on previous request`
+             );
+           }
+           return this.crawl(url, retryIndex + 1);
+         }
+
+         if (this.debug) {
+           console.error(
+             `Error occurred during "crawl('${url}')":\n\r Error: ${error}`
+           );
+         }
+
+         // Fail and log error
+         return {
+           sites: [],
+           errors: [
+             {
+               type: data.name,
+               message: error,
+               url,
+               retries: retryIndex,
+             },
+           ],
+         };
+       } else if (data && data.urlset && data.urlset.url) {
+         // Handle URLs found inside the sitemap
+         if (this.debug) {
+           console.debug(`Urlset found during "crawl('${url}')"`);
+         }
+         // filter out any urls that are older than the lastmod
+         const sites = data.urlset.url
+           .filter((site) => {
+             if (this.lastmod === 0) return true;
+             if (site.lastmod === undefined) return false;
+             const modified = new Date(site.lastmod[0]).getTime();
+
+             return modified >= this.lastmod;
+           })
+           .filter(this.isNotExcluded.bind(this))
+             .map((site) => {
+               if( !this.fields) {
+                 return site.loc && site.loc[0];
+               } else {
+                   let fields = {};
+                   for (const [field, active] of Object.entries(this.fields)) {
+                     if(active){
+                       fields[field] = site[field][0]
+                     }
+                   }
+                  return fields;
+               }
+             });
+
+         return {
+           sites,
+           errors: [],
+         };
+       } else if (data && data.sitemapindex) {
+         // Handle child sitemaps found inside the active sitemap
+         if (this.debug) {
+           console.debug(`Additional sitemap found during "crawl('${url}')"`);
+         }
+         // Map each child url into a promise to create an array of promises
+         const sitemap = data.sitemapindex.sitemap.map(
+           (map) => map.loc && map.loc[0]
+         ).filter(this.isNotExcluded.bind(this));
+
+         // Parse all child urls within the concurrency limit in the settings
+         const limit = pLimit(this.concurrency);
+         const promiseArray = sitemap.map((site) =>
+           limit(() => this.crawl(site))
+         );
+
+         // Make sure all the promises resolve then filter and reduce the array
+         const results = await Promise.all(promiseArray);
+         const sites = results
+           .filter((result) => result.errors.length === 0)
+           .reduce((prev, { sites }) => [...prev, ...sites], []);
+         const errors = results
+           .filter((result) => result.errors.length !== 0)
+           .reduce((prev, { errors }) => [...prev, ...errors], []);
+
+         return {
+           sites,
+           errors,
+         };
+       }
+
+       // Retry on error until you reach the retry limit set in the settings
+       if (retryIndex < this.retries) {
+         if (this.debug) {
+           console.log(
+             `(Retry attempt: ${retryIndex + 1} / ${
+               this.retries
+             }) ${url} due to ${data.name} on previous request`
+           );
+         }
+         return this.crawl(url, retryIndex + 1);
+       }
+       if (this.debug) {
+         console.error(`Unknown state during "crawl('${url})'":`, error, data);
+       }
+
+       // Fail and log error
+       return {
+         sites: [],
+         errors: [
+           {
+             url,
+             type: data.name || "UnknownStateError",
+             message: "An unknown error occurred.",
+             retries: retryIndex,
+           },
+         ],
+       };
+     } catch (e) {
+       if (this.debug) {
+         this.debug && console.error(e);
+       }
+     }
+   }
 
   /**
    * Gets the sites from a sitemap.xml with a given URL
@@ -446,6 +450,17 @@ export default class Sitemapper {
       });
     });
   }
+
+  /**
+   * Checks if a site is not excluded based on the exclusion patterns.
+   *
+   * @param {string} urls - The URL to check.
+   * @returns {boolean} Returns true if the urls is not excluded, false otherwise.
+   */
+   isNotExcluded(urls) {
+    if (this.exclusions.length === 0) return true;
+    return !this.exclusions.some((pattern) => pattern.test(urls));
+  }
 }
 
 /**

From 08e056a2281f33b677a9015b2a9d08f163f2e069 Mon Sep 17 00:00:00 2001
From: Adam Chapman <adam.p.chapman@gmail.com>
Date: Wed, 16 Oct 2024 22:32:40 +1100
Subject: [PATCH 4/8] fix(exclusions): singularize url param

---
 src/assets/sitemapper.js | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js
index 1a8a9d3..4f06e6c 100644
--- a/src/assets/sitemapper.js
+++ b/src/assets/sitemapper.js
@@ -454,12 +454,12 @@ export default class Sitemapper {
   /**
    * Checks if a site is not excluded based on the exclusion patterns.
    *
-   * @param {string} urls - The URL to check.
+   * @param {string} url - The URL to check.
    * @returns {boolean} Returns true if the urls is not excluded, false otherwise.
    */
-   isNotExcluded(urls) {
+   isNotExcluded(url) {
     if (this.exclusions.length === 0) return true;
-    return !this.exclusions.some((pattern) => pattern.test(urls));
+    return !this.exclusions.some((pattern) => pattern.test(url));
   }
 }
 

From a47547b073c2b69b9448a0fe9694ef05184ad133 Mon Sep 17 00:00:00 2001
From: Adam Chapman <adam.p.chapman@gmail.com>
Date: Thu, 17 Oct 2024 08:14:54 +1100
Subject: [PATCH 5/8] fix(exclusions): whitespace fubar

---
 src/assets/sitemapper.js | 282 +++++++++++++++++++--------------------
 1 file changed, 141 insertions(+), 141 deletions(-)

diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js
index 4f06e6c..81c5acd 100644
--- a/src/assets/sitemapper.js
+++ b/src/assets/sitemapper.js
@@ -270,141 +270,141 @@ export default class Sitemapper {
    * @param {integer} retryIndex - Number of retry attempts fro this URL (e.g. 0 for 1st attempt, 1 for second attempty etc.)
    * @returns {Promise<SitesData>}
    */
-   async crawl(url, retryIndex = 0) {
-     try {
-       const { error, data } = await this.parse(url);
-       // The promise resolved, remove the timeout
-       clearTimeout(this.timeoutTable[url]);
-
-       if (error) {
-         // Handle errors during sitemap parsing / request
-         // Retry on error until you reach the retry limit set in the settings
-         if (retryIndex < this.retries) {
-           if (this.debug) {
-             console.log(
-               `(Retry attempt: ${retryIndex + 1} / ${
-                 this.retries
-               }) ${url} due to ${data.name} on previous request`
-             );
-           }
-           return this.crawl(url, retryIndex + 1);
-         }
-
-         if (this.debug) {
-           console.error(
-             `Error occurred during "crawl('${url}')":\n\r Error: ${error}`
-           );
-         }
-
-         // Fail and log error
-         return {
-           sites: [],
-           errors: [
-             {
-               type: data.name,
-               message: error,
-               url,
-               retries: retryIndex,
-             },
-           ],
-         };
-       } else if (data && data.urlset && data.urlset.url) {
-         // Handle URLs found inside the sitemap
-         if (this.debug) {
-           console.debug(`Urlset found during "crawl('${url}')"`);
-         }
-         // filter out any urls that are older than the lastmod
-         const sites = data.urlset.url
-           .filter((site) => {
-             if (this.lastmod === 0) return true;
-             if (site.lastmod === undefined) return false;
-             const modified = new Date(site.lastmod[0]).getTime();
-
-             return modified >= this.lastmod;
-           })
-           .filter(this.isNotExcluded.bind(this))
-             .map((site) => {
-               if( !this.fields) {
-                 return site.loc && site.loc[0];
-               } else {
-                   let fields = {};
-                   for (const [field, active] of Object.entries(this.fields)) {
-                     if(active){
-                       fields[field] = site[field][0]
-                     }
-                   }
-                  return fields;
-               }
-             });
-
-         return {
-           sites,
-           errors: [],
-         };
-       } else if (data && data.sitemapindex) {
-         // Handle child sitemaps found inside the active sitemap
-         if (this.debug) {
-           console.debug(`Additional sitemap found during "crawl('${url}')"`);
-         }
-         // Map each child url into a promise to create an array of promises
-         const sitemap = data.sitemapindex.sitemap.map(
-           (map) => map.loc && map.loc[0]
-         ).filter(this.isNotExcluded.bind(this));
-
-         // Parse all child urls within the concurrency limit in the settings
-         const limit = pLimit(this.concurrency);
-         const promiseArray = sitemap.map((site) =>
-           limit(() => this.crawl(site))
-         );
-
-         // Make sure all the promises resolve then filter and reduce the array
-         const results = await Promise.all(promiseArray);
-         const sites = results
-           .filter((result) => result.errors.length === 0)
-           .reduce((prev, { sites }) => [...prev, ...sites], []);
-         const errors = results
-           .filter((result) => result.errors.length !== 0)
-           .reduce((prev, { errors }) => [...prev, ...errors], []);
-
-         return {
-           sites,
-           errors,
-         };
-       }
-
-       // Retry on error until you reach the retry limit set in the settings
-       if (retryIndex < this.retries) {
-         if (this.debug) {
-           console.log(
-             `(Retry attempt: ${retryIndex + 1} / ${
-               this.retries
-             }) ${url} due to ${data.name} on previous request`
-           );
-         }
-         return this.crawl(url, retryIndex + 1);
-       }
-       if (this.debug) {
-         console.error(`Unknown state during "crawl('${url})'":`, error, data);
-       }
-
-       // Fail and log error
-       return {
-         sites: [],
-         errors: [
-           {
-             url,
-             type: data.name || "UnknownStateError",
-             message: "An unknown error occurred.",
-             retries: retryIndex,
-           },
-         ],
-       };
-     } catch (e) {
-       if (this.debug) {
-         this.debug && console.error(e);
-       }
-     }
-   }
+  async crawl(url, retryIndex = 0) {
+    try {
+      const { error, data } = await this.parse(url);
+      // The promise resolved, remove the timeout
+      clearTimeout(this.timeoutTable[url]);
+
+      if (error) {
+        // Handle errors during sitemap parsing / request
+        // Retry on error until you reach the retry limit set in the settings
+        if (retryIndex < this.retries) {
+          if (this.debug) {
+            console.log(
+              `(Retry attempt: ${retryIndex + 1} / ${
+                this.retries
+              }) ${url} due to ${data.name} on previous request`
+            );
+          }
+          return this.crawl(url, retryIndex + 1);
+        }
+
+        if (this.debug) {
+          console.error(
+            `Error occurred during "crawl('${url}')":\n\r Error: ${error}`
+          );
+        }
+
+        // Fail and log error
+        return {
+          sites: [],
+          errors: [
+            {
+              type: data.name,
+              message: error,
+              url,
+              retries: retryIndex,
+            },
+          ],
+        };
+      } else if (data && data.urlset && data.urlset.url) {
+        // Handle URLs found inside the sitemap
+        if (this.debug) {
+          console.debug(`Urlset found during "crawl('${url}')"`);
+        }
+        // filter out any urls that are older than the lastmod
+        const sites = data.urlset.url
+          .filter((site) => {
+            if (this.lastmod === 0) return true;
+            if (site.lastmod === undefined) return false;
+            const modified = new Date(site.lastmod[0]).getTime();
+
+            return modified >= this.lastmod;
+          })
+            .filter(this.isNotExcluded.bind(this))
+            .map((site) => {
+              if( !this.fields) {
+                return site.loc && site.loc[0];
+              } else {
+                  let fields = {};
+                  for (const [field, active] of Object.entries(this.fields)) {
+                    if(active){
+                      fields[field] = site[field][0]
+                    }
+                  }
+                 return fields;
+              }
+            });
+
+        return {
+          sites,
+          errors: [],
+        };
+      } else if (data && data.sitemapindex) {
+        // Handle child sitemaps found inside the active sitemap
+        if (this.debug) {
+          console.debug(`Additional sitemap found during "crawl('${url}')"`);
+        }
+        // Map each child url into a promise to create an array of promises
+        const sitemap = data.sitemapindex.sitemap
+          .map((map) => map.loc && map.loc[0])
+          .filter(this.isNotExcluded.bind(this));
+
+        // Parse all child urls within the concurrency limit in the settings
+        const limit = pLimit(this.concurrency);
+        const promiseArray = sitemap.map((site) =>
+          limit(() => this.crawl(site))
+        );
+
+        // Make sure all the promises resolve then filter and reduce the array
+        const results = await Promise.all(promiseArray);
+        const sites = results
+          .filter((result) => result.errors.length === 0)
+          .reduce((prev, { sites }) => [...prev, ...sites], []);
+        const errors = results
+          .filter((result) => result.errors.length !== 0)
+          .reduce((prev, { errors }) => [...prev, ...errors], []);
+
+        return {
+          sites,
+          errors,
+        };
+      }
+
+      // Retry on error until you reach the retry limit set in the settings
+      if (retryIndex < this.retries) {
+        if (this.debug) {
+          console.log(
+            `(Retry attempt: ${retryIndex + 1} / ${
+              this.retries
+            }) ${url} due to ${data.name} on previous request`
+          );
+        }
+        return this.crawl(url, retryIndex + 1);
+      }
+      if (this.debug) {
+        console.error(`Unknown state during "crawl('${url})'":`, error, data);
+      }
+
+      // Fail and log error
+      return {
+        sites: [],
+        errors: [
+          {
+            url,
+            type: data.name || "UnknownStateError",
+            message: "An unknown error occurred.",
+            retries: retryIndex,
+          },
+        ],
+      };
+    } catch (e) {
+      if (this.debug) {
+        this.debug && console.error(e);
+      }
+    }
+  }
 
   /**
    * Gets the sites from a sitemap.xml with a given URL
@@ -452,12 +452,12 @@ export default class Sitemapper {
   }
 
   /**
-   * Checks if a site is not excluded based on the exclusion patterns.
-   *
-   * @param {string} url - The URL to check.
-   * @returns {boolean} Returns true if the urls is not excluded, false otherwise.
-   */
-   isNotExcluded(url) {
+    * Checks if a site is not excluded based on the exclusion patterns.
+    *
+    * @param {string} url - The URL to check.
+    * @returns {boolean} Returns true if the urls is not excluded, false otherwise.
+    */
+  isNotExcluded(url) {
     if (this.exclusions.length === 0) return true;
     return !this.exclusions.some((pattern) => pattern.test(url));
   }

From 97862f034a74e3afe07d68c09098b423d86f2b4c Mon Sep 17 00:00:00 2001
From: Adam Chapman <adam.p.chapman@gmail.com>
Date: Thu, 17 Oct 2024 09:12:04 +1100
Subject: [PATCH 6/8] test(exclusions): integration test cases

---
 src/tests/test.js | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/src/tests/test.js b/src/tests/test.js
index 910820c..e456485 100644
--- a/src/tests/test.js
+++ b/src/tests/test.js
@@ -264,6 +264,43 @@ describe('Sitemapper', function () {
     });
   });
 
+  describe('exclusions option', function () {
+      // check for the url that should be excluded in a later test
+     it('should prevent false positive', function (done) {
+        this.timeout(30000);
+        const url = 'https://wp.seantburke.com/sitemap.xml';
+        // exclude video and image sitemap index urls
+        sitemapper.exclusions = [/video/,/image/]
+        sitemapper.fetch(url)
+          .then(data => {
+            data.sites.should.be.Array;
+            data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.true
+            done();
+          })
+          .catch(error => {
+            console.error('Test failed');
+            done(error);
+          });
+      });
+
+    it('should filter out page_id urls', function (done) {
+      this.timeout(30000);
+      const url = 'https://wp.seantburke.com/sitemap.xml';
+      // exclude page_id=2
+      sitemapper.exclusions = [/page_id/]
+      sitemapper.fetch(url)
+        .then(data => {
+          data.sites.should.be.Array;
+          data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.false;
+          done();
+        })
+        .catch(error => {
+          console.error('Test failed');
+          done(error);
+        });
+    });
+  });
+
   describe('isNotExcluded method', function () {
     it('should return true when no exclusions are set', function () {
       const result = sitemapper.isNotExcluded('https://foo.com/page1');

From 5ccfa25aac0a17178cdf7ff017bb84b191f60353 Mon Sep 17 00:00:00 2001
From: Adam Chapman <adam.p.chapman@gmail.com>
Date: Thu, 17 Oct 2024 09:13:22 +1100
Subject: [PATCH 7/8] refactor(exclusions): handles different map types

---
 src/assets/sitemapper.js | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js
index 81c5acd..1d51aa8 100644
--- a/src/assets/sitemapper.js
+++ b/src/assets/sitemapper.js
@@ -322,7 +322,9 @@ export default class Sitemapper {
 
             return modified >= this.lastmod;
           })
-            .filter(this.isNotExcluded.bind(this))
+            .filter((site) => {
+              return this.isNotExcluded(site.loc[0])
+            })
             .map((site) => {
               if( !this.fields) {
                 return site.loc && site.loc[0];
@@ -349,7 +351,9 @@ export default class Sitemapper {
         // Map each child url into a promise to create an array of promises
         const sitemap = data.sitemapindex.sitemap
           .map((map) => map.loc && map.loc[0])
-          .filter(this.isNotExcluded.bind(this));
+          .filter((url) => {
+            return this.isNotExcluded(url)
+          });
 
         // Parse all child urls within the concurrency limit in the settings
         const limit = pLimit(this.concurrency);

From ee8887d1e3a8cbe29a3815840a829ca6353f53aa Mon Sep 17 00:00:00 2001
From: Adam Chapman <adam.p.chapman@gmail.com>
Date: Thu, 17 Oct 2024 17:21:00 +1100
Subject: [PATCH 8/8] refactor(exclusions): uses affirmative name for
 isExcluded

---
 src/assets/sitemapper.js | 14 +++++++-------
 src/tests/test.js        | 40 ++++++++++++++++++++--------------------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js
index 1d51aa8..9b32939 100644
--- a/src/assets/sitemapper.js
+++ b/src/assets/sitemapper.js
@@ -323,7 +323,7 @@ export default class Sitemapper {
             return modified >= this.lastmod;
           })
             .filter((site) => {
-              return this.isNotExcluded(site.loc[0])
+              return !this.isExcluded(site.loc[0])
             })
             .map((site) => {
               if( !this.fields) {
@@ -352,7 +352,7 @@ export default class Sitemapper {
         const sitemap = data.sitemapindex.sitemap
           .map((map) => map.loc && map.loc[0])
           .filter((url) => {
-            return this.isNotExcluded(url)
+            return !this.isExcluded(url)
           });
 
         // Parse all child urls within the concurrency limit in the settings
@@ -456,14 +456,14 @@ export default class Sitemapper {
   }
 
   /**
-    * Checks if a site is not excluded based on the exclusion patterns.
+    * Checks if a urls is excluded based on the exclusion patterns.
     *
     * @param {string} url - The URL to check.
-    * @returns {boolean} Returns true if the urls is not excluded, false otherwise.
+    * @returns {boolean} Returns true if the urls is excluded, false otherwise.
     */
-  isNotExcluded(url) {
-    if (this.exclusions.length === 0) return true;
-    return !this.exclusions.some((pattern) => pattern.test(url));
+  isExcluded(url) {
+    if (this.exclusions.length === 0) return false;
+    return this.exclusions.some((pattern) => pattern.test(url));
   }
 }
 
diff --git a/src/tests/test.js b/src/tests/test.js
index e456485..77f65c7 100644
--- a/src/tests/test.js
+++ b/src/tests/test.js
@@ -301,44 +301,44 @@ describe('Sitemapper', function () {
     });
   });
 
-  describe('isNotExcluded method', function () {
-    it('should return true when no exclusions are set', function () {
-      const result = sitemapper.isNotExcluded('https://foo.com/page1');
-      result.should.be.true();
+  describe('isExcluded method', function () {
+    it('should return false when no exclusions are set', function () {
+      const result = sitemapper.isExcluded('https://foo.com/page1');
+      result.should.be.false();
     });
 
-    it('should return true when url does not match any exclusion patterns', function () {
+    it('should return false when url does not match any exclusion patterns', function () {
       sitemapper.exclusions = [/\.pdf$/, /private/];
-      const result = sitemapper.isNotExcluded('https://foo.com/page1');
-      result.should.be.true();
+      const result = sitemapper.isExcluded('https://foo.com/page1');
+      result.should.be.false();
     });
 
     it('should return false when url matches an exclusion pattern', function () {
       sitemapper.exclusions = [/\.pdf$/, /private/];
-      const result = sitemapper.isNotExcluded('https://foo.com/document.pdf');
-      result.should.be.false();
+      const result = sitemapper.isExcluded('https://foo.com/document.pdf');
+      result.should.be.true();
     });
 
-    it('should return false when url matches any of multiple exclusion patterns', function () {
+    it('should return true when url matches any of multiple exclusion patterns', function () {
       sitemapper.exclusions = [/\.pdf$/, /private/, /temp/];
-      const result = sitemapper.isNotExcluded('https://foo.com/private/temp.html');
-      result.should.be.false();
+      const result = sitemapper.isExcluded('https://foo.com/private/temp.html');
+      result.should.be.true();
     });
 
     it('should handle complex regex patterns correctly', function () {
       sitemapper.exclusions = [/^https:\/\/foo\.com\/([a-z]{2})\/private/]
-      const result1 = sitemapper.isNotExcluded('https://foo.com/en/private/page');
-      const result2 = sitemapper.isNotExcluded('https://foo.com/en/public/page');
-      result1.should.be.false();
-      result2.should.be.true();
+      const result1 = sitemapper.isExcluded('https://foo.com/en/private/page');
+      const result2 = sitemapper.isExcluded('https://foo.com/en/public/page');
+      result1.should.be.true();
+      result2.should.be.false();
     });
 
     it('should handle case sensitivity correctly', function () {
       sitemapper.exclusions = [/private/i];
-      const result1 = sitemapper.isNotExcluded('https://foo.com/PRIVATE/page');
-      const result2 = sitemapper.isNotExcluded('https://foo.com/Private/page');
-      result1.should.be.false();
-      result2.should.be.false();
+      const result1 = sitemapper.isExcluded('https://foo.com/PRIVATE/page');
+      const result2 = sitemapper.isExcluded('https://foo.com/Private/page');
+      result1.should.be.true();
+      result2.should.be.true();
     });
   });
 });