feat: Loading sitemaps from string (#2496)

- closes #2460
apify · May 24, 2024 · 38ed0d6 · 38ed0d6
1 parent a5dca80
commit 38ed0d6
Show file tree

Hide file tree

Showing 2 changed files with 141 additions and 45 deletions.
diff --git a/packages/utils/src/internals/sitemap.ts b/packages/utils/src/internals/sitemap.ts
@@ -1,5 +1,5 @@
 import type { Duplex } from 'node:stream';
-import { Writable } from 'node:stream';
+import { Readable, Writable } from 'node:stream';
 import { StringDecoder } from 'node:string_decoder';
 import { createGunzip } from 'node:zlib';
 
@@ -8,8 +8,10 @@ import type { SAXStream } from 'sax';
 import sax from 'sax';
 import MIMEType from 'whatwg-mimetype';
 
+type SitemapSource = { type: 'url'; url: string } | { type: 'raw'; content: string };
+
 class ParsingState {
-    sitemapUrls: string[] = [];
+    sources: SitemapSource[] = [];
     urls: string[] = [];
     visitedSitemapUrls: string[] = [];
     context?: 'sitemapindex' | 'urlset';
@@ -107,7 +109,7 @@ export class Sitemap {
             if (parsingState.loc) {
                 if (parsingState.context === 'sitemapindex') {
                     if (!parsingState.visitedSitemapUrls.includes(text)) {
-                        parsingState.sitemapUrls.push(text);
+                        parsingState.sources.push({ type: 'url', url: text });
                     }
                 }
                 if (parsingState.context === 'urlset') {
@@ -151,60 +153,101 @@ export class Sitemap {
      * @param proxyUrl URL of a proxy to be used for fetching sitemap contents
      */
     static async load(urls: string | string[], proxyUrl?: string): Promise<Sitemap> {
-        const { gotScraping } = await import('got-scraping');
-
         const parsingState = new ParsingState();
-        parsingState.sitemapUrls = Array.isArray(urls) ? urls : [urls];
+        parsingState.sources = (Array.isArray(urls) ? urls : [urls]).map((url) => ({ type: 'url', url }));
 
-        while (parsingState.sitemapUrls.length > 0) {
-            const sitemapUrl = new URL(parsingState.sitemapUrls.pop()!);
-            parsingState.visitedSitemapUrls.push(sitemapUrl.toString());
-            parsingState.resetContext();
+        return await this.parse(parsingState, proxyUrl);
+    }
 
-            try {
-                const sitemapStream = await new Promise<ReturnType<typeof gotScraping.stream>>((resolve, reject) => {
-                    const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' });
-                    request.on('response', () => resolve(request));
-                    request.on('error', reject);
-                });
+    /**
+     * Parse XML sitemap content from a string and return URLs of referenced pages. If the sitemap references other sitemaps, they will be loaded via HTTP.
+     * @param content XML sitemap content
+     * @param proxyUrl URL of a proxy to be used for fetching sitemap contents
+     */
+    static async fromXmlString(content: string, proxyUrl?: string): Promise<Sitemap> {
+        const parsingState = new ParsingState();
+        parsingState.sources = [{ type: 'raw', content }];
 
-                if (sitemapStream.response!.statusCode === 200) {
-                    await new Promise((resolve, reject) => {
-                        let stream: Duplex = sitemapStream;
-                        if (sitemapUrl.pathname.endsWith('.gz')) {
-                            stream = stream.pipe(createGunzip()).on('error', reject);
-                            sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3);
-                        }
-
-                        const parser = (() => {
-                            const contentType = sitemapStream.response!.headers['content-type'];
-                            let mimeType: MIMEType | null;
-
-                            try {
-                                mimeType = new MIMEType(contentType ?? '');
-                            } catch (e) {
-                                mimeType = null;
-                            }
+        return await this.parse(parsingState, proxyUrl);
+    }
 
-                            if (mimeType?.isXML() || sitemapUrl.pathname.endsWith('.xml')) {
-                                return Sitemap.createXmlParser(parsingState, () => resolve(undefined), reject);
-                            }
+    protected static async parse(parsingState: ParsingState, proxyUrl?: string): Promise<Sitemap> {
+        const { gotScraping } = await import('got-scraping');
 
-                            if (mimeType?.essence === 'text/plain' || sitemapUrl.pathname.endsWith('.txt')) {
-                                return new SitemapTxtParser(parsingState, () => resolve(undefined));
-                            }
+        while (parsingState.sources.length > 0) {
+            const source = parsingState.sources.pop()!;
+            parsingState.resetContext();
 
-                            throw new Error('Unsupported sitemap content type');
-                        })();
+            if (source.type === 'url') {
+                const sitemapUrl = new URL(source.url);
+                parsingState.visitedSitemapUrls.push(sitemapUrl.toString());
+
+                try {
+                    const sitemapStream = await new Promise<ReturnType<typeof gotScraping.stream>>(
+                        (resolve, reject) => {
+                            const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' });
+                            request.on('response', () => resolve(request));
+                            request.on('error', reject);
+                        },
+                    );
+
+                    if (sitemapStream.response!.statusCode === 200) {
+                        await new Promise((resolve, reject) => {
+                            let stream: Duplex = sitemapStream;
+                            if (sitemapUrl.pathname.endsWith('.gz')) {
+                                stream = stream.pipe(createGunzip()).on('error', reject);
+                                sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3);
+                            }
 
-                        stream.pipe(parser);
-                    });
+                            stream.pipe(
+                                this.createParser(
+                                    resolve,
+                                    reject,
+                                    parsingState,
+                                    sitemapStream.response!.headers['content-type'],
+                                    sitemapUrl,
+                                ),
+                            );
+                        });
+                    }
+                } catch (e) {
+                    log.warning(`Malformed sitemap content: ${sitemapUrl}`);
                 }
-            } catch (e) {
-                log.warning(`Malformed sitemap content: ${sitemapUrl}`);
+            }
+
+            if (source.type === 'raw') {
+                await new Promise((resolve, reject) => {
+                    Readable.from([source.content]).pipe(this.createParser(resolve, reject, parsingState, 'text/xml'));
+                });
             }
         }
 
         return new Sitemap(parsingState.urls);
     }
+
+    protected static createParser(
+        resolve: (value: unknown) => void,
+        reject: (value: unknown) => void,
+        parsingState: ParsingState,
+        contentType: string = '',
+        url?: URL,
+    ) {
+        let mimeType: MIMEType | null;
+
+        try {
+            mimeType = new MIMEType(contentType);
+        } catch (e) {
+            mimeType = null;
+        }
+
+        if (mimeType?.isXML() || url?.pathname.endsWith('.xml')) {
+            return Sitemap.createXmlParser(parsingState, () => resolve(undefined), reject);
+        }
+
+        if (mimeType?.essence === 'text/plain' || url?.pathname.endsWith('.txt')) {
+            return new SitemapTxtParser(parsingState, () => resolve(undefined));
+        }
+
+        throw new Error('Unsupported sitemap content type');
+    }
 }
diff --git a/packages/utils/test/sitemap.test.ts b/packages/utils/test/sitemap.test.ts
@@ -217,4 +217,57 @@ describe('Sitemap', () => {
             ]),
         );
     });
+
+    it('loads sitemaps from string', async () => {
+        const sitemap = await Sitemap.fromXmlString(
+            [
+                '<?xml version="1.0" encoding="UTF-8"?>',
+                '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
+                '<url>',
+                '<loc>http://not-exists.com/catalog?item=80&amp;desc=vacation_turkey</loc>',
+                '<lastmod>2004-11-23</lastmod>',
+                '</url>',
+                '<url>',
+                '<loc>http://not-exists.com/catalog?item=81&amp;desc=vacation_maledives</loc>',
+                '<lastmod>2004-11-23</lastmod>',
+                '</url>',
+                '</urlset>',
+            ].join('\n'),
+        );
+
+        expect(new Set(sitemap.urls)).toEqual(
+            new Set([
+                'http://not-exists.com/catalog?item=80&desc=vacation_turkey',
+                'http://not-exists.com/catalog?item=81&desc=vacation_maledives',
+            ]),
+        );
+    });
+
+    it('loads sitemaps that reference other sitemaps from string', async () => {
+        const sitemap = await Sitemap.fromXmlString(
+            [
+                '<?xml version="1.0" encoding="UTF-8"?>',
+                '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
+                '<sitemap>',
+                '<loc>http://not-exists.com/sitemap_child.xml</loc>',
+                '<lastmod>2004-12-23</lastmod>',
+                '</sitemap>',
+                '<sitemap>',
+                '<loc>http://not-exists.com/sitemap_child_2.xml?from=94937939985&amp;to=1318570721404</loc>',
+                '<lastmod>2004-12-23</lastmod>',
+                '</sitemap>',
+                '</sitemapindex>',
+            ].join('\n'),
+        );
+
+        expect(new Set(sitemap.urls)).toEqual(
+            new Set([
+                'http://not-exists.com/',
+                'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
+                'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
+                'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
+                'http://not-exists.com/catalog?item=83&desc=vacation_usa',
+            ]),
+        );
+    });
 });