From 38ed0d6ad90a868df9c02632334fec8db9ef29a0 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 24 May 2024 13:34:45 +0200 Subject: [PATCH] feat: Loading sitemaps from string (#2496) - closes #2460 --- packages/utils/src/internals/sitemap.ts | 133 ++++++++++++++++-------- packages/utils/test/sitemap.test.ts | 53 ++++++++++ 2 files changed, 141 insertions(+), 45 deletions(-) diff --git a/packages/utils/src/internals/sitemap.ts b/packages/utils/src/internals/sitemap.ts index d42162ff4e49..57f7315c3664 100644 --- a/packages/utils/src/internals/sitemap.ts +++ b/packages/utils/src/internals/sitemap.ts @@ -1,5 +1,5 @@ import type { Duplex } from 'node:stream'; -import { Writable } from 'node:stream'; +import { Readable, Writable } from 'node:stream'; import { StringDecoder } from 'node:string_decoder'; import { createGunzip } from 'node:zlib'; @@ -8,8 +8,10 @@ import type { SAXStream } from 'sax'; import sax from 'sax'; import MIMEType from 'whatwg-mimetype'; +type SitemapSource = { type: 'url'; url: string } | { type: 'raw'; content: string }; + class ParsingState { - sitemapUrls: string[] = []; + sources: SitemapSource[] = []; urls: string[] = []; visitedSitemapUrls: string[] = []; context?: 'sitemapindex' | 'urlset'; @@ -107,7 +109,7 @@ export class Sitemap { if (parsingState.loc) { if (parsingState.context === 'sitemapindex') { if (!parsingState.visitedSitemapUrls.includes(text)) { - parsingState.sitemapUrls.push(text); + parsingState.sources.push({ type: 'url', url: text }); } } if (parsingState.context === 'urlset') { @@ -151,60 +153,101 @@ export class Sitemap { * @param proxyUrl URL of a proxy to be used for fetching sitemap contents */ static async load(urls: string | string[], proxyUrl?: string): Promise { - const { gotScraping } = await import('got-scraping'); - const parsingState = new ParsingState(); - parsingState.sitemapUrls = Array.isArray(urls) ? urls : [urls]; + parsingState.sources = (Array.isArray(urls) ? urls : [urls]).map((url) => ({ type: 'url', url })); - while (parsingState.sitemapUrls.length > 0) { - const sitemapUrl = new URL(parsingState.sitemapUrls.pop()!); - parsingState.visitedSitemapUrls.push(sitemapUrl.toString()); - parsingState.resetContext(); + return await this.parse(parsingState, proxyUrl); + } - try { - const sitemapStream = await new Promise>((resolve, reject) => { - const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' }); - request.on('response', () => resolve(request)); - request.on('error', reject); - }); + /** + * Parse XML sitemap content from a string and return URLs of referenced pages. If the sitemap references other sitemaps, they will be loaded via HTTP. + * @param content XML sitemap content + * @param proxyUrl URL of a proxy to be used for fetching sitemap contents + */ + static async fromXmlString(content: string, proxyUrl?: string): Promise { + const parsingState = new ParsingState(); + parsingState.sources = [{ type: 'raw', content }]; - if (sitemapStream.response!.statusCode === 200) { - await new Promise((resolve, reject) => { - let stream: Duplex = sitemapStream; - if (sitemapUrl.pathname.endsWith('.gz')) { - stream = stream.pipe(createGunzip()).on('error', reject); - sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3); - } - - const parser = (() => { - const contentType = sitemapStream.response!.headers['content-type']; - let mimeType: MIMEType | null; - - try { - mimeType = new MIMEType(contentType ?? ''); - } catch (e) { - mimeType = null; - } + return await this.parse(parsingState, proxyUrl); + } - if (mimeType?.isXML() || sitemapUrl.pathname.endsWith('.xml')) { - return Sitemap.createXmlParser(parsingState, () => resolve(undefined), reject); - } + protected static async parse(parsingState: ParsingState, proxyUrl?: string): Promise { + const { gotScraping } = await import('got-scraping'); - if (mimeType?.essence === 'text/plain' || sitemapUrl.pathname.endsWith('.txt')) { - return new SitemapTxtParser(parsingState, () => resolve(undefined)); - } + while (parsingState.sources.length > 0) { + const source = parsingState.sources.pop()!; + parsingState.resetContext(); - throw new Error('Unsupported sitemap content type'); - })(); + if (source.type === 'url') { + const sitemapUrl = new URL(source.url); + parsingState.visitedSitemapUrls.push(sitemapUrl.toString()); + + try { + const sitemapStream = await new Promise>( + (resolve, reject) => { + const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' }); + request.on('response', () => resolve(request)); + request.on('error', reject); + }, + ); + + if (sitemapStream.response!.statusCode === 200) { + await new Promise((resolve, reject) => { + let stream: Duplex = sitemapStream; + if (sitemapUrl.pathname.endsWith('.gz')) { + stream = stream.pipe(createGunzip()).on('error', reject); + sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3); + } - stream.pipe(parser); - }); + stream.pipe( + this.createParser( + resolve, + reject, + parsingState, + sitemapStream.response!.headers['content-type'], + sitemapUrl, + ), + ); + }); + } + } catch (e) { + log.warning(`Malformed sitemap content: ${sitemapUrl}`); } - } catch (e) { - log.warning(`Malformed sitemap content: ${sitemapUrl}`); + } + + if (source.type === 'raw') { + await new Promise((resolve, reject) => { + Readable.from([source.content]).pipe(this.createParser(resolve, reject, parsingState, 'text/xml')); + }); } } return new Sitemap(parsingState.urls); } + + protected static createParser( + resolve: (value: unknown) => void, + reject: (value: unknown) => void, + parsingState: ParsingState, + contentType: string = '', + url?: URL, + ) { + let mimeType: MIMEType | null; + + try { + mimeType = new MIMEType(contentType); + } catch (e) { + mimeType = null; + } + + if (mimeType?.isXML() || url?.pathname.endsWith('.xml')) { + return Sitemap.createXmlParser(parsingState, () => resolve(undefined), reject); + } + + if (mimeType?.essence === 'text/plain' || url?.pathname.endsWith('.txt')) { + return new SitemapTxtParser(parsingState, () => resolve(undefined)); + } + + throw new Error('Unsupported sitemap content type'); + } } diff --git a/packages/utils/test/sitemap.test.ts b/packages/utils/test/sitemap.test.ts index d1a2eaa493e1..af403216b1d5 100644 --- a/packages/utils/test/sitemap.test.ts +++ b/packages/utils/test/sitemap.test.ts @@ -217,4 +217,57 @@ describe('Sitemap', () => { ]), ); }); + + it('loads sitemaps from string', async () => { + const sitemap = await Sitemap.fromXmlString( + [ + '', + '', + '', + 'http://not-exists.com/catalog?item=80&desc=vacation_turkey', + '2004-11-23', + '', + '', + 'http://not-exists.com/catalog?item=81&desc=vacation_maledives', + '2004-11-23', + '', + '', + ].join('\n'), + ); + + expect(new Set(sitemap.urls)).toEqual( + new Set([ + 'http://not-exists.com/catalog?item=80&desc=vacation_turkey', + 'http://not-exists.com/catalog?item=81&desc=vacation_maledives', + ]), + ); + }); + + it('loads sitemaps that reference other sitemaps from string', async () => { + const sitemap = await Sitemap.fromXmlString( + [ + '', + '', + '', + 'http://not-exists.com/sitemap_child.xml', + '2004-12-23', + '', + '', + 'http://not-exists.com/sitemap_child_2.xml?from=94937939985&to=1318570721404', + '2004-12-23', + '', + '', + ].join('\n'), + ); + + expect(new Set(sitemap.urls)).toEqual( + new Set([ + 'http://not-exists.com/', + 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', + 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', + 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', + 'http://not-exists.com/catalog?item=83&desc=vacation_usa', + ]), + ); + }); });