Skip to content

Commit

Permalink
feat: Loading sitemaps from string (#2496)
Browse files Browse the repository at this point in the history
- closes #2460
  • Loading branch information
janbuchar authored May 24, 2024
1 parent a5dca80 commit 38ed0d6
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 45 deletions.
133 changes: 88 additions & 45 deletions packages/utils/src/internals/sitemap.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { Duplex } from 'node:stream';
import { Writable } from 'node:stream';
import { Readable, Writable } from 'node:stream';
import { StringDecoder } from 'node:string_decoder';
import { createGunzip } from 'node:zlib';

Expand All @@ -8,8 +8,10 @@ import type { SAXStream } from 'sax';
import sax from 'sax';
import MIMEType from 'whatwg-mimetype';

type SitemapSource = { type: 'url'; url: string } | { type: 'raw'; content: string };

class ParsingState {
sitemapUrls: string[] = [];
sources: SitemapSource[] = [];
urls: string[] = [];
visitedSitemapUrls: string[] = [];
context?: 'sitemapindex' | 'urlset';
Expand Down Expand Up @@ -107,7 +109,7 @@ export class Sitemap {
if (parsingState.loc) {
if (parsingState.context === 'sitemapindex') {
if (!parsingState.visitedSitemapUrls.includes(text)) {
parsingState.sitemapUrls.push(text);
parsingState.sources.push({ type: 'url', url: text });
}
}
if (parsingState.context === 'urlset') {
Expand Down Expand Up @@ -151,60 +153,101 @@ export class Sitemap {
* @param proxyUrl URL of a proxy to be used for fetching sitemap contents
*/
static async load(urls: string | string[], proxyUrl?: string): Promise<Sitemap> {
const { gotScraping } = await import('got-scraping');

const parsingState = new ParsingState();
parsingState.sitemapUrls = Array.isArray(urls) ? urls : [urls];
parsingState.sources = (Array.isArray(urls) ? urls : [urls]).map((url) => ({ type: 'url', url }));

while (parsingState.sitemapUrls.length > 0) {
const sitemapUrl = new URL(parsingState.sitemapUrls.pop()!);
parsingState.visitedSitemapUrls.push(sitemapUrl.toString());
parsingState.resetContext();
return await this.parse(parsingState, proxyUrl);
}

try {
const sitemapStream = await new Promise<ReturnType<typeof gotScraping.stream>>((resolve, reject) => {
const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' });
request.on('response', () => resolve(request));
request.on('error', reject);
});
/**
* Parse XML sitemap content from a string and return URLs of referenced pages. If the sitemap references other sitemaps, they will be loaded via HTTP.
* @param content XML sitemap content
* @param proxyUrl URL of a proxy to be used for fetching sitemap contents
*/
static async fromXmlString(content: string, proxyUrl?: string): Promise<Sitemap> {
const parsingState = new ParsingState();
parsingState.sources = [{ type: 'raw', content }];

if (sitemapStream.response!.statusCode === 200) {
await new Promise((resolve, reject) => {
let stream: Duplex = sitemapStream;
if (sitemapUrl.pathname.endsWith('.gz')) {
stream = stream.pipe(createGunzip()).on('error', reject);
sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3);
}

const parser = (() => {
const contentType = sitemapStream.response!.headers['content-type'];
let mimeType: MIMEType | null;

try {
mimeType = new MIMEType(contentType ?? '');
} catch (e) {
mimeType = null;
}
return await this.parse(parsingState, proxyUrl);
}

if (mimeType?.isXML() || sitemapUrl.pathname.endsWith('.xml')) {
return Sitemap.createXmlParser(parsingState, () => resolve(undefined), reject);
}
protected static async parse(parsingState: ParsingState, proxyUrl?: string): Promise<Sitemap> {
const { gotScraping } = await import('got-scraping');

if (mimeType?.essence === 'text/plain' || sitemapUrl.pathname.endsWith('.txt')) {
return new SitemapTxtParser(parsingState, () => resolve(undefined));
}
while (parsingState.sources.length > 0) {
const source = parsingState.sources.pop()!;
parsingState.resetContext();

throw new Error('Unsupported sitemap content type');
})();
if (source.type === 'url') {
const sitemapUrl = new URL(source.url);
parsingState.visitedSitemapUrls.push(sitemapUrl.toString());

try {
const sitemapStream = await new Promise<ReturnType<typeof gotScraping.stream>>(
(resolve, reject) => {
const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' });
request.on('response', () => resolve(request));
request.on('error', reject);
},
);

if (sitemapStream.response!.statusCode === 200) {
await new Promise((resolve, reject) => {
let stream: Duplex = sitemapStream;
if (sitemapUrl.pathname.endsWith('.gz')) {
stream = stream.pipe(createGunzip()).on('error', reject);
sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3);
}

stream.pipe(parser);
});
stream.pipe(
this.createParser(
resolve,
reject,
parsingState,
sitemapStream.response!.headers['content-type'],
sitemapUrl,
),
);
});
}
} catch (e) {
log.warning(`Malformed sitemap content: ${sitemapUrl}`);
}
} catch (e) {
log.warning(`Malformed sitemap content: ${sitemapUrl}`);
}

if (source.type === 'raw') {
await new Promise((resolve, reject) => {
Readable.from([source.content]).pipe(this.createParser(resolve, reject, parsingState, 'text/xml'));
});
}
}

return new Sitemap(parsingState.urls);
}

protected static createParser(
resolve: (value: unknown) => void,
reject: (value: unknown) => void,
parsingState: ParsingState,
contentType: string = '',
url?: URL,
) {
let mimeType: MIMEType | null;

try {
mimeType = new MIMEType(contentType);
} catch (e) {
mimeType = null;
}

if (mimeType?.isXML() || url?.pathname.endsWith('.xml')) {
return Sitemap.createXmlParser(parsingState, () => resolve(undefined), reject);
}

if (mimeType?.essence === 'text/plain' || url?.pathname.endsWith('.txt')) {
return new SitemapTxtParser(parsingState, () => resolve(undefined));
}

throw new Error('Unsupported sitemap content type');
}
}
53 changes: 53 additions & 0 deletions packages/utils/test/sitemap.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -217,4 +217,57 @@ describe('Sitemap', () => {
]),
);
});

it('loads sitemaps from string', async () => {
const sitemap = await Sitemap.fromXmlString(
[
'<?xml version="1.0" encoding="UTF-8"?>',
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
'<url>',
'<loc>http://not-exists.com/catalog?item=80&amp;desc=vacation_turkey</loc>',
'<lastmod>2004-11-23</lastmod>',
'</url>',
'<url>',
'<loc>http://not-exists.com/catalog?item=81&amp;desc=vacation_maledives</loc>',
'<lastmod>2004-11-23</lastmod>',
'</url>',
'</urlset>',
].join('\n'),
);

expect(new Set(sitemap.urls)).toEqual(
new Set([
'http://not-exists.com/catalog?item=80&desc=vacation_turkey',
'http://not-exists.com/catalog?item=81&desc=vacation_maledives',
]),
);
});

it('loads sitemaps that reference other sitemaps from string', async () => {
const sitemap = await Sitemap.fromXmlString(
[
'<?xml version="1.0" encoding="UTF-8"?>',
'<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
'<sitemap>',
'<loc>http://not-exists.com/sitemap_child.xml</loc>',
'<lastmod>2004-12-23</lastmod>',
'</sitemap>',
'<sitemap>',
'<loc>http://not-exists.com/sitemap_child_2.xml?from=94937939985&amp;to=1318570721404</loc>',
'<lastmod>2004-12-23</lastmod>',
'</sitemap>',
'</sitemapindex>',
].join('\n'),
);

expect(new Set(sitemap.urls)).toEqual(
new Set([
'http://not-exists.com/',
'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
'http://not-exists.com/catalog?item=83&desc=vacation_usa',
]),
);
});
});

0 comments on commit 38ed0d6

Please sign in to comment.