From 73e67d72ac678797fbcc49debf86f24586e6db1a Mon Sep 17 00:00:00 2001 From: Nicolas Peltier <1032754+npeltier@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:31:38 +0200 Subject: [PATCH] MWPW-153658 audit script (#25) - parse one or several url or sitemap files (can be added in manifest), - for each url from sitemap, parse page for personnalization fragment or fragment, - for each ost link found in html, keep usage, - for each osi, get wcs entry, - output everything as CSV --- misc/README.md | 20 +++ misc/audit-manifest.txt | 186 +++++++++++++++++++++++++++ misc/audit.mjs | 276 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 482 insertions(+) create mode 100644 misc/README.md create mode 100644 misc/audit-manifest.txt create mode 100644 misc/audit.mjs diff --git a/misc/README.md b/misc/README.md new file mode 100644 index 00000000..5c3e2cb7 --- /dev/null +++ b/misc/README.md @@ -0,0 +1,20 @@ +# Miscellaneous + +## Audit script + +script that crawls through raw EDS html versions of page for OST links and spits out a CSV report + + for one page or two or more + `node audit.mjs https://www.adobe.com/page1.html https://www.adobe.com/page2.html` + + for a sitemap or more + `node audit.mjs https://www.adobe.com/cc-shared/assets/sitemap.xml` + + either case output will be entered as csv in /tmp/audit.csv, you can set a different output file with -f parameter + + for a full configured set of URLs and/or sitemap you can use -m parameter with a manifest composed of URLs/Sitemap separated by new lines character + + for faster execution, buffer size (number of parallel page being audited) can be increased but will impact your connection with parameter -b (you can move it lower to the default that is 100). + + so typical execution could be + `node audit.mjs -b 50 -f ~/Documents/audit.csv -m ./audit-manifest.txt` \ No newline at end of file diff --git a/misc/audit-manifest.txt b/misc/audit-manifest.txt new file mode 100644 index 00000000..0b115034 --- /dev/null +++ b/misc/audit-manifest.txt @@ -0,0 +1,186 @@ +https://www.adobe.com/cc-shared/assets/sitemap.xml +https://www.adobe.com/au/cc-shared/assets/sitemap.xml +https://www.adobe.com/cn/cc-shared/assets/sitemap.xml +https://www.adobe.com/hk_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/hk_zh/cc-shared/assets/sitemap.xml +https://www.adobe.com/id_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/id_id/cc-shared/assets/sitemap.xml +https://www.adobe.com/in/cc-shared/assets/sitemap.xml +https://www.adobe.com/in_hi/cc-shared/assets/sitemap.xml +https://www.adobe.com/kr/cc-shared/assets/sitemap.xml +https://www.adobe.com/my_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/my_ms/cc-shared/assets/sitemap.xml +https://www.adobe.com/nz/cc-shared/assets/sitemap.xml +https://www.adobe.com/ph_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/ph_fil/cc-shared/assets/sitemap.xml +https://www.adobe.com/sg/cc-shared/assets/sitemap.xml +https://www.adobe.com/th_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/th_th/cc-shared/assets/sitemap.xml +https://www.adobe.com/tw/cc-shared/assets/sitemap.xml +https://www.adobe.com/vn_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/vn_vi/cc-shared/assets/sitemap.xml +https://www.adobe.com/ae_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/ae_ar/cc-shared/assets/sitemap.xml +https://www.adobe.com/africa/cc-shared/assets/sitemap.xml +https://www.adobe.com/at/cc-shared/assets/sitemap.xml +https://www.adobe.com/be_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/be_fr/cc-shared/assets/sitemap.xml +https://www.adobe.com/be_nl/cc-shared/assets/sitemap.xml +https://www.adobe.com/bg/cc-shared/assets/sitemap.xml +https://www.adobe.com/ch_de/cc-shared/assets/sitemap.xml +https://www.adobe.com/ch_fr/cc-shared/assets/sitemap.xml +https://www.adobe.com/ch_it/cc-shared/assets/sitemap.xml +https://www.adobe.com/cis_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/cis_ru/cc-shared/assets/sitemap.xml +https://www.adobe.com/cz/cc-shared/assets/sitemap.xml +https://www.adobe.com/de/cc-shared/assets/sitemap.xml +https://www.adobe.com/dk/cc-shared/assets/sitemap.xml +https://www.adobe.com/ee/cc-shared/assets/sitemap.xml +https://www.adobe.com/eg_ar/cc-shared/assets/sitemap.xml +https://www.adobe.com/eg_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/es/cc-shared/assets/sitemap.xml +https://www.adobe.com/fi/cc-shared/assets/sitemap.xml +https://www.adobe.com/fr/cc-shared/assets/sitemap.xml +https://www.adobe.com/gr_el/cc-shared/assets/sitemap.xml +https://www.adobe.com/gr_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/hu/cc-shared/assets/sitemap.xml +https://www.adobe.com/ie/cc-shared/assets/sitemap.xml +https://www.adobe.com/il_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/il_he/cc-shared/assets/sitemap.xml +https://www.adobe.com/iq/cc-shared/assets/sitemap.xml +https://www.adobe.com/is/cc-shared/assets/sitemap.xml +https://www.adobe.com/it/cc-shared/assets/sitemap.xml +https://www.adobe.com/kw_ar/cc-shared/assets/sitemap.xml +https://www.adobe.com/kw_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/lt/cc-shared/assets/sitemap.xml +https://www.adobe.com/lu_de/cc-shared/assets/sitemap.xml +https://www.adobe.com/lu_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/lu_fr/cc-shared/assets/sitemap.xml +https://www.adobe.com/lv/cc-shared/assets/sitemap.xml +https://www.adobe.com/mena_ar/cc-shared/assets/sitemap.xml +https://www.adobe.com/mena_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/ng/cc-shared/assets/sitemap.xml +https://www.adobe.com/nl/cc-shared/assets/sitemap.xml +https://www.adobe.com/no/cc-shared/assets/sitemap.xml +https://www.adobe.com/pl/cc-shared/assets/sitemap.xml +https://www.adobe.com/pt/cc-shared/assets/sitemap.xml +https://www.adobe.com/qa_ar/cc-shared/assets/sitemap.xml +https://www.adobe.com/qa_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/ro/cc-shared/assets/sitemap.xml +https://www.adobe.com/ru/cc-shared/assets/sitemap.xml +https://www.adobe.com/sa_en/cc-shared/assets/sitemap.xml +https://www.adobe.com/sa_ar/cc-shared/assets/sitemap.xml +https://www.adobe.com/se/cc-shared/assets/sitemap.xml +https://www.adobe.com/si/cc-shared/assets/sitemap.xml +https://www.adobe.com/sk/cc-shared/assets/sitemap.xml +https://www.adobe.com/tr/cc-shared/assets/sitemap.xml +https://www.adobe.com/ua/cc-shared/assets/sitemap.xml +https://www.adobe.com/uk/cc-shared/assets/sitemap.xml +https://www.adobe.com/za/cc-shared/assets/sitemap.xml +https://www.adobe.com/ar/cc-shared/assets/sitemap.xml +https://www.adobe.com/br/cc-shared/assets/sitemap.xml +https://www.adobe.com/ca/cc-shared/assets/sitemap.xml +https://www.adobe.com/ca_fr/cc-shared/assets/sitemap.xml +https://www.adobe.com/cl/cc-shared/assets/sitemap.xml +https://www.adobe.com/co/cc-shared/assets/sitemap.xml +https://www.adobe.com/cr/cc-shared/assets/sitemap.xml +https://www.adobe.com/ec/cc-shared/assets/sitemap.xml +https://www.adobe.com/gt/cc-shared/assets/sitemap.xml +https://www.adobe.com/la/cc-shared/assets/sitemap.xml +https://www.adobe.com/mx/cc-shared/assets/sitemap.xml +https://www.adobe.com/pe/cc-shared/assets/sitemap.xml +https://www.adobe.com/pr/cc-shared/assets/sitemap.xml +https://www.adobe.com/jp/cc-shared/assets/sitemap.xml +https://www.adobe.com/dc-shared/assets/sitemap.xml +https://www.adobe.com/au/dc-shared/assets/sitemap.xml +https://www.adobe.com/cn/dc-shared/assets/sitemap.xml +https://www.adobe.com/hk_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/hk_zh/dc-shared/assets/sitemap.xml +https://www.adobe.com/id_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/id_id/dc-shared/assets/sitemap.xml +https://www.adobe.com/in/dc-shared/assets/sitemap.xml +https://www.adobe.com/in_hi/dc-shared/assets/sitemap.xml +https://www.adobe.com/kr/dc-shared/assets/sitemap.xml +https://www.adobe.com/my_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/my_ms/dc-shared/assets/sitemap.xml +https://www.adobe.com/nz/dc-shared/assets/sitemap.xml +https://www.adobe.com/ph_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/ph_fil/dc-shared/assets/sitemap.xml +https://www.adobe.com/sg/dc-shared/assets/sitemap.xml +https://www.adobe.com/th_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/th_th/dc-shared/assets/sitemap.xml +https://www.adobe.com/tw/dc-shared/assets/sitemap.xml +https://www.adobe.com/vn_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/vn_vi/dc-shared/assets/sitemap.xml +https://www.adobe.com/ae_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/ae_ar/dc-shared/assets/sitemap.xml +https://www.adobe.com/africa/dc-shared/assets/sitemap.xml +https://www.adobe.com/at/dc-shared/assets/sitemap.xml +https://www.adobe.com/be_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/be_fr/dc-shared/assets/sitemap.xml +https://www.adobe.com/be_nl/dc-shared/assets/sitemap.xml +https://www.adobe.com/bg/dc-shared/assets/sitemap.xml +https://www.adobe.com/ch_de/dc-shared/assets/sitemap.xml +https://www.adobe.com/ch_fr/dc-shared/assets/sitemap.xml +https://www.adobe.com/ch_it/dc-shared/assets/sitemap.xml +https://www.adobe.com/cis_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/cis_ru/dc-shared/assets/sitemap.xml +https://www.adobe.com/cz/dc-shared/assets/sitemap.xml +https://www.adobe.com/de/dc-shared/assets/sitemap.xml +https://www.adobe.com/dk/dc-shared/assets/sitemap.xml +https://www.adobe.com/ee/dc-shared/assets/sitemap.xml +https://www.adobe.com/eg_ar/dc-shared/assets/sitemap.xml +https://www.adobe.com/eg_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/es/dc-shared/assets/sitemap.xml +https://www.adobe.com/fi/dc-shared/assets/sitemap.xml +https://www.adobe.com/fr/dc-shared/assets/sitemap.xml +https://www.adobe.com/gr_el/dc-shared/assets/sitemap.xml +https://www.adobe.com/gr_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/hu/dc-shared/assets/sitemap.xml +https://www.adobe.com/ie/dc-shared/assets/sitemap.xml +https://www.adobe.com/il_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/il_he/dc-shared/assets/sitemap.xml +https://www.adobe.com/iq/dc-shared/assets/sitemap.xml +https://www.adobe.com/is/dc-shared/assets/sitemap.xml +https://www.adobe.com/it/dc-shared/assets/sitemap.xml +https://www.adobe.com/kw_ar/dc-shared/assets/sitemap.xml +https://www.adobe.com/kw_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/lt/dc-shared/assets/sitemap.xml +https://www.adobe.com/lu_de/dc-shared/assets/sitemap.xml +https://www.adobe.com/lu_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/lu_fr/dc-shared/assets/sitemap.xml +https://www.adobe.com/lv/dc-shared/assets/sitemap.xml +https://www.adobe.com/mena_ar/dc-shared/assets/sitemap.xml +https://www.adobe.com/mena_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/ng/dc-shared/assets/sitemap.xml +https://www.adobe.com/nl/dc-shared/assets/sitemap.xml +https://www.adobe.com/no/dc-shared/assets/sitemap.xml +https://www.adobe.com/pl/dc-shared/assets/sitemap.xml +https://www.adobe.com/pt/dc-shared/assets/sitemap.xml +https://www.adobe.com/qa_ar/dc-shared/assets/sitemap.xml +https://www.adobe.com/qa_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/ro/dc-shared/assets/sitemap.xml +https://www.adobe.com/ru/dc-shared/assets/sitemap.xml +https://www.adobe.com/sa_en/dc-shared/assets/sitemap.xml +https://www.adobe.com/sa_ar/dc-shared/assets/sitema.xml +https://www.adobe.com/se/dc-shared/assets/sitemap.xml +https://www.adobe.com/si/dc-shared/assets/sitemap.xml +https://www.adobe.com/sk/dc-shared/assets/sitemap.xml +https://www.adobe.com/tr/dc-shared/assetssitemap.xml +https://www.adobe.com/ua/dc-shared/assets/sitemap.xml +https://www.adobe.com/uk/dc-shared/assets/sitemap.xml +https://www.adobe.com/za/dc-shared/assets/sitemap.xml +https://www.adobe.com/ar/dc-shared/assets/sitemap.xml +https://www.adobe.com/br/dc-shared/assets/sitemap.xml +https://www.adobe.com/ca/dc-shared/assets/sitemap.xml +https://www.adobe.com/ca_fr/dc-shared/assets/sitemap.xml +https://www.adobe.com/cl/dc-shared/assets/sitemap.xml +https://www.adobe.com/co/dc-shared/assets/sitemap.xml +https://www.adobe.com/cr/dc-shared/assets/sitemap.xml +https://www.adobe.com/ec/dc-shared/assets/sitemap.xml +https://www.adobe.com/gt/dc-shared/assets/sitemap.xml +https://www.adobe.com/la/dc-shared/assets/sitemap.xml +https://www.adobe.com/mx/dc-shared/assets/sitemap.xml +https://www.adobe.com/pe/dc-shared/assets/sitemap.xml +https://www.adobe.com/pr/dc-shared/assets/sitemap.xml +https://www.adobe.com/jp/dc-shared/assets/sitemap.xml \ No newline at end of file diff --git a/misc/audit.mjs b/misc/audit.mjs new file mode 100644 index 00000000..d4de0806 --- /dev/null +++ b/misc/audit.mjs @@ -0,0 +1,276 @@ +import fetch from 'node-fetch'; +import fs from 'fs'; + +const BUFFER_SIZE = 100; +const EXCERPT_SIZE = 30; +const BUFFER_ARG = '-b'; +const FILE_ARG = '-f'; +const MF_ARG = '-m'; +const HREF_REGEXP = 'href="(?[^"]+)"'; +const LOC_REGEXP = '(?[^<]+)'; +const PERSO_REGEXP = ''; +const LINK_REGEXP = ']*' + HREF_REGEXP +'[^>]*>[^<]*'; +const PARAMETER_REGEXP = '(?\\w+)=(?[^&]+)'; +const DOMAIN_REGEXP = '^https://[^/]+' +const HREF_REGEXPS = { + 'fragment': '/fragments/', + 'ost': 'https://milo.adobe.com/tools/ost?(?.+)', +} +const LOCALES = ['au', 'cn', 'hk_en', 'hk_zh', 'id_en', 'id_id', 'in', 'in_hi', 'kr', 'my_en', 'my_ms', 'nz', 'ph_en', 'ph_fil', 'sg', 'th_en', 'th_th', 'tw', 'vn_en', 'vn_vi', + 'ae_en', 'ae_ar', 'africa', 'at', 'be_en', 'be_fr', 'be_nl', 'bg', 'ch_de', 'ch_fr', 'ch_it', 'cis_en', 'cis_ru', 'cz', 'de', 'dk', 'ee', 'eg_ar', 'eg_en', 'es', 'fi', 'fr', 'gr_el', 'gr_en', 'hu', 'ie', 'il_en', 'il_he', 'iq', 'is', 'it', 'kw_ar', 'kw_en', 'lt', 'lu_de', 'lu_en', 'lu_fr', 'lv', 'mena_ar', 'mena_en', 'ng', 'nl', 'no', 'pl', 'pt', 'qa_ar', 'qa_en', 'ro', 'ru', 'sa_en', 'sa_ar', 'se', 'si', 'sk', 'tr', 'ua', 'uk', 'za', + 'ar', 'br', 'ca', 'ca_fr', 'cl', 'co', 'cr', 'ec', 'gt', 'la', 'mx', 'pe', 'pr', 'jp']; +const wcsUrl = (osi) => `https://wcs.adobe.com/web_commerce_artifact?offer_selector_ids=${osi}&country=US&language=MULT&locale=en_US&api_key=wcms-commerce-ims-ro-user-milo&landscape=PUBLISHED`; +const mapWcs = {}; +const WCS_KEYS = [ 'offerId' , 'productArrangementCode' , 'commitment' , 'term' , 'customerSegment' , 'marketSegments' , 'offerType' , 'pricePoint' ]; +const retries = new Set(); +const fetched = new Set(); +const isRelative = (url) => (url[0] == '/'); +let defaultBufferSize = BUFFER_SIZE; +let file = '/tmp/audit.csv'; + +const getUrlParts = ( url ) => { + const domain = (new RegExp(DOMAIN_REGEXP)).exec(url)[0]; + return { + domain, + uri: url = url.substring(domain.length), + } +} + +const fetchDocument = (url) => { + console.log(`fetching ${url}...`); + return fetch(url); +}; + +const getLiveUrl = (url) => url.replaceAll('.hlx.page', '.hlx.live'); + +async function getFragmentsFromManifest(url) { + const fragments = []; + try { + const response = await fetch(getLiveUrl(url)); + if(response.status == 200 && response.size > 0) { + const manifest = await response?.json(); + manifest.experiences?.data?.forEach((data) => { + Object.values(data) + .map((value) => /^https:\/\/.+/.exec(value)?.[0]) + .filter((value) => value != null) + .map(getLiveUrl) + .forEach((url) => fragments.push(url)); + }); + } + } catch (error) { + console.log(`Error while fetching manifest: ${error}`) + } + return fragments; +} + +async function getPersonnalizationFragments(pageContent) { + const persoRegexp = (new RegExp(PERSO_REGEXP, 'g')).exec(pageContent); + if (!persoRegexp) { + return null; + } + const fragments = new Set(); + const { urls } = persoRegexp.groups; + const urlArr = urls.replace('\s', '').split(','); + urlArr.forEach(async (url) => { + const manifestFragments = await getFragmentsFromManifest(url); + manifestFragments.forEach((fragment) => fragments.add(fragment)); + }); + return Array.from(fragments); +} + +const extractLinks = (pageContent) => { + const result = {}; + const linkRegexp = new RegExp(LINK_REGEXP, 'g'); + let match; + while ((match = linkRegexp.exec(pageContent)) != null) { + const { url } = match.groups; + const indexEnd = match.index + match[0].length; + const postExcerpt = pageContent.substring(indexEnd, indexEnd + EXCERPT_SIZE).replaceAll(/[,\n\s]+/g,''); + Object.entries(HREF_REGEXPS).forEach(([type, pattern]) => { + const patternMatch = (new RegExp(pattern)).exec(url); + if (patternMatch) { + result[type] ??= []; + result[type].push({ + patternMatch, + postExcerpt + }); + } + }); + } + return result; +} + +const prefixWcsKey = (key) => ('wcs ' + key); + +async function getCommerceData(osi) { + const response = await fetch(wcsUrl(osi)); + const data = {}; + if (response.ok) { + const json = await response.json(); + const offer = json.resolvedOffers[0]; + if (offer) { + WCS_KEYS.forEach( (key) => data[prefixWcsKey(key)] = offer[key]); + } + } + return data; +} + +async function extractOstUsage(ctx, parameterString, postExcerpt, collection) { + const parameterMatches = new RegExp(PARAMETER_REGEXP, 'g'); + const entry = {...ctx, postExcerpt}; + let match; + while ((match = parameterMatches.exec(parameterString)) != null) { + const { left, right } = match.groups; + entry[left] = right; + keys.add(left); + } + if (entry.osi) { + mapWcs[entry.osi] = {}; + } + collection.push(entry); +} + +async function extractUrlsFromSiteMap(sitemapUrl) { + const response = await fetchDocument(sitemapUrl); + const sitemapContent = await response.text(); + const listedUrls = new Set(); + const linkRegexp = new RegExp(LOC_REGEXP, 'g'); + let match; + while ((match = linkRegexp.exec(sitemapContent)) != null) { + let { url } = match.groups; + listedUrls.add(url); + } + return Array.from(listedUrls); +} + +const ostUsages = []; +const keys = new Set(); + +async function auditPage( ctx, url ) { + try { + if (isRelative(url)) { + url = `https://www.adobe.com${url}`; + } + if (ctx.localeRewrite) { + const localeToken = '/' + ctx.localeRewrite + '/'; + if (url.indexOf(localeToken) < 0) { + const { domain, uri } = getUrlParts(url); + url = domain + localeToken + uri.substring(1); + } + } + if (fetched.has(url)) { + //already fetched & parsed + return; + } + const response = await fetchDocument(url); + fetched.add(url); + if (!response.ok) { + console.log(`Error: response status for ${response.url} is ${response.status}`); + return; + } + if (url === ctx.origin) { + const { uri } = getUrlParts(url); + const firstToken = uri.substring(1).split('/')[0]; + ctx.localeRewrite = LOCALES.indexOf(firstToken) >= 0 ? firstToken : null; + } else { + ctx.fragment = url; + } + const content = await response.text(); + const result = extractLinks(content); + + await result?.ost?.map(async ({ patternMatch, postExcerpt }) => await extractOstUsage(ctx, patternMatch.groups.parameters, postExcerpt, ostUsages)); + const persoFragments = await getPersonnalizationFragments(content); + result.fragment ??= []; + let fragments = result.fragment + .map(({ patternMatch }) => patternMatch.input) + .filter((fragmentUrl) => fragmentUrl != url); + if (persoFragments) { + fragments = fragments.concat(persoFragments); + } + await Promise.allSettled(fragments.map((fragmentUrl) => auditPage(ctx, fragmentUrl))); + if (retries.has(url)) { + retries.delete(url); + console.log(`retries down to ${retries.size}`); + } + } + catch(error) { + ctx.retries = ctx.retries ? ctx.retries ++ : 1; + const delay = ctx.retries = 1 * 1000; + retries.add(url) + console.log(`Error while auditing document ${url}: ${error}, retrying (#${ctx.retries})in ${delay/1000}s...`); + console.log(`#retries = ${retries.size}`); + await sleep(delay); + await auditPage(ctx, url); + }; +} + +async function main() { + const startTime = Date.now(); + if (process.argv.length < 3) { + console.log("you should provide at least one URL to audit"); + } + let args = process.argv.slice(2); + let urlsToFetch = []; + while (args.length > 0) { + const arg = args.splice(0, 1)[0]; + switch(arg) { + case BUFFER_ARG: { + defaultBufferSize = parseInt(args.splice(0, 1)[0]); + console.log(`will use bufferSize of ${defaultBufferSize}`); + break; + } + case FILE_ARG: { + file = args.splice(0, 1)[0]; + console.log(`will write output to ${file}`); + break; + } + case MF_ARG: { + const mf = args.splice(0, 1)[0]; + const lines = fs.readFileSync(mf, 'utf8'); + args = args.concat(lines.split('\n')); + break; + } + default: { + if (arg.endsWith('sitemap.xml')) { + console.log("looks like a sitemap, will use urls listed there..."); + const sitemapUrls = await extractUrlsFromSiteMap(arg); + console.log(`collected ${sitemapUrls.length} urls`); + urlsToFetch = urlsToFetch.concat(sitemapUrls); + } else { + urlsToFetch.push(arg); + } + break; + } + } + } + while (urlsToFetch.length > 0) { + console.log(`${urlsToFetch.length} remaining...`); + //next buffer + // we remove retries that are being run atm + const bufferSize = defaultBufferSize > retries.size ? defaultBufferSize - retries.size : 1; + const buffer = urlsToFetch.slice(0, bufferSize); + //remaining urls once buffer will be done + urlsToFetch = (urlsToFetch.length >= bufferSize) ? urlsToFetch.slice(bufferSize) : []; + //buffer process + await Promise.allSettled(buffer.map((url) => auditPage({ origin: url }, url))); + } + console.log(`collected ${ostUsages.length} entries`); + //collecting related OSI commerce data + await Promise.allSettled(Object.keys(mapWcs).map(async (osi) => mapWcs[osi] = await getCommerceData(osi))); + + //rendering of collected ostUsages objects together with all collected keys as a CSV + let headers = Array.from(keys); + headers.unshift('fragment'); + headers.unshift('origin'); + headers.push('postExcerpt'); + headers = headers.concat(WCS_KEYS.map(prefixWcsKey)); + fs.writeFileSync(file, + `${headers.join(',')}\n${ostUsages.map((o) => { + o = {...o, ...mapWcs[o.osi] } + return headers.map(k => o[k]).join(','); + }).join('\n')}` + ); + console.log(`finished in ${(Date.now() - startTime)/1000}s`); +} + +main();