From 43fa92cea04632bdff87a4d0b1e414ea0fed75cd Mon Sep 17 00:00:00 2001 From: Kiko Beats Date: Wed, 24 Oct 2018 12:55:15 +0200 Subject: [PATCH] Add url resolution --- README.md | 6 ++---- package.json | 2 ++ src/index.js | 57 ++++++++++++++++++++++++++++++++++++++------------- test/index.js | 18 +++++++++++++++- 4 files changed, 64 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index cecb4bd..956e133 100644 --- a/README.md +++ b/README.md @@ -43,10 +43,8 @@ $ npm install html-get --save const getHTML = require('html-get') ;(async () => { - const url = 'https://kikobeats.com' - const { html, stats } = await getHTML(url) - console.log(html) - console.log(stats) + const { url, html, stats } = await getHTML('https://kikobeats.com') + console.log(url, stats, html.length) })() ``` diff --git a/package.json b/package.json index ebff272..5f7284c 100644 --- a/package.json +++ b/package.json @@ -35,9 +35,11 @@ "browserless": "~4.1.3", "got": "~9.2.2", "html-encode": "~2.0.1", + "mem": "~4.0.0", "p-cancelable": "~0.5.1", "p-timeout": "~2.0.1", "parse-domain": "~2.1.2", + "reachable-url": "~1.1.6", "time-span": "~2.0.0" }, "devDependencies": { diff --git a/src/index.js b/src/index.js index 9a13f65..9e7e56a 100644 --- a/src/index.js +++ b/src/index.js @@ -1,23 +1,42 @@ 'use strict' const createBrowserless = require('browserless') +const reachableUrl = require('reachable-url') const parseDomain = require('parse-domain') const PCancelable = require('p-cancelable') const htmlEncode = require('html-encode') const timeSpan = require('time-span') const pTimeout = require('p-timeout') +const mem = require('mem') const got = require('got') const autoDomains = require('./auto-domains') -// TODO: This is a hard timeout to ensure prerender mode +const ONE_MIN_MS = 60 * 1000 +const ONE_HOUR_MS = ONE_MIN_MS * 60 +const ONE_DAY_MS = ONE_HOUR_MS * 24 + +// TODO: This is a soft timeout to ensure prerender mode // doesn't take too much time an reach the global timeout. // Currently puppeteer is not handling a global timeout, // need to wait until 2.0 to setup `.defaultTimeout` // https://github.com/GoogleChrome/puppeteer/issues/2079 +const REQ_TIMEOUT = 6500 -const REQ_TIMEOUT = 8000 +// Puppeteer doesn't resolve redirection well. +// We need to ensure we have the right url. +const getUrl = mem( + async targetUrl => { + try { + const { url } = await reachableUrl(targetUrl) + return url + } catch (err) { + return targetUrl + } + }, + { maxAge: ONE_DAY_MS } +) const getDomain = url => (parseDomain(url) || {}).domain @@ -34,6 +53,7 @@ const fetch = (url, { toEncode, reflect = false, ...opts }) => try { const res = await req return resolve({ + url: res.url, html: await toEncode(res.body, res.headers['content-type']), mode: 'fetch' }) @@ -44,19 +64,27 @@ const fetch = (url, { toEncode, reflect = false, ...opts }) => }) const prerender = async ( - url, + targetUrl, { getBrowserless, gotOptions, toEncode, ...opts } ) => { + const url = await getUrl(targetUrl) const fetchReq = fetch(url, { reflect: true, toEncode, ...gotOptions }) + let html = '' + let fetchDataProps = {} + let isFetchRejected = false + try { const browserless = await getBrowserless() - const html = await pTimeout(browserless.html(url, opts), REQ_TIMEOUT) + html = await pTimeout(browserless.html(url, opts), REQ_TIMEOUT) fetchReq.cancel() - return { html, mode: 'prerender' } + return { url, html, mode: 'prerender' } } catch (err) { - const fetchData = await fetchReq - return { html: fetchData.isRejected ? '' : fetchData, mode: 'prerender' } + const { isRejected, ...dataProps } = await fetchReq + isFetchRejected = isRejected + fetchDataProps = dataProps } + + return isFetchRejected ? { url, html, mode: 'prerender' } : fetchDataProps } const FETCH_MODE = { fetch, prerender } @@ -68,7 +96,7 @@ const getFetchMode = (url, { prerender }) => { } module.exports = async ( - url, + targetUrl, { getBrowserless = createBrowserless, encoding = 'utf-8', @@ -79,14 +107,15 @@ module.exports = async ( } = {} ) => { const toEncode = htmlEncode(encoding) - const targetFetchMode = fetchMode(url, { prerender }) - const opts = targetFetchMode === 'fetch' - ? { toEncode, ...gotOptions } - : { toEncode, getBrowserless, gotOptions, ...puppeteerOpts } + const targetFetchMode = fetchMode(targetUrl, { prerender }) + const opts = + targetFetchMode === 'fetch' + ? { toEncode, ...gotOptions } + : { toEncode, getBrowserless, gotOptions, ...puppeteerOpts } const time = timeSpan() - const { html, mode } = await FETCH_MODE[targetFetchMode](url, opts) - return { html, stats: { mode, timing: time() } } + const { url, html, mode } = await FETCH_MODE[targetFetchMode](targetUrl, opts) + return { url, html, stats: { mode, timing: time() } } } module.exports.createBrowserless = createBrowserless diff --git a/test/index.js b/test/index.js index 2f58f58..820d33e 100644 --- a/test/index.js +++ b/test/index.js @@ -10,7 +10,7 @@ test('prerender by default', async t => { t.is(stats.mode, 'prerender') }) -test('disable prerender', async t => { +test('disable prerender explicitly', async t => { const url = 'https://kikobeats.com' const { stats } = await getHTML(url, { prerender: false }) t.is(stats.mode, 'fetch') @@ -21,3 +21,19 @@ test('prerender auto detection', async t => { const { stats } = await getHTML(url) t.is(stats.mode, 'fetch') }) + +test('follow redirect', async t => { + const url = 'https://google.com' + const redirectUrl = 'https://www.google.com/' + t.is((await getHTML(url, { prerender: false })).url, redirectUrl) + t.is((await getHTML(url, { prerender: true })).url, redirectUrl) +}) + +test('prerender error fallback into fetch mode', async t => { + const url = + 'https://www.washingtonpost.com/gdpr-consent/?__twitter_impression=true&destination=%2Fnation%2F2018%2F10%2F24%2Fmega-millions-jackpot-winner-reported-south-carolina-its-not-official%2F%3Futm_term%3D.f26f36d5914d%26tid%3Dsm_tw%26__twitter_impression%3Dtrue&utm_term=.309ab3e98c97' + + const { stats, html } = await getHTML(url) + t.true(!!html) + t.is(stats.mode, 'fetch') +})