Skip to content

Commit

Permalink
Add url resolution
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikobeats committed Oct 24, 2018
1 parent 0f32fad commit 43fa92c
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 19 deletions.
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,8 @@ $ npm install html-get --save

const getHTML = require('html-get')
;(async () => {
const url = 'https://kikobeats.com'
const { html, stats } = await getHTML(url)
console.log(html)
console.log(stats)
const { url, html, stats } = await getHTML('https://kikobeats.com')
console.log(url, stats, html.length)
})()
```

Expand Down
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,11 @@
"browserless": "~4.1.3",
"got": "~9.2.2",
"html-encode": "~2.0.1",
"mem": "~4.0.0",
"p-cancelable": "~0.5.1",
"p-timeout": "~2.0.1",
"parse-domain": "~2.1.2",
"reachable-url": "~1.1.6",
"time-span": "~2.0.0"
},
"devDependencies": {
Expand Down
57 changes: 43 additions & 14 deletions src/index.js
Original file line number Diff line number Diff line change
@@ -1,23 +1,42 @@
'use strict'

const createBrowserless = require('browserless')
const reachableUrl = require('reachable-url')
const parseDomain = require('parse-domain')
const PCancelable = require('p-cancelable')
const htmlEncode = require('html-encode')
const timeSpan = require('time-span')
const pTimeout = require('p-timeout')
const mem = require('mem')

const got = require('got')

const autoDomains = require('./auto-domains')

// TODO: This is a hard timeout to ensure prerender mode
const ONE_MIN_MS = 60 * 1000
const ONE_HOUR_MS = ONE_MIN_MS * 60
const ONE_DAY_MS = ONE_HOUR_MS * 24

// TODO: This is a soft timeout to ensure prerender mode
// doesn't take too much time an reach the global timeout.
// Currently puppeteer is not handling a global timeout,
// need to wait until 2.0 to setup `.defaultTimeout`
// https://github.com/GoogleChrome/puppeteer/issues/2079
const REQ_TIMEOUT = 6500

const REQ_TIMEOUT = 8000
// Puppeteer doesn't resolve redirection well.
// We need to ensure we have the right url.
const getUrl = mem(
async targetUrl => {
try {
const { url } = await reachableUrl(targetUrl)
return url
} catch (err) {
return targetUrl
}
},
{ maxAge: ONE_DAY_MS }
)

const getDomain = url => (parseDomain(url) || {}).domain

Expand All @@ -34,6 +53,7 @@ const fetch = (url, { toEncode, reflect = false, ...opts }) =>
try {
const res = await req
return resolve({
url: res.url,
html: await toEncode(res.body, res.headers['content-type']),
mode: 'fetch'
})
Expand All @@ -44,19 +64,27 @@ const fetch = (url, { toEncode, reflect = false, ...opts }) =>
})

const prerender = async (
url,
targetUrl,
{ getBrowserless, gotOptions, toEncode, ...opts }
) => {
const url = await getUrl(targetUrl)
const fetchReq = fetch(url, { reflect: true, toEncode, ...gotOptions })
let html = ''
let fetchDataProps = {}
let isFetchRejected = false

try {
const browserless = await getBrowserless()
const html = await pTimeout(browserless.html(url, opts), REQ_TIMEOUT)
html = await pTimeout(browserless.html(url, opts), REQ_TIMEOUT)
fetchReq.cancel()
return { html, mode: 'prerender' }
return { url, html, mode: 'prerender' }
} catch (err) {
const fetchData = await fetchReq
return { html: fetchData.isRejected ? '' : fetchData, mode: 'prerender' }
const { isRejected, ...dataProps } = await fetchReq
isFetchRejected = isRejected
fetchDataProps = dataProps
}

return isFetchRejected ? { url, html, mode: 'prerender' } : fetchDataProps
}

const FETCH_MODE = { fetch, prerender }
Expand All @@ -68,7 +96,7 @@ const getFetchMode = (url, { prerender }) => {
}

module.exports = async (
url,
targetUrl,
{
getBrowserless = createBrowserless,
encoding = 'utf-8',
Expand All @@ -79,14 +107,15 @@ module.exports = async (
} = {}
) => {
const toEncode = htmlEncode(encoding)
const targetFetchMode = fetchMode(url, { prerender })
const opts = targetFetchMode === 'fetch'
? { toEncode, ...gotOptions }
: { toEncode, getBrowserless, gotOptions, ...puppeteerOpts }
const targetFetchMode = fetchMode(targetUrl, { prerender })
const opts =
targetFetchMode === 'fetch'
? { toEncode, ...gotOptions }
: { toEncode, getBrowserless, gotOptions, ...puppeteerOpts }

const time = timeSpan()
const { html, mode } = await FETCH_MODE[targetFetchMode](url, opts)
return { html, stats: { mode, timing: time() } }
const { url, html, mode } = await FETCH_MODE[targetFetchMode](targetUrl, opts)
return { url, html, stats: { mode, timing: time() } }
}

module.exports.createBrowserless = createBrowserless
18 changes: 17 additions & 1 deletion test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test('prerender by default', async t => {
t.is(stats.mode, 'prerender')
})

test('disable prerender', async t => {
test('disable prerender explicitly', async t => {
const url = 'https://kikobeats.com'
const { stats } = await getHTML(url, { prerender: false })
t.is(stats.mode, 'fetch')
Expand All @@ -21,3 +21,19 @@ test('prerender auto detection', async t => {
const { stats } = await getHTML(url)
t.is(stats.mode, 'fetch')
})

test('follow redirect', async t => {
const url = 'https://google.com'
const redirectUrl = 'https://www.google.com/'
t.is((await getHTML(url, { prerender: false })).url, redirectUrl)
t.is((await getHTML(url, { prerender: true })).url, redirectUrl)
})

test('prerender error fallback into fetch mode', async t => {
const url =
'https://www.washingtonpost.com/gdpr-consent/?__twitter_impression=true&destination=%2Fnation%2F2018%2F10%2F24%2Fmega-millions-jackpot-winner-reported-south-carolina-its-not-official%2F%3Futm_term%3D.f26f36d5914d%26tid%3Dsm_tw%26__twitter_impression%3Dtrue&utm_term=.309ab3e98c97'

const { stats, html } = await getHTML(url)
t.true(!!html)
t.is(stats.mode, 'fetch')
})

0 comments on commit 43fa92c

Please sign in to comment.