Skip to content

Commit

Permalink
feat: Ensure to resolve media URLs
Browse files Browse the repository at this point in the history
BREAKING CHANGE: Rename fetchMode → getMode
  • Loading branch information
Kikobeats committed Dec 16, 2018
1 parent 6451c36 commit 61aee97
Show file tree
Hide file tree
Showing 6 changed files with 237 additions and 24 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ Enable or disable prerendering as mechanism for getting the HTML markup explicit

The value `auto` means that that internally use a list of whitelist website that don't need to use prerendering by default. This list is used for speedup the process, using `fetch` mode for these websites.

See [fetchMode parameter](#fetchMode) for know more.
See [getMode parameter](#getMode) for know more.

##### getBrowserless

Expand All @@ -89,19 +89,19 @@ Encoding the HTML markup properly from the body response.

It determines the encode to use A Node.js library for converting HTML documents of arbitrary encoding into a target encoding (utf8, utf16, etc).

##### fetchMode
##### getMode

Type: `function`<br>

A function evaluation that will be invoked to determinate the resolutive `mode` for getting the HTML markup from the target URL.

The default `fetchMode` is:
The default `getMode` is:

```js
const getFetchMode = (url, { prerender }) => {
const getMode = (url, { prerender }) => {
if (prerender === false) return 'fetch'
if (prerender !== 'auto') return 'prerender'
return autoDomains.includes(parseDomain(url).domain) ? 'fetch' : 'prerender'
return autoDomains.includes(getDomain(url)) ? 'fetch' : 'prerender'
}
```

Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"request"
],
"dependencies": {
"@metascraper/helpers": "~4.8.3",
"browserless": "~4.2.1",
"debug": "~4.1.0",
"got": "~9.4.0",
Expand Down
107 changes: 89 additions & 18 deletions src/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'use strict'

const { isMime } = require('@metascraper/helpers')
const createBrowserless = require('browserless')
const reachableUrl = require('reachable-url')
const parseDomain = require('parse-domain')
Expand All @@ -9,6 +10,7 @@ const htmlEncode = require('html-encode')
const timeSpan = require('time-span')
const pTimeout = require('p-timeout')
const { URL } = require('url')
const path = require('path')
const got = require('got')
const mem = require('mem')
const he = require('he')
Expand All @@ -32,12 +34,12 @@ const REQ_TIMEOUT_REACHABLE = REQ_TIMEOUT * 0.25
const getUrl = mem(
async targetUrl => {
try {
const { url } = await reachableUrl(targetUrl, {
const res = await reachableUrl(targetUrl, {
timeout: REQ_TIMEOUT_REACHABLE
})
return url
return res
} catch (err) {
return targetUrl
return { url: targetUrl, headers: {} }
}
},
{ maxAge: ONE_DAY_MS }
Expand Down Expand Up @@ -73,23 +75,18 @@ const fetch = (url, { toEncode, reflect = false, ...opts }) =>
})

const prerender = async (
targetUrl,
url,
{ getBrowserless, gotOptions, toEncode, ...opts }
) => {
let fetchReq
let fetchDataProps = {}
let isFetchRejected = false
let html = ''
let url

try {
debug(`getUrl:resolving`)
url = await getUrl(targetUrl)
debug(`getUrl:resolved ${targetUrl}${url}`)
fetchReq = fetch(url, { reflect: true, toEncode, ...gotOptions })
const browserless = await getBrowserless()
html = await pTimeout(browserless.html(url, opts), REQ_TIMEOUT)

await fetchReq.cancel()
debug('prerender:success')
return { url, html: getHtml(html), mode: 'prerender' }
Expand All @@ -104,38 +101,112 @@ const prerender = async (
return isFetchRejected ? { url, html, mode: 'prerender' } : fetchDataProps
}

const FETCH_MODE = { fetch, prerender }
const modes = { fetch, prerender }

const getFetchMode = (url, { prerender }) => {
const determinateMode = (url, { prerender }) => {
if (prerender === false) return 'fetch'
if (prerender !== 'auto') return 'prerender'
return autoDomains.includes(getDomain(url)) ? 'fetch' : 'prerender'
}

const baseHtml = (url, headers, html = '') => {
const { hostname } = new URL(url)
const { date, expires } = headers

return {
url,
mode: 'fetch',
html: `
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, minimum-scale=0.1">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0" shrink-to-fit="no">
<title>${path.basename(url)}</title>
<meta property="og:site_name" content="${hostname}">
${
date
? `<meta property="article:published_time" content="${date}">`
: ''
}
${
expires
? `<meta property="article:expiration_time" content="${expires}">`
: ''
}
<meta property="og:locale" content="en">
<meta property="og:url" content="${url}">
<meta property="og:image" content="${url}">
${html}
<link rel="canonical" href="${url}">
</head>
</head>
<body>
<img src="${url}">
</body>
</html>`.trim()
}
}

const getImageHtml = (url, headers) =>
baseHtml(url, headers, `<meta property="og:image" content="${url}">`)

const getVideoHtml = (url, headers) => {
const { protocol } = new URL(url)
const isHttps = protocol === 'https:'
const videoProperty = `og:video${isHttps ? ':secure_url' : ''}`
return baseHtml(
url,
headers,
`<meta property="${videoProperty}" content="${url}">`
)
}

const getAudioHtml = (url, headers) => {
const { protocol } = new URL(url)
const isHttps = protocol === 'https:'
const audioProperty = `og:audio${isHttps ? ':secure_url' : ''}`
return baseHtml(
url,
headers,
`<meta property="${audioProperty}" content="${url}">`
)
}

const getContent = async (encodedUrl, mode, opts) => {
const { url, headers } = await getUrl(encodedUrl)
debug(`getUrl ${encodedUrl === url ? url : `${encodedUrl}${url}`}`)
const contentType = headers['content-type']
if (isMime(contentType, 'image')) return getImageHtml(url, headers)
if (isMime(contentType, 'video')) return getVideoHtml(url, headers)
if (isMime(contentType, 'audio')) return getAudioHtml(url, headers)
return modes[mode](url, opts)
}

module.exports = async (
targetUrl,
{
getBrowserless = createBrowserless,
encoding = 'utf-8',
fetchMode = getFetchMode,
getMode = determinateMode,
gotOptions,
prerender = 'auto',
puppeteerOpts
} = {}
) => {
const { href: encodedUrl } = new URL(targetUrl)
const toEncode = htmlEncode(encoding)
const targetFetchMode = fetchMode(encodedUrl, { prerender })
const reqMode = getMode(encodedUrl, { prerender })

const opts =
targetFetchMode === 'fetch'
reqMode === 'fetch'
? { toEncode, ...gotOptions }
: { toEncode, getBrowserless, gotOptions, ...puppeteerOpts }

const time = timeSpan()
const { url, html, mode } = await FETCH_MODE[targetFetchMode](
encodedUrl,
opts
)
const { url, html, mode } = await getContent(encodedUrl, reqMode, opts)
return { url, html, stats: { mode, timing: time() } }
}

Expand Down
33 changes: 33 additions & 0 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,36 @@ test('unencoded URL', async t => {
'https://medium.com/@Acegikmo/the-ever-so-lovely-b%C3%A9zier-curve-eb27514da3bf'
)
})

test('get html from audio url', async t => {
const url = 'https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav'
const { url: urlDetected, stats, html } = await getHTML(url, {
prerender: false
})

t.true(!!html)
t.is(stats.mode, 'fetch')
t.is(url, urlDetected)
})

test('get html from image url', async t => {
const url = 'https://kikobeats.com/images/avatar.jpg'
const { url: urlDetected, stats, html } = await getHTML(url, {
prerender: false
})

t.true(!!html)
t.is(stats.mode, 'fetch')
t.is(url, urlDetected)
})

test('get html from video url', async t => {
const url = 'https://microlink.io/preview.mp4'
const { url: urlDetected, stats, html } = await getHTML(url, {
prerender: false
})

t.true(!!html)
t.is(stats.mode, 'fetch')
t.is(url, urlDetected)
})
110 changes: 109 additions & 1 deletion test/snapshots/index.js.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,112 @@ Generated by [AVA](https://ava.li).
<a href="https://httpbin-org.herokuapp.com/redirect/3"></a>␊
<a href="https://github.com/kikobeats/splashy"></a>␊
</body>␊
</html>`
</html>

## get htmnl from audio url

> Snapshot 1
`<html lang="en">␊
<head>␊
<meta name="viewport" content="width=device-width, minimum-scale=0.1">␊
<head>␊
<meta charset="utf-8">␊
<meta http-equiv="X-UA-Compatible" content="IE=edge">␊
<meta name="viewport" content="width=device-width, initial-scale=1.0" shrink-to-fit="no">␊
<title>embedadapt_100sample.wav</title>␊
<meta property="og:site_name" content="audiodemos.github.io">␊
<meta property="article:published_time" content="Sun, 16 Dec 2018 18:53:58 GMT">␊
<meta property="article:expiration_time" content="Sun, 16 Dec 2018 19:01:02 GMT">␊
<meta property="og:locale" content="en">␊
<meta property="og:url" content="https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav">␊
<meta property="og:image" content="https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav">␊
<meta property="og:audio:secure_url" content="https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav">␊
<link rel="canonical" href="https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav">␊
</head>␊
</head>␊
<body>␊
<img src="https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav">␊
</body>␊
</html>

## get html from audio url

> Snapshot 1
`<html lang="en">␊
<head>␊
<meta name="viewport" content="width=device-width, minimum-scale=0.1">␊
<head>␊
<meta charset="utf-8">␊
<meta http-equiv="X-UA-Compatible" content="IE=edge">␊
<meta name="viewport" content="width=device-width, initial-scale=1.0" shrink-to-fit="no">␊
<title>embedadapt_100sample.wav</title>␊
<meta property="og:site_name" content="audiodemos.github.io">␊
<meta property="article:published_time" content="Sun, 16 Dec 2018 18:54:57 GMT">␊
<meta property="article:expiration_time" content="Sun, 16 Dec 2018 19:01:02 GMT">␊
<meta property="og:locale" content="en">␊
<meta property="og:url" content="https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav">␊
<meta property="og:image" content="https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav">␊
<meta property="og:audio:secure_url" content="https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav">␊
<link rel="canonical" href="https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav">␊
</head>␊
</head>␊
<body>␊
<img src="https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav">␊
</body>␊
</html>`

## get html from image url

> Snapshot 1
`<html lang="en">␊
<head>␊
<meta name="viewport" content="width=device-width, minimum-scale=0.1">␊
<head>␊
<meta charset="utf-8">␊
<meta http-equiv="X-UA-Compatible" content="IE=edge">␊
<meta name="viewport" content="width=device-width, initial-scale=1.0" shrink-to-fit="no">␊
<title>avatar.jpg</title>␊
<meta property="og:site_name" content="kikobeats.com">␊
<meta property="article:published_time" content="Sun, 16 Dec 2018 18:54:58 GMT">␊
<meta property="article:expiration_time" content="Fri, 21 Dec 2018 18:54:58 GMT">␊
<meta property="og:locale" content="en">␊
<meta property="og:url" content="https://kikobeats.com/images/avatar.jpg">␊
<meta property="og:image" content="https://kikobeats.com/images/avatar.jpg">␊
<meta property="og:image" content="https://kikobeats.com/images/avatar.jpg">␊
<link rel="canonical" href="https://kikobeats.com/images/avatar.jpg">␊
</head>␊
</head>␊
<body>␊
<img src="https://kikobeats.com/images/avatar.jpg">␊
</body>␊
</html>

## get html from video url

> Snapshot 1
`<html lang="en">␊
<head>␊
<meta name="viewport" content="width=device-width, minimum-scale=0.1">␊
<head>␊
<meta charset="utf-8">␊
<meta http-equiv="X-UA-Compatible" content="IE=edge">␊
<meta name="viewport" content="width=device-width, initial-scale=1.0" shrink-to-fit="no">␊
<title>preview.mp4</title>␊
<meta property="og:site_name" content="microlink.io">␊
<meta property="article:published_time" content="Sun, 16 Dec 2018 18:56:11 GMT">␊
<meta property="og:locale" content="en">␊
<meta property="og:url" content="https://microlink.io/preview.mp4">␊
<meta property="og:image" content="https://microlink.io/preview.mp4">␊
<meta property="og:video:secure_url" content="https://microlink.io/preview.mp4">␊
<link rel="canonical" href="https://microlink.io/preview.mp4">␊
</head>␊
</head>␊
<body>␊
<img src="https://microlink.io/preview.mp4">␊
</body>␊
</html>`
Binary file modified test/snapshots/index.js.snap
Binary file not shown.

0 comments on commit 61aee97

Please sign in to comment.