Skip to content

Commit

Permalink
feat: Improve domains auto detection (#50)
Browse files Browse the repository at this point in the history
feat: Improve domains auto detection
  • Loading branch information
Kikobeats authored May 16, 2019
2 parents f4882b8 + 71f8f55 commit 782446d
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 84 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package-lock.json
.idea
*sublime*
nbproject
src/auto-domains.json

############################
# Tests
Expand Down
17 changes: 7 additions & 10 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,16 @@
"got": "~9.6.0",
"he": "~1.2.0",
"html-encode": "~2.1.1",
"lodash": "~4.17.11",
"mem": "~4.3.0",
"mime-types": "~2.1.24",
"p-cancelable": "~2.0.0",
"reachable-url": "~1.1.8",
"require-one-of": "~1.0.9",
"time-span": "~3.1.0",
"tldts": "~4.0.6"
"tldts": "~4.0.6",
"top-sites": "~1.1.8",
"write-json-file": "~3.2.0"
},
"devDependencies": {
"@commitlint/cli": "latest",
Expand All @@ -62,17 +65,14 @@
"git-dirty": "latest",
"husky": "latest",
"lint-staged": "latest",
"lodash": "latest",
"npm-check-updates": "latest",
"nyc": "latest",
"prettier-standard": "latest",
"pretty": "latest",
"puppeteer": "latest",
"standard": "latest",
"standard-markdown": "latest",
"standard-version": "latest",
"top-sites": "latest",
"write-json-file": "latest"
"standard-version": "latest"
},
"engines": {
"node": ">= 8"
Expand All @@ -85,9 +85,10 @@
"clean": "rm -rf node_modules",
"coverage": "nyc report --reporter=text-lcov | coveralls",
"lint": "standard-markdown README.md && standard",
"postinstall": "node scripts/postinstall",
"postrelease": "npm run release:tags && npm run release:github && ci-publish",
"prerelease": "npm run update:check && git-authors-cli",
"pretest": "npm run lint",
"pretest": "npm run lint && npm run postinstall",
"pretty": "prettier-standard index.js {core,test,bin,scripts}/**/*.js --single-quote --print-width 100",
"release": "git add package.json && standard-version -a",
"release:github": "conventional-github-releaser -p angular",
Expand All @@ -114,10 +115,6 @@
"finepack",
"git add"
],
"src/auto-domains.json": [
"node sort",
"git add"
],
"*.js": [
"prettier-standard",
"git add"
Expand Down
56 changes: 56 additions & 0 deletions scripts/postinstall
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env node

'use strict'

const { compact, reduce, findIndex } = require('lodash')
const writeJsonFile = require('write-json-file')
const topsites = require('top-sites')

const { getDomainWithoutSuffix } = require('../src/tlds')

const domains = [
'facebook',
'twitter',
'instagram',
'pinterest',
'apple',
'vimeo',
'flickr',
'microsoft',
'reddit',
'nytimes',
'github',
'bbc',
'digg',
'yelp',
'imdb',
'huffingtonpost',
'etsy',
'slideshare',
'eventbrite',
'sourceforge',
'telegraph',
'bloomberg',
'medium',
'techcrunch',
'washingtonpost',
'engadget',
'theverge',
'giphy'
]

const { top, rest } = reduce(
domains,
(acc, domain) => {
const index = findIndex(topsites, ({ rootDomain }) => getDomainWithoutSuffix(rootDomain) === domain)
if (index !== -1) acc.top[index] = domain
else acc.rest.push(domain)
return acc
},
{ top: new Array(topsites.length), rest: [] }

)

writeJsonFile('./src/auto-domains.json', compact(top).concat(rest))
.then(() => process.exit())
.catch(err => console.log(err))
28 changes: 0 additions & 28 deletions sort.js

This file was deleted.

29 changes: 0 additions & 29 deletions src/auto-domains.json

This file was deleted.

25 changes: 10 additions & 15 deletions src/index.js
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
'use strict'

const { getDomain, getPublicSuffix } = require('tldts')
const { isMediaUrl } = require('@metascraper/helpers')
const requireOneOf = require('require-one-of')
const reachableUrl = require('reachable-url')
const PCancelable = require('p-cancelable')
const debug = require('debug')('html-get')
const htmlEncode = require('html-encode')
const timeSpan = require('time-span')

const { URL } = require('url')
const got = require('got')
const mem = require('mem')
const he = require('he')

const { getDomainWithoutSuffix } = require('./tlds')
const autoDomains = require('./auto-domains')
const addHtml = require('./html')

Expand All @@ -32,13 +30,15 @@ const REQ_TIMEOUT_REACHABLE = REQ_TIMEOUT * 0.25
// Puppeteer doesn't resolve redirection well.
// We need to ensure we have the right url.
const getUrl = mem(
async targetUrl => {
async (targetUrl, opts) => {
try {
const res = await reachableUrl(targetUrl, {
timeout: REQ_TIMEOUT_REACHABLE
timeout: REQ_TIMEOUT_REACHABLE,
...opts
})
return res
} catch (err) {
debug('getUrl:err', err)
return { url: targetUrl, headers: {} }
}
},
Expand All @@ -51,6 +51,7 @@ const fetch = (url, { toEncode, reflect = false, ...opts }) =>
new PCancelable(async (resolve, reject, onCancel) => {
const req = got(url, {
encoding: null,
retry: 0,
timeout: reflect ? REQ_TIMEOUT / 2 : REQ_TIMEOUT,
...opts
})
Expand Down Expand Up @@ -99,11 +100,7 @@ const prerender = async (url, { getBrowserless, gotOptions, toEncode, ...opts })

const modes = { fetch, prerender }

const isFetchMode = mem(url => {
const suffix = getPublicSuffix(url)
const domain = getDomain(url)
return autoDomains.includes(suffix ? domain.replace(`.${suffix}`, '') : domain)
})
const isFetchMode = url => autoDomains.includes(getDomainWithoutSuffix(url))

const determinateMode = (url, { prerender }) => {
if (prerender === false) return 'fetch'
Expand All @@ -113,7 +110,7 @@ const determinateMode = (url, { prerender }) => {
}

const getContent = async (encodedUrl, mode, opts) => {
const { url, headers } = await getUrl(encodedUrl)
const { url, headers } = await getUrl(encodedUrl, opts)
debug(`getUrl ${encodedUrl === url ? url : `${encodedUrl}${url}`}`)
const content = await modes[mode](url, opts)

Expand All @@ -134,16 +131,14 @@ module.exports = async (
puppeteerOpts
} = {}
) => {
const { href: encodedUrl } = new URL(targetUrl)
const toEncode = htmlEncode(encoding)
const reqMode = getMode(encodedUrl, { prerender })
const reqMode = getMode(targetUrl, { prerender })

const opts =
reqMode === 'fetch'
? { toEncode, ...gotOptions }
: { toEncode, getBrowserless, gotOptions, ...puppeteerOpts }

const time = timeSpan()
const { url, html, mode } = await getContent(encodedUrl, reqMode, opts)
const { url, html, mode } = await getContent(targetUrl, reqMode, opts)
return { url, html, stats: { mode, timing: time() } }
}
12 changes: 12 additions & 0 deletions src/tlds.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
'use strict'

const { getDomain, getPublicSuffix } = require('tldts')
const mem = require('mem')

const getDomainWithoutSuffix = mem(url => {
const suffix = getPublicSuffix(url)
const domain = getDomain(url)
return suffix ? domain.replace(`.${suffix}`, '') : domain
})

module.exports = { getDomainWithoutSuffix }
4 changes: 2 additions & 2 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ test('unreachable urls', async t => {
<title>notexisturl.dev</title>
<meta property="og:site_name" content="notexisturl.dev">
<meta property="og:locale" content="en">
<meta property="og:url" content="https://notexisturl.dev/">
<link rel="canonical" href="https://notexisturl.dev/">
<meta property="og:url" content="https://notexisturl.dev">
<link rel="canonical" href="https://notexisturl.dev">
</head>
<body>
</body>
Expand Down

0 comments on commit 782446d

Please sign in to comment.