From f60248ef444102ba8a6878d783ecf8792c50522a Mon Sep 17 00:00:00 2001 From: Kiko Beats Date: Tue, 26 Jun 2018 10:02:07 +0200 Subject: [PATCH] First commit --- .bumpedrc | 24 ++++++++++ .editorconfig | 22 +++++++++ .gitattributes | 1 + .gitignore | 34 ++++++++++++++ .npmrc | 4 ++ .travis.yml | 5 ++ LICENSE | 21 +++++++++ README.md | 111 ++++++++++++++++++++++++++++++++++++++++++++ package.json | 79 +++++++++++++++++++++++++++++++ src/auto-domains.js | 30 ++++++++++++ src/index.js | 61 ++++++++++++++++++++++++ test/index.js | 23 +++++++++ 12 files changed, 415 insertions(+) create mode 100644 .bumpedrc create mode 100755 .editorconfig create mode 100755 .gitattributes create mode 100755 .gitignore create mode 100644 .npmrc create mode 100755 .travis.yml create mode 100755 LICENSE create mode 100644 README.md create mode 100644 package.json create mode 100644 src/auto-domains.js create mode 100644 src/index.js create mode 100644 test/index.js diff --git a/.bumpedrc b/.bumpedrc new file mode 100644 index 0000000..4a77d72 --- /dev/null +++ b/.bumpedrc @@ -0,0 +1,24 @@ +files: + - package.json +plugins: + prerelease: + Adding authors: + plugin: bumped-terminal + command: npx git-authors-cli + Linting config files: + plugin: bumped-finepack + postrelease: + Generating CHANGELOG file: + plugin: bumped-changelog + Committing new version: + plugin: bumped-terminal + command: 'git add CHANGELOG.md package.json && git commit -m "Release $newVersion"' + Detecting problems before publish: + plugin: bumped-terminal + command: 'git-dirty && npm test' + Publishing tag to GitHub: + plugin: bumped-terminal + command: 'git tag $newVersion && git push && git push --tags' + Publishing to NPM: + plugin: bumped-terminal + command: npm publish diff --git a/.editorconfig b/.editorconfig new file mode 100755 index 0000000..c3efa59 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,22 @@ +# http://editorconfig.org + +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true +max_line_length = 100 +indent_brace_style = 1TBS +spaces_around_operators = true +quote_type = auto + +[package.json] +indent_style = space +indent_size = 2 + +[*.md] +trim_trailing_whitespace = false diff --git a/.gitattributes b/.gitattributes new file mode 100755 index 0000000..176a458 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text=auto diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..0b01f76 --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +############################ +# npm +############################ +node_modules +npm-debug.log +.node_history +yarn.lock +package-lock.json + +############################ +# tmp, editor & OS files +############################ +.tmp +*.swo +*.swp +*.swn +*.swm +.DS_Store +*# +*~ +.idea +*sublime* +nbproject + +############################ +# Tests +############################ +testApp +coverage +.nyc_output + +############################ +# Other +############################ diff --git a/.npmrc b/.npmrc new file mode 100644 index 0000000..e03e941 --- /dev/null +++ b/.npmrc @@ -0,0 +1,4 @@ +unsafe-perm=true +save-prefix=~ +shrinkwrap=false +save=false diff --git a/.travis.yml b/.travis.yml new file mode 100755 index 0000000..d2e90b3 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,5 @@ +language: node_js +node_js: + - "node" + - "lts/*" +after_success: npm run coveralls diff --git a/LICENSE b/LICENSE new file mode 100755 index 0000000..e77f7f8 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright © 2018 Kiko Beats (kikobeats.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..6bf1c17 --- /dev/null +++ b/README.md @@ -0,0 +1,111 @@ +# get-html + +![Last version](https://img.shields.io/github/tag/Kikobeats/get-html.svg?style=flat-square) +[![Build Status](https://img.shields.io/travis/Kikobeats/get-html/master.svg?style=flat-square)](https://travis-ci.org/Kikobeats/get-html) +[![Coverage Status](https://img.shields.io/coveralls/Kikobeats/get-html.svg?style=flat-square)](https://coveralls.io/github/Kikobeats/get-html) +[![Dependency status](https://img.shields.io/david/Kikobeats/get-html.svg?style=flat-square)](https://david-dm.org/Kikobeats/get-html) +[![Dev Dependencies Status](https://img.shields.io/david/dev/Kikobeats/get-html.svg?style=flat-square)](https://david-dm.org/Kikobeats/get-html#info=devDependencies) +[![NPM Status](https://img.shields.io/npm/dm/get-html.svg?style=flat-square)](https://www.npmjs.org/package/get-html) +[![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=flat-square)](https://paypal.me/Kikobeats) + +> Get the HTML from any website, using prerendering when is necessary. + +## Features + +- Get HTML markup from any website (client side apps as well) +- Prerendering detection based on domains whitelist. +- Speed up process blocking ads trackers. +- Encoding body response properly. + +## Install + +```bash +$ npm install get-html --save +``` + +## Usage + +```js +'use strict' + +const getHtml = require('get-html') +;(async () => { + const url = 'https://kikobeats.com' + const { html, stats } = await getHTML(url) + console.log(html) +})() +``` + +## API + +### getHTML(url, [options]) + +#### url + +*Required*
+Type: `string` + +The target URL for getting the HTML markup. + +#### options + +##### prerender + +Type: `boolean|string`
+Default: `'auto'` + +Enable or disable prerendering as mechanism for getting the HTML markup explicitly. + +The value `auto` means that that internally use a list of whitelist website that don't need to use prerendering by default. This list is used for speedup the process, using `fetch` mode for these websites. + +See [fetchMode parameter](#fetchMode) for know more. + +##### browserless + +Type: `object`
+ +A [browserless](https://browserless.js.org/) instance to be used for interact with puppeteer. If you don't provide one, a browser instance will be created in each library call. + +##### encoding + +Type: `string`
+Default: `'utf-8'` + +Encoding the HTML markup properly from the body response. + +It determines the encode to use A Node.js library for converting HTML documents of arbitrary encoding into a target encoding (utf8, utf16, etc). + +##### fetchMode + +Type: `function`
+ +A function evaluation that will be invoked to determinate the resolutive `mode` for getting the HTML markup from the target URL. + +The default `fetchMode` is: + +```js +const getFetchMode = (url, { prerender }) => { + if (prerender === false) return 'fetch' + if (prerender !== 'auto') return 'prerender' + return autoDomains.includes(parseDomain(url).domain) ? 'fetch' : 'prerender' +} +``` + +##### gotOptions + +Type: `object`
+ +Under `mode=fetch`, pass configuration object to [got](https://www.npmjs.com/package/got). + +##### puppeteerOpts + +Type: `object` + +Under non `mode=fetch`, pass configuration object to [puppeteer](https://www.npmjs.com/package/puppeteer). + +## License + +**get-html** © [Kiko Beats](https://kikobeats.com), released under the [MIT](https://github.com/Kikobeats/get-html/blob/master/LICENSE.md) License.
+Authored and maintained by Kiko Beats with help from [contributors](https://github.com/Kikobeats/get-html/contributors). + +> [kikobeats.com](https://kikobeats.com) · GitHub [Kiko Beats](https://github.com/Kikobeats) · Twitter [@Kikobeats](https://twitter.com/Kikobeats) diff --git a/package.json b/package.json new file mode 100644 index 0000000..b060eef --- /dev/null +++ b/package.json @@ -0,0 +1,79 @@ +{ + "name": "get-html", + "description": "Get the HTML from any website, using prerendering when is necessary.", + "homepage": "https://documentup.com/Kikobeats/get-html", + "version": "0.0.0", + "main": "src/index.js", + "author": { + "email": "josefrancisco.verdu@gmail.com", + "name": "Kiko Beats", + "url": "https://kikobeats.com" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/Kikobeats/get-html.git" + }, + "bugs": { + "url": "https://github.com/Kikobeats/get-html/issues" + }, + "keywords": [ + "fetch", + "get", + "got", + "headless", + "html", + "prerender", + "request" + ], + "dependencies": { + "browserless": "~3.6.1", + "got": "~8.3.1", + "html-encode": "~2.0.1", + "parse-domain": "~2.1.2", + "puppeteer": "~1.5.0", + "time-span": "~2.0.0" + }, + "devDependencies": { + "ava": "latest", + "coveralls": "latest", + "finepack": "latest", + "git-authors-cli": "latest", + "git-dirty": "latest", + "husky": "latest", + "lint-staged": "latest", + "nyc": "latest", + "prettier-standard": "latest", + "standard": "latest", + "standard-markdown": "latest" + }, + "engines": { + "node": ">= 8" + }, + "files": [ + "src" + ], + "scripts": { + "clean": "rm -rf node_modules", + "coveralls": "nyc report --reporter=text-lcov | coveralls", + "lint": "standard-markdown && standard", + "precommit": "lint-staged", + "pretest": "npm run lint", + "pretty": "prettier-standard index.js {core,test,bin,scripts}/**/*.js --single-quote --print-width 100", + "test": "nyc ava" + }, + "license": "MIT", + "lint-staged": { + "package.json": [ + "finepack", + "git add" + ], + "*.js": [ + "prettier-standard", + "git add" + ], + "*.md": [ + "standard-markdown", + "git add" + ] + } +} diff --git a/src/auto-domains.js b/src/auto-domains.js new file mode 100644 index 0000000..769a7ed --- /dev/null +++ b/src/auto-domains.js @@ -0,0 +1,30 @@ +module.exports = [ + 'apple', + 'bbc', + 'bloomberg', + 'digg', + 'engadget', + 'etsy', + 'eventbrite', + 'facebook', + 'flickr', + 'github', + 'gizmodo', + 'huffingtonpost', + 'imdb', + 'instagram', + 'medium', + 'microsoft', + 'nytimes', + 'pinterest', + 'reddit', + 'slideshare', + 'sourceforge', + 'techcrunch', + 'telegraph', + 'theverge', + 'twitter', + 'vimeo', + 'yelp', + 'youtube' +] diff --git a/src/index.js b/src/index.js new file mode 100644 index 0000000..ac051be --- /dev/null +++ b/src/index.js @@ -0,0 +1,61 @@ +'use strict' + +const createBrowserless = require('browserless') +const parseDomain = require('parse-domain') +const htmlEncode = require('html-encode') +const timeSpan = require('time-span') +const got = require('got') + +const autoDomains = require('./auto-domains') + +const fetch = async (url, { toEncode, ...opts }) => { + const res = await got(url, { encoding: null, ...opts }) + return toEncode(res.body, res.headers['content-type']) +} + +const prerender = async ( + url, + { browserless = createBrowserless(), gotOptions, toEncode, ...opts } +) => { + const fetchData = fetch(url, { toEncode, ...gotOptions }) + let html + + try { + html = await browserless.getHTML(url, opts) + fetchData.cancel() + } catch (err) { + html = await fetchData + } + + return html +} + +const FETCH_MODE = { fetch, prerender } + +const getFetchMode = (url, { prerender }) => { + if (prerender === false) return 'fetch' + if (prerender !== 'auto') return 'prerender' + return autoDomains.includes(parseDomain(url).domain) ? 'fetch' : 'prerender' +} + +module.exports = async ( + url, + { + browserless, + encoding = 'utf-8', + fetchMode = getFetchMode, + gotOptions, + prerender = 'auto', + puppeteerOpts + } = {} +) => { + const toEncode = htmlEncode(encoding) + const mode = fetchMode(url, { prerender }) + const opts = + mode === 'fetch' + ? { toEncode, ...gotOptions } + : { toEncode, browserless, gotOptions, ...puppeteerOpts } + const time = timeSpan() + const html = await FETCH_MODE[mode](url, opts) + return { html, stats: { mode, timing: time() } } +} diff --git a/test/index.js b/test/index.js new file mode 100644 index 0000000..2f58f58 --- /dev/null +++ b/test/index.js @@ -0,0 +1,23 @@ +'use strict' + +const test = require('ava') + +const getHTML = require('..') + +test('prerender by default', async t => { + const url = 'https://kikobeats.com' + const { stats } = await getHTML(url) + t.is(stats.mode, 'prerender') +}) + +test('disable prerender', async t => { + const url = 'https://kikobeats.com' + const { stats } = await getHTML(url, { prerender: false }) + t.is(stats.mode, 'fetch') +}) + +test('prerender auto detection', async t => { + const url = 'https://facebook.com' + const { stats } = await getHTML(url) + t.is(stats.mode, 'fetch') +})