Skip to content

Commit

Permalink
build: use html detection based on headers and content
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikobeats committed May 7, 2019
1 parent c5e099e commit 27d52c6
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 5 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@
"@metascraper/helpers": "~5.2.0",
"cheerio": "~1.0.0-rc.3",
"debug": "~4.1.1",
"file-type": "~11.0.0",
"got": "~9.6.0",
"he": "~1.2.0",
"html-encode": "~2.1.1",
"mem": "~4.3.0",
"mime-types": "~2.1.24",
"p-cancelable": "~2.0.0",
"reachable-url": "~1.1.8",
"require-one-of": "~1.0.3",
Expand Down
3 changes: 2 additions & 1 deletion src/auto-domains.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@
"medium",
"techcrunch",
"engadget",
"theverge"
"theverge",
"giphy"
]
7 changes: 4 additions & 3 deletions src/html.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
'use strict'

const { isMime } = require('@metascraper/helpers')
const mimeTypes = require('mime-types')
const { getDomain } = require('tldts')
const fileType = require('file-type')
const cheerio = require('cheerio')
const { URL } = require('url')
const path = require('path')
Expand Down Expand Up @@ -86,8 +86,9 @@ const htmlTemplate = () => `

module.exports = ({ html, url, headers }) => {
const contentType = headers['content-type']
const htmlTyle = fileType(Buffer.from(html, 0, fileType.minimumBytes))
const content = htmlTyle === undefined ? html : htmlTemplate()
const isHTML =
mimeTypes.extension(contentType) === 'html' && typeof html === 'string' && html.length
const content = isHTML ? html : htmlTemplate()

const $ = cheerio.load(content, {
decodeEntities: false,
Expand Down

0 comments on commit 27d52c6

Please sign in to comment.