Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add rewriteHtml #207

Merged
merged 5 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,13 @@ Default: `false`

When is `true`, it will be rewritten CSS/HTML relatives URLs present in the HTML markup into absolutes.

##### rewriteHtml

Type: `boolean`<br>
Default: `false`

When is `true`, it will rewrite some common mistake related with HTML meta tags.

## License

**html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>
Expand Down
27 changes: 27 additions & 0 deletions src/html.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
'use strict'

const { get, split, nth, castArray, forEach } = require('lodash')
const debug = require('debug-logfmt')('html-get:rewrite')
const localhostUrl = require('localhost-url-regex')
const { TAGS: URL_TAGS } = require('html-urls')
const isHTML = require('is-html-content')
Expand Down Expand Up @@ -89,6 +90,29 @@ const addBody = ({ url, headers, html }) => {
return `<!DOCTYPE html><html><head></head><body>${element}</body></html>`
}

const isOpenGraph = (prop = '') =>
['og:', 'fb:'].some(prefix => prop.startsWith(prefix))

const rewriteMetaTags = ({ $ }) => {
$('meta').each((_, element) => {
const el = $(element)
if (!el.attr('content')) return

const name = el.attr('name')
const property = el.attr('property')

// Convert 'name' to 'property' for Open Graph tags if 'property' is not already set correctly
if (property !== name && isOpenGraph(name)) {
el.removeAttr('name').attr('property', name)
debug('og', el.attr())
// Convert 'property' to 'name' for non-Open Graph tags
} else if (property && !isOpenGraph(property)) {
el.removeAttr('property').attr('name', property)
debug('meta', el.attr())
}
})
}

const rewriteHtmlUrls = ({ $, url }) => {
forEach(URL_TAGS, (tagName, urlAttr) => {
$(tagName.join(',')).each(function () {
Expand Down Expand Up @@ -156,6 +180,7 @@ module.exports = ({
hide,
remove,
rewriteUrls,
rewriteHtml,
scripts,
modules
}) => {
Expand All @@ -167,6 +192,8 @@ module.exports = ({

if (rewriteUrls) rewriteHtmlUrls({ $, url })

if (rewriteHtml) rewriteMetaTags({ $, url })

addHead({ $, url, headers })

if (styles) injectStyle({ $, styles })
Expand Down
8 changes: 6 additions & 2 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ const getContent = PCancelable.fn(
mutoolPath,
puppeteerOpts,
rewriteUrls,
rewriteHtml,
toEncode
},
onCancel
Expand All @@ -224,7 +225,8 @@ const getContent = PCancelable.fn(
const html = addHtml({
...content,
...(isFetchMode ? puppeteerOpts : undefined),
rewriteUrls
rewriteUrls,
rewriteHtml
})

return { ...content, html }
Expand All @@ -245,7 +247,8 @@ module.exports = PCancelable.fn(
mutoolPath = defaultMutoolPath(),
prerender = 'auto',
puppeteerOpts,
rewriteUrls = false
rewriteUrls = false,
rewriteHtml = false
} = {},
onCancel
) => {
Expand All @@ -268,6 +271,7 @@ module.exports = PCancelable.fn(
mutoolPath,
puppeteerOpts,
rewriteUrls,
rewriteHtml,
toEncode
})

Expand Down
154 changes: 154 additions & 0 deletions test/html/rewrite-html.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
'use strict'

const test = require('ava')
const cheerio = require('cheerio')

const { prettyHtml } = require('../util')

const html = require('../../src/html')

const composeHtml = meta =>
prettyHtml(`
<!DOCTYPE html>
<html>
<head>
<title>kikobeats.com</title>
<meta property="og:site_name" content="kikobeats.com">
<link rel="canonical" href="https://kikobeats.com"><meta charset="utf-8">
${meta.join('\n')}
</head>
<body></body>
</html>`)

test("don't rewrite og if property is already present", async t => {
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml([
'<meta content="This Pin was discovered by NMA Group" data-app="true" name="og:description" property="og:description">'
]),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is(
$('meta[name="og:description"]').attr('content'),
'This Pin was discovered by NMA Group'
)
t.is(
$('meta[property="og:description"]').attr('content'),
'This Pin was discovered by NMA Group'
)
})

test('fb propietary tags should be treat as og', async t => {
{
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml(['<meta content="1234" property="fb:app_id">']),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[property="fb:app_id"]').attr('content'), '1234')
t.is($('meta[name="fb:app_id"]').attr('content'), undefined)
}
{
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml(['<meta content="1234" name="fb:app_id">']),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[property="fb:app_id"]').attr('content'), '1234')
t.is($('meta[name="fb:app_id"]').attr('content'), undefined)
}
})

test("don't rewrite og if content is empty", async t => {
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml(['<meta content="" name="twitter:description">']),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[name="twitter:description"]').attr('content'), '')
t.is($('meta[property="twitter:description"]').attr('content'), undefined)
})

test('rewrite multiple og wrong markup', async t => {
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml([
'<meta name="og:title" content="Kiko Beats">',
'<meta name="og:description" content="Personal website of Kiko Beats">',
'<meta name="og:image" content="https://kikobeats.com/image.jpg">'
]),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[property="og:title"]').attr('content'), 'Kiko Beats')
t.is(
$('meta[property="og:description"]').attr('content'),
'Personal website of Kiko Beats'
)
t.is(
$('meta[property="og:image"]').attr('content'),
'https://kikobeats.com/image.jpg'
)
})

test('rewrite multiple meta wrong markup', async t => {
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml([
'<meta property="title" content="Kiko Beats">',
'<meta property="description" content="Personal website of Kiko Beats">',
'<meta property="image" content="https://kikobeats.com/image.jpg">'
]),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[name="title"]').attr('content'), 'Kiko Beats')
t.is(
$('meta[name="description"]').attr('content'),
'Personal website of Kiko Beats'
)
t.is(
$('meta[name="image"]').attr('content'),
'https://kikobeats.com/image.jpg'
)
})

test('rewrite multiple twitter wrong markup', async t => {
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml([
'<meta property="twitter:title" content="Kiko Beats">',
'<meta property="twitter:description" content="Personal website of Kiko Beats">',
'<meta property="twitter:image" content="https://kikobeats.com/image.jpg">'
]),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[name="twitter:title"]').attr('content'), 'Kiko Beats')
t.is(
$('meta[name="twitter:description"]').attr('content'),
'Personal website of Kiko Beats'
)
t.is(
$('meta[name="twitter:image"]').attr('content'),
'https://kikobeats.com/image.jpg'
)
})