Skip to content

Commit

Permalink
feat: better timestamp inference (#167)
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikobeats authored Aug 5, 2023
1 parent e902866 commit 6973d25
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 228 deletions.
8 changes: 4 additions & 4 deletions bin/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
'use strict'

const createBrowserless = require('browserless')
const mri = require('mri')
const { URL } = require('url')
const mri = require('mri')

const getHTML = require('..')

Expand All @@ -26,9 +26,9 @@ getHTML(url, { getBrowserless, ...args })
headers: ${
headers
? Object.keys(headers).reduce(
(acc, key) => `${acc}${key}=${headers[key]} `,
''
)
(acc, key) => `${acc}${key}=${headers[key]} `,
''
)
: '-'
}
`)
Expand Down
15 changes: 13 additions & 2 deletions src/html.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,21 @@ const has = el => el.length !== 0

const upsert = (el, collection, item) => !has(el) && collection.push(item)

/**
* Infer timestamp from `last-modified`, `date`, or `age` response headers.
*/
const getDate = headers => {
const timestamp = get(headers, 'last-modified') || get(headers, 'date')
return timestamp
? toDate(timestamp)
: toDate(Date.now() - Number(get(headers, 'age')) * 1000)
}

const addHead = ({ $, url, headers }) => {
const tags = []
const contentType = get(headers, 'content-type')
const charset = nth(split(contentType, 'charset='), 1)
const timestamp = get(headers, 'last-modified') || get(headers, 'date')
const date = timestamp && toDate(timestamp)
const date = getDate(headers)
const { domain } = parseUrl(url)
const head = $('head')

Expand Down Expand Up @@ -179,3 +188,5 @@ module.exports = ({

return rewriteUrls ? rewriteCssUrls({ html: $.html(), url }) : $.html()
}

module.exports.getDate = getDate
26 changes: 26 additions & 0 deletions test/html/get-date.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
'use strict'

const test = require('ava')

const { getDate } = require('../../src/html')

test('from `last-modified`', t => {
const date = getDate({ 'last-modified': 'Fri, 04 Aug 2023 21:10:56 GMT' })
t.is(date, '2023-08-04T21:10:56.000Z')
})

test('from `date`', t => {
const date = getDate({ 'last-modified': 'Sat, 05 Aug 2023 09:43:59 GMT' })
t.is(date, '2023-08-05T09:43:59.000Z')
})

test('from `age`', t => {
{
const date = getDate({ age: '1884' })
t.truthy(date)
}
{
const date = getDate({})
t.is(date, undefined)
}
})
6 changes: 3 additions & 3 deletions test/html.js → test/html/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ const path = require('path')
const test = require('ava')
const fs = require('fs')

const { prettyHtml } = require('./util')
const { prettyHtml } = require('../util')

const html = require('../src/html')
const html = require('../../src/html')

test('add minimal html markup', t => {
const output = html({
Expand Down Expand Up @@ -125,7 +125,7 @@ test('`rewriteHtmlUrls` rewrites relative root URLs inside html markup', t => {
rewriteUrls: true,
url: 'https://browserless.js.org',
html: fs.readFileSync(
path.resolve(__dirname, 'fixtures/browserless.html'),
path.resolve(__dirname, '../fixtures/browserless.html'),
'utf8'
),
headers: {
Expand Down
221 changes: 2 additions & 219 deletions test/snapshots/html.js.md → test/html/snapshots/index.js.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Snapshot report for `test/html.js`
# Snapshot report for `test/html/index.js`

The actual snapshot is saved in `html.js.snap`.
The actual snapshot is saved in `index.js.snap`.

Generated by [AVA](https://avajs.dev).

Expand Down Expand Up @@ -413,220 +413,3 @@ Generated by [AVA](https://avajs.dev).
<body>␊
</body>␊
</html>`

## ensure domain is valid

> Snapshot 1
undefined

## `rewriteHtmlUrls` don't modify udnefined attributes

> Snapshot 1
`<!DOCTYPE html>␊
<html lang="en">␊
<head>␊
<title>Document</title>␊
<meta property="og:site_name" content="moovility.me">␊
<link rel="canonical" href="https://moovility.me">␊
<meta charset="UTF-8">␊
</head>␊
<body>␊
<script>␊
console.log('greetings')␊
</script>␊
</body>␊
</html>`

## add markup for json payload

> Snapshot 1
`<!DOCTYPE html>␊
<html>␊
<head>␊
<title>geolocation.microlink.io</title>␊
<meta property="og:site_name" content="microlink.io">␊
<link rel="canonical" href="https://geolocation.microlink.io/">␊
</head>␊
<body><pre>{"origin":"83.46.149.83","city":"Madrid","alpha2":"ES","alpha3":"ESP","callingCodes":["+34"],"currencies":{"EUR":{"name":"Euro","symbol":"€"}},"eeaMember":true,"euMember":true,"flag":"🇪🇸","languages":{"spa":"Spanish"},"numeric":724,"tld":[".es"],"region":"MD","latitude":"40.4163","longitude":"-3.6934","timezone":"Europe/Madrid","headers":{"accept":"*/*","accept-encoding":"gzip","cdn-loop":"cloudflare","cf-connecting-ip":"83.46.149.83","cf-ipcountry":"ES","cf-ray":"73a29be38cdf37c7-MAD","cf-visitor":"{"scheme":"https"}","connection":"Keep-Alive","host":"geolocation.microlink.io","user-agent":"curl/7.79.1","x-forwarded-for":"172.70.57.171","x-forwarded-host":"geolocation.microlink.io","x-forwarded-proto":"https","x-real-ip":"172.70.57.171","x-vercel-edge-region":"dev","x-vercel-id":"cdg1::x96k9-1660405852783-a0083d276cde","x-vercel-ip-city":"Madrid","x-vercel-ip-country":"ES","x-vercel-ip-country-region":"MD","x-vercel-ip-latitude":"40.4163","x-vercel-ip-longitude":"-3.6934","x-vercel-ip-timezone":"Europe/Madrid","x-vercel-proxied-for":"172.70.57.171"}}</pre>␊
</body>␊
</html>`

## `rewriteUrls` rewrites relative root URLs inside html markup

> Snapshot 1
`<!DOCTYPE html>␊
<html lang="en">␊
<head>␊
<!-- Basic -->␊
<meta charset="utf-8">␊
<meta http-equiv="x-ua-compatible" content="ie=edge">␊
<!-- Search Engine -->␊
<meta name="description" content="a puppeter-like Node.js library for interacting with Headless production scenarios.">␊
<meta name="image" content="https://browserless.js.org/static/logo-banner.png">␊
<link rel="canonical" href="https://browserless.js.org">␊
<title>browserless, a puppeter-like Node.js library for interacting with Headless production scenarios.</title>␊
<meta name="viewport" content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0">␊
<!-- Schema.org for Google -->␊
<meta itemprop="name" content="browserless, a puppeter-like Node.js library for interacting with Headless production scenarios.">␊
<meta itemprop="description" content="a puppeter-like Node.js library for interacting with Headless production scenarios.">␊
<meta itemprop="image" content="https://browserless.js.org/static/logo-banner.png">␊
<!-- Twitter -->␊
<meta name="twitter:card" content="summary_large_image">␊
<meta name="twitter:title" content="browserless">␊
<meta name="twitter:description" content="a puppeter-like Node.js library for interacting with Headless production scenarios.">␊
<meta name="twitter:image" content="https://browserless.js.org/static/demo.png">␊
<meta name="twitter:label1" value="Installation">␊
<meta name="twitter:data1" value="npm install browserless --save">␊
<!-- Open Graph general (Facebook, Pinterest & Google+) -->␊
<meta property="og:title" content="browserless">␊
<meta property="og:logo" content="https://browserless.js.org/static/logo.png">␊
<meta property="og:description" content="a puppeter-like Node.js library for interacting with Headless production scenarios.">␊
<meta property="og:image" content="https://browserless.js.org/static/demo.png">␊
<meta property="og:url" content="https://browserless.js.org">␊
<meta property="og:site_name" content="browserless.js.org">␊
<meta property="og:type" content="website">␊
<!-- Favicon -->␊
<link rel="icon" type="image/png" href="https://browserless.js.org/static/favicon-32x32.png" sizes="32x32">␊
<link rel="icon" type="image/png" href="https://browserless.js.org/static/favicon-16x16.png" sizes="16x16">␊
<!-- Stylesheet -->␊
<link href="https://fonts.googleapis.com/css?family=Nunito|Nunito+Sans" rel="stylesheet">␊
<link rel="stylesheet" href="https://browserless.js.org/static/style.min.css">␊
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/codecopy/umd/codecopy.min.css">␊
</head>␊
<body>␊
<div id="app"></div>␊
<script src="https://browserless.js.org/static/main.min.js"></script>␊
<script src="https://unpkg.com/docsify/lib/docsify.min.js"></script>␊
<script src="https://unpkg.com/docsify/lib/plugins/ga.min.js"></script>␊
<script src="https://unpkg.com/docsify/lib/plugins/external-script.min.js"></script>␊
<script src="https://unpkg.com/prismjs/components/prism-bash.min.js"></script>␊
<script src="https://unpkg.com/prismjs/components/prism-jsx.min.js"></script>␊
<script src="https://cdn.jsdelivr.net/npm/codecopy/umd/codecopy.min.js"></script>␊
</body>␊
</html>`

## `rewriteUrls` rewrites relative URLs inside html markup

> Snapshot 1
`<!DOCTYPE html>␊
<html>␊
<head>␊
<link rel="apple-touch-icon" href="https://moovility.me/img/icons/MOV/icon2-76.png" sizes="76x76">␊
<title>moovility.me</title>␊
<meta property="og:site_name" content="moovility.me">␊
<link rel="canonical" href="https://moovility.me/">␊
<meta charset="utf-8">␊
</head>␊
<body>␊
</body>␊
</html>`

## `rewriteUrls` rewrites relative URLs inside stylesheet

> Snapshot 1
`<!DOCTYPE html>␊
<html lang="en">␊
<head>␊
<title>kikobeats.com</title>␊
<meta property="og:site_name" content="kikobeats.com">␊
<link rel="canonical" href="https://kikobeats.com">␊
<meta charset="utf-8">␊
</head>␊
<body>␊
<div style="background-image: url(https://kikobeats.com/images/microlink.jpg)"></div>␊
<div style="background-image: url(https://kikobeats.com/images/microlink.jpg)"></div>␊
</body>␊
</html>`

## `rewriteUrls` don't modify inline javascript

> Snapshot 1
`<!DOCTYPE html>␊
<html lang="en">␊
<head>␊
<meta charset="UTF-8">␊
<meta name="viewport" content="width=device-width, initial-scale=1.0">␊
<title>column-muralist-honors-african-americans-killed-by-police</title>␊
<meta property="og:site_name" content="latimes.com">␊
<link rel="canonical" href="https://www.latimes.com/opinion/story/2020-06-07/column-muralist-honors-african-americans-killed-by-police">␊
</head>␊
<body>␊
<a class="ActionLink" data-social-service="print" href="javascript:window.print()"><svg>␊
<use xlink:href="#mono-icon-print"></use>␊
</svg><span>Print</span></a>␊
</body>␊
</html>`

## `rewriteUrls` don't modify non http protocols

> Snapshot 1
`<!DOCTYPE html>␊
<html lang="en">␊
<head>␊
<meta charset="UTF-8">␊
<meta name="viewport" content="width=device-width, initial-scale=1.0">␊
<title>column-muralist-honors-african-americans-killed-by-police</title>␊
<meta property="og:site_name" content="latimes.com">␊
<link rel="canonical" href="https://www.latimes.com/opinion/story/2020-06-07/column-muralist-honors-african-americans-killed-by-police">␊
</head>␊
<body>␊
<a href="mailto:jen@oreilly.com"></a>␊
<a href="ftp://user:password@server/pathname"></a>␊
<a href="file://server/path"></a>␊
<a href="nntp://server:port/newsgroup/article"></a>␊
<a href="telnet://user:password@server:port/"></a>␊
<a href="gopher://docstore.mik.ua/orelly.htm"></a>␊
</body>␊
</html>`

## `rewriteUrls` don't modify data URIs

> Snapshot 1
`<!DOCTYPE html>␊
<html lang="en">␊
<head>␊
<meta charset="UTF-8">␊
<meta name="viewport" content="width=device-width, initial-scale=1.0">␊
<title>example.com</title>␊
<meta property="og:site_name" content="example.com">␊
<link rel="canonical" href="https://example.com">␊
</head>␊
<body>␊
<img src="data:image/gif;base64,R0lGODlhEAAQAMQAAORHHOVSKudfOulrSOp3WOyDZu6QdvCchPGolfO0o/XBs/fNwfjZ0frl3/zy7////wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAkAABAALAAAAAAQABAAAAVVICSOZGlCQAosJ6mu7fiyZeKqNKToQGDsM8hBADgUXoGAiqhSvp5QAnQKGIgUhwFUYLCVDFCrKUE1lBavAViFIDlTImbKC5Gm2hB0SlBCBMQiB0UjIQA7" alt="star" width="16" height="16">␊
</body>␊
</html>`

## `rewriteUrls` don't modify udnefined attributes

> Snapshot 1
`<!DOCTYPE html>␊
<html lang="en">␊
<head>␊
<title>Document</title>␊
<meta property="og:site_name" content="moovility.me">␊
<link rel="canonical" href="https://moovility.me">␊
<meta charset="UTF-8">␊
</head>␊
<body>␊
<script>␊
console.log('greetings')␊
</script>␊
</body>␊
</html>`
Binary file added test/html/snapshots/index.js.snap
Binary file not shown.
Binary file removed test/snapshots/html.js.snap
Binary file not shown.

0 comments on commit 6973d25

Please sign in to comment.