-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
150 lines (138 loc) · 4.53 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
'use strict'
const { URL } = require('url')
const isUrl = require('is-url-superb')
const scrape = require('html-metadata')
const jp = require('jsonpath')
const arrayToSentence = require('./lib/list')
const fromTwitter = require('./lib/twitter')
const upperfirst = s => s[0].toUpperCase() + s.slice(1)
const toStartCase = s => s.toLowerCase().split(' ').map(upperfirst).join(' ')
const collapseWhitespace = s => s.replace(/\s+/g, ' ')
function trimPunctuation (string) {
const punctuation = '\\s-–—….,;:?!¡•/\\(){}<>*%@$¢€$#‹›«»‘’“”"\'[\\]'
let leadingPunctuation = new RegExp('^[' + punctuation + ']+')
let trailingPunctuation = new RegExp('[' + punctuation + ']+$')
string = string.replace(leadingPunctuation, '')
string = string.replace(trailingPunctuation, '')
return string
}
function removeTextFollowingPunctuation (string) {
const punctuationWithLeadingSpace = '-.–,—\'’'
const punctuation = '…;:?!¡•/\\(){}<>*%@$¢€$#‹›«»‘“”"[\\]'
let followingPunctuation = new RegExp('(\\s+[' + punctuationWithLeadingSpace + ']|[' + punctuation + ']+).+$')
return string.replace(followingPunctuation, '')
}
function stripLeadingPreposition (string) {
const prepositions = 'by|von|par'
const leadingPrepositions = new RegExp('^(' + prepositions + ')\\s+', 'i')
return string.replace(leadingPrepositions, '')
}
function tidy (string) {
string = collapseWhitespace(string) // reduce any whitespace to single spaces
string = string.trim() // strip leading or trailing whitespace
string = trimPunctuation(string) // strip leading or trailing punctuation
string = stripLeadingPreposition(string) // remove common prepositions from start of string
string = removeTextFollowingPunctuation(string)
if (string === string.toUpperCase()) string = toStartCase(string)
return string
}
function fromAuthor (meta) {
if (meta.hasOwnProperty('author') && meta.author.length > 0) {
return tidy(meta.author)
}
return null
}
function queryStructuredData (data) {
let results = []
let authorWithArticleBody = jp.query(data, '$..*[?(@["author"] && (@["https://schema.org/articleBody"]) || @["articleBody"])]')
let dataSource = authorWithArticleBody.length > 0 ? authorWithArticleBody : data
let authors = jp.query(dataSource, '$..author')
authors.map(author => {
if (typeof author === 'string') {
results.push(author)
return
}
let names = jp.query(author, '$..name')
if (names.length > 0) {
names.map(name => {
if (typeof name === 'string') {
results.push(name)
}
if (Array.isArray(name)) {
name.map(n => { results.push(n) })
}
})
return
}
if (Array.isArray(author)) {
author.map(member => {
if (typeof member === 'string') {
results.push(member)
}
})
}
})
return results
}
function isSlug (string) {
return /^[\d\w-]+\.[\w]*$/.test(string)
}
function isValidResult (res) {
let validity = res.length > 0
if (validity) { validity = !isUrl(res) }
if (validity) { validity = !isSlug(res) }
return validity
}
function fromStructuredData (data) {
let results = queryStructuredData(data)
results = results.reduce((results, author) => {
if (!isValidResult(author)) return results
author = tidy(author)
if (isValidResult(author) && !results.includes(author)) {
results.push(author)
}
return results
}, [])
if (results.length === 1) {
return results[0]
}
if (results.length > 1) {
return arrayToSentence(results)
}
return null
}
module.exports = async function (url) {
// that’s not even a URL (╯°□°)╯︵ ┻━┻
if (!isUrl(url)) return null
// fetch metadata
let data
try {
data = await scrape({
url: url,
headers: { 'User-Agent': 'whorl' }
})
} catch (e) {
return null
}
let author = null
// try to extract author from JSON-LD
if (data.hasOwnProperty('jsonLd')) {
author = fromStructuredData(data.jsonLd)
if (author) return author
}
// try to extract author from schemaOrg
if (data.hasOwnProperty('schemaOrg')) {
author = fromStructuredData(data.schemaOrg)
if (author) return author
}
// try <meta name="author" … >
author = fromAuthor(data.general)
if (author && isValidResult(author)) return author
// handle Twitter
// create parsed URL from metadata URL
let urlObject = new URL(data.general.canonical || url)
author = fromTwitter(urlObject)
if (author) return author
// all is lost
return null
}