-
Notifications
You must be signed in to change notification settings - Fork 58
/
Copy pathindex.ts
197 lines (164 loc) · 6.21 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import { Table } from '@cliffy/table'
import { tty } from '@cliffy/ansi/tty'
import { colors } from '@cliffy/ansi/colors'
import './index.d.ts'
import {
url2title,
getPageRecord,
writeRecord,
removeRecord,
} from './analyser/toolkit.ts'
import {
requestMetricsRun,
checkStatus,
retrieveMetrics,
} from './analyser/metrics.ts'
const debug = Deno.env.get('DEBUG') !== undefined
if (!debug) console.debug = () => {} // supress debug messages
const white = (output: string | number) => colors.white(` ${output} `)
const whiteHd = (output: string | number) => colors.bgWhite.bold.black(` ${output} `)
const red = (output: string | number) => colors.red(` ${output} `)
const redHd = (output: string | number) => colors.bgRed.bold.black(` ${output} `)
const yellow = (output: string | number) => colors.yellow(` ${output} `)
const yellowHd = (output: string | number) => colors.bgYellow.bold.black(` ${output} `)
const blue = (output: string | number) => colors.blue(` ${output} `)
const blueHd = (output: string | number) => colors.bgBlue.bold.black(` ${output} `)
const INPUT_FILE = Deno.args[0] ?? './pages.txt'
const OUTPUT_PATH = Deno.args[1] ?? './content' // results are written here
const RECHECK_THRESHOLD = 60 * 60 * 24 * 7 * 1000 // recheck pages older than 1 week
const REJECT_THRESHOLD = 262144 // 256KB (duh)
const PARALLEL_JOBS = 3 // max YLT jobs
const now = Date.now()
const pages = await getPageList() // all pages
const statistics = {
total: pages.length,
checked: 0,
updated: [] as { url: string, weight: number }[],
rejected: [] as { url: string, weight: number }[],
errors: [] as string[],
}
async function getPageList(): Promise<string[]> {
const inputContent = await Deno.readTextFile(INPUT_FILE)
return inputContent.split('\n').filter((line) => line.startsWith('http'))
}
async function updateRecord(runId: string, url: string): Promise<boolean> {
const oldRecord = await getPageRecord(url, OUTPUT_PATH)
const metrics = await retrieveMetrics(runId)
if (!metrics) {
statistics.errors.push(`Failed to retrieve results for ${url} (run id: ${runId})`)
console.debug(red("failed to retrieve results"), "for", blue(url), runId)
return false
}
// poor mans toISODateString
const now = new Date().toISOString().split("T")[0]
const weight = metrics.metrics.contentLength
if (weight > REJECT_THRESHOLD) {
statistics.rejected.push({ url, weight: Math.round(weight / 1024) })
console.debug(url, red("rejected!"), "Weighs", Math.round(weight / 1024), "kb")
if (oldRecord) {
console.debug("Removing record at", OUTPUT_PATH)
removeRecord(url, OUTPUT_PATH).catch(() => {
statistics.errors.push('Failed to remove', OUTPUT_PATH)
console.debug(red("Failed to remove old record"), "of rejected url", url)
})
}
return false
}
const { htmlSize, imageSize, videoSize } = metrics.metrics
const contentSize = htmlSize + imageSize + videoSize
const record: PageRecord = {
title: url2title(url),
date: oldRecord === null ? now : oldRecord.date,
updated: now,
weight,
extra: {
source: url,
ratio: Math.round(contentSize / weight * 100),
size: Math.round(weight / 1024),
},
}
const success = await writeRecord(record, url, OUTPUT_PATH)
if (success) {
statistics.updated.push({ url, weight })
console.debug(blue(url), white("successfully updated!"))
} else {
statistics.errors.push(`Failed to write record for ${url}`)
console.debug(blue(url), red("record could not be written!"))
}
return true
}
async function checkPage(url: string) {
const record = await getPageRecord(url, OUTPUT_PATH)
const lastUpdated = Date.parse(record?.updated || "")
const needsCheck = !record || now - lastUpdated > RECHECK_THRESHOLD
if (!needsCheck) {
statistics.checked++
console.debug(blue(url), white("is up-to-date"))
return true
}
const runId = await requestMetricsRun(url)
if (!runId) {
statistics.errors.push(`Failed to run metric for ${url}`)
console.debug(blue(url), red("getting metrics failed!"))
return false
}
console.debug(blue(url), white("new or outdated,"), "runId is", runId)
return runId
}
function sleep(duration: number) {
return new Promise<void>((resolve) => {
setTimeout(() => resolve(), duration)
})
}
function updateStatusScreen() {
const { total, checked, updated, rejected, errors } = statistics
const tableOutput = new Table(
[whiteHd('total'), whiteHd('checked'), blueHd('added/updated'), yellowHd('rejected'), redHd('errors')],
[white(total), white(checked), blue(updated.length), yellow(rejected.length), red(errors.length)],
)
tty.cursorLeft.cursorUp.eraseLine()
tty.cursorLeft.cursorUp.eraseLine()
console.log(tableOutput.toString())
}
function showStatistics() {
console.log(new Table(
...statistics.rejected.map((page) => [yellowHd('Rejected'), page.url, `${red(page.weight)}kb`]),
).toString())
console.log(new Table(
...statistics.errors.map((err) => [redHd('Error'), err]),
).toString())
}
async function handleBatch() {
if (!debug) updateStatusScreen()
if (!pages.length) return showStatistics() // done, yeah!
const batch = pages.splice(0, PARALLEL_JOBS)
const jobs = batch.map((url) => checkPage(url))
while (jobs.length) {
// take the first job and check
// if the check fails, it will be added back to the end of the list
const job = jobs.shift()
const runId = await job
// page is up-to-date or YLT has an error
if (!job || runId === undefined || runId === true || runId === false) continue
// TODO: handle failures more gracefully
const { url, status } = await checkStatus(runId)
if (status === "failed") {
statistics.errors.push(`YLT analysis failed for ${url} (run id: ${runId})`)
console.debug(blue(url), red("YLT analysis failed"))
continue
} else if (status === "complete") {
console.debug(blue(url), blue("updating record..."))
await updateRecord(runId, url)
continue
} else {
console.debug(blue(url), white("job incomplete, pushing back"))
// not done yet, add it back
jobs.push(job)
// wait a bit before checking again
await sleep(1000)
}
}
handleBatch()
}
console.log('Starting...')
handleBatch()