-
Notifications
You must be signed in to change notification settings - Fork 5
/
Scrapy.js
102 lines (86 loc) · 2.91 KB
/
Scrapy.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
const puppeteer = require(`puppeteer`)
const ora = require(`ora`)
const chalk = require(`chalk`)
const fs = require(`fs`)
class Scrapy {
constructor(path = `instagram`, host = `https://instagram.com/`) {
this.path = path
this.host = host
this.spinner = ora().start()
}
get url() {
return `${this.host}${this.path}`
}
async start() {
this.spinner.text = chalk.yellow(`Scraping url: ${this.url}`)
this.browser = await puppeteer.launch()
this.page = await this.browser.newPage()
await this.page.setExtraHTTPHeaders({
'Accept-Language': 'en-US'
})
await this.page.goto(this.url, {
waitUntil: `networkidle0`
})
if (await this.page.$(`.dialog-404`)) {
this.spinner.fail(`The url you followed may be broken`);
process.exit()
}
this.spinner.succeed(chalk.green(`Valid page found`))
this.spinner.start()
this.evaluate()
}
async evaluate() {
try {
this.items = await this.load(100)
} catch (error) {
this.spinner.fail(`There was a problem parsing the page`)
process.exit()
}
this.spinner.succeed(chalk.green(`Scraped ${this.items.size} posts`))
this.buildJSON()
await this.page.close()
await this.browser.close()
}
async load(maxItemsSize) {
this.maxItemsSize = maxItemsSize
var page = this.page
let previousHeight
var media = new Set()
var index = `.`
while (maxItemsSize == null || media.size < maxItemsSize) {
try {
previousHeight = await page.evaluate(`document.body.scrollHeight`)
await page.evaluate(`window.scrollTo(0, document.body.scrollHeight)`)
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`)
await page.waitFor(1000)
this.spinner.text = chalk.yellow(`Scrolling${index}`)
const nodes = await page.evaluate(() => {
const images = document.querySelectorAll(`a > div > div.KL4Bh > img`)
return [].map.call(images, img => img.src)
})
nodes.forEach(element => {
if (media.size < maxItemsSize) {
media.add(element)
}
})
index = index + `.`
}
catch (error) {
console.error(error)
break
}
}
return media
}
buildJSON() {
var tmp = []
this.items.forEach(url => {
tmp.push({
"thumbnail_src": url,
"accessibility_caption": ""
})
})
fs.writeFileSync('../nodes.json', JSON.stringify(tmp));
}
}
module.exports = Scrapy