-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.js
100 lines (81 loc) · 3.47 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
const sharp = require('sharp');
const pageHrefs = require('./get_page_anchors');
const config = require('./config');
if (config.filters === true) {
var filters = require('./filters');
};
const DEPTH = config.depth;
const URL = config.host + config.path;
const crawledPages = new Map();
const maxDepth = DEPTH; // Subpage depth to crawl site.
const SCREENSHOTS = process.argv.includes('--screenshots');
module.exports = {
/**
* Crawls a URL by visiting an url, then recursively visiting any child subpages.
* @param {!Browser} browser
* @param {{url: string, title: string, img?: string, children: !Array<!Object>}} page Current page.
* @param {number=} depth Current subtree depth of crawl.
*/
async crawl(browser, page, depth = 0) {
if (depth > maxDepth) {
return;
}
if (crawledPages.has(page.url)) {
console.log(`Reusing route: ${page.url}`);
const item = crawledPages.get(page.url);
page.title = item.title;
page.img = item.img;
page.children = item.children;
// Fill in the children with details (if they already exist).
page.children.forEach(c => {
const item = crawledPages.get(c.url);
c.title = item ? item.title : '';
c.img = item ? item.img : null;
});
return;
} else {
console.log(`Loading: ${page.url}`);
const newPage = await browser.newPage();
await newPage.goto(page.url, {
waitUntil: 'networkidle2'
});
let anchors = await newPage.evaluate(pageHrefs.collectAllSameOriginAnchorsDeep);
/** optional filters to remove unwanted links from final visualisation **/
if (config.filters === true) {
// general filters applied to all URL's crawled
filters.excludeAnchorsWhichContain.forEach((item) => {
anchors = anchors
.filter(a => a.includes(item) !== true)
})
anchors = anchors
.filter(a => a.endsWith(config.path) !== true) // filter links back to homepage i.e. "service/careers/"
.filter(a => a.includes(config.path) === true)
// subpages filters only
if (depth > 0) {
filters.excludeSubpageAnchorsEndingWith.forEach((item) => {
anchors = anchors.filter(a => a.endsWith(item) !== true);
})
}
}
page.title = await newPage.evaluate('document.title');
page.children = anchors.map(url => ({
url
}));
if (SCREENSHOTS) {
const path = `./${OUT_DIR}/${slugify(page.url)}.png`;
let imgBuff = await newPage.screenshot({
fullPage: false
});
imgBuff = await sharp(imgBuff).resize(null, 150).toBuffer(); // resize image to 150 x auto.
util.promisify(fs.writeFile)(path, imgBuff); // async
page.img = `data:img/png;base64,${imgBuff.toString('base64')}`;
}
crawledPages.set(page.url, page); // cache it.
await newPage.close();
}
// Crawl subpages.
for (const childPage of page.children) {
await this.crawl(browser, childPage, depth + 1);
}
}
};