-
Notifications
You must be signed in to change notification settings - Fork 0
/
clubhouse_scraper.js
131 lines (104 loc) · 4.4 KB
/
clubhouse_scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
const puppeteer = require('puppeteer');
const fs = require("fs");
(async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 720 });
await page.goto('https://www.campusvibe.ca/campusvibe/groups/cea260f5-8aab-4e11-bccf-fe4a846e62dc#grouptype=4ae737753cd77c806877fb9a848e144e64eb6275');
await page.waitFor(2000);
await page.click("#btn-list-view");
await page.waitFor(2000);
let clubsList = [];
page.on('response', res => {
if (res.url().includes('group?sname')) {
res.json()
.then(body => {
for (let i = 0; i < 10; i++) {
let info = body.listT[i];
let club = {
name: info.name,
nickname: info.nickName,
description: info.description,
category: info.groupCategory,
campus: info.campusName,
faculty: info.programType,
targetMemberCategory: info.whoShouldJoin,
memberCount: info.memberCount,
nationalAffiliationName: info.nationalAffiliationName,
nationalAffiliationWebsite: info.nationalAffiliationWebsite,
notificationEmail: info.notificationEmail,
instagramLink: info.instagramLink,
twitterLink: info.twitterLink,
facebookLink: info.facebookLink,
imgLink: (info.imageId == '')? '' : `https://www.campusvibe.ca/Skeddy/rest/gem/v1/image/${info.imageId}`
}
clubsList.push(club);
}
})
.catch(err => {
console.error(err);
})
}
});
let totalClubsNum = await page.evaluate(() => {
let totalClubs = document.querySelector("span.events-totalNumber");
return Promise.resolve(totalClubs.firstChild.textContent);
});
let curNum = 0;
while (curNum < totalClubsNum) {
curNum = await scroll(page);
await page.waitFor(500);
}
fs.writeFile("./clubs.json", JSON.stringify(clubsList, null, 2), (err) => {
if (err) {
console.error(err);
return;
};
});
await browser.close();
})();
async function scroll(page) {
return page.evaluate(() => {
window.scrollBy(0, window.innerHeight);
window.scrollBy(0, -10);
let count = document.querySelectorAll("#itemsListTable tr");
return Promise.resolve(count.length);
});
}
// // limited scrape info from dom after load
// async function getData(page) {
// return page.evaluate(() => {
// let clubs = document.querySelectorAll('#itemsListTable tr');
// let clubList = [];
// if (clubs) {
// for (let club of clubs) {
// let imgDiv = club.cells[0].firstElementChild;
// let imgDivCss = window.getComputedStyle(imgDiv);
// let imgUrl = imgDivCss['background-image'].substring(5);
// imgUrl = imgUrl.split('")')[0];
// let itemDataDiv = imgDiv.nextElementSibling;
// let groupTitleH4 = itemDataDiv.firstElementChild;
// let name = groupTitleH4.firstElementChild.title;
// let groupTypeP = groupTitleH4.nextElementSibling;
// let potentialFaculty = groupTypeP.querySelector("a");
// let groupType = (potentialFaculty)? groupTypeP.lastElementChild.textContent : null;
// let targetP = groupTypeP.nextElementSibling;
// let target = targetP.title.substring(4);
// let shortDescription = targetP.nextElementSibling.textContent;
// let numMembers = club.cells[1].children[1].textContent;
// let clubInfo = {
// name: name,
// numMembers: numMembers,
// targetMemberType: target,
// faculty: groupType,
// truncatedDescription: shortDescription,
// imgUrl: imgUrl
// }
// clubList.push(clubInfo);
// }
// return Promise.resolve(clubList);
// } else {
// Promise.reject(new Error('no clubs list found'));
// }
// });
// }