forked from lewisdonovan/google-news-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
125 lines (108 loc) · 4.2 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
'use strict'
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const getTitle = require('./getTitle').default;
const getArticleType = require('./getArticleType').default;
const getPrettyUrl = require('./getPrettyUrl').default;
const buildQueryString = require('./buildQueryString').default;
const getArticleContent = require('./getArticleContent').default;
const googleNewsScraper = async (userConfig) => {
const config = Object.assign({
prettyURLs: true,
getArticleContent: false,
puppeteerArgs: [],
puppeteerHeadlessMode: true,
}, userConfig);
let queryVars = config.queryVars || {};
if (userConfig.searchTerm) {
queryVars.q = userConfig.searchTerm;
}
const queryString = config.queryVars ? buildQueryString(queryVars) : ''
const baseUrl = config.baseUrl ?? `https://news.google.com/search`
const timeString = config.timeframe ? ` when:${config.timeframe}` : ''
const url = `${baseUrl}${queryString}${timeString}`
console.log(`📰 SCRAPING NEWS FROM: ${url}`);
const requiredArgs = [
'--disable-extensions-except=/path/to/manifest/folder/',
'--load-extension=/path/to/manifest/folder/',
];
const puppeteerConfig = {
headless: userConfig.puppeteerHeadlessMode,
args: puppeteer.defaultArgs().concat(config.puppeteerArgs).filter(Boolean).concat(requiredArgs)
}
const browser = await puppeteer.launch(puppeteerConfig)
const page = await browser.newPage()
page.setViewport({ width: 1366, height: 768 })
page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')
page.setRequestInterception(true)
page.on('request', request => {
if (!request.isNavigationRequest()) {
request.continue()
return
}
const headers = request.headers()
headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
headers['Accept-Encoding'] = 'gzip'
headers['Accept-Language'] = 'en-US,en;q=0.9,es;q=0.8'
headers['Upgrade-Insecure-Requests'] = "1"
headers['Referer'] = 'https://www.google.com/'
request.continue({ headers })
})
await page.setCookie({
name: "CONSENT",
value: `YES+cb.${new Date().toISOString().split('T')[0].replace(/-/g, '')}-04-p0.en-GB+FX+667`,
domain: ".google.com"
});
await page.goto(url, { waitUntil: 'networkidle2' });
try {
await page.$(`[aria-label="Reject all"]`);
await Promise.all([
page.click(`[aria-label="Reject all"]`),
page.waitForNavigation({ waitUntil: 'networkidle2' })
]);
} catch (err) {
// console.log("ERROR REJECTING COOKIES:", err);
}
const content = await page.content();
const $ = cheerio.load(content);
const articles = $('article');
let results = []
let i = 0
const urlChecklist = []
$(articles).each(function () {
const link = $(this).find('a[href^="./article"]').attr('href').replace('./', 'https://news.google.com/') || false
link && urlChecklist.push(link);
const srcset = $(this).find('figure').find('img').attr('srcset')?.split(' ');
const image = srcset && srcset.length
? srcset[srcset.length - 2]
: $(this).find('figure').find('img').attr('src');
const articleType = getArticleType($, this);
const title = getTitle($, this, articleType);
const mainArticle = {
title,
"link": link,
"image": image?.startsWith("/") ? `https://news.google.com${image}` : image,
"source": $(this).find('div[data-n-tid]').text() || false,
"datetime": new Date($(this).find('div:last-child time').attr('datetime')) || false,
"time": $(this).find('div:last-child time').text() || false,
articleType
}
results.push(mainArticle)
i++
});
if (config.prettyURLs) {
results = await Promise.all(results.map(article => {
const url = getPrettyUrl(article.link);
article.link = url;
return article;
}));
}
if (config.getArticleContent) {
const filterWords = config.filterWords || [];
results = await getArticleContent(results, browser, filterWords);
}
await page.close();
await browser.close()
return results.filter(result => result.title)
}
module.exports = googleNewsScraper;