-
Notifications
You must be signed in to change notification settings - Fork 24
/
hackerNews.js
44 lines (37 loc) · 976 Bytes
/
hackerNews.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env node
// Normally, you import roboto like this:
//var roboto = require('roboto');
var roboto = require('../lib/roboto');
var html_strip = require('htmlstrip-native').html_strip;
var stripOptions = {
include_script : false,
include_style : false,
compact_whitespace : true
};
var crawler = new roboto.Crawler({
startUrls: [
"https://news.ycombinator.com/",
],
allowedDomains: [
"news.ycombinator.com",
],
// Note that there is a delay due to directive 'Crawl-Delay: 30'
// defined in their robots.txt
//obeyRobotsTxt: false
});
crawler.parseField('url', function(response) {
return response.url;
});
crawler.parseField('title', function(response, $) {
return $('head title').text();
});
crawler.parseField('body', function(response, $) {
var html = $('body').html();
if (html) {
return html_strip(html, stripOptions);
}
});
crawler.on('item', function(item) {
// Do something with the item!
});
crawler.crawl();