-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.rb
116 lines (94 loc) · 3.08 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
require 'scraperwiki'
require 'mechanize'
require 'rest-client'
# TODO: Use https
# There's a problem with their ssl cert, which prevents
# the Wayback machine from archiving and requires not verifying ssl
# on our end. So for now, get the http version.
BASE_URL = 'http://www.nswmining.com.au'
ORG_NAME = 'NSW Mining'
def web_archive(page)
url = "https://web.archive.org/save/#{page.uri.to_s}"
begin
archive_request_response = RestClient.get(url)
"https://web.archive.org" + archive_request_response.headers[:content_location]
rescue RestClient::BadGateway => e
puts "archive.org ping returned error response for #{url}: " + e.to_s
end
end
def find_meta_tag_content(page, key, value)
tag = page.search(:meta).find do |t|
t[key] === value
end
tag['content'] if tag
end
def extract_article_body(page)
page.at('.field-name-body > div > div')&.inner_html ||
page.at('article .content > div > div > div').inner_html
end
def extract_photo(article_item)
return unless article_item.at('.thumbnail')
BASE_URL + article_item.at('.thumbnail img')['src']
end
def save_article(article_item, page)
photo = extract_photo(article_item)
summary = article_item.at('.exerpt p:last-child').text
post = page.at('.newsItemDetail')
published = Date.parse(post.at('.date').text).to_s
# Skip if we already have the current version of article
saved_article = ScraperWiki.select("* FROM data WHERE url='#{page.uri.to_s}'").last rescue nil
# TODO: Move the article page request to be after this check,
# to save unnecessary requests to pages we have.
# We could then skip a whole index page without requests.
if saved_article
puts "Skipping #{page.uri.to_s}, already saved"
else
puts "Saving: #{page.uri.to_s}, #{published}"
article = {
'name' => post.at('h1').text,
'url' => page.uri.to_s,
'scraped_at' => Time.now.utc.to_s,
'published' => published,
'published_raw' => post.at('.date').text,
'author' => ORG_NAME,
'summary' => summary,
'content' => post.inner_html,
'syndication' => web_archive(page),
'org' => ORG_NAME
}
article['photo'] = photo if photo
ScraperWiki.save_sqlite(['url'], article)
end
end
def save_articles_and_click_next_while_articles(agent, index_page)
web_archive(index_page)
puts "Collecting articles on #{index_page.uri.to_s}"
articles = index_page.search('.posts .article')
if articles.any?
articles.each do |article_item|
sleep 1
save_article(
article_item,
agent.get(BASE_URL + article_item.at(:a)['href'])
)
end
end
next_page_link = index_page.links.select do |link|
link.text.eql? '>'
end.pop
if next_page_link
puts "Clicking for the next page"
save_articles_and_click_next_while_articles(
agent,
next_page_link.click
)
else
puts "That's the last page my friends, no more articles to collect."
end
end
agent = Mechanize.new
initial_index_page = agent.get(BASE_URL + "/menu/media/news?page=1")
save_articles_and_click_next_while_articles(
agent,
initial_index_page
)