forked from serpapi/code-challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.rb
75 lines (67 loc) · 2.3 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
require 'nokogiri'
require 'json'
# Pre-steps --------------------------------------------------
# Creating a hash to map images ids (keys) and base64 images (values)
#------------------------------------------------------------
def pre_process(doc)
scripts = doc.css('script')
ids = []
imgs = []
scripts.each do |s|
id = s.text.match(/var ii=\['([^']+)'\];/)
next if id == nil
img = s.text.match(/var s='([^']+)';/)
next if img == nil
ids.push(id[1])
imgs.push(img[1])
end
hash = ids.zip(imgs).to_h
return hash
end
# Get Image --------------------------------------------------
# Function that looks for the base64 image into the hash
# In case not founded uses the 'data-src' value
#------------------------------------------------------------
def get_image(ele, hash)
img_val = ''
img_id = ele.at_css('img').attributes['id']
if img_id != nil
img_val = hash[img_id.text]
img_val = img_val.gsub("\\x3d", "=") # fixing some base64 issues
else
img_val = ele.at_css('img').attributes['data-src'].value
end
return img_val
end
# Main process --------------------------------------------------
# Iterating over the elements of interest and building the results array
#------------------------------------------------------------
def process_elements(doc, hash)
google_url = "https://www.google.com"
elements = doc.css('div.iELo6 a')
results_array = []
elements.each do |ele|
res = {}
res['name'] = ele.css('div.pgNMRc').text
res['extensions'] = [ele.css('div.cxzHyb').text] if ele.css('div.cxzHyb').text != ''
res['link'] = google_url+ele.attributes['href']
res['image'] = get_image(ele, hash)
results_array << res
end
return results_array
end
# Main function --------------------------------------------------
def main
# change it if you want to test other input E.g picasso-paintings
input_name = "van-gogh-paintings"
file = "inputs/"+input_name+".html"
html_content = File.read(file)
doc = Nokogiri::HTML(html_content)
hash = pre_process(doc)
results_array = process_elements(doc, hash)
File.open('results/'+input_name+'-result.json', 'w') do |file|
file.write(JSON.pretty_generate({"artworks":results_array}))
end
puts input_name+" scraped!"
end
main()