-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.rb
68 lines (47 loc) · 2.15 KB
/
crawl.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
require 'nokogiri'
require 'open-uri'
require 'csv'
require 'geokit'
include Geokit::Geocoders
MAPS_CONFIG = YAML::load(File.open('maps.yml'))
Geokit::Geocoders::google = MAPS_CONFIG['api_key']
COMPANY_NAME = 0
EMAIL = 1
WEBSITE = 2
PHONE = 3
CITY_STATE = 4
CATEGORIES = 5
def parse_result_node(the_node)
company_name = the_node.css('div[@class="addressinfo"]//h2').first.content
email_nodes = the_node.css('a[@class="email"]')
email_addresses = email_nodes.nil? ? "" : email_nodes.collect { |email_node| email_node.content }.join("; ")
website_nodes = the_node.css('a[@class="http"]')
websites_addresses = website_nodes.nil? ? "" : website_nodes.collect { |website| "http://" + website.content }.join("; ")
phone_nodes = the_node.css('div[@class="phone ar12grey"]')
phone_numbers = phone_nodes.nil? ? "" : phone_nodes.collect { |phone| phone.content }.join("; ")
city_state_node = the_node.css('div[@class="address"]').first
city_state = ""
unless city_state_node.nil?
geo = GoogleGeocoder.geocode(city_state_node.content)
city_state = "#{geo.city.nil? ? geo.district : geo.city}-#{geo.state}" unless geo.nil?
end
categories = the_node.css('h6').css('a').collect { |category| category.content }.join("; ")
puts "Finished parsing #{company_name}..."
sleep(1)
[company_name, email_addresses, websites_addresses, phone_numbers, city_state, categories]
end
results = Array.new
css_classes = ['div[@class="searchbg_blue"]', 'div[@class="searchbg_white"]']
for page_number in 1..115
puts "Page #{page_number}"
url_to_parse = "http://www.photosourcedirectory.com/search_results.php?txtSearch=production&optCountry=45&rowsPerPage=20&optSortBy=relevancy&page=#{page_number}"
doc = Nokogiri::HTML(open(url_to_parse))
css_classes.each do |css_class|
doc.css(css_class).each { |container| results << parse_result_node(container) }
end
end
CSV.open("contacts.csv", "wb") do |csv|
csv << ["Company Name", "Emails", "Websites", "Phone Numbers", "City/State", "Categories"]
results.each { |x| csv << [x[COMPANY_NAME], x[EMAIL], x[WEBSITE], x[PHONE], x[CITY_STATE], x[CATEGORIES]] }
end
puts "Parsing complete."