-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape_wikipedia_airline_list.rb
147 lines (122 loc) · 3.64 KB
/
scrape_wikipedia_airline_list.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
require 'mechanize'
#Requires a nokogiri object and returns an array of query strings.
def extract_links(delim_tags, index, page)
extract_ranges = [index...index+1]
doc = page
extracted_links = []
i = 0
# Change /"html"/"body" to the correct path of the tag which contains this list
(doc/"html"/"body"/"div").children.each do |el|
if (delim_tags.include? el.name)
i += 1
else
extract = false
extract_ranges.each do |cur_range|
if (cur_range.include? i)
extract = true
break
end
end
if extract
el.children.each do |d|
d.css("a").each do |k|
#destination = agent.get('en.wikipedia.org' + k['href'])
extracted_links.push(k['href'].clone)
end
end
end
end
end
return extracted_links
end
#Requires a nokogiri object and returns the first IATA code found on the page.
def extract_iata_code(page)
page.css('th a').each do |d|
if d['href'].eql?("/wiki/International_Air_Transport_Association_airport_code")
return d.next_element.text
break
end
end
end
#Requires a nokogiri object. Searches for and returns the first query string ending in '_destinations'. If one is not found, it returns an empty string.
def find_main_destinations(page)
storage = ""
page.css('div.rellink a').each do |ad|
if ad['href'].slice(ad['href'].length-13, ad['href'].length).eql?("_destinations")
#returns a string
storage = storage + ad['href']
break
end
end
return storage
end
#Takes a nokogiri object, parses it for 'table.wikitable.sortable' and returns an array of URLs contained within the column matching the column_name.
def extract_column_airlines(page, column_name)
airport_links = []
table_width = 0
airport_index = 0
#parse the table head and return the index of the "Airport" column as well as the total column count.
page.css('table.toccolours.sortable th').each do |c|
if c.text.eql?(column_name)
airport_index = page.css('table.toccolours.sortable th').index(c)
table_width = page.css('table.toccolours.sortable th').length
break
end
end
#parse the table rows
page.css('table.toccolours.sortable tr').each do |tr|
#for each each row, parse the columns
tr.css('td').each do |td|
#if the column's index modulous the width is equal it to the desired index
if tr.css('td').index(td)%table_width == airport_index
#retrieve the link within that column.
td.css('a').each do |a|
if a['href'].to_s.include?("redlink=1") == false
airport_links.push(a['href'].clone)
end
end
end
end
end
return airport_links
end
agent = Mechanize.new
begin
page = agent.get('http://en.wikipedia.org/wiki/Fly_Air')
rescue Mechanize::ResponseCodeError => e
puts e.to_s
end
airline_parser_object = page.parser
h2_index = 0
#Search for the destinations section.
airline_parser_object.css('div#mw-content-text h2').each do |c|
#puts "Parsing links"
if c.css("span.mw-headline").text.eql?"Destinations"
h2_index = airline_parser_object.css('div#mw-content-text h2').index(c)
end
end
destinations = extract_links(["h2"], h2_index, airline_parser_object)
destination_airports = []
destinations.each do |k|
puts k
end
destinations.each do |d|
begin
puts "Querying page: " + d
if d.slice(0, 7).eql?("http://")
puts d
page = agent.get(d)
else
puts d
page = agent.get('http://en.wikipedia.org' + d)
end
nokogiri_page = page.parser
#store that IATA code
destination_airports.push(extract_iata_code(nokogiri_page))
#destination_airports.each do |d|
# puts d
#end
rescue Mechanize::ResponseCodeError, StandardError => e
puts "Error fetching airline destination IATA codes: " + e.to_s
end
end