-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
171 lines (116 loc) · 4.31 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import time
import re
def scrape_all():
# Initiate headless driver for deployment
browser = Browser("chrome", executable_path="chromdriver", headless=True)
news_title, news_paragraph = mars_news(browser)
data = {
"news_title": news_title,
"news_paragraph": news_paragraph,
"featured_image": featured_image(browser),
"hemispheres": hemisphers(browser),
"weather": twitter_weather(browser),
"facts": mars_facts(),
"last_modified": dt.datetime.now()
}
# Stop webdriver and return data
browser.quit()
return data
def mars_news(browser):
url = "https://mars.nasa.gov/news/"
browser.visit(url)
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=0.5)
html = browser.html
news_soup = BeautifulSoup(html, "html.parser")
try:
slide_elem = news_soup.select_one("ul.item_list li.slide")
news_title = slide_elem.find("div", class_="content_title").get_text()
news_p = slide_elem.find(
"div", class_="article_teaser_body").get_text()
except AttributeError:
return None, None
return news_title, news_p
def featured_image(browser):
url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)
# Find and click the full image button
full_image_elem = browser.find_by_id("full_image")
full_image_elem.click()
# Find the more info button and click that
browser.is_element_present_by_text("more info", wait_time=0.5)
more_info_elem = browser.links.find_by_partial_text("more info")
more_info_elem.click()
# Parse the resulting html with soup
html = browser.html
img_soup = BeautifulSoup(html, "html.parser")
# Find the relative image url
img = img_soup.select_one("figure.lede a img")
try:
img_url_rel = img.get("src")
except AttributeError:
return None
# Use the base url to create an absolute url
img_url = f"https://www.jpl.nasa.gov{img_url_rel}"
return img_url
def hemispheres(browser):
# A way to break up long strings
url = (
"https://astrogeology.usgs.gov/search/"
"results?q=hemisphere+enhanced&k1=target&v1=Mars"
)
browser.visit(url)
# Click the link, find the sample anchor and return the href
hemisphere_image_urls = []
for i in range(4):
# find the elements on each loop to aviod a stale element exception
browser.find_by_css("a.product-item h3")[i].click()
hemi_data = scrape_hemisphere(browser.html)
# Append hemisphere object to list
hemisphere_image_urls.append(hemi_data)
# Finally, we naviage backwards
browser.back()
return hemisphere_image_urls
def twitter_weather(browser):
url = "https://twitter.com/marswxreport?lang=en"
browser.visit(url)
# Pause for 5 seconds to let the Twitter page load
time.sleep(5)
html = browser.html
weather_soup = BeautifulSoup(html, "html.parser")
# First, find a twseek with the data-name 'Mars WEasther'
tweet_attrs = {"class": "tweet", "data-name": "Mars Weather"}
mars_weather_tweet = weather_soup.find("div", attrs=tweet_attrs)
# Next, search for the p tag or span tag within the tweet
try:
mars_weather = mars_weather_tweet.find("p", "tweet-text").get_text()
except AttributeError:
pattern = re.compile(r'sol')
mars_weather = weather_soup.find('span', text=pattern).text
return mars_weather
def scrape_hemisphere(html_text):
hemi_soup = BeautifulSoup(html_text, "html.parser")
try:
title_elem = hemi_soup.find("h2", class_="title").get_text()
sample_elem = hemi_soup.find("a", tedt="Sample").get("href")
except AttributeError:
title_elem = None
sample_elem = None
hemisphere = {
"title": title_elem,
"img_url": sample_elem
}
return hemisphere
def mars_facts():
try:
df = pd.read_html("http://space-faces.com/mars/")[0]
except BaseException:
return None
df.columns = ["description", "value"]
df.set_index("description", inplace=True)
return df.to_html(classes="travle table-striped")
if __name__ == "__main__":
print(scrape_all())