-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_wsj_articles.py
113 lines (98 loc) · 3.25 KB
/
get_wsj_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Functionality for scraping WSJ's news archive to get
headlines and summaries.
"""
# load libraries and variables
import datetime
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
from config import base_url, headline_class, webdriver_wait_time
# define functions
def get_sd_url(current_date):
"""
Get the URL for a single date.
Args:
current_date (datetime.date): date to build string off of
Returns:
URL string
"""
current_date_formatted = '{}{}{}'.format(current_date.strftime('%Y'),
current_date.strftime('%m'),
current_date.strftime('%d'))
URL = '{}{}'.format(base_url, current_date_formatted)
return URL
def get_wsj_articles_sd(current_date, logging = False):
"""
Get WSJ article headlines and summaries for a single date.
Args:
current_date (datetime.date): date to grab articles from
Returns:
pandas DataFrame with fields:
published_date | headline | summary
"""
# get URL, load page, extract main section
URL = get_sd_url(current_date)
driver = webdriver.Chrome()
driver.get(URL)
WebDriverWait(driver, webdriver_wait_time)
if logging:
print(driver.title)
soup = BeautifulSoup(driver.page_source, 'html.parser')
main_sections = soup.find_all('main', id = 'main', role = 'main')
# extract article headlines and summaries;
# note we use a loop over main_sections just in
# case, but it should only show up once
headlines = []
summaries = []
for main_sec in main_sections:
# get articles
articles = main_sec.find_all('article')
for article in articles:
# get headline
# NOTE - should replace this with regex in case the string at the end changes
# re.compile('.*listing-col-.*')
headline = article.find('div', class_ = headline_class)
headlines.append(headline.getText())
# get summary
summary = article.find('p')
summaries.append(summary.getText())
# close page, put article info into dataframe and return
driver.close()
article_dat = pd.DataFrame({'published_date': current_date, 'headline': headlines, 'summary': summaries})
return(article_dat)
def get_wsj_articles(start_date, end_date, logging = False):
"""
Get WSJ article headlines and summaries for supplied
date range.
Args:
start_date (datetime.date): earliest date to get articles
end_date (datetime.date): most recent date to get articles
Returns
pandas DataFrame with fields:
published_date | headline | summary
"""
# loop through dates, adding data to list of DataFrames
wsj_articles = []
current_date = start_date
one_day = datetime.timedelta(days = 1)
while(current_date <= end_date):
# get date's articles, iterate
cd_articles = get_wsj_articles_sd(current_date, logging)
wsj_articles.append(cd_articles)
current_date += one_day
# combine DataFrames into one and return
wsj_articles = pd.concat(wsj_articles)
return(wsj_articles)
# example usage
if __name__ == '__main__':
# set variables
start_time = time.time()
start_date = datetime.date(2020, 4, 26)
end_date = datetime.date(2020, 4, 28)
# build table and print
print(get_wsj_articles(start_date, end_date, logging = True))
elapsed_time = round((time.time() - start_time) / 60, 2)
print('------- Runtime: {} minutes -------'.format(elapsed_time))