-
Notifications
You must be signed in to change notification settings - Fork 37
/
stockplanconnect.py
215 lines (171 loc) · 7.18 KB
/
stockplanconnect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""Retrieves PDF documents from https://www.stockplanconnect.com.
These PDF documents can be parsed by extracting the text using `pdftotext`.
This uses the `selenium` Python package in conjunction with `chromedriver` to
scrape the Stockplanconnect website.
Configuration:
==============
The following keys may be specified as part of the configuration dict:
- `credentials`: Required. Must be a `dict` with `'username'` and `'password'`
keys.
- `output_directory`: Required. Must be a `str` that specifies the path on the
local filesystem where the documents will be saved. If the directory does not
exist, it will be created.
- `profile_dir`: Optional. If specified, must be a `str` that specifies the
path to a persistent Chrome browser profile to use. This should be a path
used solely for this single configuration; it should not refer to your normal
browser profile. If not specified, a fresh temporary profile will be used
each time.
- `headless`: Must be set to `False` currently, as this scraper does not work
properly when run with a headless browser.
Output format:
==============
Each document is saved to the `output_directory` with a name like:
2017-02-09.Restricted_Units.Trade_Confirmations.Confirmation.pdf
2017-08-30.Restricted_Units.Trade_Confirmations.Release_Confirmation.pdf
2017-12-31.Other.Tax_Documents.Form_1099.pdf
If there are multiple documents of the same type on the same date, a number is
appended, e.g.:
2018-05-31.Restricted_Units.Trade_Confirmations.Release_Confirmation.pdf
2018-06-28.Restricted_Units.Trade_Confirmations.Release_Confirmation.2.pdf
2018-06-28.Restricted_Units.Trade_Confirmations.Release_Confirmation.3.pdf
If for some reason this data source does not work and you wish to manually
download documents, make sure to use the same name numbering scheme: the first
document listed with a given date, document type, and name should be given no
numeric suffix, the second such document should be given a suffix of `.2`, the
third `.3`, etc.
Example:
========
def CONFIG_stockplanconnect():
return dict(
module='finance_dl.stockplanconnect',
credentials={
'username': 'XXXXXX',
'password': 'XXXXXX',
},
output_directory=os.path.join(data_dir, 'stockplanconnect'),
headless=False,
)
Interactive shell:
==================
From the interactive shell, type: `self.run()` to start the scraper.
"""
import urllib.parse
import re
import collections
import time
import logging
import os
import dateutil.parser
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from finance_dl import scrape_lib
logger = logging.getLogger('scraper')
netloc_re = r'^([^\.@]+\.)*stockplanconnect.com|([^\.@]+\.)*morganstanley.com$'
def check_url(url):
result = urllib.parse.urlparse(url)
if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
raise RuntimeError('Reached invalid URL: %r' % url)
class Scraper(scrape_lib.Scraper):
def __init__(self, credentials, output_directory, **kwargs):
super().__init__(**kwargs)
self.credentials = credentials
self.output_directory = output_directory
def check_after_wait(self):
check_url(self.driver.current_url)
def login(self):
logger.info('Initiating log in')
self.driver.get('https://www.stockplanconnect.com')
(username, password), = self.wait_and_return(
self.find_username_and_password_in_any_frame)
time.sleep(2.0)
username.click()
time.sleep(2.0)
logger.info('Entering username')
username.send_keys(self.credentials['username'])
username.click()
time.sleep(2.0)
logger.info('Entering password')
password.click()
time.sleep(1.0)
password.send_keys(self.credentials['password'])
time.sleep(1.0)
with self.wait_for_page_load():
password.send_keys(Keys.ENTER)
logger.info('Logged in')
def get_output_path(self, parts, index):
journal_date_format = '%Y-%m-%d'
date = dateutil.parser.parse(parts[0])
def sanitize(x):
x = x.replace(' ', '_')
x = re.sub('[^a-zA-Z0-9-_.]', '', x)
return x
suffix = ''
if index != 1:
suffix = '.%d' % index
return os.path.join(
self.output_directory,
'%s.%s.%s.%s%s.pdf' % (date.strftime(journal_date_format),
sanitize(parts[1]), sanitize(parts[2]),
sanitize(parts[3]), suffix))
def get_documents(self):
logger.info('Looking for documents link')
documents, = self.wait_and_locate((By.PARTIAL_LINK_TEXT, 'Documents'))
scrape_lib.retry(lambda: self.click(documents), num_tries=3,
retry_delay=5)
self.download_documents()
def download_documents(self):
logger.info('Looking for PDF links')
links, = self.wait_and_return(
lambda: self.driver.find_elements(By.LINK_TEXT, 'PDF'))
links = list(links)[::-1]
previously_seen_parts = collections.Counter()
for link in links:
cur_el = link
output_path = None
while True:
try:
parent = cur_el.find_element_by_xpath('..')
except:
break
if parent == cur_el:
break
full_text = parent.text
parts = full_text.split('\n')
if len(parts) == 5:
try:
key = tuple(parts)
index = previously_seen_parts[key] + 1
previously_seen_parts[key] += 1
output_path = self.get_output_path(parts, index)
break
except:
logger.info('Failed to determine output filename %r',
parts)
break
else:
cur_el = parent
if output_path is None:
logger.info('skipping link due to no date')
continue
if os.path.exists(output_path):
logger.info('skipping existing file: %r', output_path)
continue
self.click(link)
logger.info('Waiting for download')
download_result, = self.wait_and_return(self.get_downloaded_file)
if not os.path.exists(self.output_directory):
os.makedirs(self.output_directory)
tmp_path = output_path + '.tmp'
with open(tmp_path, 'wb') as f:
download_data = download_result[1]
f.write(download_data)
os.rename(tmp_path, output_path)
logger.info("Wrote %s", output_path)
def run(self):
self.login()
self.get_documents()
def run(**kwargs):
scrape_lib.run_with_scraper(Scraper, **kwargs)
def interactive(**kwargs):
return scrape_lib.interact_with_scraper(Scraper, **kwargs)