-
Notifications
You must be signed in to change notification settings - Fork 120
/
scrape.py
126 lines (99 loc) · 5.35 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
sec.gov crawler that downloads exhibit 10 form filings of type 10-K, 10-Q, and 8-K (i.e. material contracts)
sec.gov allows 10 requests per second https://www.sec.gov/privacy.htm#security
Modified from https://github.com/dtuggener/LEDGAR_provision_classification/blob/71c31dad8988ec0cccc73f5fb515576538cc4590/sec_crawler.py
"""
import re
import requests
import os
import time
import zipfile
OUTDIR = './sec_data'
BASE_URL = 'https://www.sec.gov/Archives/'
YEARS = range(2000, 2020, -1)
QS = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
VALID_FORMS = ['10-K', '10-Q', '8-K']
SLEEP_TIME = 0.15
def fetch_master_files():
"""Get the master files"""
for year in YEARS:
year = str(year)
outdir_year = os.path.join(OUTDIR, year)
if not os.path.exists(outdir_year):
os.makedirs(outdir_year)
for q in QS:
outdir_year_q = os.path.join(outdir_year, q)
if not os.path.exists(outdir_year_q):
os.makedirs(outdir_year_q)
outdir_year_q_master = os.path.join(outdir_year_q, 'master.zip')
if not os.path.exists(outdir_year_q_master):
master_url = BASE_URL + 'edgar/full-index/' + year + '/' + q + '/master.zip'
print('Downloading', master_url)
time.sleep(SLEEP_TIME)
response = requests.get(master_url)
with open(outdir_year_q_master, 'wb') as f:
f.write(response.content)
def crawl_master_files():
"""Get crawlable URLs from master files and download contracts"""
for year in YEARS:
print(year)
year = str(year)
outdir_year = os.path.join(OUTDIR, year)
for q in QS:
print(q)
outdir_year_q = os.path.join(outdir_year, q)
outdir_year_q_master = os.path.join(outdir_year_q, 'master.zip')
try:
z = zipfile.ZipFile(outdir_year_q_master) # Fails for non-existant Qs, e.g. 2019 Q3
except:
continue
with z.open('master.idx') as f:
for line in f:
line = line.decode('utf8', errors='ignore')
if line[0].isdigit(): # CIK number
line = line.split('|')
if line[2] in VALID_FORMS:
filing_txt = line[4].strip().split('/')[-1]
filing_id = filing_txt.replace('-', '').replace('.txt', '')
filing_dir = os.path.join(outdir_year_q, filing_id)
if not os.path.exists(filing_dir):
os.makedirs(filing_dir)
filing_index = os.path.join(filing_dir, filing_txt.replace('.txt', '') + '-index.html')
if not os.path.exists(filing_index): # Check if we already have downloaded the filing index
index_url = os.path.join(BASE_URL, 'edgar/data', filing_id, filing_txt.replace('.txt', '') + '-index.html')
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), year, q, 'Downloading index', index_url)
time.sleep(SLEEP_TIME)
try:
index_html = requests.get(index_url)
except:
print("skipping", index_url)
continue
with open(os.path.join(filing_dir, filing_index), 'w') as f:
f.write(index_html.text)
# Load the index_html
index_html = open(filing_index).read()
trs = re.findall('<tr[^>]*>(.*?)</tr>', index_html, re.S)
for row in trs:
if '<td' not in row:
continue
tds = re.split('</?td[^>]*>', row)
if tds[7].startswith('EX-10'):
file_name = re.search('"(.+)"', tds[5]).group(1)
file_url = 'https://www.sec.gov' + file_name
#if file_url.endswith('htm'):
if file_url.endswith('htm') or file_url.endswith('html'):
filing_file = os.path.join(filing_dir, file_name.split('/')[-1])
if not os.path.exists(filing_file):
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), year, q, 'Downloading contract', file_url)
try:
filing_html = requests.get(file_url)
with open(filing_file, 'w') as f:
f.write(filing_html.text)
except:
print("skipping", filing_file)
continue
if __name__ == '__main__':
print('Fetching master files')
fetch_master_files()
print('Fetching contracts')
crawl_master_files()