forked from stefan-jansen/machine-learning-for-trading
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
Copy pathsa_selenium.py
116 lines (101 loc) · 3.62 KB
/
sa_selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Stefan Jansen'
import re
from pathlib import Path
from random import random
from time import sleep
from urllib.parse import urljoin
import pandas as pd
from bs4 import BeautifulSoup
from furl import furl
from selenium import webdriver
transcript_path = Path('transcripts')
def store_result(meta, participants, content):
"""Save parse content to csv"""
path = transcript_path / 'parsed' / meta['symbol']
if not path.exists():
path.mkdir(parents=True, exist_ok=True)
pd.DataFrame(content, columns=['speaker', 'q&a', 'content']).to_csv(path / 'content.csv', index=False)
pd.DataFrame(participants, columns=['type', 'name']).to_csv(path / 'participants.csv', index=False)
pd.Series(meta).to_csv(path / 'earnings.csv')
def parse_html(html):
"""Main html parser function"""
date_pattern = re.compile(r'(\d{2})-(\d{2})-(\d{2})')
quarter_pattern = re.compile(r'(\bQ\d\b)')
soup = BeautifulSoup(html, 'lxml')
meta, participants, content = {}, [], []
h1 = soup.find('h1', itemprop='headline')
if h1 is None:
return
h1 = h1.text
meta['company'] = h1[:h1.find('(')].strip()
meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]
title = soup.find('div', class_='title')
if title is None:
return
title = title.text
print(title)
match = date_pattern.search(title)
if match:
m, d, y = match.groups()
meta['month'] = int(m)
meta['day'] = int(d)
meta['year'] = int(y)
match = quarter_pattern.search(title)
if match:
meta['quarter'] = match.group(0)
qa = 0
speaker_types = ['Executives', 'Analysts']
for header in [p.parent for p in soup.find_all('strong')]:
text = header.text.strip()
if text.lower().startswith('copyright'):
continue
elif text.lower().startswith('question-and'):
qa = 1
continue
elif any([type in text for type in speaker_types]):
for participant in header.find_next_siblings('p'):
if participant.find('strong'):
break
else:
participants.append([text, participant.text])
else:
p = []
for participant in header.find_next_siblings('p'):
if participant.find('strong'):
break
else:
p.append(participant.text)
content.append([header.text, qa, '\n'.join(p)])
return meta, participants, content
SA_URL = 'https://seekingalpha.com/'
TRANSCRIPT = re.compile('Earnings Call Transcript')
next_page = True
page = 1
driver = webdriver.Firefox()
while next_page:
print(f'Page: {page}')
url = f'{SA_URL}/earnings/earnings-call-transcripts/{page}'
driver.get(urljoin(SA_URL, url))
sleep(8 + (random() - .5) * 2)
response = driver.page_source
page += 1
soup = BeautifulSoup(response, 'lxml')
links = soup.find_all(name='a', string=TRANSCRIPT)
if len(links) == 0:
next_page = False
else:
for link in links:
transcript_url = link.attrs.get('href')
article_url = furl(urljoin(SA_URL, transcript_url)).add({'part': 'single'})
driver.get(article_url.url)
html = driver.page_source
result = parse_html(html)
if result is not None:
meta, participants, content = result
meta['link'] = link
store_result(meta, participants, content)
sleep(8 + (random() - .5) * 2)
driver.close()
# pd.Series(articles).to_csv('articles.csv')