forked from vincentjs/oshaughnessey
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Scraper.py
148 lines (108 loc) · 3.9 KB
/
Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import re
from urllib.request import urlopen
from Stock import Stock
def importHtml(url):
"Scrapes the HTML file from the given URL"
response = urlopen(url, data = None)
html = response.read().decode('utf-8').split('\n')
return html
def importFinvizPage(html, stocks):
"Imports data from a FINVIZ HTML page"
isFound = False
for line in html:
if line[0:15] == '<td height="10"':
isFound = True
# Import data line into stock database
_readFinvizLine(line, stocks)
if isFound and len(line) < 10:
break
return
def _readFinvizLine(line, stocks):
"Imports stock metrics from the data line and stores it in the database"
# Parse html
(stkraw, dl) = _parseHtml(line)
# Create new stock object
stock = Stock()
# Get ticker symbol
stock.tick = stkraw[dl[1] + 1: dl[2]]
# Get company name
stock.name = stkraw[dl[2] + 1 : dl[3]]
# Get market cap multiplier (either MM or BB)
if stkraw[dl[4] - 1] == 'B':
capmult = 1000000000
else:
capmult = 1000000
# Get market cap
stock.mktcap = capmult * _toFloat(stkraw[dl[3] + 1 : dl[4] - 1])
# Get P/E ratio
stock.pe = _toFloat(stkraw[dl[4] + 1 : dl[5]])
# Get P/S ratio
stock.ps = _toFloat(stkraw[dl[5] + 1 : dl[6]])
# Get P/B ratio
stock.pb = _toFloat(stkraw[dl[6] + 1 : dl[7]])
# Get P/FCF ratio
stock.pfcf = _toFloat(stkraw[dl[7] + 1 : dl[8]])
# Get Dividend Yield
stock.div = _toFloat(stkraw[dl[8] + 1 : dl[9] - 1])
# Get 6-mo Relative Price Strength
stock.mom = _toFloat(stkraw[dl[9] + 1 : dl[10] - 1])
# Get Current Stock Price
stock.price = _toFloat(stkraw[dl[11] + 1 : dl[12]])
# Append stock to list of stocks
stocks.append(stock)
return
def _toFloat(line):
"Converts a string to a float"
try:
num = float(line)
except:
num = float('NaN')
return num
def readYahooEVEBITDA(line):
"Imports EV/EBITDA data from Yahoo! Finance"
# Parse html
(stkraw, dl) = _parseHtml(line)
for i in range(0, len(dl)):
if (stkraw[dl[i] + 1 : dl[i] + 24] == 'Enterprise Value/EBITDA'):
evebitda = stkraw[dl[i + 1] + 1 : dl[i + 2]]
break
return _toFloat(evebitda)
def readYahooBBY(line):
"Imports BBY data from Yahoo! Finance"
# Line also contains Borrowings details - Remove it all
if 'Net Borrowings' in line:
# Remove extra data
line = line[:line.find('Net Borrowings')]
# Trim prior data
line = line[line.find('Sale Purchase of Stock'):]
# Determine if buys or sells, replace open parantheses:
# (#,###) -> -#,###
line = re.sub(r'[(]', '-', line)
# Eliminate commas and close parantheses: -#,### -> -####
line = re.sub(r'[,|)]', '', line)
# Remove HTML data and markup, replacing with commas
line = re.sub(r'[<.*?>|]', ',', line)
line = re.sub(' ', ',', line)
# Locate the beginnings of each quarterly Sale Purchase points
starts = [m.start() for m in re.finditer(',\d+,|,.\d+', line)]
# Locate the ends of each quarterly Sale Purchase points
ends = [m.start() for m in re.finditer('\d,', line)]
# Sum all buys and sells across year
tot = 0
for i in range(0, len(starts)):
# x1000 because all numbers are in thousands
tot = tot + float(line[starts[i] + 1 : ends[i] + 1]) * 1000
return tot
def _parseHtml(line):
"Parses the HTML line by </td> breaks and returns the delimited string"
# Replace </td> breaks with placeholder, '`'
ph = '`'
rem = re.sub('</td>', ph, line)
# The ticker symbol initial delimiter is different
# Remove all other remaining HTML data
stkraw = re.sub('<.*?>', '', rem)
# Replace unbalanced HTML
stkraw = re.sub('">', '`', stkraw)
# Find the placeholders
dl = [m.start() for m in re.finditer(ph, stkraw)]
return (stkraw, dl)