-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
322 lines (281 loc) · 12.8 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
import bs4
import psycopg2
import time, sys, threading, requests
from configparser import ConfigParser
from datetime import datetime
from itertools import cycle
# takes in a link, gets the list rows, returns that list
# otherwise none
# super simple just finds the list of rows
def scrapePerformances(resultsLink):
r = requests.get(resultsLink)
soup = bs4.BeautifulSoup(r.content, "html5lib")
outer_div = soup.find('div', class_= 'panel-body frame-loading-hide')
if outer_div:
rowList = outer_div.find_all('tr', class_='allRows')
return rowList
return None
# gets the link to the page containing all the top marks for each season
# year is just the year, seasonStr is either indoor or outdoor
# CURRENTLY ONLY: results for NCAA d1 qualifiers
# TODO: D2, D3, juco, christian, naia
def getResultsLink(year,seasonStr):
#Depending on older results might add gender spec
url = "https://tf.tfrrs.org/college_archives_tab.html?"+seasonStr+"=1&year="+year
r = requests.get(url)
soup = bs4.BeautifulSoup(r.content, 'html5lib')
# just gets first div in row, first list in div, and link from first a tag
# should be the same for every year, returns link when found
# this looks ugly but i guess its how you do it in bs4
row_div = soup.find('div', class_= 'row')
if row_div:
first_div_inside_row = row_div.find('div')
if first_div_inside_row:
ul_inside_first_div = first_div_inside_row.find('ul')
if ul_inside_first_div:
first_li_inside_ul = ul_inside_first_div.find('li')
a_tag = first_li_inside_ul.find('a')
if a_tag:
return a_tag.get('href')
return None
# config parse function
# gets data from database.ini file
# found out about config parser, decided to try it out
def config(filename='database.ini', section='postgresql'):
# create a parser
parser = ConfigParser()
# read config file
parser.read(filename)
# get section, default to postgresql
db = {}
if parser.has_section(section):
params = parser.items(section)
for param in params:
db[param[0]] = param[1]
else:
raise Exception('Section {0} not found in the {1} file'.format(section, filename))
return db
# progress bar function. scraping can take a while so i thought this would look cool :)
def progressbar(current_value,total_value):
if total_value == 0:
total_value = 1
progress = (current_value/total_value) * 10
boxes = ("■" * int(abs(progress)))
spaces = " " * (10-len(boxes))
loadbar = f" [{boxes+spaces}]{int(abs(progress)*10)}%"
print(loadbar, end ='\r')
# convert time format for the rows to match SQL time type
# this is so all events use same format for their result times
def convertTime(time_str):
try:
if ':' in time_str:
# If the time format includes minutes and seconds (e.g., 10:56.66)
time_format = "%M:%S.%f"
else:
# If the time format is in seconds only (e.g., 6.94)
time_format = "%S.%f"
# Parse the time string to a timedelta object
time_delta = datetime.strptime(time_str, time_format) - datetime(1900, 1, 1)
# Convert the timedelta to seconds with 2 decimal places
seconds = time_delta.total_seconds()
# Round to 2 decimal places
seconds = round(seconds, 2)
return seconds
except ValueError:
print("Invalid time format. Please use either seconds (e.g., 6.94) or minutes:seconds.milliseconds (e.g., 10:56.66).")
return None
# remove m at the end, or just keep the same
# literally just checks if last char is as m and slices it off. thats it
def convertMark(mark):
# if a meter distance
if mark[-1] == 'm':
return mark[:-1]
return mark
# MAIN FUNCTION:
# connects to db, scrapes all, stores them in postgreSQL
def scrapeAllYears():
if not sys.argv[1] or not sys.argv[2]:
print("Please provide a valid start and stop year")
print("Example usage: Python3 scrape.py 2023 2024")
# connect
conn = None
# counter for rows skipped
errcount = 0
# wrapped all this is try because it can run into weird one off rows that mess it up
try:
params = config()
# print('Connecting to the PostgreSQL database...', end ='\r')
conn = psycopg2.connect(**params)
cur = conn.cursor()
# temp year and season vars
# begin operations in DB: make table
cur.execute("""DROP TABLE IF EXISTS TopPerformances""")
cur.execute("""CREATE TABLE IF NOT EXISTS TopPerformances (
entryid SERIAL PRIMARY KEY,
event varchar(255),
rank int NOT NULL,
gender varchar(255),
athlete varchar(255),
team varchar(255),
result float,
meet varchar(255),
season varchar(255),
meetdate date)
""")
# loop through seasons and scrape!!!!
# starts at 2010
# ends at latest year (2024)
#START
# startYear default changes if valid
startYear = 2012
endYear = 0
year = datetime.today().year
# if user inputs range, change defaults
if sys.argv[1] and sys.argv[2]:
startYear = int(sys.argv[1])
endYear = int(sys.argv[2])
else:
print("invalid params")
return
# variables: linkyear is just the year we use in the link that changes. (link is link to website)
linkYear = startYear
# starts at indoor season, switches to outdoor for each year
linkSeason = "indoor"
# gets first link
curLink = getResultsLink(str(linkYear), linkSeason)
# END YEAR
# event hashing, so we can check how to store event marks in DB
# looks quite messy, just checks so we know how to store data
timeEvents = { '400Hurdles', '110Hurdles', '100Hurdles', '10000Meters', '1500Meters', '100Meters', '60Meters', '200Meters',
'400Meters', '800Meters', '4x400Relay', '4x100Relay', '3000Steeplechase', 'Mile', '3000Meters', '5000Meters',
'60Hurdles', '4x400Relay', 'DistanceMedleyRelay'}
meterEvents = {'Javelin', 'Hammer', 'HighJump', 'PoleVault', 'LongJump', 'TripleJump', 'ShotPut', 'Discus', 'WeightThrow'}
multiEvents = ('Decathlon', 'Pentathlon', 'Heptathlon')
#------------------------- MAIN OUTER LOOP!!!!!!!!!!!!------------------------------
# print(f"Beginning scrape from {startYear} to {endYear}", end ='\r')
# loop through the years/seasons
while curLink and linkYear <= endYear:
progressbar(linkYear-startYear, year-startYear)
#scrape indoor
# print(f"{linkYear}, {linkSeason}, {curLink}")
# gets rows
rows = scrapePerformances(curLink)
# section counter adds for each new section
# loop through the rows in each season
# INNER LOOP!!!!!!!!!!!!!!!
# loop through rows
for row in rows:
try:
# not proud of this here, but i found out while making the backend queries that i dont even
# have a gender field. this is how i decided to fix that. works
# surprisingly doesnt slow program to a halt.
# i think you can index parents in bs4 (parent[3] maybe ill make it look nicer later)
if (row.parent.parent.parent.parent.get("class")[1][-1]) == 'f':
gender = 'female'
else:
gender = 'male'
# get data row object, extra actual data from the row
data = row.find_all('a')
rank = data[0].text
meetdate = row.find('td', class_='tablesaw-priority-2').text
meetdate = datetime.strptime(meetdate, '%b %d, %Y').strftime('%Y-%m-%d')
# this div doesnt exist in relays, so i use it to check
checkRelay = row.find('td', class_='tablesaw-priority-1')
# gather data depending on relay or nah
# IF A RELAY!
if not checkRelay:
team = data[1].text
result = data[2].text
# MULTIPLE athletes are stored as one for the time being, dont need to make another table
athlete = f"{data[3].text}, {data[4].text}, {data[5].text}, {data[6].text}"
meet = data[7].text
event = data[2].get("href").split("/")[-1].replace("-","")
# if rank == "1":
# print(f"rank:{rank} team:{team} time: {time}athletes:{athlete} meet:{meet} date:{meetdate}")
else:
athlete = data[1].text
team = data[2].text
result = data[3].text
meet = data[4].text
event = data[3].get("href").split("/")[-1].replace("-","")
# if rank == "1":
# print(f"rank:{rank} athlete:{athlete} team:{team} time:{time} meet:{meet} date:{meetdate} ")
#RESULT INSERT depending on a field event, multi, or run
# convert to seconds
# uses hashed event sets from above
if event in timeEvents:
result = convertTime(result)
elif event in meterEvents:
result = convertMark(result)
meet = data[5].text
elif event in multiEvents:
result = convertMark(result)
else:
print("ERROR: BAD EVENT DETECTED")
return
# finally insert
cur.execute("""insert into topperformances (event, rank, gender, athlete, team, result, meet, season, meetdate)
values (%s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(event, int(rank), gender, athlete, team, float(result), meet, linkSeason, meetdate))
# IF A BAD ROW: dont crash
# example of bad row: relay with only 3 people running (idk how that even happens. but it happened once in like 2012 lol)
except (Exception, psycopg2.DatabaseError) as error:
errcount+=1
# print(error)
continue
# set for next season
if linkSeason == "indoor":
linkSeason = "outdoor"
else:
linkYear+=1
linkSeason = "indoor"
# get link for outdoor season!
curLink = getResultsLink(str(linkYear), linkSeason)
progressbar(1, 1)
print("\nSuccessfully completed scrape")
# run clean up queries. fixes some weird data
# combines some teams and fixes some stuff so backend queries can run
# i dont want 2 teams named miami ohio or two named miami florida
# this is the only inconsistency i could find.
# also first query just fixes (A, B, C) teams flooding entries, makes it so they arent different schools in the database
cur.execute("""UPDATE topperformances
SET team = LEFT(team, LENGTH(team) - 4)
WHERE team LIKE '%(_)'""")
cur.execute("""UPDATE topperformances
SET team = 'Miami (Ohio)'
WHERE team = 'Miami (OH)'""")
cur.execute("""UPDATE topperformances
SET team = 'Miami (Fla.)'
WHERE team = 'Miami'""")
cur.close()
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print('\n')
# print(athlete)
# print(event)
print(error)
finally:
if conn is not None:
conn.close()
print('Database connection closed.')
return errcount
def animate():
# cool little animation looks like npm lol
for c in cycle(["⢿", "⣻", "⣽", "⣾", "⣷", "⣯", "⣟", "⡿"]):
if done:
break
sys.stdout.write('\rScraping ' + c + " ")
sys.stdout.flush()
time.sleep(0.1)
if __name__ == "__main__":
#currently works 2012-current
#doesnt work earlier (yet) because:
#in the lists before 2012 there is weird imgs and things get wonky
# can be adapted for other divisions easily, will do it in the future
#happy scraping :)
done = False
t = threading.Thread(target=animate)
t.start()
count = scrapeAllYears()
done = True
print("Skipped entries: " + count)