-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_trainingpeaks.py
143 lines (110 loc) · 4.46 KB
/
scrape_trainingpeaks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from config import credentials, rider_mapping, DATA_PATH
import pyderman as dr
from selenium import webdriver
from tqdm import tqdm
import pandas as pd
import time
class Scraper(object):
def __init__(self, dates, month_first=True):
self.dates = dates
self.restart = True
if month_first:
self.date_format = '%m/%d/%Y'
else:
self.date_format = '%d/%m/%Y'
def open_driver(self, download_path):
driver_path = dr.install(browser=dr.chrome, file_directory='./lib/', verbose=True, chmod=True,
overwrite=False, version='96.0.4664.45', filename=None, return_info=False)
options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {"download.default_directory":download_path})
self.driver = webdriver.Chrome(driver_path, options=options)
self.driver.maximize_window()
def login(self, url_login, credentials):
self.driver.get(url_login)
self.driver.find_element_by_name('Username').send_keys(credentials['username'])
self.driver.find_element_by_name('Password').send_keys(credentials['password'])
self.driver.find_element_by_name('submit').click()
time.sleep(15)
def click_calendar(self):
self.driver.find_element_by_class_name('calendar').click()
time.sleep(5)
def get_athletes(self):
# find athletes
self.driver.find_element_by_class_name('groupAndAthleteSelector').click() # dropdown menu
time.sleep(5)
self.athletes = self.driver.find_elements_by_class_name('athleteOption') # list of athletes
def check_ethics(self, i):
print(self.athletes[i].text)
if ' '.join(self.athletes[i].text.split()[1:]).lower() in rider_mapping.keys() or \
self.athletes[i].text.lower() == 'kusztor peter':
print("include")
return True
else:
print("exclude")
return False
def click_athlete(self, i):
self.athletes[i].click() # click on ith athlete
time.sleep(5)
def click_workouts(self):
# go to list layout
self.driver.find_element_by_class_name('workoutSearch').click()
time.sleep(5)
# select only bike training (only first time visiting website)
if self.restart:
self.driver.find_element_by_class_name('filter').click()
self.driver.set_page_load_timeout(10)
self.driver.find_element_by_xpath("//*[@id='main']/div[1]/div/div/div[3]/div[3]/div/div[2]/div[5]/div[4]/div[2]/label[2]").click() # select bike
self.driver.set_page_load_timeout(10)
def select_dates(self):
start_date = self.dates[self.d]
end_date = self.dates[self.d+1]
self.driver.find_element_by_class_name('endDate').clear()
self.driver.find_element_by_class_name('endDate').send_keys(end_date.strftime(self.date_format)+'\n')
time.sleep(5)
self.driver.find_element_by_class_name('startDate').clear()
self.driver.find_element_by_class_name('startDate').send_keys(start_date.strftime(self.date_format)+'\n')
self.driver.find_element_by_class_name('endDate').send_keys('\n')
time.sleep(25)
def scrape_activities(self, j_min=0):
activities = self.driver.find_elements_by_class_name("activity")
for self.j in range(j_min, int(self.driver.find_element_by_class_name('totalHits').text.strip(' results'))):
activities[self.j].click()
time.sleep(1)
if self.driver.find_element_by_id('quickViewFileUploadDiv').text != 'Upload':
self.driver.find_element_by_id('quickViewFileUploadDiv').click()
self.driver.set_page_load_timeout(10)
# download
self.driver.find_element_by_class_name('download').click()
self.driver.set_page_load_timeout(10)
self.driver.find_element_by_id('closeIcon').click()
self.driver.set_page_load_timeout(10)
def scrape_athlete(self, d_min=0):
# scrape trainingpeaks data for athlete i
for self.d in tqdm(range(d_min, len(self.dates)-1)):
self.select_dates()
self.scrape_activities()
self.driver.find_element_by_class_name('closeIcon').click()
self.driver.set_page_load_timeout(10)
self.restart = False
def main():
"""
Scrape Trainingpeaks
Select by:
- workout type = bike
- date range = 01/01/2020 until 31/10/2020 -> 2019
"""
scraper = Scraper(dates=pd.date_range(start='01-01-2014', end='31-12-2021', freq='30D'))
scraper.open_driver(download_path=DATA_PATH+'TrainingPeaks/export/')
scraper.login(url_login='https://home.trainingpeaks.com/login', credentials=credentials['TP']['pro'])
scraper.click_calendar()
scraper.get_athletes()
for i in range(len(scraper.athletes)):
if scraper.check_ethics(i)
scraper.click_athlete(i)
scraper.click_workouts()
scraper.scrape_athlete()
scraper.get_athletes()
else:
continue
if __name__ == '__main__':
main()