-
Notifications
You must be signed in to change notification settings - Fork 0
/
pickle_reader.py
330 lines (254 loc) · 12 KB
/
pickle_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
import os
import csv
import pandas as pd
import re
def load_links_from_csv(filename):
"""
Load a list of links from a CSV file.
:param filename: The name of the CSV file to load the links from.
:return: A list of links (strings) loaded from the CSV file.
"""
# Open the CSV file in read mode with newline=''
with open(filename, 'r', newline='') as file:
# Create a CSV reader object
reader = csv.reader(file)
# Iterate through each row in the CSV file and extract the link (first element of each row)
links = [row[0] for row in reader]
# Return the list of links
return links
def save_links_to_csv(links, filename):
"""
Save a list of links to a CSV file.
:param links: A list of links (strings) to be saved.
:param filename: The name of the CSV file to save the links.
"""
# Open the CSV file in write mode with newline=''
with open(filename, 'w', newline='') as file:
# Create a CSV writer object
writer = csv.writer(file)
# Iterate through each link in the list
for link in links:
# Write the link as a row in the CSV file
writer.writerow([link])
def load_from_json(file_name):
"""
Load data from a JSON file.
:param file_name: The name of the JSON file to load the data from.
:return: The data loaded from the JSON file.
"""
# Open the JSON file in read mode
with open(file_name, 'r') as file:
# Load the data from the JSON file
data = json.load(file)
# Return the loaded data
return data
def save_to_json(data, file_name):
"""
Save data to a JSON file.
:param data: The data to be saved in JSON format.
:param file_name: The name of the JSON file to save the data to.
"""
# Open the JSON file in write mode
with open(file_name, 'w') as file:
# Dump the data into the JSON file
json.dump(data, file)
def get_rid_of_cookies_popup(driver):
"""
Close the cookies pop-up on the BattleBots website.
:param driver: The WebDriver instance navigating the website.
"""
# Find the cookies alert close button element by its class name
cookies_button = driver.find_element_by_class_name("cmplz-close")
# Click the close button to get rid of the cookies pop-up
cookies_button.click()
def get_season_links():
"""
Scrapes the BattleBots website to get the links for each season.
:return: A list of links for each season.
"""
# start scraper with link
base_link = "https://battlebots.com/robots/"
driver = webdriver.Chrome()
driver.get(base_link)
# get all season elements to list
class_element = driver.find_elements_by_class_name("row")
# list to save all links
links = []
# items 4 - 6 have 2 links, so using nested loop to get each
for i in range(4, 7):
for j in range(2):
# extract the link for the current season and append it to the list of links
season = class_element[i].find_elements_by_tag_name("a")[j+1].get_attribute("href")
links.append(season)
# extract the link for the final season and append it to the list of links
season = class_element[7].find_element_by_tag_name("a").get_attribute("href")
links.append(season)
# close scraper browser
driver.quit()
return links
def get_season_robot_links(season_link):
"""
Given a link to a BattleBots season page, this function returns a list of links to all the robots
that competed in that season.
:param season_link: A string representing the URL of the BattleBots season page.
:return: A list of strings representing the URLs of each robot that competed in the specified season.
"""
# Start the scraper with the given link
driver = webdriver.Chrome()
driver.get(season_link)
# Initialize an empty list to store the robot links
links = []
# Go to the season link
driver.get(season_link)
# Get all the robot overlay elements as a list
thumbnail_overlay_elements = driver.find_elements_by_css_selector(".portfolio-item-wrapper")
# Iterate through the list of robot elements
for thumbnail_overlay_element in thumbnail_overlay_elements:
# Extract the link element for the robot
link_element = thumbnail_overlay_element.find_element_by_css_selector("a.button")
# Get the link attribute for the robot and append it to the list of links
link = link_element.get_attribute("href")
links.append(link)
# Close the scraper browser
driver.quit()
# Return the list of robot links
return links
def get_all_season_robot_links_dict():
"""
Scrape all season links and their associated robot links from the BattleBots website.
:return: A dictionary containing season names as keys and lists of robot links as values.
"""
# Get the list of season links
season_links = get_season_links()
# Dictionary to store robot links for each season
all_season_robot_links_dict = {}
# Iterate through the season links
for season_link in season_links:
# Extract the season name from the URL
season_name = season_link.split('/')[-2]
# Get the list of robot links for the current season
robot_links = get_season_robot_links(season_link)
# Add the robot links to the dictionary with the season name as the key
all_season_robot_links_dict[season_name] = robot_links
return all_season_robot_links_dict
def read_pickles_in_directory(directory_path):
"""
Given a directory path containing pickle files of robots' match history and stats, this function reads and
stores them in a dictionary.
:param directory_path: The path of the directory containing the pickle files.
:return: A dictionary containing robot names as keys and a nested dictionary with 'match_history' and 'stats'
DataFrames as values.
"""
robot_dict = {}
# regular expression pattern for extracting robot name and table type from filename
regex_pattern = r'(.+)_((match_history)|(stats))\.pkl'
# iterate through each file in the directory
for filename in os.listdir(directory_path):
# check if the filename matches the regex pattern
match = re.match(regex_pattern, filename)
if match:
# extract the robot name and table type from the filename
robot_name = match.group(1)
table_type = match.group(2)
key = robot_name
# read the pickle file and store the DataFrame in the appropriate dictionary key
with open(os.path.join(directory_path, filename), 'rb') as f:
df = pd.read_pickle(f)
if not df.empty:
if key in robot_dict:
robot_dict[key][table_type] = df
else:
robot_dict[key] = {table_type: df}
return robot_dict
def create_long_format_stats_df(robot_dict):
"""
Given a dictionary of robots containing their stats, this function creates a long format DataFrame.
:param robot_dict: A dictionary containing robot names as keys and nested dictionaries with 'stats' DataFrames as values.
:return: A DataFrame in long format containing robot names, years, stats, and values.
"""
# Initialize an empty list to store individual long format DataFrames for each robot
long_format_dfs = []
# Iterate through each robot name and the nested dictionary in robot_dict
for robot_name, nested_dict in robot_dict.items():
# Check if the nested dictionary contains a 'stats' DataFrame
if "stats" in nested_dict:
# Extract the 'stats' DataFrame from the nested dictionary
stats_df = nested_dict["stats"]
# Filter out the 'index' row from the 'stats' DataFrame
stats_df = stats_df[stats_df['Stats'] != 'index']
# Add a new column to the 'stats' DataFrame with the robot name
stats_df["RobotName"] = robot_name
# Melt the 'stats' DataFrame into long format
stats_df = stats_df.melt(id_vars=['RobotName', 'Stats'], var_name='Year', value_name='Value')
# Append the melted DataFrame to the list of long format DataFrames
long_format_dfs.append(stats_df)
# Concatenate the list of DataFrames into a single DataFrame
final_df = pd.concat(long_format_dfs, ignore_index=True)
# Return the final DataFrame in long format
return final_df
def create_long_format_matchup_history_df(robot_dict):
"""
Takes a dictionary of robot information, and returns a Pandas DataFrame containing
the match history for all robots in long format.
Parameters:
robot_dict (dict): A dictionary where the keys are robot names and the values
are dictionaries containing information about the robot.
Returns:
all_match_history_df_melt (Pandas DataFrame): A formatted Pandas DataFrame in long format,
containing the match history for all robots in the input dictionary.
"""
# Create an empty list to store the formatted match history dataframes
match_history_dfs = []
# Loop through each robot in the robot_dict dictionary
for robot_name, tables_dict in robot_dict.items():
# Check if the 'match_history' key exists in the tables_dict
if 'match_history' in tables_dict:
# Extract the match_history dataframe for this robot
match_history_df = tables_dict['match_history']
# Add a RobotName column with the robot's name
match_history_df['RobotName'] = robot_name
# Rename the columns in match_history_df
match_history_df.columns = [col + '_match_history' if col not in ['Season', 'RobotName'] else col for col in match_history_df.columns]
# Append the formatted dataframe to the list
match_history_dfs.append(match_history_df)
# Concatenate all the formatted match history dataframes into a single dataframe
all_match_history_df = pd.concat(match_history_dfs)
# Remove double "_matchup_history" suffix
all_match_history_df.columns = all_match_history_df.columns.str.replace('_match_history_match_history', '_match_history')
# Melt the all_match_history_df table
all_match_history_df_melt = pd.melt(
all_match_history_df,
id_vars=['Season', 'RobotName'],
value_vars=['Round_match_history', 'Matchup_match_history', 'Results_match_history', 'Opponent_match_history'],
var_name='Stats',
value_name='Value'
)
all_match_history_df_melt = all_match_history_df_melt.rename(columns={'Season': 'Year'})
# Fix null concatenation issues
all_match_history_df_melt.dropna(subset=['Year', 'Value'], inplace=True)
return all_match_history_df_melt
def get_all_wiki_robot_links():
"""
Scrapes the Battlebots wiki to extract links to all competitor list templates, which contain links to robots.
:return: A list of links to all competitor list templates.
"""
# link for wiki
link = "https://battlebots.fandom.com/wiki/Category:Competitor_list_templates"
# start scraper with link
driver = webdriver.Chrome()
# go to link
driver.get(link)
# isolate alphabet letter element
member_elements = driver.find_elements_by_class_name("category-page__members-wrapper")
# temporary list
links = []
# iterate through all the alphabet class member and extract `css` element which has link
for member_element in member_elements:
a_elements = member_element.find_elements_by_css_selector(".category-page__member-link")
# get the link in every alphabet `css` element link
for a_element in a_elements:
# append to the empty list
links.append(a_element.get_attribute("href"))
# close scraper browser
driver.quit()
return links