-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster_preprocessing.py
297 lines (250 loc) · 16.9 KB
/
cluster_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
"""This module provides functions for preprocessing data for clustering."""
# Import pandas as pd
import pandas as pd
# Create a function with the name "preprocessing_data_clusters"
# that receives the dataframe "data" as a parameter and run all the code above
def preprocessing_groups_institutions(data):
"""Preprocess the data for clustering."""
# Create a dataframe called institutions_groups with the column
# "instituciones" from groups_raw
institutions_groups = data["instituciones"]
# Create a dataframe called institutions_groups counting the
# unique values of the column "instituciones" from groups_raw
institutions_groups = institutions_groups.value_counts()
# Change the name of the column "instituciones" to "groups"
# in institutions_groups
institutions_groups = institutions_groups.rename("groups")
# Remove the strings "Categoría " from the column "Estado"
# in groups_raw
data["Estado"] = data["Estado"].str.replace("Categoría ", "")
# Values equals to "00" in the column "estado" in groups_raw are
# replaced by "Grupo reconocido"
data["Estado"] = data["Estado"].replace("00", "Grupo reconocido")
# Create a dataframe called institutions_gropus with the matrix
# of the values in the column "instituciones" and "estado" from groups_raw
institutions_category = pd.crosstab(data["instituciones"], data["Estado"])
# Extract the first four characters from the column "Año y mes
# de formación" in data
data["Año y mes de formación"] = data["Año y mes de formación"].str[:4]
# Change the name of the column "Año y mes de formación" to "anio" in data
data = data.rename(columns={"Año y mes de formación": "anio"})
# Create a dataframe called institutions_year with the unique
# values of "instituciones" and "anio" from groups_raw
institutions_year = data[["instituciones", "anio"]].drop_duplicates()
# From institutions_year, group by "instituciones" and filter by
# the lowest value of "anio"
institutions_year = institutions_year.groupby("instituciones").min()
# Create a dataframe called institutions_cleaned by merging
# institutions_groups, institutions_category and institutions_year
institutions_cleaned = institutions_groups.to_frame().merge(institutions_category,
left_index=True, right_index=True).merge(institutions_year,
left_index=True, right_index=True)
# Change the name of the column "groups" to "total_groups"
# in institutions_cleaned
institutions_cleaned = institutions_cleaned.rename(columns={"groups": "total_groups"})
# Change the name of the column "A" to "Group_A" in institutions_cleaned
institutions_cleaned = institutions_cleaned.rename(columns={"A": "Group_A"})
# Change the name of the column "A1" to "Group_A1" in institutions_cleaned
institutions_cleaned = institutions_cleaned.rename(columns={"A1": "Group_A1"})
# Change the name of the column "B" to "Group_B" in institutions_cleaned
institutions_cleaned = institutions_cleaned.rename(columns={"B": "Group_B"})
# Change the name of the column "C" to "Group_C" in institutions_cleaned
institutions_cleaned = institutions_cleaned.rename(columns={"C": "Group_C"})
# Change the name of the column "Grupo reconocido" to
# "Group_no_category" in institutions_cleaned
rename_dict = {"Grupo reconocido": "Group_no_category"}
institutions_cleaned = institutions_cleaned.rename(columns=rename_dict)
# Create a column named "instituciones" in the dataframe
# "institution_groups" with the values in index column
institutions_cleaned['instituciones'] = institutions_cleaned.index
# Move the column "instituciones" to the first position
cols = [col for col in institutions_cleaned.columns if col != 'instituciones']
new_cols = ['instituciones'] + cols
institutions_cleaned = institutions_cleaned[new_cols]
return institutions_cleaned
# Create a function with the name "preprocessing_groups_papers"
# that receives the dataframe "papers_raw" and "groups_raw" as a
# parameters and run all the code above. Return the dataframe "institutions_papers_3"
def preprocessing_groups_papers(papers_raw, groups_raw):
"""This module provides functions for preprocessing data for papers."""
# Create a dataframe called institutions_papers_1 with the
# matrix of the count of unique values in the column "codigo_grupo"
# and "publindex" from papers_raw
institutions_papers_1 = pd.crosstab(papers_raw["codigo_grupo"], papers_raw["Publindex"])
# from the dataframe "groups_raw" select the columns
# "Código del grupo" and "instituciones" and save it in a
# dataframe called "institutions_groups_names", also change
# the column name to "codigo_grupo" and "instituciones"
institutions_groups_names = groups_raw[["Código del grupo", "instituciones"]]
institutions_groups_names.columns = ["codigo_grupo", "instituciones"]
# from the dataframe "institutions_groups_names" filter the
# unique values and save the results in a dataframe called
# "institutions_groups_names_unique"
institutions_groups_names_unique = institutions_groups_names.drop_duplicates(subset=['codigo_grupo'])
# Merge the dataframe "institutions_groups_names" with the
# dataframe "institutions_papers_1" on the column "codigo_grupo"
institutions_papers_2 = pd.merge(institutions_groups_names_unique,
institutions_papers_1, on="codigo_grupo")
# Remove the column "codigo_grupo" from the dataframe "institutions_papers_2"
institutions_papers_2 = institutions_papers_2.drop(columns=["codigo_grupo"])
# Create a dataframe called "institutions_papers_3"
# with the group by of the dataframe "institutions_papers_2"
# by the column "instituciones" and sum the values
institutions_papers_3 = institutions_papers_2.groupby(["instituciones"]).sum()
# Create a new column in the dataframe "institutions_papers_3"
# called "total" with the sum of the values of the dataframe "institutions_papers_3"
institutions_papers_3["total"] = institutions_papers_3.sum(axis=1)
institutions_papers_3.columns = [
f'Paper_{str(col)}' for col in institutions_papers_3.columns
]
return institutions_papers_3
def preprocessing_groups_researchers(researchers_raw, groups_raw):
"""This module provides functions for preprocessing data for researchers."""
# from groups_raw dataframe, change the name of the column
# "Código del grupo" to "codigo_grupo", select "codigo_grupo"
# and "instituciones" and save the data in a variable called "groups_raw_1"
groups_raw_1 = groups_raw.rename(columns={'Código del grupo': 'codigo_grupo'})[['codigo_grupo', 'instituciones']]
# merge researchers_raw and groups_raw_1 on "codigo_grupo"
# and save the data in a variable called "researchers_raw_1"
researchers_raw_1 = researchers_raw.merge(groups_raw_1, on='codigo_grupo', how='left') # There is a mistake here
# create a new dataframe called "researchers_raw_2" from
# "researchers_raw_1" and select "Nombre" and "instituciones"
researchers_raw_2 = researchers_raw_1[['Nombre', 'instituciones']]
# Group the data by "instituciones" and count the number
# of values in column "Nombre", save the data in dataframe
# called "researchers_raw_3"
researchers_raw_3 = researchers_raw_2.groupby(['instituciones']).size().reset_index(name='counts')
# Change the name of the column "counts" to "researchers_total"
researchers_raw_total = researchers_raw_3.rename(columns={'counts': 'researchers_total'})
# form researchers_raw_1, select "Posgrado" and "instituciones"
# and save the data in a variable called "researchers_raw_4"
researchers_raw_4 = researchers_raw_1[['Posgrado', 'instituciones']]
# Change the values in "Posgrado", "Maestría/magister" or
# "Maestría/Magister" to "magister", "Doctorado" to "doctorate",
# "Especialización" to "specialization", "Especialidad médica" or
# "Especialidad Médica" to "medical_specialization", "Pregrado/universitario"
# or "Pregrado/Universitario" to "undergrad". Save the data in a variable called
# "researchers_raw_5"
researchers_raw_5 = researchers_raw_4.replace({'Posgrado': {'Maestría/magister': 'magister', 'Maestría/Magister': 'magister', 'Doctorado': 'doctorate', 'Especialización': 'specialization', 'Especialidad médica': 'medical_specialization', 'Especialidad Médica': 'medical_specialization', 'Pregrado/universitario': 'undergrad', 'Pregrado/Universitario': 'undergrad'}})
# From researchers_raw_5, remove values in column "Posgrado"
# that are not in the list ["magister", "doctorate", "specialization",
# "medical_specialization", "undergrad"], save the data in a variable
# called "researchers_raw_5"
researchers_raw_5 = researchers_raw_5[researchers_raw_5['Posgrado'].isin(['magister', 'doctorate', 'specialization', 'medical_specialization', 'undergrad'])]
# From researchers_raw_5, group the data by "instituciones" and
# "Posgrado" and count the number of values in column "Posgrado",
# save the data in a variable called "researchers_raw_6"
researchers_raw_6 = researchers_raw_5.groupby(['instituciones', 'Posgrado']).size().reset_index(name='counts')
# Create a matrix with the values in column "Posgrado" as
# columns and the values in column "instituciones" as rows,
# save the data in a variable called "researchers_raw_7"
researchers_raw_7 = researchers_raw_6.pivot(index='instituciones', columns='Posgrado', values='counts')
# Fill the missing values with 0 and save the data in a
# variable called "researchers_raw_formation"
researchers_raw_formation = researchers_raw_7.fillna(0)
# Merge researchers_raw_formation and researchers_raw_total on
# "instituciones" and save the data in a variable called "researchers"
institutions_researchers = researchers_raw_formation.merge(researchers_raw_total, on='instituciones', how='left')
return institutions_researchers
# create a function called "preprocessing_groups_capitulos" with the code below
def preprocessing_groups_capitulos(groups_raw, capitulos_raw):
"""This module provides functions for preprocessing data for chapters."""
# create a dataframe called capitulos_grouped with the following columns:
# - "codigo_grupo" (from "capitulos_raw")
# - "capitulos_totales" (the number of rows in "capitulos_raw" for each "codigo_grupo")
capitulos_grouped = capitulos_raw.groupby('codigo_grupo').size().reset_index(name='capitulos_totales')
# from the dataframe "groups_raw" select the columns
# "Código del grupo" and "instituciones" and save it in a
# dataframe called "institutions_groups_names", also change
# the column name to "codigo_grupo" and "instituciones"
institutions_groups_names = groups_raw[["Código del grupo", "instituciones"]]
institutions_groups_names.columns = ["codigo_grupo", "instituciones"]
# from the dataframe "institutions_groups_names" filter the
# unique values and save the results in a dataframe called
# "institutions_groups_names_unique"
institutions_groups_names_unique = institutions_groups_names.drop_duplicates(subset=['codigo_grupo'])
# Merge the dataframe "institutions_groups_names" with the
# dataframe "caplitulos_grouped" on the column "codigo_grupo"
institutions_capitulos = pd.merge(institutions_groups_names_unique,
capitulos_grouped, on="codigo_grupo")
# Remove the column "codigo_grupo" from the dataframe "institutions_papers_2"
institutions_capitulos_1 = institutions_capitulos.drop(columns=["codigo_grupo"])
# Create a dataframe called "institutions_capitulos_2"
# with the group by of the dataframe "institutions_capitulos_1"
# by the column "instituciones" and sum the values
institutions_capitulos_2 = institutions_capitulos_1.groupby(["instituciones"]).sum()
# from the dataframe "groups_raw" select the columns
# "Código del grupo" and "instituciones" and save it in a
# dataframe called "institutions_groups_names", also change
# the column name to "codigo_grupo" and "instituciones"
institutions_groups_names = groups_raw[["Código del grupo", "instituciones"]]
institutions_groups_names.columns = ["codigo_grupo", "instituciones"]
# from the dataframe "institutions_groups_names" filter the
# unique values and save the results in a dataframe called
# "institutions_groups_names_unique"
institutions_groups_names_unique = institutions_groups_names.drop_duplicates(subset=['codigo_grupo'])
# Merge the dataframe "institutions_groups_names" with the
# dataframe "caplitulos_grouped" on the column "codigo_grupo"
institutions_capitulos = pd.merge(institutions_groups_names_unique,
capitulos_grouped, on="codigo_grupo")
# Remove the column "codigo_grupo" from the dataframe "institutions_papers_2"
institutions_capitulos_1 = institutions_capitulos.drop(columns=["codigo_grupo"])
# Create a dataframe called "institutions_capitulos_2"
# with the group by of the dataframe "institutions_capitulos_1"
# by the column "instituciones" and sum the values
institutions_capitulos_2 = institutions_capitulos_1.groupby(["instituciones"]).sum()
return institutions_capitulos_2
def preprocessing_groups_innovations(groups_raw, innovations_raw):
# create a dataframe called capitulos_grouped with the following columns:
# - "codigo_grupo" (from "capitulos_raw")
# - "innovations_totals" (the number of rows in "innovations_raw" for each "codigo_grupo")
innovations_grouped = innovations_raw.groupby('codigo_grupo').size().reset_index(name='innovation_totals')
# from the dataframe "groups_raw" select the columns
# "Código del grupo" and "instituciones" and save it in a
# dataframe called "institutions_groups_names", also change
# the column name to "codigo_grupo" and "instituciones"
institutions_groups_names = groups_raw[["Código del grupo", "instituciones"]]
institutions_groups_names.columns = ["codigo_grupo", "instituciones"]
# from the dataframe "institutions_groups_names" filter the
# unique values and save the results in a dataframe called
# "institutions_groups_names_unique"
institutions_groups_names_unique = institutions_groups_names.drop_duplicates(subset=['codigo_grupo'])
# # Merge the dataframe "institutions_groups_names" with the
# # dataframe "caplitulos_grouped" on the column "codigo_grupo"
institutions_innovations = pd.merge(institutions_groups_names_unique,
innovations_grouped, on="codigo_grupo")
# # Remove the column "codigo_grupo" from the dataframe "institutions_papers_2"
institutions_innovations_1 = institutions_innovations.drop(columns=["codigo_grupo"])
# # Create a dataframe called "institutions_capitulos_2"
# # with the group by of the dataframe "institutions_capitulos_1"
# # by the column "instituciones" and sum the values
institutions_innovations_2 = institutions_innovations_1.groupby(["instituciones"]).sum()
return institutions_innovations_2
def preprocessing_groups_books(groups_raw, books_raw):
# create a dataframe called capitulos_grouped with the following columns:
# - "codigo_grupo" (from "capitulos_raw")
# - "innovations_totals" (the number of rows in "innovations_raw" for each "codigo_grupo")
institutions_grouped = groups_raw.groupby('Código del grupo').size().reset_index(name='institutions_totals')
# change the column name "grupo" to "codigo_grupo"
institutions_grouped.columns = ["codigo_grupo", "institutions_totals"]
# from the dataframe "groups_raw" select the columns
# "Código del grupo" and "instituciones" and save it in a
# dataframe called "institutions_groups_names", also change
# the column name to "codigo_grupo" and "instituciones"
institutions_groups_names = groups_raw[["Código del grupo", "instituciones"]]
institutions_groups_names.columns = ["codigo_grupo", "instituciones"]
# from the dataframe "institutions_groups_names" filter the
# unique values and save the results in a dataframe called
# "institutions_groups_names_unique"
institutions_groups_names_unique = institutions_groups_names.drop_duplicates(subset=['codigo_grupo'])
# # # Merge the dataframe "institutions_groups_names" with the
# # # dataframe "caplitulos_grouped" on the column "codigo_grupo"
institutions_groups = pd.merge(institutions_groups_names_unique,
institutions_grouped, on="codigo_grupo")
# Remove the column "codigo_grupo" from the dataframe "institutions_papers_2"
institutions_groups_1 = institutions_groups.drop(columns=["codigo_grupo"])
# Create a dataframe called "institutions_capitulos_2"
# with the group by of the dataframe "institutions_capitulos_1"
# by the column "instituciones" and sum the values
institutions_groups_2 = institutions_groups_1.groupby(["instituciones"]).sum()
return institutions_groups_2