-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathaux_functions.py
56 lines (40 loc) · 1.27 KB
/
aux_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import numpy as np
import requests
import lxml.html as html
from bs4 import BeautifulSoup
def get_soup(url):
try:
response = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=5)
try:
home = response.content.decode("utf-8")
except:
home = response.content.decode("latin1")
soup = BeautifulSoup(home, "html.parser")
return soup, response.status_code
except requests.exceptions.Timeout as err:
print("timeout :(")
return 1, 1
except:
return 1, 1
def get_clean_text(input):
clean_text = input.copy()
clean_text = clean_text.str.lower()
clean_text = (
clean_text.str.replace(".", "")
.str.replace(":", "")
.str.replace(";", "")
.str.replace(",", "")
.str.replace("-", "")
.str.replace(" ", " ")
.str.replace(" ", " ")
.str.replace(" ", " ")
.str.replace(" ", " ")
.str.replace(" ", " ")
.str.replace(" ", " ")
.str.replace(" ", " ")
)
return clean_text
def get_edad(input, i):
if len(np.where(np.array(input[i]) == "años")[0]) > 0:
edad_position = np.where(np.array(input[i]) == "años")[0][0] - 1
return input[i][edad_position]