-
Notifications
You must be signed in to change notification settings - Fork 0
/
Funcion palabras.R
45 lines (38 loc) · 1.35 KB
/
Funcion palabras.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# función keywords
# Functions
nearest_result <- function(text, division){
# ToDo() - Basic NLP clean encoding - remove special characters, etc
text <- tolower(text)
aux <- read_xlsx('Inputs/keywords_help.xlsx') %>%
filter(division == division) %>%
# Todo() create vector of similarity
filter(str_detect(text, str_to_lower(str_replace(KEYWORDS, ',', '|')))) %>%
select(INDICATOR) %>% pull()
# Case without match
if( length(aux)!=1 ) {aux<-NA}
return(aux)
}
clean_text <- function(text){
text = str_trim(str_to_lower(text))
text = stri_trans_general(text, id = "Latin-ASCII")
}
clean_detect <- function(text){
text <- str_replace_all(text, regex("([0-9]+)|(\\sy\\s)"), "")
text <- str_replace_all(text, regex("(,\\s)|(\\s,)"), ",")
text <- str_replace_all(text, ',', '|')
text <- clean_text(text)
text <- str_replace_all(text, regex("\\s+"), " ")
return(text)
}
nearest_match <- function(division, text, column){
# ToDo() - Basic NLP clean encoding - remove special characters, etc
aux <- read_xlsx('Inputs/keywords_help.xlsx') %>%
filter({{division}} == division) %>%
# Todo() create vector of similarity
filter(str_detect(clean_text(text),
clean_detect(KEYWORDS))) %>%
select(!!enquo(column)) %>% pull()
# Case without match
if( length(aux)<1 ) {aux<-NA}
return(aux)
}