-
Notifications
You must be signed in to change notification settings - Fork 0
/
2 - GeraOutlier.R
51 lines (35 loc) · 1.51 KB
/
2 - GeraOutlier.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# IDENFITICACAO DOS POSSIVEIS OUTLIERS
# Carregamento das bibliotecas necessarias
library(tidyverse)
library(magrittr)
# Formatar dados
names(NBS) <- c('cd_nbs','desc_nbs','vlr_tot_nbs')
names(EMPRESAS) <- c('cd_nbs','desc_nbs','cpf_cnpj','cpf_cnpj/nome','vlr_tot_emp','vlr_dia_med_emp')
# adcionar seguintes colunas na variavel nbs_aquisicao:
# desvio_padrao_nbs
# valor_diario_medio_nbs
nbs <- EMPRESAS %>%
group_by(cd_nbs)%>%
summarise(vlr_dia_med_nbs = mean(vlr_dia_med_emp,na.rm = FALSE),
desv_pdr_nbs = sd(vlr_dia_med_emp,na.rm = FALSE))
NBS <- merge(NBS,nbs,by = 'cd_nbs')
rm(nbs)
# adcionar seguintes colunas na variavel empresas_aquisicao:
# vlr_tot_nbs
# vlr_dia_med_nbs (valor em nbs_aquisicao)
# desv_pdr_nbs (valor em nbs_aquisicao)
# z_norm = (vlr_dia_med_nbs - vlr_dia_med_emp)/desv_pdr_nbs,
# part =vlr_tot_emp/vlr_tot_nbs,
# maior_0.10 = if_else(part > 0.1 & vlr_tot_nbs > 1000000,TRUE,FALSE),
# avalia = if_else(part > 0.9 | z_norm > 2, TRUE,FALSE)
EMPRESAS <- NBS %>%
select(-desc_nbs)%>%
right_join(EMPRESAS,by = "cd_nbs")
EMPRESAS <- EMPRESAS %>%
mutate(z_norm = (vlr_dia_med_emp - vlr_dia_med_nbs)/desv_pdr_nbs,
part =vlr_tot_emp/vlr_tot_nbs,
maior_0.10 = if_else(part > 0.1 & vlr_tot_nbs > 1000000,TRUE,FALSE),
avalia = if_else(part > 0.9 | z_norm > 2, TRUE,FALSE))
# selecionar outliers
provaveis_outliers <- EMPRESAS %>%
filter(avalia == TRUE ,maior_0.10 == TRUE)