-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPCA.R
98 lines (75 loc) · 3.88 KB
/
PCA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#
# PCA.R
# Obesity Risk
#
# Created by Diego Revilla on 26/04/24
# Copyright � 2024 . All Rights reserved
#
library(FactoMineR)
library(plotrix)
#Choose the six variables to do Principal Component Analysis
pca_dataset <- obesity_data %>% select(Age, Height, Weight, FAF, FCVC, NCP)
#Normalize the dataset
pca_dataset <- scale(pca_dataset)
#multiply the dataset data to match the corrected standar deviation with the uncorrected standard deviation
#, with a difference of sqrt(5/6)
pca_dataset <- pca_dataset * sqrt(5/6)
#calculate the correlation matrix
correlation_matrix <- cor(pca_dataset)
#compute the eigenvectors of the correlation matrix
eigen_vec <- eigen(correlation_matrix)
# The relative variance associated to each eigenvector is the eigenvalue divided by the sum of eigenvalues
variance_table <- cbind(eigen_vec$values,eigen_vec$values/sum(eigen_vec$values),
cumsum(eigen_vec$values/sum(eigen_vec$values)))
#We note that the variable that contributes the less is the lst one, with only a 7% of description of the total variance
#Compute the coordinates for all variables
coordinates <- t(solve(eigen_vec$vectors)%*%t(pca_dataset))
plot(coordinates[,1],coordinates[,2],xlab="Component 1 (27,89%)",ylab="Component 2 (21,78%)",
xlim=c(-4,4),ylim=c(-2.5,2.5),pch=16)
text(coordinates[,1],coordinates[,2]+0.4,labels = row.names(pca_dataset),cex=1)
abline(h=0,v=0,lty=2,col="lightgray")
# Calculate correlations between coordinates and each column of pca_dataset
correlations <- sapply(1:ncol(pca_dataset), function(j) {
c(cor(coordinates[, 1], pca_dataset[, j]), cor(coordinates[, 2], pca_dataset[, j]))
})
# Convert correlations to a matrix and assign column names
correlations <- as.data.frame(t(correlations))
colnames(correlations) <- c("Correlation_Coord1", "Correlation_Coord2")
# Assign row names to correlations
rownames(correlations) <- colnames(pca_dataset)
plot(correlations,xlim=c(-1,1),ylim=c(-1,1),pch=16,cex=0.3,
xlab="Component 1 (27,89%)",ylab="Component 2 (21,78%)",asp=1)
draw.circle(0,0,radius=1)
arrows(x0 = 0,y0 = 0,x1 = correlations[,1],y1=correlations[,2],length=0.1)
abline(h=0,v=0,lty=2)
text(correlations[,1]+0.1,correlations[,2]+0.1,labels = row.names(correlations),cex=1)
# Overlay the scatter plot of individual data points
plot(correlations, xlim = c(-3, 3), ylim = c(-2, 2), pch = 16, cex = 0.3,
xlab = "Component 1 (67.77%)", ylab = "Component 2 (19.05%)", asp = 1)
points(coordinates[, 1], coordinates[, 2], pch = 16, col = "lightgray")
text(coordinates[, 1], coordinates[, 2] + 0.4, labels = row.names(pca_dataset), cex = 0.8)
abline(h = 0, v = 0, lty = 2, col = "lightgray")
# Create the scatter plot with vectors
draw.circle(0, 0, radius = 1)
arrows(x0 = 0, y0 = 0, x1 = correlations[, 1], y1 = correlations[, 2], length = 0.1)
abline(h = 0, v = 0, lty = 2)
text(correlations[, 1] + 0.1, correlations[, 2] + 0.1, labels = row.names(correlations), cex = 0.6)
# Optionally, add a title
title(main = "PCA Analysis: Individuals and Vectors")
# Influential #
# Influential individuals or variables can be detected by computing their contribution to the coordinates and correlations
# Individual contribution
indcontr1 <- (coordinates[,1]^2)/sum(coordinates[,1]^2)
indcontr2 <- (coordinates[,2]^2)/sum(coordinates[,2]^2)
# Variable contribution
varcontr1 <- (correlations[,1]^2)/sum(correlations[,1]^2)
varcontr2 <- (correlations[,2]^2)/sum(correlations[,2]^2)
pca1 <- PCA(pca_dataset,scale.unit = TRUE)
variable_correlation <- (pca1$var$cor)^2
plot(variable_correlation,xlim=c(-1,1),ylim=c(-1,1),pch=16,cex=0.3,
xlab="Component 1 (27,89%)",ylab="Component 2 (21,78%)",asp=1)
draw.circle(0,0,radius=1)
arrows(x0 = 0,y0 = 0,x1 = variable_correlation[,1],y1=variable_correlation[,2],length=0.1)
abline(h=0,v=0,lty=2)
text(variable_correlation[,1]+0.1,variable_correlation[,2]+0.1,labels = row.names(variable_correlation),cex=1)
title(main = "PCA Analysis: PCA Final Analysis")