-
Notifications
You must be signed in to change notification settings - Fork 0
/
sysCriterion_utils.R
223 lines (177 loc) · 7.56 KB
/
sysCriterion_utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
############################################################################################################
# SYSTEMATIC TUNING CRITERION UTILS
# Set of functions used to quantify the systematic tuning criterion. Specifically, these functions run
# the bootstrapping analyses to determine that probability that a factor with a given number of consecutive
# loadings over a set threshold could have emerged by chance. An algorithm for finding a stastically
# significant solution for a given dataset is also included.
#
# 1. sysFactors <- runs the bootstrapping analysis to determine that probability that a factor with a given
# number of consecutive loadings over a set threshold could have emerged by chance
#
# 2. sysSolution <- algorithm that finds a stastically significant solution for a given dataset
#
#############################################################################################################
# PARAMETERS (sysFactors):
#
# n <- # of samples
# p <- # of variables
# fm <- method of estimation for factor model
# rotate <- type of rotation for factor model
# nF <- # of factors to extract
# ldgTh <- loading threshold
# runTh < # of consecutive loadings
# iterations <- # of iteration for bootstrapping
#
# NOTE: (n,p,fm,rotate) should match the original dataset as closely as possible
sysFactors <- function(n,p,fm,rotate,nF,ldgTh,runTh,iterations=1000){
allSys = as.numeric()
for (i in 1:iterations){
# correlation matrix of random data
corMat <- cor(matrix(rnorm(n*p),n),use = "pairwise.complete.obs")
# factor analysis
if (fm == 'pc'){
factorSolution <- principal(corMat,nfactors=nF,residuals=TRUE,rotate=rotate,n.obs=n,eps=1e-14)
} else {
factorSolution = fa(corMat,nF,n.obs=n,fm=fm,rotate=rotate,residuals=TRUE,scores="regression")
}
sig_loadings = matrix(as.numeric(abs(factorSolution$loadings) > ldgTh),ncol=nF)
#take a count of the number of 'systematic factors'
sysFactors = as.numeric()
for (factor in 1:nF){
#find runs of loadings greater than threshold
runs = rle(sig_loadings[,factor])
vals = as.numeric(which((runs$values == 1) == TRUE))
lens = as.numeric(which((runs$lengths >= runTh) == TRUE))
ans = sum(as.numeric(vals %in% lens))
#check runs
if (ans > 0){
runIDX = lens[lens %in% vals]
initIDX = sum(runs$lengths[1:runIDX-1])+1
runLoadings = factorSolution$loadings[initIDX:(initIDX+runs$lengths[runIDX]-1),factor]
#redo calc of runs based on sign of loadings
sign_runs = rle(sign(runLoadings))
sr = sum(sign_runs$lengths >= runTh)
if (sr > 0){
sysFactors[factor] = 1
} else {
sysFactors[factor] = 0
}
} else{
sysFactors[factor] = 0
}
}
allSys[i] = sum(sysFactors)
}
# calculate and return p-value for the given loading and consecutive loading thresholds
sysFreq = sum(allSys)/(iterations*nF)
return(sysFreq)
}
######################################################################################################################
# PARAMETERS (sysSolution):
#
# n <- # of samples
# faMat <- data correlation/covariance matrix
# fm <- method of estimation for factor model
# rotate <- type of rotation for factor model
# initNF <- initial number of factors (Kaiser rule unless otherwise specified)
# runLim <- limit for number of consecutive loadings to search for in solution
# iterations <- # of iteration for bootstrapping
#
# NOTE: (n,fm,rotate) should match the original solution
sysSolution <- function(n,faMat,fm,rotate="varimax",initF = 0,runLim=4,iterations=1000){
#unless otherwise specified, nF will be initialized as the number of eigenvalues greater than 1
# (aka greater than average)
if (initF == 0){
eigs <- eigen(faMat)
nLambda <- sum(eigs$values > 1)
nF <- nLambda
} else {
nF = initF
}
runTh = 2
pSys = 1
while (pSys >= 0.05){
#compute factor model with given parameters
if (fm == 'pc'){
if (rotate == "varimax"){
faNF <- principal(faMat,nfactors=nF,residuals=TRUE,rotate=rotate,scores=FALSE,n.obs=n,eps=1e-14)
} else {
faNF <- principal(faMat,nfactors=nF,residuals=TRUE,rotate=rotate,n.obs=n,scores=FALSE)
}
} else if (fm == 'mle'){
faNF = fa(faMat,nF,n.obs=n,fm=fm,rotate=rotate,residuals=TRUE)
}
#find highest systematic loadings per factor
sysLoadings = matrix(list(),nrow=nF,ncol=1)
for (factor in 1:nF) {
ldgs = faNF$loadings[,factor]
maxn = sort(abs(ldgs),decreasing=TRUE)
idx = 1
while(sum(sysLoadings[[factor]]) == 0){
max_loadings = (ldgs >= maxn[idx]) + (ldgs <= -maxn[idx])
runs = rle(max_loadings)
#check if high loadings are part of systematic sequence
if (sum(runs$lengths[which(runs$values == 1)] >= runTh) > 0) {
cumIDX = cumsum(runs$lengths)
runIDX = which(runs$values == 1)[runs$lengths[which(runs$values == 1)] >= runTh]
#find systematic sequence with max loadings
eval = matrix(list(),nrow=length(runIDX),ncol=1)
for (r in 1:length(runIDX)){
if (runIDX[r] > 1){
eval[[r]] = abs(ldgs[(cumIDX[runIDX[r]-1]+1):cumIDX[runIDX[r]]])
} else {
eval[[r]] = abs(ldgs[1:cumIDX[1]])
}
}
whichsys = matrix(0,nrow=1,ncol=r)
for (r in 1:length(eval)){
if (sum(eval[[r]] == max(unlist(eval)))){
whichsys[r] = 1
}
}
sysIDX = which(whichsys == 1)
if (runIDX[sysIDX] > 1){
sysLoadings[[factor]] = ldgs[(cumIDX[runIDX[sysIDX]-1]+1):cumIDX[runIDX[sysIDX]]]
} else {
sysLoadings[[factor]] = ldgs[1:cumIDX[1]]
}
} else {
idx = idx + 1
}
}
}
#evaluate the chance probability of the loading and run thresholds that characterize the current model
#as long as runs for all factors exist, otherwise automatically search for different model
if (sum(which(lengths(sysLoadings) < runTh)) == 0) {
ldgTh = min(abs(unlist(sysLoadings)))
pSys = sysFactors(dim(X)[1],dim(X)[2],fm,rotate,nF,ldgTh,runTh,iterations)
} else {
pSys = 1
}
#if this prob is too high, add a run until limit, then decrease number of factors
if (pSys > 0.05){
runTh = runTh + 1
if (runTh > runLim){
runTh = 2
nF = nF - 1
}
}
}
output = matrix(list(),nrow=5,ncol=1)
output[[1]] = round(nF)
output[[2]] = pSys
output[[3]] = ldgTh
output[[4]] = round(runTh)
output[[5]] = sysLoadings
output = data.frame(output,row.names=c('# of factors', 'pval', 'loading thresh','consecutive loadings','sys loadings'))
return(output)
}
# OUTPUT structure describing the final and statistically significant solution
#
# output[[1,1]] = # of factors
# output[[2,1]] = p-value
# output[[3,1]] = loading threshold
# output[[4,1]] = consecutive loading threshold
# output[[5,1]] = loadings for each factor that met the systematic tuning criterion
#
##############################################################################################################