clean up and extra unit tests

dexter-psychometrics · Jul 3, 2024 · 616cb99 · 616cb99
1 parent 5a784f6
commit 616cb99
Show file tree

Hide file tree

Showing 25 changed files with 266 additions and 189 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -12,4 +12,3 @@
 ^LICENSE$
 ^CRAN-SUBMISSION$
 ^smoke_testing
-aaa.R$
diff --git a/.gitignore b/.gitignore
@@ -5,7 +5,10 @@
 *.o
 *.so
 *.dll
+*.gcda
+*.gcno
 /revdep/.cache.rds
 tmp
 dexter.Rproj
 docs
+smoke_testing
diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,10 @@
 # dexter 1.5.0
 
-* bugfix: previous dexter versions could place the wrong domain name with the profile categories if not all booklets had the same domains
+* bugfix in `profiles()`: correct domain names when different booklets don't have the same domains
 
-* lower autocorrelation for `fit_enorm(..., method='Bayes')`
+* improvements leading to lower autocorrelation for `fit_enorm(..., method='Bayes')`
+
+* `plausible_scores` can now return scores per item to generate complete partially imputed datasets
 
 
 # dexter 1.4.2

diff --git a/R/data_selection.R b/R/data_selection.R
@@ -1,6 +1,3 @@
-# TO DO: rlang has an is_reference method, don't have to use home baked one in cpp
-# test and use
-
 
 #' Selecting data
 #' 

diff --git a/R/dif.R b/R/dif.R
@@ -4,17 +4,19 @@
 PairDIF_ = function(beta1, beta2, acov.beta1, acov.beta2)
 {
   labs = rownames(beta1)
-  DR = kronecker(beta2,t(beta2),FUN="-")-kronecker(beta1,t(beta1),FUN="-") 
+  beta1=drop(beta1);beta2=drop(beta2)
+  DR = outer(beta2,beta2,FUN='-') - outer(beta1,beta1,FUN='-')
   var1 = diag(acov.beta1)
   var2 = diag(acov.beta2)
-  S = (kronecker(var1,t(var1),FUN="+")-2*acov.beta1)+(kronecker(var2,t(var2),FUN="+")-2*acov.beta2)
+  S = outer(var1,var1,FUN='+') - 2*acov.beta1 + outer(var2,var2,FUN='+') -2*acov.beta2
   diag(S) = 1
   D = DR/sqrt(S)
-  colnames(D) = labs; rownames(D)=labs
-  colnames(DR) = labs; rownames(DR)=labs
+  colnames(D) = rownames(D) = labs
+  colnames(DR) = rownames(DR) = labs
   return(list(D=D, Delta_R=DR))
 }
 
+
 ## produces a statistics for overall-DIF
 # beta1 and beta1 are both mean centered and do not contain the zero category. 
 # In general, this works if both sets of parameters have the same normalization.
@@ -177,8 +179,12 @@ print.DIF_stats <- function(x, ...)
 #' The statistics are standard normal deviates and colored to distinguish significant from non-significant values.
 #' If there is no DIF, a proportion alpha off the cells will be colored significant by chance alone.
 #'      
+# experimental, currenlty testing, can use some more finetuning
 plot.DIF_stats = function(x, items = NULL, itemsX = items, itemsY = items, alpha =.05,...)
 {
+  oldpar = par(no.readonly = TRUE)
+  on.exit({par(oldpar)},add=TRUE)
+
   if(is.null(itemsX)) itemsX = sort(unique(x$items$item_id))
   if(is.null(itemsY)) itemsY = sort(unique(x$items$item_id))
 
@@ -215,14 +221,7 @@ plot.DIF_stats = function(x, items = NULL, itemsX = items, itemsY = items, alpha
   }
 
   max_ = max(x$DIF_pair)
-  default.args = list(main = paste(x$group_labels[1],'vs.',x$group_labels[2]),
-                      axes=FALSE, zlim=c(0,max_),xlab='',ylab='',useRaster=TRUE)
-
-  oldpar = par(no.readonly = TRUE)
-  on.exit({par(oldpar)},add=TRUE)
-
-  graphics::layout(matrix(c(1,1,2,0),2,2), widths=c(7,1))
-
+
   qn = qnorm(1-alpha/2)
 
   breaks = seq(0, qn, .05)
@@ -237,11 +236,15 @@ plot.DIF_stats = function(x, items = NULL, itemsX = items, itemsY = items, alpha
 
   # Reverse Y axis
   # yLabels <- rev(yLabels)
-  xLabels <- rev(xLabels)
-  DIF_pair <- DIF_pair[nrow(DIF_pair) : 1,]
+  xLabels = rev(xLabels)
+  DIF_pair = DIF_pair[nrow(DIF_pair) : 1,]
+
+  mgp = par('mgp')
+  mgp[2] = .6 * mgp[2]
+  par(plt = oldpar$plt-c(0,.08,0,0), mgp=mgp, tck=coalesce(par('tck'),-.01))
 
-  # Data Map
-  par(mar = c(6,8,2.5,2))
+  default.args = list(main = paste(x$group_labels[1],'vs.',x$group_labels[2]),
+                      axes=FALSE, zlim=c(0,max_),xlab='',ylab='',useRaster=TRUE)
 
   user.args = list(...)
   do.call(image,
@@ -250,23 +253,25 @@ plot.DIF_stats = function(x, items = NULL, itemsX = items, itemsY = items, alpha
                                          col=col,breaks=breaks),
                          default = default.args))
 
-  cex.axis = coalesce(user.args$cex.axis, 0.6)
+  cex.axis = coalesce(user.args$cex.axis, 0.5)
   axis(1, at=1:length(yLabels), labels=yLabels, las=3, cex.axis=cex.axis, hadj=1,padj=0.5)
   axis(2, at=1:length(xLabels), labels=xLabels, las=1, cex.axis=cex.axis, hadj=1,padj=0.5)
 
   #Color Scale
-  try({
-    par(mar=c(2,2,2.5,2))
-    image(1, seq(0, max(qn,min(20,max_)), by=.05),
+
+  par(new = TRUE, pty = "m", plt = c(.93,.97,.5,oldpar$plt[4]), err = -1,tck=NA)
+
+
+  image(1, seq(0, max(qn,min(20,max_)), by=.05),
           matrix(seq(0, max(qn,min(20,max_)), by=.05),nrow=1),
           col=col,
           breaks=breaks,
           xlab="",ylab="",
-          xaxt="n", axes=FALSE)
-    axis(2, at=0:min(20,max_),lwd=0,lwd.ticks=1,las=2,cex.axis=0.8)
-    
-  }, silent=TRUE)
+          xaxt="n",yaxt="n", axes=FALSE)
+
+  axis(2, at=0:min(20,max_),lwd=0,lwd.ticks=1,las=2,cex.axis=coalesce(user.args$cex.axis, 0.5) * 1.4)
+
 
-  graphics::layout(1)
   invisible(NULL)
 }
+
diff --git a/R/fit_enorm.R b/R/fit_enorm.R
@@ -79,7 +79,6 @@ fit_enorm_ = function(dataSrc, qtpredicate = NULL, fixed_params = NULL, method=c
     } else
     {
       # transform the fixed params to the b parametrization dexter uses internally
-      # TO DO: check that it is ungrouped
       fixed_params = transform.df.parms(fixed_params, out.format = 'b') 
     }  
 
@@ -551,7 +550,6 @@ calibrate_Bayes = function(ss,  nIter, fixed_params=NULL,
     } else
     {
       # different but slightly underdispersed start locations
-      # to do: check this again, 0 madness
       sample_beta = rmvnorm(nchains,cml$beta, cml$acov.beta/2)
 
       b = apply(sample_beta,1,function(beta){ beta2b(ss$ssIS$item_score,beta,ss$ssI$first, ss$ssI$last)})

diff --git a/R/individual_differences.R b/R/individual_differences.R
@@ -87,7 +87,7 @@ theta_score_distribution = function(b,a,first,last,scoretab)
 ## Get the score distribution of a booklet from fit_enorm
 #  based on a polynomial smoothing of the log-lambda's
 #  Currently only implemented for CML
-# TO DO: Implement for Bayes.
+# to~do: not yet for Bayes
 # Check e.g., plot(0:48,log(lambda),col="green"); lines(0:48,log_l_pr)
 # beta = as.numeric(qr$coefficients)[-1]
 # n.obs is the exact observed score distributions if CML

diff --git a/R/latent_cor.R b/R/latent_cor.R
@@ -49,10 +49,7 @@ latent_cor = function(dataSrc, item_property, predicate=NULL, nDraws=500, hpd=0.
   {
     stop("a matrix datasrc is not yet implemented for this function")
   }
-  # to do: this is a bit tricky, we will often need to merge over persons, e.g. if booklets are administered
-  # per subject. But if the same person makes multiple tests for the same subject, this is not good.
-  # Still have to decide on a solution
-  # why not check which is the case?
+  # merge_within_persons might also be user set. But TRUE will be the most useful in nearly all cases
 
   respData = get_resp_data(dataSrc, qtpredicate, env=env, extra_columns=item_property,
                            merge_within_persons=TRUE)

diff --git a/R/misc.R b/R/misc.R
@@ -221,7 +221,7 @@ check_dataSrc = function(x)
   if(is_db(x))
   {
     if(dbIsValid(x)) return(NULL)
-    stop_('your database connection is no longer valid, you need to reconnect. see: ?open_project for details')
+    stop_('your database connection is no longer valid, you need to reconnect. See: ?open_project for details')
   }
 
 

diff --git a/R/misc_anon.R b/R/misc_anon.R
@@ -4,11 +4,6 @@
 # @param setA, setB: two mutually exclusive subsets of items as indexes in first/last
 # @return         a score-by-score matrix of probabilities:
 #                 P(X^A_+=s_a, X^B_+=s_b|X_+=s) where s=s_a+s_b
-
-
-#to do: this has probably changed, does it rely on this?
-# @details        NA's indicate that a total scores was not possible given the weights
-
 # if cIM is not null, the interaction model will be used
 # cIM should be per score
 SSTable = function(b, a, first, last,setA, setB, cIM_score=NULL)

diff --git a/R/plausible_scores.R b/R/plausible_scores.R
@@ -18,7 +18,7 @@
 #' @param items vector of item_id's, this specifies the itemset to generate the testscores for. If \code{items} is \code{NULL} 
 #' all items occurring in \code{dataSrc} are used.
 #' @param covariates name or a vector of names of the variables to group the population, used to update the prior.
-#' A covariate must be a discrete person covariate (e.g. not a float) that indicates nominal categories, e.g. gender or school
+#' A covariate must be a discrete person covariate that indicates nominal categories, e.g. gender or school
 #' If dataSrc is a data.frame, it must contain the covariate.
 #' @param keep.observed If responses to one or more of the items have been observed,
 #' the user can choose to keep these observations or generate new ones. 
@@ -104,9 +104,19 @@ plausible_scores = function(dataSrc, parms=NULL, predicate=NULL, items=NULL, par
     bstep = 0L
   }
 
-
   if(by_item)
   {
+    if(merge_within_persons)
+    {
+      pp = tibble(old_person_id=pv$person_id,booklet_id=pv$booklet_id, person_id=1:nrow(pv))
+      pv$person_id = pp$person_id
+
+      respData$x = respData$x |> 
+        rename(old_person_id='person_id') |>
+        inner_join(pp,by=c('booklet_id','old_person_id')) |>
+        select(-'old_person_id')
+    }
+
     pv = droplevels(pv)
     if(is.unsorted(pv$person_id)) pv = arrange(pv,.data$person_id)
 
@@ -141,7 +151,7 @@ plausible_scores = function(dataSrc, parms=NULL, predicate=NULL, items=NULL, par
           mutate(present=coalesce(.data$present,FALSE)) |>
           arrange(.data$item_id) |>
           mutate(cnt=cumsum(as.integer(!.data$present))*np) |>
-          filter(present)
+          filter(.data$present)
 
         m = as.integer(m[unlist(w)])
 
@@ -153,11 +163,19 @@ plausible_scores = function(dataSrc, parms=NULL, predicate=NULL, items=NULL, par
         x[[sprintf('PS%i',i)]][w] = m
       }
     }
+
     if(!is.null(covariates))
       pv = inner_join(select(pv,all_of(c('person_id',covariates))), x, by='person_id')
     else
       pv = x
 
+    if(merge_within_persons)
+    {
+      pv = inner_join(pv,pp,by='person_id') |>
+        select(-'person_id') |>
+        mutate(person_id='old_person_id') 
+    }
+
   } else
   {
     if(keep.observed && any(respData$design$item_id %in% items))
@@ -207,7 +225,7 @@ plausible_scores = function(dataSrc, parms=NULL, predicate=NULL, items=NULL, par
       }
     }
   }
-  
+
   pv |>
     select(-any_of('booklet_score')) |>
     rename_with(gsub, pattern='^PV(?=\\d+$)',replacement='PS', perl=TRUE)  |>

diff --git a/R/plausible_values.R b/R/plausible_values.R
@@ -297,11 +297,8 @@ plausible_values_ = function(dataSrc, parms=NULL, qtpredicate=NULL, covariates=N
 
   } else
   {
-    # TO DO: ik vind dit onleesbaar: is de check nou dat alle parms er in moetne zitten of dat
-    # er geen scores zijn die neit door parms gedekt worden? Tweede toch?
     if(inherits(parms,'data.frame'))
     {
-      #parms = transform.df.parms(parms,'b')
       pcheck = parms[,c('item_id','item_score')]
     } else
     {

diff --git a/R/resp_data.R b/R/resp_data.R
@@ -1,5 +1,4 @@
 
-
 # faster factor, does not handle or check for NA values
 ffactor = function (x, levels=NULL, as_int=FALSE) 
 {