diff --git a/.Rbuildignore b/.Rbuildignore index eae8d05..78d9077 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,5 @@ vignettes Makefile inst/simulation-study/ ^\.github$ +^docker$ +^\.vscode$ diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..1793a56 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,20 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**", + "/usr/lib/R/site-library/cpp11/include", + "/usr/lib/R/site-library/Rcpp/include", + "/usr/share/R/include" + ], + "defines": [], + "compilerPath": "/usr/bin/gcc", + "cStandard": "c17", + "cppStandard": "gnu++17", + "intelliSenseMode": "linux-gcc-x64", + "configurationProvider": "ms-vscode.makefile-tools" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..2727130 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,56 @@ +{ + "files.associations": { + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "concepts": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "map": "cpp", + "unordered_map": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "regex": "cpp", + "string": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "initializer_list": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "ranges": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "typeinfo": "cpp" + } +} \ No newline at end of file diff --git a/NAMESPACE b/NAMESPACE index 201d45a..6a7a69a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(add_geese) export(conditional_prob) export(geese_mcmc) export(geese_mle) +export(get_annotations) export(get_probabilities) export(get_sequence) export(get_support) diff --git a/R/RcppExports.R b/R/RcppExports.R index 397bc58..6eb3aa0 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -174,6 +174,11 @@ get_support <- function(p) { .Call(`_geese_get_support`, p) } +#' @export +get_annotations <- function(p) { + .Call(`_geese_get_annotations_cpp`, p) +} + #' @title Evolutionary terms #' @description Model terms for both [geese] and [flock] objects. #' @export diff --git a/README.md b/README.md index bf8cfc6..27d882c 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,12 @@ annotations <- replicate(n * 2 - 1, c(9, 9), simplify = FALSE) set.seed(31) tree <- aphylo::sim_tree(n)$edge - 1L -duplication <- rep(TRUE, n * 2 - 1) +# Sorting by the second column +tree <- tree[order(tree[, 2]), ] + +duplication <- sample.int( + n = 2, size = n * 2 - 1, replace = TRUE, prob = c(.4, .6) + ) == 1 # Reading the data in amodel <- new_geese( @@ -72,32 +77,53 @@ amodel <- new_geese( ) # Preparing the model -term_gains(amodel, 0:1) -term_loss(amodel, 0:1) -term_maxfuns(amodel, 1, 1) +term_gains(amodel, 0:1, duplication = 1) +term_loss(amodel, 0:1, duplication = 1) +term_gains(amodel, 0:1, duplication = 0) +term_loss(amodel, 0:1, duplication = 0) +term_maxfuns(amodel, 0, 1, duplication = 2) init_model(amodel) #> Initializing nodes in Geese (this could take a while)... #> ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| done. # Testing params <- c( - # Gains + # Gains spe 2, 1.5, # Loss -2, -1.5, + # Gains spe + -2, -1, + # Loss spe + -4, -4, # Max funs 2, # Root probabilities -10, -10 ) -names(params) <- c("gain0", "gain1", "loss0", "loss1", "onefun", "root0", "root1") +names(params) <- c( + "gain0 dupl", "gain1 dupl", + "loss0 dupl", "loss1 dupl", + "gain0 spe", "gain1 spe", + "loss0 spe", "loss1 spe", + "onefun", + "root0", "root1" + ) likelihood(amodel, params*0) # Equals 1 b/c all missings #> [1] 1 # Simulating data -fake1 <- sim_geese(p = amodel, par = params, seed = 1110) +fake1 <- sim_geese(p = amodel, par = params, seed = 212) fake2 <- sim_geese(p = amodel, par = params) + +# Removing interior node data +is_interior <- which(tree[,2] %in% tree[,1]) +is_leaf <- which(!tree[,2] %in% tree[,1]) +# for (i in is_interior) { +# fake1[[i]] <- rep(9, 2) +# fake2[[i]] <- rep(9, 2) +# } ``` We can now visualize either of the annotations using the @@ -132,11 +158,11 @@ amodel <- new_geese( ) # Adding the model terms -term_gains(amodel, 0:1) -term_loss(amodel, 0:1) -term_maxfuns(amodel, 1, 1) - -# We need to initialize to do all the accounting +term_gains(amodel, 0:1, duplication = 1) +term_loss(amodel, 0:1, duplication = 1) +term_gains(amodel, 0:1, duplication = 0) +term_loss(amodel, 0:1, duplication = 0) +term_maxfuns(amodel, 0, 1, duplication = 2) init_model(amodel) #> Initializing nodes in Geese (this could take a while)... #> ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| done. @@ -146,33 +172,92 @@ print(amodel) #> INFO ABOUT PHYLOGENY #> # of functions : 2 #> # of nodes [int; leaf] : [99; 100] -#> # of ann. [zeros; ones] : [72; 128] -#> # of events [dupl; spec] : [99; 0] +#> # of ann. [zeros; ones] : [83; 117] +#> # of events [dupl; spec] : [43; 56] #> Largest polytomy : 2 #> #> INFO ABOUT THE SUPPORT #> Num. of Arrays : 396 -#> Support size : 4 +#> Support size : 8 #> Support size range : [10, 10] #> Transform. Fun. : no -#> Model terms (5) : +#> Model terms (9) : #> - Gains 0 at duplication #> - Gains 1 at duplication #> - Loss 0 at duplication #> - Loss 1 at duplication -#> - Genes with [1, 1] funs at duplication +#> - Gains 0 at speciation +#> - Gains 1 at speciation +#> - Loss 0 at speciation +#> - Loss 1 at speciation +#> - Genes with [0, 1] funs # Finding MLE ans_mle <- geese_mle(amodel, hessian = TRUE) +ans_mle +#> $par +#> [1] 2.327179 1.553591 -1.729575 -1.833682 -1.590516 -1.119200 -3.823851 +#> [8] -2.864298 1.982499 -1.465843 4.366549 +#> +#> $value +#> [1] -109.7751 +#> +#> $counts +#> function gradient +#> 1002 NA +#> +#> $convergence +#> [1] 1 +#> +#> $message +#> NULL +#> +#> $hessian +#> [,1] [,2] [,3] [,4] [,5] +#> [1,] -4.206819071 0.5959524394 0.8862856191 -1.721987651 -1.503185e-01 +#> [2,] 0.595952439 -5.1501119600 -2.3668333888 2.589829847 2.739261e-02 +#> [3,] 0.886285619 -2.3668333888 -6.9892574608 1.273369396 9.894125e-03 +#> [4,] -1.721987651 2.5898298475 1.2733693957 -5.950797128 -3.604817e-02 +#> [5,] -0.150318497 0.0273926144 0.0098941246 -0.036048174 -1.372080e+00 +#> [6,] 0.020065547 -0.0867748664 -0.0605347061 0.373968106 4.557307e-02 +#> [7,] 0.238633328 -0.0203662864 -0.2858568173 0.088855117 -5.867636e-02 +#> [8,] -0.169421696 0.5298916008 0.1330584354 -0.704884567 2.255319e-01 +#> [9,] 2.314439284 4.1601766227 -3.0270645492 -5.257577271 6.883251e-01 +#> [10,] -0.020862576 -0.0004507292 -0.0234848407 0.008509286 1.834480e-02 +#> [11,] 0.000175195 -0.0036292320 -0.0001882725 -0.003219606 2.817835e-05 +#> [,6] [,7] [,8] [,9] [,10] +#> [1,] 0.0200655474 0.238633328 -0.1694216962 2.314439e+00 -2.086258e-02 +#> [2,] -0.0867748664 -0.020366286 0.5298916008 4.160177e+00 -4.507292e-04 +#> [3,] -0.0605347061 -0.285856817 0.1330584354 -3.027065e+00 -2.348484e-02 +#> [4,] 0.3739681063 0.088855117 -0.7048845667 -5.257577e+00 8.509286e-03 +#> [5,] 0.0455730742 -0.058676356 0.2255319007 6.883251e-01 1.834480e-02 +#> [6,] -1.7555584648 0.187628157 0.5698203367 1.306991e+00 2.208491e-04 +#> [7,] 0.1876281566 -1.111934470 0.0777368676 -1.058568e+00 -1.325889e-02 +#> [8,] 0.5698203367 0.077736868 -2.5204264773 -2.774906e+00 7.558960e-03 +#> [9,] 1.3069908000 -1.058567779 -2.7749056741 -1.941377e+01 -1.233878e-02 +#> [10,] 0.0002208491 -0.013258886 0.0075589597 -1.233878e-02 -6.093654e-03 +#> [11,] -0.0005919283 -0.000109976 0.0001258655 4.454019e-04 -3.267786e-05 +#> [,11] +#> [1,] 1.751950e-04 +#> [2,] -3.629232e-03 +#> [3,] -1.882725e-04 +#> [4,] -3.219606e-03 +#> [5,] 2.817835e-05 +#> [6,] -5.919283e-04 +#> [7,] -1.099760e-04 +#> [8,] 1.258655e-04 +#> [9,] 4.454019e-04 +#> [10,] -3.267786e-05 +#> [11,] -9.352519e-04 # Prob of each gene gaining a single function transition_prob( amodel, - params = c(-1, -1, -2, -2, -.5), + params = rep(0, nterms(amodel) - nfuns(amodel)), duplication = TRUE, state = c(FALSE, FALSE), array = matrix(c(1, 0, 0, 1), ncol=2) ) -#> [1] 0.01990333 +#> [1] 0.0625 ``` ## Model fitting MCMC @@ -182,14 +267,26 @@ set.seed(122) ans_mcmc <- geese_mcmc( amodel, nsteps = 20000, - kernel = fmcmc::kernel_ram(warmup = 2000), - prior = function(p) dlogis(p, scale = 2, log = TRUE) - ) + kernel = fmcmc::kernel_ram(warmup = 5000), + prior = function(p) c( + dlogis( + p, + scale = 4, + location = c( + rep(0, nterms(amodel) - nfuns(amodel)), + rep(-5, nfuns(amodel)) + ), + log = TRUE + ) + )) ``` We can take a look at the results like this: + + #> @@ -201,65 +298,82 @@ style="width:100.0%" /> #> 1. Empirical mean and standard deviation for each variable, #> plus standard error of the mean: #> - #> Mean SD Naive SE Time-series SE - #> Gains 0 at duplication 2.5163 1.5272 0.021595 0.17972 - #> Gains 1 at duplication 2.0925 1.2076 0.017077 0.13709 - #> Loss 0 at duplication -1.7518 0.6079 0.008596 0.05062 - #> Loss 1 at duplication -1.4358 0.6546 0.009257 0.04587 - #> Genes with [1, 1] funs at duplication 1.8845 0.4698 0.006643 0.03823 - #> Root 1 0.2000 4.5845 0.064828 0.84884 - #> Root 2 -0.4791 3.3166 0.046900 0.49277 + #> Mean SD Naive SE Time-series SE + #> Gains 0 at duplication 2.9015 0.8051 0.011385 0.09034 + #> Gains 1 at duplication 1.6914 0.5653 0.007994 0.04934 + #> Loss 0 at duplication -2.0287 0.5349 0.007563 0.05280 + #> Loss 1 at duplication -1.8866 0.6442 0.009110 0.08533 + #> Gains 0 at speciation -12.1932 3.5435 0.050107 1.15176 + #> Gains 1 at speciation -0.1454 0.6609 0.009345 0.06815 + #> Loss 0 at speciation -2.9909 0.5184 0.007331 0.04458 + #> Loss 1 at speciation -5.1655 1.9408 0.027444 0.39515 + #> Genes with [0, 1] funs 2.2578 0.4569 0.006461 0.06265 + #> Root 1 -1.0470 3.0807 0.043564 0.94842 + #> Root 2 -4.2756 4.2474 0.060061 1.59284 #> #> 2. Quantiles for each variable: #> - #> 2.5% 25% 50% 75% 97.5% - #> Gains 0 at duplication 0.3490 1.484 2.22730 3.187 6.9267 - #> Gains 1 at duplication 0.3154 1.258 1.89930 2.630 5.2817 - #> Loss 0 at duplication -2.9349 -2.191 -1.72650 -1.306 -0.6860 - #> Loss 1 at duplication -2.7509 -1.892 -1.44436 -0.984 -0.1482 - #> Genes with [1, 1] funs at duplication 1.0690 1.534 1.86367 2.195 2.8358 - #> Root 1 -8.7570 -2.336 0.03266 2.579 10.9341 - #> Root 2 -6.9796 -2.679 -0.45639 1.848 5.8369 + #> 2.5% 25% 50% 75% 97.5% + #> Gains 0 at duplication 1.4054 2.3030 2.8777 3.4337 4.5624 + #> Gains 1 at duplication 0.5451 1.3327 1.7001 2.0905 2.7559 + #> Loss 0 at duplication -3.0657 -2.3764 -2.0460 -1.6762 -0.9765 + #> Loss 1 at duplication -3.1944 -2.3389 -1.8797 -1.4119 -0.6868 + #> Gains 0 at speciation -18.2113 -14.9130 -12.1597 -10.1648 -3.6030 + #> Gains 1 at speciation -1.5472 -0.5998 -0.1365 0.3416 1.0736 + #> Loss 0 at speciation -4.0181 -3.3470 -2.9738 -2.6539 -2.0354 + #> Loss 1 at speciation -9.4815 -6.5157 -4.8115 -3.6121 -2.3045 + #> Genes with [0, 1] funs 1.4263 1.9483 2.2481 2.5599 3.2238 + #> Root 1 -5.9435 -3.5719 -1.4757 1.4858 4.7924 + #> Root 2 -14.2253 -5.9892 -3.8179 -1.5920 3.3555 ``` r -par_estimates <- colMeans(window(ans_mcmc, start = 20000)) -ans_pred <- predict_geese(amodel, par_estimates, leave_one_out = TRUE) -ans_pred <- do.call(rbind, ans_pred) +par_estimates <- colMeans( + window(ans_mcmc, start = end(ans_mcmc)*3/4) + ) +ans_pred <- predict_geese( + amodel, par_estimates, + leave_one_out = TRUE, + only_annotated = TRUE + ) |> do.call(what = "rbind") +# ans_pred_sim <- replicate(1e4, { +# do.call(rbind, sim_geese(amodel, par_estimates)) +# }, simplify = "array") + +# ans_pred_avg <- ans_pred_sim[,,1] +# for (i in 2:dim(ans_pred_sim)[3]) { +# ans_pred_avg <- ans_pred_avg + ans_pred_sim[,,i] +# } + +# ans_pred_avg <- ans_pred_avg / dim(ans_pred_sim)[3] + +# ans_pred_sim <- predict_geese_simulate( +# amodel, params, nsim = 100 +# ) |> +# do.call(what = "rbind") # Preparing annotations ann_obs <- do.call(rbind, fake1) -# Mean Absolute Error -hist(abs(ans_pred - ann_obs)) -``` - - - -``` r - # AUC -(ans <- aphylo::prediction_score( - cbind(as.vector(ans_pred)), - cbind(as.vector(ann_obs)) - )) +(ans <- prediction_score(ans_pred, ann_obs)) #> Prediction score (H0: Observed = Random) #> -#> N obs. : 398 -#> alpha(0, 1) : 0.37, 0.63 -#> Observed : 0.55 -#> Random : 0.53 -#> P( N obs. : 199 +#> alpha(0, 1) : 0.40, 0.60 +#> Observed : 0.68 *** +#> Random : 0.52 +#> P( -------------------------------------------------------------------------------- #> Values scaled to range between 0 and 1, 1 being best. #> #> Significance levels: *** p < .01, ** p < .05, * p < .10 -#> AUC 0.51. -#> MAE 0.45. +#> AUC 0.80. +#> MAE 0.32. -plot(ans$auc) +plot(ans$auc, xlim = c(0,1), ylim = c(0,1)) ``` - + @@ -294,8 +408,11 @@ add_geese( # Persistence to preserve parent state term_gains(flock, 0:1, duplication = 1) -term_gains(flock, 0:1, duplication = 1) -term_maxfuns(flock, 1, 1, duplication = 1) +term_loss(flock, 0:1, duplication = 1) +term_gains(flock, 0:1, duplication = 0) +term_loss(flock, 0:1, duplication = 0) +term_maxfuns(flock, 0, 1, duplication = 2) + # We need to initialize to do all the accountintg init_model(flock) @@ -307,21 +424,25 @@ print(flock) #> INFO ABOUT THE PHYLOGENIES #> # of phylogenies : 2 #> # of functions : 2 -#> # of ann. [zeros; ones] : [145; 255] -#> # of events [dupl; spec] : [198; 0] +#> # of ann. [zeros; ones] : [165; 235] +#> # of events [dupl; spec] : [86; 112] #> Largest polytomy : 2 #> #> INFO ABOUT THE SUPPORT #> Num. of Arrays : 792 -#> Support size : 4 -#> Support size range : [3, 10] +#> Support size : 8 +#> Support size range : [10, 10] #> Transform. Fun. : no -#> Model terms (5) : +#> Model terms (9) : #> - Gains 0 at duplication #> - Gains 1 at duplication -#> - Gains 0 at duplication -#> - Gains 1 at duplication -#> - Genes with [1, 1] funs at duplication +#> - Loss 0 at duplication +#> - Loss 1 at duplication +#> - Gains 0 at speciation +#> - Gains 1 at speciation +#> - Loss 0 at speciation +#> - Loss 1 at speciation +#> - Genes with [0, 1] funs ``` We can use the same program to fit the MCMC @@ -351,11 +472,16 @@ for (i in 1:ncol(ans_mcmc2)) { ) abline(h = params[i], lty=3, lwd=4, col = "red") } -par(op) ``` +``` r +par(op) +``` + + + ``` r summary(window(ans_mcmc2, start = 10000)) #> @@ -367,32 +493,97 @@ summary(window(ans_mcmc2, start = 10000)) #> 1. Empirical mean and standard deviation for each variable, #> plus standard error of the mean: #> -#> Mean SD Naive SE Time-series SE -#> Gains 0 at duplication 1.2650 2.2088 0.022087 0.222041 -#> Gains 1 at duplication 0.6656 2.5350 0.025348 0.445193 -#> Gains 0 at duplication 1.5533 2.3386 0.023385 0.333928 -#> Gains 1 at duplication 0.6474 2.5415 0.025414 0.394829 -#> Genes with [1, 1] funs at duplication 0.8878 0.1446 0.001446 0.004478 -#> Root 1 0.5259 3.5929 0.035927 0.363434 -#> Root 2 0.1195 3.6405 0.036403 0.413736 +#> Mean SD Naive SE Time-series SE +#> Gains 0 at duplication 2.39204 0.4707 0.004707 0.03019 +#> Gains 1 at duplication 1.85804 0.4925 0.004925 0.02789 +#> Loss 0 at duplication -2.15114 0.4451 0.004451 0.03310 +#> Loss 1 at duplication -1.50477 0.4427 0.004427 0.03176 +#> Gains 0 at speciation -4.10744 2.9954 0.029952 0.76564 +#> Gains 1 at speciation -0.84969 0.8242 0.008241 0.09520 +#> Loss 0 at speciation -3.16554 0.6535 0.006535 0.05307 +#> Loss 1 at speciation -4.88115 2.0161 0.020160 0.32971 +#> Genes with [0, 1] funs 2.09933 0.3703 0.003702 0.02921 +#> Root 1 0.02501 2.6487 0.026486 0.45210 +#> Root 2 -1.07238 2.9197 0.029195 0.56841 #> #> 2. Quantiles for each variable: #> -#> 2.5% 25% 50% 75% 97.5% -#> Gains 0 at duplication -3.3453 -0.16720 1.28576 2.7768 5.455 -#> Gains 1 at duplication -4.5441 -1.03143 0.92544 2.4571 5.112 -#> Gains 0 at duplication -2.4602 -0.07041 1.44898 2.8152 6.893 -#> Gains 1 at duplication -3.9060 -1.09242 0.33373 2.3327 5.772 -#> Genes with [1, 1] funs at duplication 0.6098 0.78839 0.89160 0.9865 1.166 -#> Root 1 -7.0003 -1.66898 0.41966 2.6846 8.177 -#> Root 2 -7.0876 -2.23377 -0.03671 2.3143 7.521 +#> 2.5% 25% 50% 75% 97.5% +#> Gains 0 at duplication 1.5050 2.068 2.37614 2.7239 3.3368 +#> Gains 1 at duplication 0.9237 1.511 1.84256 2.2029 2.8299 +#> Loss 0 at duplication -3.0413 -2.451 -2.14564 -1.8533 -1.2836 +#> Loss 1 at duplication -2.3961 -1.809 -1.51894 -1.1984 -0.6178 +#> Gains 0 at speciation -11.2547 -5.414 -2.91312 -1.9486 -0.9131 +#> Gains 1 at speciation -3.2320 -1.183 -0.72227 -0.3283 0.3280 +#> Loss 0 at speciation -4.7209 -3.510 -3.08984 -2.7347 -2.0557 +#> Loss 1 at speciation -10.5227 -5.326 -4.19469 -3.5823 -2.7532 +#> Genes with [0, 1] funs 1.3738 1.842 2.07762 2.3515 2.8303 +#> Root 1 -4.7967 -1.873 -0.04377 1.5864 6.0565 +#> Root 2 -6.5355 -3.147 -1.08668 1.1586 4.6030 ``` +Are we doing better in AUCs? + +``` r +par_estimates <- colMeans( + window(ans_mcmc2, start = end(ans_mcmc2)*3/4) + ) + +ans_pred <- predict_flock( + flock, par_estimates, + leave_one_out = TRUE, + only_annotated = TRUE + ) |> + lapply(do.call, what = "rbind") |> + do.call(what = rbind) +# ans_pred_sim <- replicate(1e4, { +# do.call(rbind, sim_geese(amodel, par_estimates)) +# }, simplify = "array") + +# ans_pred_avg <- ans_pred_sim[,,1] +# for (i in 2:dim(ans_pred_sim)[3]) { +# ans_pred_avg <- ans_pred_avg + ans_pred_sim[,,i] +# } + +# ans_pred_avg <- ans_pred_avg / dim(ans_pred_sim)[3] + +# ans_pred_sim <- predict_geese_simulate( +# amodel, params, nsim = 100 +# ) |> +# do.call(what = "rbind") + +# Preparing annotations +ann_obs <- rbind( + do.call(rbind, fake1), + do.call(rbind, fake2) +) + +# AUC +(ans <- prediction_score(ans_pred, ann_obs)) +#> Prediction score (H0: Observed = Random) +#> +#> N obs. : 398 +#> alpha(0, 1) : 0.42, 0.58 +#> Observed : 0.72 *** +#> Random : 0.51 +#> P( -------------------------------------------------------------------------------- +#> Values scaled to range between 0 and 1, 1 being best. +#> +#> Significance levels: *** p < .01, ** p < .05, * p < .10 +#> AUC 0.86. +#> MAE 0.28. +plot(ans$auc) +``` + + + ## Limiting the support In this example, we use the function `rule_limit_changes()` to apply a -constraint to the support of the model. This takes the first 2 term (0 -and 1 since the index is in C++), and restricts the support to states +constraint to the support of the model. This takes the first two terms +(0 and 1 since the index is in C++) and restricts the support to states where there are between $[0, 2]$ changes, at most. This should be useful when dealing with multiple functions or @@ -423,7 +614,7 @@ init_model(amodel_limited) # Is limiting the support any useful? support_size(amodel_limited) -#> [1] 196 +#> [1] 224 ``` Since we added the constraint based on the term @@ -455,27 +646,27 @@ style="width:100.0%" /> #> 1. Empirical mean and standard deviation for each variable, #> plus standard error of the mean: #> - #> Mean SD Naive SE Time-series SE - #> Gains 0 at duplication 1.4575 0.6083 0.008602 0.04453 - #> Gains 1 at duplication 1.1337 0.6868 0.009711 0.05444 - #> Loss 0 at duplication -1.8499 0.4843 0.006849 0.04207 - #> Loss 1 at duplication -1.7043 0.4894 0.006920 0.03454 - #> Genes with [1, 1] funs at duplication 1.6845 0.4270 0.006038 0.03512 - #> Overall changes at duplication 0.0000 0.0000 0.000000 0.00000 - #> Root 1 -0.4058 4.0884 0.057813 0.76638 - #> Root 2 0.4454 3.8899 0.055006 0.78296 + #> Mean SD Naive SE Time-series SE + #> Gains 0 at duplication 1.06329 0.8555 0.012097 0.06474 + #> Gains 1 at duplication 1.00857 0.7727 0.010927 0.04945 + #> Loss 0 at duplication -1.44630 0.7529 0.010647 0.05664 + #> Loss 1 at duplication -0.65287 0.7342 0.010383 0.04529 + #> Genes with [1, 1] funs at duplication 1.04183 0.3736 0.005283 0.02301 + #> Overall changes at duplication 0.00000 0.0000 0.000000 0.00000 + #> Root 1 -0.05519 3.1452 0.044476 0.35121 + #> Root 2 -0.20215 3.2415 0.045837 0.41755 #> #> 2. Quantiles for each variable: #> - #> 2.5% 25% 50% 75% 97.5% - #> Gains 0 at duplication 0.40725 1.0206 1.42499 1.832 2.8323 - #> Gains 1 at duplication -0.04293 0.6465 1.09431 1.593 2.6107 - #> Loss 0 at duplication -2.81021 -2.1509 -1.82709 -1.512 -0.9524 - #> Loss 1 at duplication -2.65462 -1.9963 -1.68462 -1.377 -0.8145 - #> Genes with [1, 1] funs at duplication 0.93538 1.3850 1.65966 1.936 2.6215 - #> Overall changes at duplication 0.00000 0.0000 0.00000 0.000 0.0000 - #> Root 1 -10.37064 -2.6885 -0.09387 2.250 6.7433 - #> Root 2 -6.89559 -1.9881 0.26227 2.743 9.1986 + #> 2.5% 25% 50% 75% 97.5% + #> Gains 0 at duplication -0.5104 0.5096 1.07974 1.5870 2.75348 + #> Gains 1 at duplication -0.3511 0.4883 0.97593 1.4741 2.72087 + #> Loss 0 at duplication -3.0046 -1.9420 -1.39766 -0.9289 -0.05484 + #> Loss 1 at duplication -2.0463 -1.1631 -0.65509 -0.2187 0.87313 + #> Genes with [1, 1] funs at duplication 0.3743 0.7911 1.01242 1.2674 1.88310 + #> Overall changes at duplication 0.0000 0.0000 0.00000 0.0000 0.00000 + #> Root 1 -6.4868 -2.1595 0.08435 2.1941 5.72248 + #> Root 2 -6.6845 -2.0668 -0.14747 1.7791 6.08394 # Code of Conduct diff --git a/README.qmd b/README.qmd index b2fa5e0..1a08ff6 100644 --- a/README.qmd +++ b/README.qmd @@ -68,7 +68,7 @@ tree <- aphylo::sim_tree(n)$edge - 1L tree <- tree[order(tree[, 2]), ] duplication <- sample.int( - n = 2, size = n * 2 - 1, replace = TRUE, prob = c(.1, .9) + n = 2, size = n * 2 - 1, replace = TRUE, prob = c(.4, .6) ) == 1 # Reading the data in @@ -80,26 +80,36 @@ amodel <- new_geese( ) # Preparing the model -term_gains(amodel, 0:1) -term_loss(amodel, 0:1) -term_maxfuns(amodel, 0, 1) -term_overall_changes(amodel, duplication = 0) +term_gains(amodel, 0:1, duplication = 1) +term_loss(amodel, 0:1, duplication = 1) +term_gains(amodel, 0:1, duplication = 0) +term_loss(amodel, 0:1, duplication = 0) +term_maxfuns(amodel, 0, 1, duplication = 2) init_model(amodel) # Testing params <- c( - # Gains + # Gains spe 2, 1.5, # Loss -2, -1.5, + # Gains spe + -2, -1, + # Loss spe + -4, -4, # Max funs 2, - # Overall changes - -5, # Root probabilities -10, -10 ) -names(params) <- c("gain0", "gain1", "loss0", "loss1", "onefun", "root0", "root1") +names(params) <- c( + "gain0 dupl", "gain1 dupl", + "loss0 dupl", "loss1 dupl", + "gain0 spe", "gain1 spe", + "loss0 spe", "loss1 spe", + "onefun", + "root0", "root1" + ) likelihood(amodel, params*0) # Equals 1 b/c all missings @@ -143,11 +153,11 @@ amodel <- new_geese( ) # Adding the model terms -term_gains(amodel, 0:1) -term_loss(amodel, 0:1) -term_maxfuns(amodel, 0, 1) -term_overall_changes(amodel, duplication = 0) -# We need to initialize to do all the accounting +term_gains(amodel, 0:1, duplication = 1) +term_loss(amodel, 0:1, duplication = 1) +term_gains(amodel, 0:1, duplication = 0) +term_loss(amodel, 0:1, duplication = 0) +term_maxfuns(amodel, 0, 1, duplication = 2) init_model(amodel) print(amodel) @@ -159,7 +169,7 @@ ans_mle # Prob of each gene gaining a single function transition_prob( amodel, - params = c(-1, -1, -2, -2, -2, -.5), + params = rep(0, nterms(amodel) - nfuns(amodel)), duplication = TRUE, state = c(FALSE, FALSE), array = matrix(c(1, 0, 0, 1), ncol=2) ) @@ -176,7 +186,7 @@ ans_mcmc <- geese_mcmc( prior = function(p) c( dlogis( p, - scale = 2, + scale = 4, location = c( rep(0, nterms(amodel) - nfuns(amodel)), rep(-5, nfuns(amodel)) @@ -213,37 +223,31 @@ par_estimates <- colMeans( window(ans_mcmc, start = end(ans_mcmc)*3/4) ) ans_pred <- predict_geese( - amodel, ans_mle$par, + amodel, par_estimates, leave_one_out = TRUE, only_annotated = TRUE ) |> do.call(what = "rbind") -ans_pred_sim <- replicate(1e4, { - do.call(rbind, sim_geese(amodel, params)) -}, simplify = "array") +# ans_pred_sim <- replicate(1e4, { +# do.call(rbind, sim_geese(amodel, par_estimates)) +# }, simplify = "array") -ans_pred_avg <- ans_pred_sim[,,1] -for (i in 2:dim(ans_pred_sim)[3]) { - ans_pred_avg <- ans_pred_avg + ans_pred_sim[,,i] -} +# ans_pred_avg <- ans_pred_sim[,,1] +# for (i in 2:dim(ans_pred_sim)[3]) { +# ans_pred_avg <- ans_pred_avg + ans_pred_sim[,,i] +# } -ans_pred_avg <- ans_pred_avg / dim(ans_pred_sim)[3] +# ans_pred_avg <- ans_pred_avg / dim(ans_pred_sim)[3] -ans_pred_sim <- predict_geese_simulate( - amodel, params, nsim = 100 - ) |> - do.call(what = "rbind") +# ans_pred_sim <- predict_geese_simulate( +# amodel, params, nsim = 100 +# ) |> +# do.call(what = "rbind") # Preparing annotations ann_obs <- do.call(rbind, fake1) -# Mean Absolute Error -hist(abs(ans_pred - ann_obs)) - # AUC -(ans <- aphylo::prediction_score( - cbind(as.vector(ans_pred)), - cbind(as.vector(ann_obs)) - )) +(ans <- prediction_score(ans_pred, ann_obs)) plot(ans$auc, xlim = c(0,1), ylim = c(0,1)) ``` @@ -284,8 +288,11 @@ add_geese( # Persistence to preserve parent state term_gains(flock, 0:1, duplication = 1) -term_gains(flock, 0:1, duplication = 1) -term_maxfuns(flock, 1, 1, duplication = 1) +term_loss(flock, 0:1, duplication = 1) +term_gains(flock, 0:1, duplication = 0) +term_loss(flock, 0:1, duplication = 0) +term_maxfuns(flock, 0, 1, duplication = 2) + # We need to initialize to do all the accountintg init_model(flock) @@ -324,6 +331,47 @@ par(op) summary(window(ans_mcmc2, start = 10000)) ``` +Are we doing better in AUCs? + +```{r} +par_estimates <- colMeans( + window(ans_mcmc2, start = end(ans_mcmc2)*3/4) + ) + +ans_pred <- predict_flock( + flock, par_estimates, + leave_one_out = TRUE, + only_annotated = TRUE + ) |> + lapply(do.call, what = "rbind") |> + do.call(what = rbind) +# ans_pred_sim <- replicate(1e4, { +# do.call(rbind, sim_geese(amodel, par_estimates)) +# }, simplify = "array") + +# ans_pred_avg <- ans_pred_sim[,,1] +# for (i in 2:dim(ans_pred_sim)[3]) { +# ans_pred_avg <- ans_pred_avg + ans_pred_sim[,,i] +# } + +# ans_pred_avg <- ans_pred_avg / dim(ans_pred_sim)[3] + +# ans_pred_sim <- predict_geese_simulate( +# amodel, params, nsim = 100 +# ) |> +# do.call(what = "rbind") + +# Preparing annotations +ann_obs <- rbind( + do.call(rbind, fake1), + do.call(rbind, fake2) +) + +# AUC +(ans <- prediction_score(ans_pred, ann_obs)) +plot(ans$auc) +``` + ## Limiting the support In this example, we use the function `rule_limit_changes()` to apply a diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..6582e92 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,21 @@ +FROM rocker/r-devel-san + +RUN apt update && apt install --no-install-suggests -y libssl-dev + +RUN Rscript --vanilla -e \ + 'install.packages(c("Rcpp", "fmcmc", "tinytest"), repos = "https://cloud.r-project.org")' + +RUN install2.r aphylo --deps=TRUE + +RUN mkdir ~/.R && \ + echo "CXXFLAGS=-g -O0" > ~/.R/Makevars && \ + echo "CXX14FLAGS=-g -O0" >> ~/.R/Makevars && \ + echo "CXX17FLAGS=-g -O0" >> ~/.R/Makevars && \ + echo "CXX11FLAGS=-g -O0" >> ~/.R/Makevars && \ + echo "SAFE_FLAGS=-g -O0" >> ~/.R/Makevars + + +CMD ["bash"] + + + diff --git a/docker/Dockerfile.clang b/docker/Dockerfile.clang new file mode 100644 index 0000000..a2f1422 --- /dev/null +++ b/docker/Dockerfile.clang @@ -0,0 +1,20 @@ +FROM rocker/r-devel-ubsan-clang + +RUN apt update && apt install --no-install-suggests -y libssl-dev + +RUN Rscript --vanilla -e \ + 'install.packages(c("Rcpp", "fmcmc", "tinytest"), repos = "https://cloud.r-project.org")' + +RUN install2.r aphylo --deps=TRUE + +RUN mkdir ~/.R && \ + echo "CXXFLAGS=-g -O0" > ~/.R/Makevars && \ + echo "CXX14FLAGS=-g -O0" >> ~/.R/Makevars && \ + echo "CXX17FLAGS=-g -O0" >> ~/.R/Makevars && \ + echo "CXX11FLAGS=-g -O0" >> ~/.R/Makevars && \ + echo "SAFE_FLAGS=-g -O0" >> ~/.R/Makevars + +CMD ["bash"] + + + diff --git a/docker/Makefile b/docker/Makefile new file mode 100644 index 0000000..8cd7edc --- /dev/null +++ b/docker/Makefile @@ -0,0 +1,16 @@ +build: Dockerfile + docker build -t ghcr.io/uscbiostats/geese:latest . + +build-clang: Dockerfile.clang + docker build -f Dockerfile.clang -t ghcr.io/uscbiostats/geese:clang . + +run: + docker run --rm -ti -v $(PWD)/..:/mnt -w/mnt ghcr.io/uscbiostats/geese:latest bash + + +run-clang: + docker run --rm -ti -v $(PWD)/..:/mnt -w/mnt ghcr.io/uscbiostats/geese:clang bash + +push: + docker push ghcr.io/uscbiostats/geese:clang; \ + docker push ghcr.io/uscbiostats/geese:latest diff --git a/inst/include/barry/barray-bones.hpp b/inst/include/barry/barray-bones.hpp index 3523cce..0fb6849 100644 --- a/inst/include/barry/barray-bones.hpp +++ b/inst/include/barry/barray-bones.hpp @@ -211,6 +211,7 @@ class BArray { // Misc void print(const char * fmt = nullptr, ...) const; + void print_n(size_t nrow, size_t ncol, const char * fmt = nullptr, ...) const; /** * @name Arithmetic operators diff --git a/inst/include/barry/barray-meat.hpp b/inst/include/barry/barray-meat.hpp index d8b5ee4..ea35005 100644 --- a/inst/include/barry/barray-meat.hpp +++ b/inst/include/barry/barray-meat.hpp @@ -10,13 +10,6 @@ class Cell_const; #ifndef BARRY_BARRAY_MEAT_HPP #define BARRY_BARRAY_MEAT_HPP -#define BARRAY_TYPE() BArray - -#define BARRAY_TEMPLATE_ARGS() - -#define BARRAY_TEMPLATE(a,b) \ - template BARRAY_TEMPLATE_ARGS() inline a BARRAY_TYPE()::b - #define ROW(a) this->el_ij[a] #define COL(a) this->el_ji[a] @@ -26,7 +19,7 @@ Cell BArray::Cell_default = Cell(stat // Edgelist with data -BARRAY_TEMPLATE(,BArray) ( +template inline BArray::BArray ( size_t N_, size_t M_, const std::vector< size_t > & source, const std::vector< size_t > & target, @@ -68,7 +61,8 @@ BARRAY_TEMPLATE(,BArray) ( } // Edgelist with data -BARRAY_TEMPLATE(,BArray) ( +template +inline BArray::BArray ( size_t N_, size_t M_, const std::vector< size_t > & source, const std::vector< size_t > & target, @@ -130,7 +124,8 @@ BARRAY_TEMPLATE(,BArray) ( } -BARRAY_TEMPLATE(,BArray) ( +template +inline BArray::BArray ( const BArray & Array_, bool copy_data ) : N(Array_.N), M(Array_.M) @@ -178,7 +173,8 @@ BARRAY_TEMPLATE(,BArray) ( } -BARRAY_TEMPLATE(BARRAY_TYPE() &, operator=) ( +template +inline BArray & BArray:: operator= ( const BArray & Array_ ) { @@ -227,8 +223,8 @@ BARRAY_TEMPLATE(BARRAY_TYPE() &, operator=) ( } -BARRAY_TEMPLATE(,BArray) ( - BARRAY_TYPE() && x +template inline BArray::BArray ( + BArray && x ) noexcept : N(0u), M(0u), NCells(0u), data(nullptr), @@ -269,8 +265,8 @@ BARRAY_TEMPLATE(,BArray) ( } -BARRAY_TEMPLATE(BARRAY_TYPE() &, operator=) ( - BARRAY_TYPE() && x +template inline BArray & BArray:: operator= ( + BArray && x ) noexcept { // Clearing @@ -318,8 +314,8 @@ BARRAY_TEMPLATE(BARRAY_TYPE() &, operator=) ( } -BARRAY_TEMPLATE(bool, operator==) ( - const BARRAY_TYPE() & Array_ +template inline bool BArray:: operator== ( + const BArray & Array_ ) { // Dimension and number of cells used @@ -336,7 +332,7 @@ BARRAY_TEMPLATE(bool, operator==) ( return true; } -BARRAY_TEMPLATE(,~BArray) () { +template inline BArray::~BArray () { if (delete_data && (data != nullptr)) delete data; @@ -344,7 +340,7 @@ BARRAY_TEMPLATE(,~BArray) () { return; } -BARRAY_TEMPLATE(void, set_data) ( +template inline void BArray:: set_data ( Data_Type * data_, bool delete_data_ ) { @@ -358,7 +354,7 @@ BARRAY_TEMPLATE(void, set_data) ( } -BARRAY_TEMPLATE(Data_Type *, D_ptr) () +template inline Data_Type * BArray:: D_ptr () { return this->data; } @@ -369,7 +365,7 @@ inline const Data_Type * BArray::D_ptr() const return this->data; } -BARRAY_TEMPLATE(Data_Type &, D) () +template inline Data_Type & BArray:: D () { return *this->data; } @@ -396,7 +392,7 @@ inline void BArray::flush_data() } -BARRAY_TEMPLATE(void, out_of_range) ( +template inline void BArray:: out_of_range ( size_t i, size_t j ) const { @@ -409,7 +405,7 @@ BARRAY_TEMPLATE(void, out_of_range) ( } -BARRAY_TEMPLATE(Cell_Type, get_cell) ( +template inline Cell_Type BArray:: get_cell ( size_t i, size_t j, bool check_bounds @@ -432,7 +428,7 @@ BARRAY_TEMPLATE(Cell_Type, get_cell) ( } -BARRAY_TEMPLATE(std::vector< Cell_Type >, get_row_vec) ( +template inline std::vector< Cell_Type > BArray:: get_row_vec ( size_t i, bool check_bounds ) const { @@ -449,7 +445,7 @@ BARRAY_TEMPLATE(std::vector< Cell_Type >, get_row_vec) ( return ans; } -BARRAY_TEMPLATE(void, get_row_vec) ( +template inline void BArray:: get_row_vec ( std::vector< Cell_Type > * x, size_t i, bool check_bounds @@ -464,7 +460,7 @@ BARRAY_TEMPLATE(void, get_row_vec) ( } -BARRAY_TEMPLATE(std::vector< Cell_Type >, get_col_vec) ( +template inline std::vector< Cell_Type > BArray:: get_col_vec ( size_t i, bool check_bounds ) const { @@ -481,7 +477,7 @@ BARRAY_TEMPLATE(std::vector< Cell_Type >, get_col_vec) ( } -BARRAY_TEMPLATE(void, get_col_vec) ( +template inline void BArray:: get_col_vec ( std::vector * x, size_t i, bool check_bounds @@ -496,7 +492,7 @@ BARRAY_TEMPLATE(void, get_col_vec) ( } -BARRAY_TEMPLATE(const Row_type< Cell_Type > &, row) ( +template inline const Row_type< Cell_Type > & BArray:: row ( size_t i, bool check_bounds ) const { @@ -508,7 +504,7 @@ BARRAY_TEMPLATE(const Row_type< Cell_Type > &, row) ( } -BARRAY_TEMPLATE(const Col_type< Cell_Type > &, col) ( +template inline const Col_type< Cell_Type > & BArray:: col ( size_t i, bool check_bounds ) const { @@ -520,7 +516,7 @@ BARRAY_TEMPLATE(const Col_type< Cell_Type > &, col) ( } -BARRAY_TEMPLATE(Entries< Cell_Type >, get_entries) () const { +template inline Entries< Cell_Type > BArray:: get_entries () const { Entries res(NCells); @@ -539,7 +535,7 @@ BARRAY_TEMPLATE(Entries< Cell_Type >, get_entries) () const { return res; } -BARRAY_TEMPLATE(bool, is_empty) ( +template inline bool BArray:: is_empty ( size_t i, size_t j, bool check_bounds @@ -561,25 +557,25 @@ BARRAY_TEMPLATE(bool, is_empty) ( } -BARRAY_TEMPLATE(size_t, nrow) () const noexcept { +template inline size_t BArray:: nrow () const noexcept { return N; } -BARRAY_TEMPLATE(size_t, ncol) () const noexcept { +template inline size_t BArray:: ncol () const noexcept { return M; } -BARRAY_TEMPLATE(size_t, nnozero) () const noexcept { +template inline size_t BArray:: nnozero () const noexcept { return NCells; } -BARRAY_TEMPLATE(Cell< Cell_Type >, default_val) () const { +template inline Cell< Cell_Type > BArray:: default_val () const { return this->Cell_default; } -BARRAY_TEMPLATE(BARRAY_TYPE() &, operator+=) ( +template inline BArray & BArray:: operator+= ( const std::pair & coords ) { @@ -594,7 +590,7 @@ BARRAY_TEMPLATE(BARRAY_TYPE() &, operator+=) ( } -BARRAY_TEMPLATE(BARRAY_TYPE() &, operator-=) ( +template inline BArray & BArray:: operator-= ( const std::pair & coords ) { @@ -608,8 +604,8 @@ BARRAY_TEMPLATE(BARRAY_TYPE() &, operator-=) ( } -template BARRAY_TEMPLATE_ARGS() -inline BArrayCell BARRAY_TYPE()::operator()( +template +inline BArrayCell BArray::operator()( size_t i, size_t j, bool check_bounds @@ -619,8 +615,8 @@ inline BArrayCell BARRAY_TYPE()::operator()( } -template BARRAY_TEMPLATE_ARGS() -inline const Cell_Type BARRAY_TYPE()::operator() ( +template +inline const Cell_Type BArray::operator() ( size_t i, size_t j, bool check_bounds @@ -630,7 +626,7 @@ inline const Cell_Type BARRAY_TYPE()::operator() ( } -BARRAY_TEMPLATE(void, rm_cell) ( +template inline void BArray:: rm_cell ( size_t i, size_t j, bool check_bounds, @@ -665,7 +661,7 @@ BARRAY_TEMPLATE(void, rm_cell) ( } -BARRAY_TEMPLATE(void, insert_cell) ( +template inline void BArray:: insert_cell ( size_t i, size_t j, const Cell< Cell_Type> & v, @@ -712,7 +708,7 @@ BARRAY_TEMPLATE(void, insert_cell) ( } -BARRAY_TEMPLATE(void, insert_cell) ( +template inline void BArray:: insert_cell ( size_t i, size_t j, Cell< Cell_Type> && v, @@ -759,7 +755,7 @@ BARRAY_TEMPLATE(void, insert_cell) ( } -BARRAY_TEMPLATE(void, insert_cell) ( +template inline void BArray:: insert_cell ( size_t i, size_t j, Cell_Type v, @@ -771,7 +767,7 @@ BARRAY_TEMPLATE(void, insert_cell) ( } -BARRAY_TEMPLATE(void, swap_cells) ( +template inline void BArray:: swap_cells ( size_t i0, size_t j0, size_t i1, size_t j1, bool check_bounds, @@ -874,7 +870,7 @@ BARRAY_TEMPLATE(void, swap_cells) ( return; } -BARRAY_TEMPLATE(void, toggle_cell) ( +template inline void BArray:: toggle_cell ( size_t i, size_t j, bool check_bounds, @@ -907,7 +903,7 @@ BARRAY_TEMPLATE(void, toggle_cell) ( } -BARRAY_TEMPLATE(void, swap_rows) ( +template inline void BArray:: swap_rows ( size_t i0, size_t i1, bool check_bounds @@ -953,7 +949,7 @@ BARRAY_TEMPLATE(void, swap_rows) ( } // This swapping is more expensive overall -BARRAY_TEMPLATE(void, swap_cols) ( +template inline void BArray:: swap_cols ( size_t j0, size_t j1, bool check_bounds @@ -1024,7 +1020,7 @@ BARRAY_TEMPLATE(void, swap_cols) ( return; } -BARRAY_TEMPLATE(void, zero_row) ( +template inline void BArray:: zero_row ( size_t i, bool check_bounds ) { @@ -1045,7 +1041,7 @@ BARRAY_TEMPLATE(void, zero_row) ( } -BARRAY_TEMPLATE(void, zero_col) ( +template inline void BArray:: zero_col ( size_t j, bool check_bounds ) { @@ -1066,7 +1062,7 @@ BARRAY_TEMPLATE(void, zero_col) ( } -BARRAY_TEMPLATE(void, transpose) () { +template inline void BArray:: transpose () { // Start by flipping the switch visited = !visited; @@ -1127,7 +1123,7 @@ BARRAY_TEMPLATE(void, transpose) () { } -BARRAY_TEMPLATE(void, clear) ( +template inline void BArray:: clear ( bool hard ) { @@ -1152,7 +1148,7 @@ BARRAY_TEMPLATE(void, clear) ( } -BARRAY_TEMPLATE(void, resize) ( +template inline void BArray:: resize ( size_t N_, size_t M_ ) { @@ -1183,7 +1179,8 @@ BARRAY_TEMPLATE(void, resize) ( } -BARRAY_TEMPLATE(void, reserve) () { +template +inline void BArray:: reserve () { #ifdef BARRAY_USE_UNORDERED_MAP for (size_t i = 0u; i < N; i++) ROW(i).reserve(M); @@ -1195,17 +1192,42 @@ BARRAY_TEMPLATE(void, reserve) () { } -BARRAY_TEMPLATE(void, print) ( +template +inline void BArray:: print ( const char * fmt, ... ) const { + + std::va_list args; + va_start(args, fmt); + print_n(N, M, fmt, args); + va_end(args); + + return; + +} + +template +inline void BArray:: print_n ( + size_t nrow, + size_t ncol, + const char * fmt, + ... +) const { + + if (nrow > N) + nrow = N; + + if (ncol > M) + ncol = M; + std::va_list args; va_start(args, fmt); printf_barry(fmt, args); va_end(args); - for (size_t i = 0u; i < N; ++i) + for (size_t i = 0u; i < nrow; ++i) { #ifdef BARRY_DEBUG_LEVEL @@ -1215,7 +1237,7 @@ BARRAY_TEMPLATE(void, print) ( #else printf_barry("[%3i,] ", i); #endif - for (size_t j = 0u; j < M; ++j) { + for (size_t j = 0u; j < ncol; ++j) { if (this->is_empty(i, j, false)) printf_barry(" . "); else @@ -1226,6 +1248,15 @@ BARRAY_TEMPLATE(void, print) ( printf_barry("\n"); } + + if (nrow < N) + printf_barry("Skipping %lu rows. ", N - nrow); + + if (ncol < M) + printf_barry("Skipping %lu columns. ", M - ncol); + + if (nrow < N || ncol < M) + printf_barry("\n"); return; @@ -1235,9 +1266,5 @@ BARRAY_TEMPLATE(void, print) ( #undef ROW #undef COL -#undef BARRAY_TYPE -#undef BARRAY_TEMPLATE_ARGS -#undef BARRAY_TEMPLATE - #endif diff --git a/inst/include/barry/barraydense-meat.hpp b/inst/include/barry/barraydense-meat.hpp index 4725b60..aedd5bc 100644 --- a/inst/include/barry/barraydense-meat.hpp +++ b/inst/include/barry/barraydense-meat.hpp @@ -20,13 +20,6 @@ template class BArrayDenseCell; -#define BDENSE_TYPE() BArrayDense - -#define BDENSE_TEMPLATE_ARGS() - -#define BDENSE_TEMPLATE(a,b) \ - template BDENSE_TEMPLATE_ARGS() inline a BDENSE_TYPE()::b - #define ROW(a) this->el_ij[a] #define COL(a) this->el_ji[a] #define POS(a,b) (b)*N + (a) @@ -38,7 +31,8 @@ Cell_Type BArrayDense::Cell_default = static_cast< Cell_Typ #define ZERO_CELL static_cast(0.0) // Edgelist with data -BDENSE_TEMPLATE(,BArrayDense)( +template +inline BArrayDense::BArrayDense( size_t N_, size_t M_, const std::vector< size_t > & source, @@ -96,7 +90,8 @@ BDENSE_TEMPLATE(,BArrayDense)( } // Edgelist without data -BDENSE_TEMPLATE(, BArrayDense)( +template +inline BArrayDense:: BArrayDense( size_t N_, size_t M_, const std::vector< size_t > & source, const std::vector< size_t > & target, @@ -151,8 +146,9 @@ BDENSE_TEMPLATE(, BArrayDense)( } -BDENSE_TEMPLATE(, BArrayDense)( - const BDENSE_TYPE() & Array_, +template +inline BArrayDense:: BArrayDense( + const BArrayDense & Array_, bool copy_data ) : N(Array_.N), M(Array_.M){ @@ -191,8 +187,9 @@ BDENSE_TEMPLATE(, BArrayDense)( } -BDENSE_TEMPLATE(BDENSE_TYPE() &, operator=) ( - const BDENSE_TYPE() & Array_ +template +inline BArrayDense & BArrayDense::operator=( + const BArrayDense & Array_ ) { // Clearing @@ -237,8 +234,9 @@ BDENSE_TEMPLATE(BDENSE_TYPE() &, operator=) ( } -BDENSE_TEMPLATE(, BArrayDense)( - BDENSE_TYPE() && x +template +inline BArrayDense:: BArrayDense( + BArrayDense && x ) noexcept : N(std::move(x.N)), M(std::move(x.M)), // NCells(std::move(x.NCells)), @@ -254,8 +252,9 @@ BDENSE_TEMPLATE(, BArrayDense)( } -BDENSE_TEMPLATE(BDENSE_TYPE() &, operator=)( - BDENSE_TYPE() && x +template +inline BArrayDense & BArrayDense::operator=( + BArrayDense && x ) noexcept { // Clearing @@ -297,8 +296,9 @@ BDENSE_TEMPLATE(BDENSE_TYPE() &, operator=)( } -BDENSE_TEMPLATE(bool, operator==) ( - const BDENSE_TYPE() & Array_ +template +inline bool BArrayDense::operator== ( + const BArrayDense & Array_ ) { // Dimension and number of cells used @@ -315,7 +315,8 @@ BDENSE_TEMPLATE(bool, operator==) ( return true; } -BDENSE_TEMPLATE(, ~BArrayDense) () { +template +inline BArrayDense::~BArrayDense () { if (delete_data && (data != nullptr)) delete data; @@ -323,7 +324,8 @@ BDENSE_TEMPLATE(, ~BArrayDense) () { return; } -BDENSE_TEMPLATE(void, set_data) ( +template +inline void BArrayDense::set_data ( Data_Type * data_, bool delete_data_ ) { @@ -338,23 +340,28 @@ BDENSE_TEMPLATE(void, set_data) ( } -BDENSE_TEMPLATE(Data_Type *, D_ptr) () { +template +inline Data_Type * BArrayDense::D_ptr () { return this->data; } -BDENSE_TEMPLATE(const Data_Type *, D_ptr) () const { +template +inline const Data_Type * BArrayDense::D_ptr () const { return this->data; } -BDENSE_TEMPLATE(Data_Type &, D) () { +template + inline Data_Type & BArrayDense::D () { return *this->data; } -BDENSE_TEMPLATE(const Data_Type &, D) () const { +template +inline const Data_Type & BArrayDense::D () const { return *this->data; } -BDENSE_TEMPLATE(void, out_of_range) ( +template +inline void BArrayDense::out_of_range ( size_t i, size_t j ) const { @@ -374,7 +381,8 @@ BDENSE_TEMPLATE(void, out_of_range) ( } -BDENSE_TEMPLATE(Cell_Type, get_cell) ( +template +inline Cell_Type BArrayDense::get_cell ( size_t i, size_t j, bool check_bounds @@ -388,7 +396,8 @@ BDENSE_TEMPLATE(Cell_Type, get_cell) ( } -BDENSE_TEMPLATE(std::vector< Cell_Type >, get_row_vec) ( +template +inline std::vector< Cell_Type > BArrayDense::get_row_vec ( size_t i, bool check_bounds ) const { @@ -405,7 +414,7 @@ BDENSE_TEMPLATE(std::vector< Cell_Type >, get_row_vec) ( } -BDENSE_TEMPLATE(void, get_row_vec) ( +template inline void BArrayDense:: get_row_vec ( std::vector * x, size_t i, bool check_bounds @@ -420,7 +429,7 @@ BDENSE_TEMPLATE(void, get_row_vec) ( } -BDENSE_TEMPLATE(std::vector< Cell_Type >, get_col_vec)( +template inline std::vector< Cell_Type > BArrayDense:: get_col_vec( size_t i, bool check_bounds ) const { @@ -437,7 +446,7 @@ BDENSE_TEMPLATE(std::vector< Cell_Type >, get_col_vec)( } -BDENSE_TEMPLATE(void, get_col_vec) ( +template inline void BArrayDense:: get_col_vec ( std::vector * x, size_t i, bool check_bounds @@ -452,7 +461,7 @@ BDENSE_TEMPLATE(void, get_col_vec) ( } template -inline const BArrayDenseRow_const BDENSE_TYPE()::row( +inline const BArrayDenseRow_const BArrayDense::row( size_t i, bool check_bounds ) const { @@ -465,7 +474,7 @@ inline const BArrayDenseRow_const BDENSE_TYPE()::row( } template -inline BArrayDenseRow & BDENSE_TYPE()::row( +inline BArrayDenseRow & BArrayDense::row( size_t i, bool check_bounds ) { @@ -505,7 +514,7 @@ BArrayDense::col( } -BDENSE_TEMPLATE(Entries< Cell_Type >, get_entries)() const { +template inline Entries< Cell_Type > BArrayDense:: get_entries() const { size_t nzero = this->nnozero(); @@ -534,7 +543,7 @@ BDENSE_TEMPLATE(Entries< Cell_Type >, get_entries)() const { } -BDENSE_TEMPLATE(bool, is_empty)( +template inline bool BArrayDense:: is_empty( size_t i, size_t j, bool check_bounds @@ -547,15 +556,15 @@ BDENSE_TEMPLATE(bool, is_empty)( } -BDENSE_TEMPLATE(size_t, nrow)() const noexcept { +template inline size_t BArrayDense:: nrow() const noexcept { return N; } -BDENSE_TEMPLATE(size_t, ncol)() const noexcept { +template inline size_t BArrayDense:: ncol() const noexcept { return M; } -BDENSE_TEMPLATE(size_t, nnozero)() const noexcept { +template inline size_t BArrayDense:: nnozero() const noexcept { size_t nzero = 0u; for (auto & v : el) @@ -565,11 +574,13 @@ BDENSE_TEMPLATE(size_t, nnozero)() const noexcept { return nzero; } -BDENSE_TEMPLATE(Cell< Cell_Type>, default_val)() const { +template +inline Cell< Cell_Type> BArrayDense::default_val() const { return this->Cell_default; } -BDENSE_TEMPLATE(BDENSE_TYPE() &, operator+=)( +template +inline BArrayDense & BArrayDense::operator+=( const std::pair & coords ) { @@ -587,7 +598,8 @@ BDENSE_TEMPLATE(BDENSE_TYPE() &, operator+=)( } -BDENSE_TEMPLATE(BDENSE_TYPE() &, operator-=)( +template +inline BArrayDense & BArrayDense::operator-=( const std::pair & coords ) { @@ -606,8 +618,8 @@ BDENSE_TEMPLATE(BDENSE_TYPE() &, operator-=)( } -template BDENSE_TEMPLATE_ARGS() -inline BArrayDenseCell BDENSE_TYPE()::operator()( +template +inline BArrayDenseCell BArrayDense::operator()( size_t i, size_t j, bool check_bounds @@ -617,8 +629,8 @@ inline BArrayDenseCell BDENSE_TYPE()::operator()( } -template BDENSE_TEMPLATE_ARGS() -inline const Cell_Type BDENSE_TYPE()::operator()( +template +inline const Cell_Type BArrayDense::operator()( size_t i, size_t j, bool check_bounds @@ -631,7 +643,8 @@ inline const Cell_Type BDENSE_TYPE()::operator()( } -BDENSE_TEMPLATE(void, rm_cell) ( +template +inline void BArrayDense::rm_cell ( size_t i, size_t j, bool check_bounds, @@ -653,7 +666,8 @@ BDENSE_TEMPLATE(void, rm_cell) ( } -BDENSE_TEMPLATE(void, insert_cell) ( +template +inline void BArrayDense::insert_cell ( size_t i, size_t j, const Cell< Cell_Type> & v, @@ -689,7 +703,7 @@ BDENSE_TEMPLATE(void, insert_cell) ( } -BDENSE_TEMPLATE(void, insert_cell)( +template inline void BArrayDense:: insert_cell( size_t i, size_t j, Cell_Type v, @@ -722,7 +736,7 @@ BDENSE_TEMPLATE(void, insert_cell)( } -BDENSE_TEMPLATE(void, swap_cells) ( +template inline void BArrayDense:: swap_cells ( size_t i0, size_t j0, size_t i1, size_t j1, bool check_bounds, @@ -759,7 +773,7 @@ BDENSE_TEMPLATE(void, swap_cells) ( } -BDENSE_TEMPLATE(void, toggle_cell) ( +template inline void BArrayDense:: toggle_cell ( size_t i, size_t j, bool check_bounds, @@ -778,7 +792,7 @@ BDENSE_TEMPLATE(void, toggle_cell) ( } -BDENSE_TEMPLATE(void, swap_rows) ( +template inline void BArrayDense:: swap_rows ( size_t i0, size_t i1, bool check_bounds @@ -806,7 +820,7 @@ BDENSE_TEMPLATE(void, swap_rows) ( } // This swapping is more expensive overall -BDENSE_TEMPLATE(void, swap_cols) ( +template inline void BArrayDense:: swap_cols ( size_t j0, size_t j1, bool check_bounds @@ -833,7 +847,7 @@ BDENSE_TEMPLATE(void, swap_cols) ( return; } -BDENSE_TEMPLATE(void, zero_row) ( +template inline void BArrayDense:: zero_row ( size_t i, bool check_bounds ) { @@ -852,7 +866,7 @@ BDENSE_TEMPLATE(void, zero_row) ( } -BDENSE_TEMPLATE(void, zero_col) ( +template inline void BArrayDense:: zero_col ( size_t j, bool check_bounds ) { @@ -871,7 +885,7 @@ BDENSE_TEMPLATE(void, zero_col) ( } -BDENSE_TEMPLATE(void, transpose) () { +template inline void BArrayDense:: transpose () { // if (NCells == 0u) // { @@ -899,7 +913,7 @@ BDENSE_TEMPLATE(void, transpose) () { } -BDENSE_TEMPLATE(void, clear) ( +template inline void BArrayDense:: clear ( bool hard ) { @@ -913,7 +927,7 @@ BDENSE_TEMPLATE(void, clear) ( } -BDENSE_TEMPLATE(void, resize) ( +template inline void BArrayDense:: resize ( size_t N_, size_t M_ ) { @@ -949,7 +963,7 @@ BDENSE_TEMPLATE(void, resize) ( } -BDENSE_TEMPLATE(void, reserve) () { +template inline void BArrayDense:: reserve () { el.reserve(N * M); el_rowsums.reserve(N); @@ -958,7 +972,7 @@ BDENSE_TEMPLATE(void, reserve) () { } -BDENSE_TEMPLATE(void, print) ( +template inline void BArrayDense:: print ( const char * fmt, ... ) const @@ -992,17 +1006,17 @@ BDENSE_TEMPLATE(void, print) ( } -BDENSE_TEMPLATE(const std::vector< Cell_Type > &, get_data)() const +template inline const std::vector< Cell_Type > & BArrayDense:: get_data() const { return el; } -BDENSE_TEMPLATE(const Cell_Type, rowsum)(size_t i) const +template inline const Cell_Type BArrayDense:: rowsum(size_t i) const { return el_rowsums[i]; } -BDENSE_TEMPLATE(const Cell_Type, colsum)(size_t j) const +template inline const Cell_Type BArrayDense:: colsum(size_t j) const { return el_colsums[j]; } @@ -1012,9 +1026,6 @@ BDENSE_TEMPLATE(const Cell_Type, colsum)(size_t j) const #undef POS #undef POS_N -#undef BDENSE_TYPE -#undef BDENSE_TEMPLATE_ARGS -#undef BDENSE_TEMPLATE #undef ZERO_CELL #endif diff --git a/inst/include/barry/barraydensecell-meat.hpp b/inst/include/barry/barraydensecell-meat.hpp index e2a9fa4..137f31d 100644 --- a/inst/include/barry/barraydensecell-meat.hpp +++ b/inst/include/barry/barraydensecell-meat.hpp @@ -11,10 +11,17 @@ inline BArrayDenseCell& BArrayDenseCell(other); + #ifdef BARRY_DEBUG + Cell_Type old = dat->el.at(POS(i,j)); + dat->el.at(POS(i,j)) = val; + dat->el_rowsums.at(i) += (val - old); + dat->el_colsums.at(j) += (val - old); + #else Cell_Type old = dat->el[POS(i,j)]; dat->el[POS(i,j)] = val; dat->el_rowsums[i] += (val - old); dat->el_colsums[j] += (val - old); + #endif return *this; @@ -23,48 +30,81 @@ inline BArrayDenseCell& BArrayDenseCell inline void BArrayDenseCell::operator=(const Cell_Type & val) { + #ifdef BARRY_DEBUG + Cell_Type old = dat->el.at(POS(i,j)); + dat->el.at(POS(i,j)) = val; + dat->el_rowsums.at(i) += (val - old); + dat->el_colsums.at(j) += (val - old); + #else Cell_Type old = dat->el[POS(i,j)]; dat->el[POS(i,j)] = val; dat->el_rowsums[i] += (val - old); dat->el_colsums[j] += (val - old); + #endif } template inline void BArrayDenseCell::operator+=(const Cell_Type & val) { + #ifdef BARRY_DEBUG + dat->el.at(POS(i,j)) += val; + dat->el_rowsums.at(i) += val; + dat->el_colsums.at(j) += val; + #else dat->el[POS(i,j)] += val; dat->el_rowsums[i] += val; dat->el_colsums[j] += val; + #endif } template inline void BArrayDenseCell::operator-=(const Cell_Type & val) { + #ifdef BARRY_DEBUG + dat->el.at(POS(i,j)) -= val; + dat->el_rowsums.at(i) -= val; + dat->el_colsums.at(j) -= val; + #else dat->el[POS(i,j)] -= val; dat->el_rowsums[i] -= val; dat->el_colsums[j] -= val; + #endif } template inline void BArrayDenseCell::operator*=(const Cell_Type & val) { + #ifdef BARRY_DEBUG + Cell_Type old = dat->el.at(POS(i,j)); + dat->el_colsums.at(j) += (old * val - old); + dat->el_rowsums.at(i) += (old * val - old); + dat->el.at(POS(i,j)) *= val; + #else Cell_Type old = dat->el[POS(i,j)]; dat->el_colsums[j] += (old * val - old); dat->el_rowsums[i] += (old * val - old); dat->el[POS(i,j)] *= val; + #endif } template inline void BArrayDenseCell::operator/=(const Cell_Type & val) { + #ifdef BARRY_DEBUG + Cell_Type old = dat->el.at(POS(i,j)); + dat->el_rowsums.at(i) += (old/val - old); + dat->el_colsums.at(j) += (old/val - old); + dat->el.at(POS(i,j)) /= val; + #else Cell_Type old = dat->el[POS(i,j)]; dat->el_rowsums[i] += (old/val - old); dat->el_colsums[j] += (old/val - old); dat->el[POS(i,j)] /= val; + #endif } diff --git a/inst/include/barry/model-meat.hpp b/inst/include/barry/model-meat.hpp index f9f670d..371f6d0 100644 --- a/inst/include/barry/model-meat.hpp +++ b/inst/include/barry/model-meat.hpp @@ -19,7 +19,11 @@ inline double update_normalizing_constant( #ifdef __OPENMP #pragma omp simd reduction(+:res) #else - #pragma GCC ivdep + #ifdef __GNUC__ + #ifndef __clang__ + #pragma GCC ivdep + #endif + #endif #endif for (size_t i = 0u; i < n; ++i) { @@ -73,7 +77,11 @@ inline double likelihood_( #ifdef __OPENMP #pragma omp simd reduction(+:numerator) #else - #pragma GCC ivdep + #ifdef __GNUC__ + #ifndef __clang__ + #pragma GCC ivdep + #endif + #endif #endif for (size_t j = 0u; j < params.size(); ++j) numerator += *(stats_target + j) * params[j]; diff --git a/inst/include/barry/models/defm/defm-bones.hpp b/inst/include/barry/models/defm/defm-bones.hpp index 1aa7c15..c96b786 100644 --- a/inst/include/barry/models/defm/defm-bones.hpp +++ b/inst/include/barry/models/defm/defm-bones.hpp @@ -11,9 +11,14 @@ class DEFM : public DEFMModel { * @brief Model data */ ///@{ - const int * Y = nullptr; ///< Outcome variable - const int * ID = nullptr; ///< Individual ids - const double * X = nullptr; ///< Covariates + int * Y = nullptr; ///< Outcome variable + int * ID = nullptr; ///< Individual ids + double * X = nullptr; ///< Covariates + + // In case we need a copy of the data + std::shared_ptr> Y_shared; ///< Outcome variable + std::shared_ptr> ID_shared; ///< Individual ids + std::shared_ptr> X_shared;///< Covariates size_t N; ///< Number of agents/individuals size_t ID_length; ///< Length of the vector IDs @@ -32,17 +37,27 @@ class DEFM : public DEFMModel { public: DEFM( - const int * id, - const int * y, - const double * x, + int * id, + int * y, + double * x, size_t id_length, size_t y_ncol, size_t x_ncol, - size_t m_order + size_t m_order, + bool copy_data = true ); // ~DEFM() { + + // if (n_owners-- == 1) + // { + // delete[] Y; + // delete[] ID; + // delete[] X; + // } + // DEFMModel::~Model(); + // }; DEFMModel & get_model() { diff --git a/inst/include/barry/models/defm/defm-meat.hpp b/inst/include/barry/models/defm/defm-meat.hpp index 85f141c..0d7361c 100644 --- a/inst/include/barry/models/defm/defm-meat.hpp +++ b/inst/include/barry/models/defm/defm-meat.hpp @@ -102,19 +102,44 @@ inline void DEFM::simulate( } inline DEFM::DEFM( - const int * id, - const int * y, - const double * x, + int * id, + int * y, + double * x, size_t id_length, size_t y_ncol, size_t x_ncol, - size_t m_order + size_t m_order, + bool copy_data ) { // Pointers - ID = id; - Y = y; - X = x; + if (copy_data) + { + + ID_shared = std::make_shared< std::vector >(id_length); + Y_shared = std::make_shared< std::vector >(id_length * y_ncol); + X_shared = std::make_shared< std::vector >(id_length * x_ncol); + + for (size_t i = 0u; i < id_length; ++i) + ID_shared->at(i) = *(id + i); + + for (size_t i = 0u; i < (id_length * y_ncol); ++i) + Y_shared->at(i) = *(y + i); + + for (size_t i = 0u; i < (id_length * x_ncol); ++i) + X_shared->at(i) = *(x + i); + + ID = &ID_shared->at(0u); + Y = &Y_shared->at(0u); + X = &X_shared->at(0u); + + } else { + + ID = id; + Y = y; + X = x; + + } // Overall dimmensions ID_length = id_length; diff --git a/inst/include/barry/models/geese/geese-bones.hpp b/inst/include/barry/models/geese/geese-bones.hpp index 53c20fa..9daafc4 100644 --- a/inst/include/barry/models/geese/geese-bones.hpp +++ b/inst/include/barry/models/geese/geese-bones.hpp @@ -327,10 +327,10 @@ class Geese { * @return `get_support_fun()` returns the computed support of the model. */ ///@{ - std::mt19937 * get_rengine(); - PhyloCounters * get_counters(); - PhyloModel * get_model(); - PhyloSupport * get_support_fun(); + std::mt19937 * get_rengine(); + PhyloCounters * get_counters(); + PhyloModel * get_model(); + PhyloSupport * get_support_fun(); ///@} /** @@ -342,7 +342,8 @@ class Geese { * @return std::vector< std::vector< bool > > of length `2^P`. */ std::vector< std::vector< bool > > get_states() const; - std::vector< size_t > get_annotated_nodes() const; ///< Returns the ids of the nodes with at least one annotation + std::vector< size_t > get_annotated_nodes() const; ///< Returns the ids of the nodes with at least one annotation + std::vector< size_t > get_annotations() const; ///< Returns the annotations of the nodes with at least one annotation }; diff --git a/inst/include/barry/models/geese/geese-meat-constructors.hpp b/inst/include/barry/models/geese/geese-meat-constructors.hpp index f43ea30..5e75a2e 100644 --- a/inst/include/barry/models/geese/geese-meat-constructors.hpp +++ b/inst/include/barry/models/geese/geese-meat-constructors.hpp @@ -159,11 +159,11 @@ inline Geese::Geese( if (node.ord == std::numeric_limits< size_t >::max()) { - const char *fmt = "Node id %i was not included in geneid."; - int sz = std::snprintf(nullptr, 0, fmt, node.id); - std::vector buf(sz + 1); - std::snprintf(&buf[0], buf.size(), fmt, node.id); - throw std::logic_error(&buf[0]); + std::string msg = "Node id " + + std::to_string(node.id) + + " does not have an ord."; + + throw std::logic_error(msg); } @@ -171,11 +171,11 @@ inline Geese::Geese( if (node.duplication != duplication[node.ord]) { - const char *fmt = "Node id %i's duplication was not properly recorded."; - int sz = std::snprintf(nullptr, 0, fmt, node.id); - std::vector buf(sz + 1); - std::snprintf(&buf[0], buf.size(), fmt, node.id); - throw std::logic_error(&buf[0]); + std::string msg = "Node id " + + std::to_string(node.id) + + "'s duplication was not properly recorded."; + + throw std::logic_error(msg); } @@ -205,11 +205,10 @@ inline Geese::Geese( if (++ord_count[node.ord] > 1u) { - const char *fmt = "Node id %i's ord was repeated."; - int sz = std::snprintf(nullptr, 0, fmt, node.id); - std::vector buf(sz + 1); - std::snprintf(&buf[0], buf.size(), fmt, node.id); - throw std::logic_error(&buf[0]); + std::string msg = "Node id " + + std::to_string(node.id) + + "'s ord was repeated."; + throw std::logic_error(msg); } diff --git a/inst/include/barry/models/geese/geese-meat.hpp b/inst/include/barry/models/geese/geese-meat.hpp index be49ed8..8d5ba49 100644 --- a/inst/include/barry/models/geese/geese-meat.hpp +++ b/inst/include/barry/models/geese/geese-meat.hpp @@ -789,5 +789,33 @@ inline std::vector< size_t > Geese::get_annotated_nodes() const { } +inline std::vector< size_t > Geese::get_annotations() const { + + // Makeing space for the annotations + std::vector< size_t > ann(this->nfuns() * this->nnodes(), 9u); + size_t nrows = this->nnodes(); + for (auto & n : nodes) + { + + // Getting the location + size_t row = n.second.ord; + + // Counting non-9 annotations + for (size_t f = 0u; f < nfuns(); ++f) + { + // If it has one non-9, then add it to the list + // and continue to the next node. + if (n.second.annotations[f] != 9u) { + ann[f * nrows + row] = n.second.annotations[f]; + } + } + + + } + + return ann; + +} + #endif diff --git a/inst/include/barry/typedefs.hpp b/inst/include/barry/typedefs.hpp index e8a108c..bb46787 100644 --- a/inst/include/barry/typedefs.hpp +++ b/inst/include/barry/typedefs.hpp @@ -270,7 +270,11 @@ inline T vec_inner_prod( #ifdef __OPENM #pragma omp simd reduction(+:res) #else - #pragma GCC ivdep + #ifdef __GNUC__ + #ifndef __clang__ + #pragma GCC ivdep + #endif + #endif #endif for (size_t i = 0u; i < n; ++i) res += (*(a + i) * *(b + i)); @@ -293,7 +297,11 @@ inline double vec_inner_prod( #ifdef __OPENMP #pragma omp simd reduction(+:res) #else - #pragma GCC ivdep + #ifdef __GNUC__ + #ifndef __clang__ + #pragma GCC ivdep + #endif + #endif #endif for (size_t i = 0u; i < n; ++i) res += (*(a + i) * *(b + i)); diff --git a/inst/tinytest/test-aphylo.R b/inst/tinytest/test-aphylo.R index dc5c874..4554873 100644 --- a/inst/tinytest/test-aphylo.R +++ b/inst/tinytest/test-aphylo.R @@ -98,21 +98,31 @@ res <- geese_mle(amodel, lower = -10, upper = 10, method = "L-BFGS-B") plogis(res$par) coef(ans_aphylo_mle) -ans_geese <- predict_geese(amodel, res$par, only_annotated = TRUE) |> +ans_geese <- predict_geese(amodel, res$par, only_annotated = TRUE, leave_one_out = TRUE) |> do.call(what = rbind) -# predict_geese_simulate(amodel, res$par, seed = 212, nsim = 1000) |> -# do.call(what = rbind) +predict_geese_simulate(amodel, res$par, seed = 212, nsim = 100000) |> + do.call(what = rbind) # The absolute difference should be below 0.01 differences <- abs(head(plogis(res$par), -2) - head(coef(ans_aphylo_mle), -1))/ head(coef(ans_aphylo_mle), -1) -# prediction_score(ans_aphylo_mle) +prediction_score(ans_aphylo_mle) + +prediction_score(x = ans_geese, expected = do.call(rbind, fake1)) -# prediction_score( -# x = ans_geese, -# expected = do.call(rbind, fake1) -# ) +# plot(sort(predict(ans_aphylo_mle)[1:99,]) -sort(ans_geese[1:99,])) +aphylo_pred <- predict(ans_aphylo_mle, loo = FALSE) +(aphylo_pred[c(tree[, 2] + 1, n ), ] - + do.call(rbind, fake1)) |> abs() |> mean(na.rm = TRUE) + +all( + range(do.call(rbind, fake1) - get_annotations(amodel), na.rm = TRUE) == 0 + ) +# All equal +(aphylo_pred[c(tree[, 2] + 1, n ), ] - ans_geese) |> + abs() |> + hist() diff --git a/man/figures/README-mcmc-analysis-1.png b/man/figures/README-mcmc-analysis-1.png index cfc97be..884b23b 100644 Binary files a/man/figures/README-mcmc-analysis-1.png and b/man/figures/README-mcmc-analysis-1.png differ diff --git a/man/figures/README-mcmc-analysis-2.png b/man/figures/README-mcmc-analysis-2.png new file mode 100644 index 0000000..9634d4c Binary files /dev/null and b/man/figures/README-mcmc-analysis-2.png differ diff --git a/man/figures/README-mcmc-analysis-limited-1.png b/man/figures/README-mcmc-analysis-limited-1.png index fb7aeb3..6afdf33 100644 Binary files a/man/figures/README-mcmc-analysis-limited-1.png and b/man/figures/README-mcmc-analysis-limited-1.png differ diff --git a/man/figures/README-prediction-1.png b/man/figures/README-prediction-1.png index 57ca0e0..9ee6b47 100644 Binary files a/man/figures/README-prediction-1.png and b/man/figures/README-prediction-1.png differ diff --git a/man/figures/README-unnamed-chunk-2-1.png b/man/figures/README-unnamed-chunk-2-1.png new file mode 100644 index 0000000..6a77a0f Binary files /dev/null and b/man/figures/README-unnamed-chunk-2-1.png differ diff --git a/man/figures/README-viz-flock-1.png b/man/figures/README-viz-flock-1.png index 40659c5..39d7556 100644 Binary files a/man/figures/README-viz-flock-1.png and b/man/figures/README-viz-flock-1.png differ diff --git a/man/figures/README-viz-flock-2.png b/man/figures/README-viz-flock-2.png new file mode 100644 index 0000000..79a0ed8 Binary files /dev/null and b/man/figures/README-viz-flock-2.png differ diff --git a/man/figures/README-viz-with-aphylo-1.png b/man/figures/README-viz-with-aphylo-1.png index 05a33d1..781d4a1 100644 Binary files a/man/figures/README-viz-with-aphylo-1.png and b/man/figures/README-viz-with-aphylo-1.png differ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 8052008..b71b892 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -264,6 +264,16 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// get_annotations_cpp +IntegerMatrix get_annotations_cpp(SEXP p); +RcppExport SEXP _geese_get_annotations_cpp(SEXP pSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< SEXP >::type p(pSEXP); + rcpp_result_gen = Rcpp::wrap(get_annotations_cpp(p)); + return rcpp_result_gen; +END_RCPP +} // rule_limit_changes int rule_limit_changes(SEXP p, int term_pos, int lb, int ub, size_t duplication); RcppExport SEXP _geese_rule_limit_changes(SEXP pSEXP, SEXP term_posSEXP, SEXP lbSEXP, SEXP ubSEXP, SEXP duplicationSEXP) { @@ -279,7 +289,7 @@ BEGIN_RCPP END_RCPP } // predict_geese -std::vector< std::vector< double > > predict_geese(SEXP p, const std::vector< double >& par, bool leave_one_out, bool use_reduced_sequence, bool only_annotated); +std::vector< NumericVector > predict_geese(SEXP p, const std::vector< double >& par, bool leave_one_out, bool use_reduced_sequence, bool only_annotated); RcppExport SEXP _geese_predict_geese(SEXP pSEXP, SEXP parSEXP, SEXP leave_one_outSEXP, SEXP use_reduced_sequenceSEXP, SEXP only_annotatedSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; @@ -601,11 +611,11 @@ BEGIN_RCPP END_RCPP } // new_geese -SEXP new_geese(std::vector< std::vector< size_t > >& annotations, std::vector< size_t >& geneid, std::vector< int >& parent, std::vector< bool >& duplication); +SEXP new_geese(ListOf< IntegerVector >& annotations, std::vector< size_t >& geneid, std::vector< int >& parent, std::vector< bool >& duplication); RcppExport SEXP _geese_new_geese(SEXP annotationsSEXP, SEXP geneidSEXP, SEXP parentSEXP, SEXP duplicationSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; - Rcpp::traits::input_parameter< std::vector< std::vector< size_t > >& >::type annotations(annotationsSEXP); + Rcpp::traits::input_parameter< ListOf< IntegerVector >& >::type annotations(annotationsSEXP); Rcpp::traits::input_parameter< std::vector< size_t >& >::type geneid(geneidSEXP); Rcpp::traits::input_parameter< std::vector< int >& >::type parent(parentSEXP); Rcpp::traits::input_parameter< std::vector< bool >& >::type duplication(duplicationSEXP); @@ -638,6 +648,7 @@ static const R_CallMethodDef CallEntries[] = { {"_geese_print_geese", (DL_FUNC) &_geese_print_geese, 1}, {"_geese_print_nodes_cpp", (DL_FUNC) &_geese_print_nodes_cpp, 1}, {"_geese_get_support", (DL_FUNC) &_geese_get_support, 1}, + {"_geese_get_annotations_cpp", (DL_FUNC) &_geese_get_annotations_cpp, 1}, {"_geese_rule_limit_changes", (DL_FUNC) &_geese_rule_limit_changes, 5}, {"_geese_predict_geese", (DL_FUNC) &_geese_predict_geese, 5}, {"_geese_predict_flock", (DL_FUNC) &_geese_predict_flock, 5}, diff --git a/src/geese-common.cpp b/src/geese-common.cpp index 6e4b0e1..9fa8352 100644 --- a/src/geese-common.cpp +++ b/src/geese-common.cpp @@ -547,3 +547,29 @@ std::vector< NumericMatrix > get_support(SEXP p) } + +//' @export +// [[Rcpp::export(name = "get_annotations", rng = false)]] +IntegerMatrix get_annotations_cpp(SEXP p) { + + Rcpp::XPtr ptr(p); + + auto annotations = ptr->get_annotations(); + size_t nnodes = ptr->nnodes(); + size_t nfuns = ptr->nfuns(); + + IntegerMatrix ans(nnodes, nfuns); + + for (size_t i = 0u; i < nnodes; ++i) + { + for (size_t j = 0u; j < nfuns; ++j) + { + int tmp = static_cast(annotations[j * nnodes + i]); + ans(i, j) = tmp == 9 ? NA_INTEGER : tmp; + + } + } + + return ans; + +} \ No newline at end of file diff --git a/src/geese-predict.cpp b/src/geese-predict.cpp index a9beb90..0531298 100644 --- a/src/geese-predict.cpp +++ b/src/geese-predict.cpp @@ -8,7 +8,7 @@ using namespace Rcpp; //' @export //' @rdname geese-common // [[Rcpp::export(rng = false)]] -std::vector< std::vector< double > > predict_geese( +std::vector< NumericVector > predict_geese( SEXP p, const std::vector< double > & par, bool leave_one_out = false, @@ -17,15 +17,33 @@ std::vector< std::vector< double > > predict_geese( ) { // Preparing output - std::vector< std::vector< double > > res(0u); + std::vector< NumericVector > res; IF_GEESE(p) { Rcpp::XPtrptr(p); - res = ptr->predict( + auto res0 = ptr->predict( par, nullptr, leave_one_out, only_annotated, use_reduced_sequence ); + res.resize(res0.size()); + + for (const auto n: ptr->nodes) { + + res[n.second.ord] = wrap(res0[n.second.ord]); + + if (!only_annotated) + continue; + + for (size_t f = 0u; f < ptr->nfuns(); ++f) + { + if (n.second.annotations[f] == 9u) + res[n.second.ord][f] = NA_REAL; + } + + } + + } IF_FLOCK(p) { stop("Use -predict_flock- instead."); diff --git a/src/geese.cpp b/src/geese.cpp index 9043ade..ab8ba3a 100644 --- a/src/geese.cpp +++ b/src/geese.cpp @@ -16,14 +16,36 @@ using namespace Rcpp; //' @aliases geese // [[Rcpp::export(rng = false)]] SEXP new_geese( - std::vector< std::vector< size_t > > & annotations, + ListOf< IntegerVector > & annotations, std::vector< size_t > & geneid, std::vector< int > & parent, std::vector< bool > & duplication ) { + // Preprocessing annotations. (checking for data types) + std::vector< std::vector< size_t > > annotations2; + for (IntegerVector entry : annotations) { + + std::vector< size_t > tmp; + for (size_t i = 0u; i < entry.size(); ++i) { + if (Rcpp::traits::is_na(entry[i]) || entry[i] == 9) + tmp.push_back(9); + else if (entry[i] != 0 && entry[i] != 1) + stop("Invalid annotation value: %d", entry[i]); + else + tmp.push_back(entry[i]); + + } + + // Rcpp::print(wrap(tmp)); + + annotations2.push_back(tmp); + } + + // Rcpp::print(wrap(annotations2)); + Rcpp::XPtr dat( - new geese::Geese(annotations, geneid, parent, duplication + new geese::Geese(annotations2, geneid, parent, duplication )); dat.attr("class") = "geese";