diff --git a/R/gen_tibble.R b/R/gen_tibble.R index 38eec8e7..eff5275c 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -116,11 +116,42 @@ gen_tibble_bed_rds <- function(x, ..., indiv_meta <- list(id = bigsnp_obj$fam$sample.ID, population = bigsnp_obj$fam$family.ID) + # TODO check if other elements are informative; if they are, bring them over + indiv_meta$genotypes <- new_vctrs_bigsnp(bigsnp_obj, bigsnp_file = bigsnp_path, indiv_id = bigsnp_obj$fam$sample.ID) + # transfer some of the fam info to the metadata table if it is not missing (0 is the default missing value) + fam_info <- .gt_get_bigsnp(indiv_meta)$fam + if(!all(fam_info$paternal.id==0)){ + indiv_meta$paternal_ID <- fam_info$paternal.id + indiv_meta$paternal_ID[indiv_meta$paternal_ID==0]<-NA + } + if(!all(fam_info$maternal.id==0)){ + indiv_meta$maternal_ID <- fam_info$maternal.id + indiv_meta$maternal_ID[indiv_meta$maternal_ID==0]<-NA + } + if(!all(fam_info$sex==0)){ + indiv_meta$sex <- dplyr::case_match( + fam_info$sex, + 1 ~ "male", + 2 ~ "female", + .default = NA, + .ptype = factor(levels = c("female", "male")) + ) + } + if(!all(fam_info$affection %in% c(0,-9))){ + indiv_meta$phenotype <- dplyr::case_match( + fam_info$affection, + 1 ~ "control", + 2 ~ "case", + -9 ~ NA, + .default = NA, + .ptype = factor(levels = c("control", "case")) + ) + } new_gen_tbl <- tibble::new_tibble( indiv_meta, class = "gen_tbl" diff --git a/R/gen_tibble_ped.R b/R/gen_tibble_ped.R index 3401a625..b596a072 100644 --- a/R/gen_tibble_ped.R +++ b/R/gen_tibble_ped.R @@ -39,10 +39,10 @@ read.pedfile <- function(file, n, snps, which, split="\t| +", sep=".", # r2 <- as.raw(2) # r3 <- as.raw(3) - r0 <- 3 - r1 <- 2 + r0 <- NA_integer_ + r1 <- 0 r2 <- 1 - r3 <- 0 + r3 <- 2 ## Input file con <- gzfile(file) diff --git a/R/gt_as_plink.R b/R/gt_as_plink.R index f4a2f16a..461b5e3f 100644 --- a/R/gt_as_plink.R +++ b/R/gt_as_plink.R @@ -1,38 +1,169 @@ #' Export a `gen_tibble` object to PLINK bed format #' #' This function exports all the information of a `gen_tibble` object into -#' a PLINK bed file. +#' a PLINK bed, ped or raw file (and associated files, i.e. .bim and .fam for .bed; +#' .fam for .ped). #' #' @param x a [`gen_tibble`] object -#' @param bedfile a character string giving the path to output bed file. +#' @param file a character string giving the path to output file. If left to NULL, +#' the output file will have the same path and prefix of the backingfile. +#' @param type one of "bed", "ped" or "raw" #' @param overwrite boolean whether to overwrite the file. -#' @returns TRUE if successful +#' @returns the path of the saved file #' @export -gt_as_plink <- function(x, bedfile = NULL, +gt_as_plink <- function(x, file = NULL, type = c("bed","ped","raw"), overwrite = TRUE){ - if (is.null(bedfile)){ - bedfile <- bigstatsr::sub_bk(attr(x$genotypes,"bigsnp")$genotypes$backingfile,".bed") + type <- match.arg(type) + + if (is.null(file)){ + file <- bigstatsr::sub_bk(attr(x$genotypes,"bigsnp")$genotypes$backingfile,paste0(".",type)) + } + if (file_ext(file)!=type){ + file <- paste0(file,".",type) } - if (file_ext(bedfile)!="bed"){ - bedfile <- paste0(bedfile,".bed") + + if (type=="bed"){ + all_files <- c(file, + bigsnpr::sub_bed(file,".bim"), + bigsnpr::sub_bed(file,".fam") + ) + } else if (type=="ped"){ + all_files <- c(file, + gsub(".ped",".fam",file)) + } else if (type=="raw"){ + all_files <- file } - if (file.exists(bedfile)){ + + if (any(file.exists(all_files))){ if (overwrite){ - file.remove(bedfile) - file.remove(bigsnpr::sub_bed(bedfile,".bim")) - file.remove(bigsnpr::sub_bed(bedfile,".fam")) - } else { - stop(bedfile," already exists; remove if first or set 'overwrite' = TRUE") + file.remove(all_files[file.exists(all_files)]) + } else { + stop("at least one of", all_files," already exists; remove if first or set 'overwrite' = TRUE") } } + if (type == "bed"){ + gt_write_bed(x, file) + } else { + gt_write_ped_raw(x,file,type) + } +} + + +gt_write_bed <- function(x, file) { bed_path <- bigsnpr::snp_writeBed(attr(x$genotypes,"bigsnp"), - bedfile = bedfile, + bedfile = file, ind.row = vctrs::vec_data(x$genotypes), ind.col = show_loci(x)$big_index) # the bim and fam file only contain the original information in the bigSNP object # TODO we should update them with the info from the gentibble bed_path } + + + +gt_write_ped_raw <- function(x, file, plink_format = c("raw","ped"), chunk_size = 10000){ + + # loci information + loci <- show_loci(x) + # replace missing value with zero + loci$allele_alt[is.na(loci$allele_alt)]<-"0" + + # create col names to use in the raw file + raw_col_names<- c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE", + paste0(loci$name,"_",toupper(loci$allele_alt),"(/",toupper(loci$allele_ref),")")) + + # create the info for the fam file + indiv_meta <- tibble(population = x$population, + id = x$id, + pat = pull_NA(x, "pat"), + mat = pull_NA(x, "mat"), + sex = pull_NA(x, "sex"), + phenotype = pull_NA(x, "phenotype") + ) + # recode some variables + indiv_meta$sex<- dplyr::case_match(as.character(indiv_meta$sex),'female'~'2','male'~'1',.default="0") + indiv_meta$pat[is.na(indiv_meta$pat)]<-0 + indiv_meta$mat[is.na(indiv_meta$mat)]<-0 + indiv_meta$phenotype<-dplyr::case_match(as.character(indiv_meta$phenotype), + 'control'~"1",'case'~"2", .default=indiv_meta$phenotype ) + indiv_meta$phenotype[is.na(indiv_meta$phenotype)]<--9 + + # create chunks + n_ind <- nrow(x) + chunks <- split(1:n_ind, ceiling(seq_along(1:n_ind)/chunk_size)) + + # loop over to write + for (i_chunk in seq_len(length(chunks))){ + chunk <- chunks[[i_chunk]] + raw_table <- cbind(indiv_meta[chunk,], + show_genotypes(x[chunk,])) + + # now recode the genotypes with letters if raw + if (plink_format=="ped"){ + for (i in 1:(ncol(raw_table)-6)){ + raw_table[,i+6]<-recode_genotype(raw_table[,i+6], loci$allele_ref[i], loci$allele_alt[i]) + } + } + + colnames(raw_table) <- raw_col_names + # append column names only the first time, when the file does not exist + utils::write.table(raw_table, + file=file, + sep = " ", + row.names = FALSE, + col.names=(!file.exists(file) & plink_format=="raw"), + append = file.exists(file), + quote = FALSE) + + } + + ## create info for the map file + loci_meta <- show_loci(x) + map_file <- paste0(substr(file, 1, nchar(file)-4),".map") + map_table <- tibble(chrom = pull_NA(loci_meta,"chromosome"), + id = pull_NA(loci_meta,"name"), + cM = pull_NA(loci_meta,"cM"), + position = pull_NA(loci_meta,"position")) + map_table[is.na(map_table)]<-0 + + utils::write.table( + map_table, + file = map_file, + row.names = FALSE, + col.names = FALSE, + quote = FALSE + ) + return(file) +} + + +# internal function returning either a vector from a given column or NA if it does not exist +pull_NA <- function(.x, .col_name) { + if (.col_name %in% names(.x)){ + return(.x %>% pull(.col_name)) + } else { + return(rep(NA,nrow(.x))) + } +} + +# recode dosage as letter genotypes +recode_genotype <- function(x, allele_ref, allele_alt){ + x <-as.character(x) + genotypes <- c(paste(allele_ref, allele_ref), + paste(allele_ref, allele_alt), + paste(allele_alt, allele_alt)) + x <- dplyr::case_match( + x, + "0" ~ genotypes[1], + "1" ~ genotypes[2], + "2" ~ genotypes[3] + ) + x[is.na(x)]<-c("0 0") + x + +} + + diff --git a/inst/extdata/pop_a.bk b/inst/extdata/pop_a.bk new file mode 100644 index 00000000..57b68c9f Binary files /dev/null and b/inst/extdata/pop_a.bk differ diff --git a/inst/extdata/pop_a.ped b/inst/extdata/pop_a.ped new file mode 100644 index 00000000..a48c2989 --- /dev/null +++ b/inst/extdata/pop_a.ped @@ -0,0 +1,5 @@ +pop_a GRC14300079 0 0 1 -9 A A A A G G A A G G T G G G C C T T C C T T G G T T A G T T T T +pop_a GRC14300142 0 0 1 -9 A A A A G G A A G G G G G G C C T T G C A T G G T T G G C T T T +pop_a GRC14300159 0 0 1 -9 A A G A A G A A A A T G G G C C T T C C T T G G T T A G T T T T +pop_a GRC14300286 0 0 1 -9 A A A A G G A A G G G G G G C C T T G C T T G G T T A G C T T T +pop_a GRC14300305 0 0 1 -9 A A A A A G A A A G T T G G C C T T C C T T G G T T A G T T T T diff --git a/inst/extdata/pop_a.rds b/inst/extdata/pop_a.rds new file mode 100644 index 00000000..2710662d Binary files /dev/null and b/inst/extdata/pop_a.rds differ diff --git a/man/gt_as_plink.Rd b/man/gt_as_plink.Rd index f2665bcf..5bbae9d2 100644 --- a/man/gt_as_plink.Rd +++ b/man/gt_as_plink.Rd @@ -4,19 +4,23 @@ \alias{gt_as_plink} \title{Export a \code{gen_tibble} object to PLINK bed format} \usage{ -gt_as_plink(x, bedfile = NULL, overwrite = TRUE) +gt_as_plink(x, file = NULL, type = c("bed", "ped", "raw"), overwrite = TRUE) } \arguments{ \item{x}{a \code{\link{gen_tibble}} object} -\item{bedfile}{a character string giving the path to output bed file.} +\item{file}{a character string giving the path to output file. If left to NULL, +the output file will have the same path and prefix of the backingfile.} + +\item{type}{one of "bed", "ped" or "raw"} \item{overwrite}{boolean whether to overwrite the file.} } \value{ -TRUE if successful +the path of the saved file } \description{ This function exports all the information of a \code{gen_tibble} object into -a PLINK bed file. +a PLINK bed, ped or raw file (and associated files, i.e. .bim and .fam for .bed; +.fam for .ped). } diff --git a/src/snp_as.cpp b/src/snp_as.cpp index 18d7c861..2ae71a7e 100644 --- a/src/snp_as.cpp +++ b/src/snp_as.cpp @@ -41,8 +41,8 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) { size_t n = macc.nrow(); size_t m = macc.ncol(); - for (int j = 0; j < m; j++) { - for (int i = 0; i < n; i++) { + for (unsigned int j = 0; j < m; j++) { + for (unsigned int i = 0; i < n; i++) { int value = (macc(i,j)); if (value <3){ dos_mat(i, j) = value-1; // note that we want the matrix of (dos-1) @@ -53,10 +53,10 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) { }} // fill the rest with 0s (should be 1 column max) - int m2 = dos_mat.n_cols; + size_t m2 = dos_mat.n_cols; if (m2 > m) { myassert(m2 == (m + 1), ERROR_BUG); - for (int i = 0; i < n; i++) { + for (unsigned int i = 0; i < n; i++) { dos_mat(i, m) = 1; na_mat(i, m) = 0; } diff --git a/src/snp_ibs.cpp b/src/snp_ibs.cpp index 606fca43..d741a2ce 100644 --- a/src/snp_ibs.cpp +++ b/src/snp_ibs.cpp @@ -42,8 +42,8 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) { size_t n = macc.nrow(); size_t m = macc.ncol(); - for (int j = 0; j < m; j++) { - for (int i = 0; i < n; i++) { + for (unsigned int j = 0; j < m; j++) { + for (unsigned int i = 0; i < n; i++) { int value = (macc(i,j)); if (value == 0){ genotype0(i, j) = 1; @@ -55,10 +55,10 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) { }} // fill the rest with 0s (should be 1 column max) - int m2 = genotype0.n_cols; + size_t m2 = genotype0.n_cols; if (m2 > m) { myassert(m2 == (m + 1), ERROR_BUG); - for (int i = 0; i < n; i++) { + for (unsigned int i = 0; i < n; i++) { genotype0(i, m) = 0; genotype1(i, m) = 0; genotype2(i, m) = 0; diff --git a/src/snp_king.cpp b/src/snp_king.cpp index 1d51418a..2b8d542f 100644 --- a/src/snp_king.cpp +++ b/src/snp_king.cpp @@ -42,8 +42,8 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) { size_t n = macc.nrow(); size_t m = macc.ncol(); - for (int j = 0; j < m; j++) { - for (int i = 0; i < n; i++) { + for (unsigned int j = 0; j < m; j++) { + for (unsigned int i = 0; i < n; i++) { int value = (macc(i,j)); if (value == 0){ genotype0(i, j) = 1; @@ -58,10 +58,10 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) { }} // fill the rest with 0s (should be 1 column max) - int m2 = genotype0.n_cols; + size_t m2 = genotype0.n_cols; if (m2 > m) { myassert(m2 == (m + 1), ERROR_BUG); - for (int i = 0; i < n; i++) { + for (unsigned int i = 0; i < n; i++) { genotype0(i, m) = 0; genotype1(i, m) = 0; genotype2(i, m) = 0; diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index 2cf81aa5..5b2460b0 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -14,7 +14,7 @@ test_loci <- data.frame(name=paste0("rs",1:6), test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) # this also tests show_genotypes and show_loci -test_that("create gen_tibble from bed",{ +test_that("create gen_tibble from dfs",{ expect_true(inherits(test_gt,"gen_tbl")) # we can extract the genotypes correctly extracted_genotypes <- test_gt %>% show_genotypes() @@ -32,17 +32,6 @@ test_that("create gen_tibble from bed",{ }) -# now create it directly from the dfs -test_that("create gen_tibble from dfs",{ - test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta, - loci = test_loci, quiet = TRUE) - # because of the different backing file info, we cannot use identical on the whole object - expect_true(identical(show_genotypes(test_gt), show_genotypes(test_dfs_gt))) - expect_true(identical(show_loci(test_gt), show_loci(test_dfs_gt))) - expect_true(identical(test_gt %>% select(-genotypes), - test_dfs_gt %>% select(-genotypes))) -}) - test_genotypes_c <- rbind(c("1","1","0","1","1","0"), c("2","1","0","0","0","0"), c("2","2","0","0","1","1")) @@ -81,4 +70,17 @@ test_that("gen_tibble catches invalid alleles",{ }) +test_that("gen_tibble from a bed file",{ + bed_path <- system.file("extdata/pop_a.bed", package = "tidypopgen") + pop_a_gt <- gen_tibble(bed_path, quiet=TRUE, backingfile = tempfile()) + # now read the dosages created by plink when saving in raw format + raw_file_pop_a <- read.table(system.file("extdata/pop_a.raw", package = "tidypopgen"), header= TRUE) + mat <- as.matrix(raw_file_pop_a[,7:ncol(raw_file_pop_a)]) + mat <- unname(mat) + expect_true(all.equal(mat,show_genotypes(pop_a_gt))) + # now read in the ped file + ped_path <- system.file("extdata/pop_a.ped", package = "tidypopgen") + pop_a_ped_gt <- gen_tibble(ped_path, quiet=TRUE,backingfile = tempfile()) + all.equal(show_genotypes(pop_a_gt),show_genotypes(pop_a_ped_gt)) +}) diff --git a/tests/testthat/test_gt_as_plink.R b/tests/testthat/test_gt_as_plink.R index c43c1dfd..ca49ecb3 100644 --- a/tests/testthat/test_gt_as_plink.R +++ b/tests/testthat/test_gt_as_plink.R @@ -3,7 +3,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c"), population = c("pop1","pop1","pop2")) test_genotypes <- rbind(c(1,1,0,1,1,0), c(2,1,0,0,0,0), - c(2,2,0,0,1,1)) + c(2,2,0,0,NA,1)) test_loci <- data.frame(name=paste0("rs",1:6), chromosome=paste0("chr",c(1,1,1,1,2,2)), position=as.integer(c(3,5,65,343,23,456)), @@ -16,7 +16,7 @@ test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_in # this also tests show_genotypes and show_loci test_that("write a bed file",{ - bed_path <- gt_as_plink(test_gt, bedfile = paste0(tempfile(),".bed")) + bed_path <- gt_as_plink(test_gt, file = paste0(tempfile(),".bed")) # now read the file back in test_gt2 <- gen_tibble(bed_path, quiet=TRUE) ## continue here @@ -30,5 +30,18 @@ test_that("write a bed file",{ expect_true(is.na(show_loci(test_gt2)$allele_alt[3])) + # now write it as a ped + ped_path <- gt_as_plink(test_gt, file = paste0(tempfile(),".ped"), type = "ped") + test_gt3 <- gen_tibble(ped_path, quiet=TRUE) + # the gen tibble from the bed and ped should contain the same information + expect_true(all.equal(show_loci(test_gt3),show_loci(test_gt2), check.attributes=FALSE)) + expect_true(all.equal(show_genotypes(test_gt3),show_genotypes(test_gt2))) + + # write it as raw + raw_path <- gt_as_plink(test_gt,file=tempfile(),type="raw") + raw_file_test <- read.table(raw_path, header= TRUE) + mat <- as.matrix(raw_file_test[,7:ncol(raw_file_test)]) + mat <- unname(mat) + expect_true(all.equal(mat,show_genotypes(test_gt))) }) diff --git a/vignettes/example_clustering_and_dapc.Rmd b/vignettes/example_clustering_and_dapc.Rmd index 49923c39..8a69afcd 100644 --- a/vignettes/example_clustering_and_dapc.Rmd +++ b/vignettes/example_clustering_and_dapc.Rmd @@ -3,7 +3,7 @@ title: "example workflow with tidypopgen" output: rmarkdown::html_vignette #pdf_document vignette: > - %\VignetteIndexEntry{example_workflow} + %\VignetteIndexEntry{example workflow with tidypopgen} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/vignettes/overview.Rmd b/vignettes/overview.Rmd index c1fb692e..7d8e3925 100644 --- a/vignettes/overview.Rmd +++ b/vignettes/overview.Rmd @@ -367,12 +367,14 @@ write a file in the appropriate format, and return the name of that file on comp For example, to export to a PLINK .bed file, we simply use: ```{r} -gt_as_plink(example_gt, bedfile = tempfile("new_bed_")) +gt_as_plink(example_gt, file = tempfile("new_bed_")) ``` This will also write a .bim and .fam file and save them together with the .bed file. Note that, from the main tibble, only on `id`, `population` and `sex` will be preserved in the .fam file. +It is also possible to write .ped and .raw files by specifying `type="ped"` or +`type="raw"` in `gt_as_plink()` (see the help page for `gt_as_plink()` for details). # Merging data