Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write ped #34

Merged
merged 5 commits into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,42 @@ gen_tibble_bed_rds <- function(x, ...,

indiv_meta <- list(id = bigsnp_obj$fam$sample.ID,
population = bigsnp_obj$fam$family.ID)
# TODO check if other elements are informative; if they are, bring them over


indiv_meta$genotypes <- new_vctrs_bigsnp(bigsnp_obj,
bigsnp_file = bigsnp_path,
indiv_id = bigsnp_obj$fam$sample.ID)

# transfer some of the fam info to the metadata table if it is not missing (0 is the default missing value)
fam_info <- .gt_get_bigsnp(indiv_meta)$fam
if(!all(fam_info$paternal.id==0)){
indiv_meta$paternal_ID <- fam_info$paternal.id
indiv_meta$paternal_ID[indiv_meta$paternal_ID==0]<-NA
}
if(!all(fam_info$maternal.id==0)){
indiv_meta$maternal_ID <- fam_info$maternal.id
indiv_meta$maternal_ID[indiv_meta$maternal_ID==0]<-NA
}
if(!all(fam_info$sex==0)){
indiv_meta$sex <- dplyr::case_match(
fam_info$sex,
1 ~ "male",
2 ~ "female",
.default = NA,
.ptype = factor(levels = c("female", "male"))
)
}
if(!all(fam_info$affection %in% c(0,-9))){
indiv_meta$phenotype <- dplyr::case_match(
fam_info$affection,
1 ~ "control",
2 ~ "case",
-9 ~ NA,
.default = NA,
.ptype = factor(levels = c("control", "case"))
)
}
new_gen_tbl <- tibble::new_tibble(
indiv_meta,
class = "gen_tbl"
Expand Down
6 changes: 3 additions & 3 deletions R/gen_tibble_ped.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ read.pedfile <- function(file, n, snps, which, split="\t| +", sep=".",
# r2 <- as.raw(2)
# r3 <- as.raw(3)

r0 <- 3
r1 <- 2
r0 <- NA_integer_
r1 <- 0
r2 <- 1
r3 <- 0
r3 <- 2

## Input file
con <- gzfile(file)
Expand Down
161 changes: 146 additions & 15 deletions R/gt_as_plink.R
Original file line number Diff line number Diff line change
@@ -1,38 +1,169 @@
#' Export a `gen_tibble` object to PLINK bed format
#'
#' This function exports all the information of a `gen_tibble` object into
#' a PLINK bed file.
#' a PLINK bed, ped or raw file (and associated files, i.e. .bim and .fam for .bed;
#' .fam for .ped).
#'
#' @param x a [`gen_tibble`] object
#' @param bedfile a character string giving the path to output bed file.
#' @param file a character string giving the path to output file. If left to NULL,
#' the output file will have the same path and prefix of the backingfile.
#' @param type one of "bed", "ped" or "raw"
#' @param overwrite boolean whether to overwrite the file.
#' @returns TRUE if successful
#' @returns the path of the saved file
#' @export


gt_as_plink <- function(x, bedfile = NULL,
gt_as_plink <- function(x, file = NULL, type = c("bed","ped","raw"),
overwrite = TRUE){
if (is.null(bedfile)){
bedfile <- bigstatsr::sub_bk(attr(x$genotypes,"bigsnp")$genotypes$backingfile,".bed")
type <- match.arg(type)

if (is.null(file)){
file <- bigstatsr::sub_bk(attr(x$genotypes,"bigsnp")$genotypes$backingfile,paste0(".",type))
}
if (file_ext(file)!=type){
file <- paste0(file,".",type)
}
if (file_ext(bedfile)!="bed"){
bedfile <- paste0(bedfile,".bed")

if (type=="bed"){
all_files <- c(file,
bigsnpr::sub_bed(file,".bim"),
bigsnpr::sub_bed(file,".fam")
)
} else if (type=="ped"){
all_files <- c(file,
gsub(".ped",".fam",file))
} else if (type=="raw"){
all_files <- file
}
if (file.exists(bedfile)){

if (any(file.exists(all_files))){
if (overwrite){
file.remove(bedfile)
file.remove(bigsnpr::sub_bed(bedfile,".bim"))
file.remove(bigsnpr::sub_bed(bedfile,".fam"))
} else {
stop(bedfile," already exists; remove if first or set 'overwrite' = TRUE")
file.remove(all_files[file.exists(all_files)])
} else {
stop("at least one of", all_files," already exists; remove if first or set 'overwrite' = TRUE")
}
}

if (type == "bed"){
gt_write_bed(x, file)
} else {
gt_write_ped_raw(x,file,type)
}
}


gt_write_bed <- function(x, file) {
bed_path <- bigsnpr::snp_writeBed(attr(x$genotypes,"bigsnp"),
bedfile = bedfile,
bedfile = file,
ind.row = vctrs::vec_data(x$genotypes),
ind.col = show_loci(x)$big_index)
# the bim and fam file only contain the original information in the bigSNP object
# TODO we should update them with the info from the gentibble
bed_path
}



gt_write_ped_raw <- function(x, file, plink_format = c("raw","ped"), chunk_size = 10000){

# loci information
loci <- show_loci(x)
# replace missing value with zero
loci$allele_alt[is.na(loci$allele_alt)]<-"0"

# create col names to use in the raw file
raw_col_names<- c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE",
paste0(loci$name,"_",toupper(loci$allele_alt),"(/",toupper(loci$allele_ref),")"))

# create the info for the fam file
indiv_meta <- tibble(population = x$population,
id = x$id,
pat = pull_NA(x, "pat"),
mat = pull_NA(x, "mat"),
sex = pull_NA(x, "sex"),
phenotype = pull_NA(x, "phenotype")
)
# recode some variables
indiv_meta$sex<- dplyr::case_match(as.character(indiv_meta$sex),'female'~'2','male'~'1',.default="0")
indiv_meta$pat[is.na(indiv_meta$pat)]<-0
indiv_meta$mat[is.na(indiv_meta$mat)]<-0
indiv_meta$phenotype<-dplyr::case_match(as.character(indiv_meta$phenotype),
'control'~"1",'case'~"2", .default=indiv_meta$phenotype )
indiv_meta$phenotype[is.na(indiv_meta$phenotype)]<--9

# create chunks
n_ind <- nrow(x)
chunks <- split(1:n_ind, ceiling(seq_along(1:n_ind)/chunk_size))

# loop over to write
for (i_chunk in seq_len(length(chunks))){
chunk <- chunks[[i_chunk]]
raw_table <- cbind(indiv_meta[chunk,],
show_genotypes(x[chunk,]))

# now recode the genotypes with letters if raw
if (plink_format=="ped"){
for (i in 1:(ncol(raw_table)-6)){
raw_table[,i+6]<-recode_genotype(raw_table[,i+6], loci$allele_ref[i], loci$allele_alt[i])
}
}

colnames(raw_table) <- raw_col_names
# append column names only the first time, when the file does not exist
utils::write.table(raw_table,
file=file,
sep = " ",
row.names = FALSE,
col.names=(!file.exists(file) & plink_format=="raw"),
append = file.exists(file),
quote = FALSE)

}

## create info for the map file
loci_meta <- show_loci(x)
map_file <- paste0(substr(file, 1, nchar(file)-4),".map")
map_table <- tibble(chrom = pull_NA(loci_meta,"chromosome"),
id = pull_NA(loci_meta,"name"),
cM = pull_NA(loci_meta,"cM"),
position = pull_NA(loci_meta,"position"))
map_table[is.na(map_table)]<-0

utils::write.table(
map_table,
file = map_file,
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
return(file)
}


# internal function returning either a vector from a given column or NA if it does not exist
pull_NA <- function(.x, .col_name) {
if (.col_name %in% names(.x)){
return(.x %>% pull(.col_name))
} else {
return(rep(NA,nrow(.x)))
}
}

# recode dosage as letter genotypes
recode_genotype <- function(x, allele_ref, allele_alt){
x <-as.character(x)
genotypes <- c(paste(allele_ref, allele_ref),
paste(allele_ref, allele_alt),
paste(allele_alt, allele_alt))
x <- dplyr::case_match(
x,
"0" ~ genotypes[1],
"1" ~ genotypes[2],
"2" ~ genotypes[3]
)
x[is.na(x)]<-c("0 0")
x

}


Binary file added inst/extdata/pop_a.bk
Binary file not shown.
5 changes: 5 additions & 0 deletions inst/extdata/pop_a.ped
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pop_a GRC14300079 0 0 1 -9 A A A A G G A A G G T G G G C C T T C C T T G G T T A G T T T T
pop_a GRC14300142 0 0 1 -9 A A A A G G A A G G G G G G C C T T G C A T G G T T G G C T T T
pop_a GRC14300159 0 0 1 -9 A A G A A G A A A A T G G G C C T T C C T T G G T T A G T T T T
pop_a GRC14300286 0 0 1 -9 A A A A G G A A G G G G G G C C T T G C T T G G T T A G C T T T
pop_a GRC14300305 0 0 1 -9 A A A A A G A A A G T T G G C C T T C C T T G G T T A G T T T T
Binary file added inst/extdata/pop_a.rds
Binary file not shown.
12 changes: 8 additions & 4 deletions man/gt_as_plink.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions src/snp_as.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
size_t n = macc.nrow();
size_t m = macc.ncol();

for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < m; j++) {
for (unsigned int i = 0; i < n; i++) {
int value = (macc(i,j));
if (value <3){
dos_mat(i, j) = value-1; // note that we want the matrix of (dos-1)
Expand All @@ -53,10 +53,10 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
}}

// fill the rest with 0s (should be 1 column max)
int m2 = dos_mat.n_cols;
size_t m2 = dos_mat.n_cols;
if (m2 > m) {
myassert(m2 == (m + 1), ERROR_BUG);
for (int i = 0; i < n; i++) {
for (unsigned int i = 0; i < n; i++) {
dos_mat(i, m) = 1;
na_mat(i, m) = 0;
}
Expand Down
8 changes: 4 additions & 4 deletions src/snp_ibs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
size_t n = macc.nrow();
size_t m = macc.ncol();

for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < m; j++) {
for (unsigned int i = 0; i < n; i++) {
int value = (macc(i,j));
if (value == 0){
genotype0(i, j) = 1;
Expand All @@ -55,10 +55,10 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
}}

// fill the rest with 0s (should be 1 column max)
int m2 = genotype0.n_cols;
size_t m2 = genotype0.n_cols;
if (m2 > m) {
myassert(m2 == (m + 1), ERROR_BUG);
for (int i = 0; i < n; i++) {
for (unsigned int i = 0; i < n; i++) {
genotype0(i, m) = 0;
genotype1(i, m) = 0;
genotype2(i, m) = 0;
Expand Down
8 changes: 4 additions & 4 deletions src/snp_king.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
size_t n = macc.nrow();
size_t m = macc.ncol();

for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < m; j++) {
for (unsigned int i = 0; i < n; i++) {
int value = (macc(i,j));
if (value == 0){
genotype0(i, j) = 1;
Expand All @@ -58,10 +58,10 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
}}

// fill the rest with 0s (should be 1 column max)
int m2 = genotype0.n_cols;
size_t m2 = genotype0.n_cols;
if (m2 > m) {
myassert(m2 == (m + 1), ERROR_BUG);
for (int i = 0; i < n; i++) {
for (unsigned int i = 0; i < n; i++) {
genotype0(i, m) = 0;
genotype1(i, m) = 0;
genotype2(i, m) = 0;
Expand Down
26 changes: 14 additions & 12 deletions tests/testthat/test_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ test_loci <- data.frame(name=paste0("rs",1:6),
test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE)

# this also tests show_genotypes and show_loci
test_that("create gen_tibble from bed",{
test_that("create gen_tibble from dfs",{
expect_true(inherits(test_gt,"gen_tbl"))
# we can extract the genotypes correctly
extracted_genotypes <- test_gt %>% show_genotypes()
Expand All @@ -32,17 +32,6 @@ test_that("create gen_tibble from bed",{

})

# now create it directly from the dfs
test_that("create gen_tibble from dfs",{
test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta,
loci = test_loci, quiet = TRUE)
# because of the different backing file info, we cannot use identical on the whole object
expect_true(identical(show_genotypes(test_gt), show_genotypes(test_dfs_gt)))
expect_true(identical(show_loci(test_gt), show_loci(test_dfs_gt)))
expect_true(identical(test_gt %>% select(-genotypes),
test_dfs_gt %>% select(-genotypes)))
})

test_genotypes_c <- rbind(c("1","1","0","1","1","0"),
c("2","1","0","0","0","0"),
c("2","2","0","0","1","1"))
Expand Down Expand Up @@ -81,4 +70,17 @@ test_that("gen_tibble catches invalid alleles",{

})

test_that("gen_tibble from a bed file",{
bed_path <- system.file("extdata/pop_a.bed", package = "tidypopgen")
pop_a_gt <- gen_tibble(bed_path, quiet=TRUE, backingfile = tempfile())
# now read the dosages created by plink when saving in raw format
raw_file_pop_a <- read.table(system.file("extdata/pop_a.raw", package = "tidypopgen"), header= TRUE)
mat <- as.matrix(raw_file_pop_a[,7:ncol(raw_file_pop_a)])
mat <- unname(mat)
expect_true(all.equal(mat,show_genotypes(pop_a_gt)))
# now read in the ped file
ped_path <- system.file("extdata/pop_a.ped", package = "tidypopgen")
pop_a_ped_gt <- gen_tibble(ped_path, quiet=TRUE,backingfile = tempfile())
all.equal(show_genotypes(pop_a_gt),show_genotypes(pop_a_ped_gt))

})
Loading