Skip to content

Commit

Permalink
Merge pull request #57 from EvolEcolGroup/write_vcf
Browse files Browse the repository at this point in the history
Write vcf
  • Loading branch information
dramanica authored Oct 25, 2024
2 parents 1a19203 + bee3aba commit 0d4e1f3
Show file tree
Hide file tree
Showing 31 changed files with 94 additions and 50 deletions.
2 changes: 1 addition & 1 deletion R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ gt_write_bigsnp_from_dfs <- function(genotypes, indiv_meta, loci,

map <- tibble(chromosome = loci$chromosome,
marker.ID = loci$name,
genetic.dist = loci$genetic_dist,
genetic.dist = as.double(loci$genetic_dist), ## make sure that genetic.dist is double
physical.pos = loci$position,
allele1 = loci$allele_alt,
allele2 = loci$allele_ref)
Expand Down
15 changes: 8 additions & 7 deletions R/gt_as_vcf.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,12 @@ gt_as_vcf <- function(x, file = NULL, chunk_size = NULL, overwrite = FALSE){
}

# set up chunks
chunks_vec <- c(
rep(chunk_size, floor(count_loci(x) / chunk_size)),
count_loci(x) %% chunk_size
)
chunks_vec_index <- c(1,cumsum(chunks_vec))
chunks_vec <- rep(chunk_size, floor(count_loci(x) / chunk_size))
if (count_loci(x) %% chunk_size != 0){
chunks_vec <- c(chunks_vec, count_loci(x) %% chunk_size)
}
chunks_vec_index <- c(0,cumsum(chunks_vec))


# generate the header
vcf_header <- c("##fileformat=VCFv4.3",
Expand Down Expand Up @@ -68,13 +69,13 @@ gt_as_vcf <- function(x, file = NULL, chunk_size = NULL, overwrite = FALSE){
for (chunk_i in seq_along(chunks_vec)) {
genotypes_matrix <- t(show_genotypes(x,
loci_indices =
chunks_vec_index[chunk_i]:chunks_vec_index[chunk_i+1]))
(chunks_vec_index[chunk_i]+1):chunks_vec_index[chunk_i+1]))
genotypes_matrix[genotypes_matrix==0] <- "0/0"
genotypes_matrix[genotypes_matrix==1] <- "0/1"
genotypes_matrix[genotypes_matrix==2] <- "1/1"
genotypes_matrix[is.na(genotypes_matrix)] <- "./."
# subset loci to this chunk
loci_sub <- show_loci(x)[chunks_vec_index[chunk_i]:chunks_vec_index[chunk_i+1],]
loci_sub <- show_loci(x)[(chunks_vec_index[chunk_i]+1):chunks_vec_index[chunk_i+1],]
# add the other columns needed for the
loci_cols <- c("chromosome", "position", "name", "allele_ref", "allele_alt")
loci_sub <- loci_sub %>% select(any_of(loci_cols)) %>%
Expand Down
2 changes: 1 addition & 1 deletion data-raw/ideas/allele_sharing.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion data-raw/ideas/fst_gt_compare_to_hier.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion data-raw/ideas/fst_hier.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion data-raw/ideas/fst_wc.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion data-raw/ideas/fst_wc2.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion data-raw/worked_comparisons/compare_fst.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))
Expand Down
14 changes: 7 additions & 7 deletions tests/testthat/test_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -112,7 +112,7 @@ test_that("gen_tibble loci is dataframe or tbl",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))
wrong_loci_matrix <- as.matrix(test_loci)
Expand Down Expand Up @@ -150,7 +150,7 @@ test_that("gen_tibble identifies wrong loci table columns",{
wrong_loci <- data.frame(a=paste0("rs",1:6),
b=paste0("chr",c(1,1,1,1,2,2)),
c=as.integer(c(3,5,65,343,23,456)),
d = as.integer(rep(0,6)),
d = as.double(rep(0,6)),
e = c("A","T","C","G","C","T"),
f = c("T","C", NA,"C","G","A"))
expect_error(test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta,
Expand Down Expand Up @@ -437,7 +437,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -492,7 +492,7 @@ test_that("chr_int is always an integer",{
test_loci_fac <- data.frame(name=paste0("rs",1:6),
chromosome=as.factor(paste0("chr",c(1,1,1,1,2,2))),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))
test_gt <- gen_tibble(x = test_genotypes, loci = test_loci_fac, indiv_meta = test_indiv_meta, quiet = TRUE)
Expand Down Expand Up @@ -531,7 +531,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -590,7 +590,7 @@ test_that("additional vcf tests with larger file",{
# test_loci <- data.frame(name=paste0("rs",1:6),
# chromosome=paste0("chr",c(1,1,1,1,2,2)),
# position=as.integer(c(3,5,65,343,23,456)),
# genetic_dist = as.integer(rep(0,6)),
# genetic_dist = as.double(rep(0,6)),
# allele_ref = c("A","T","C","G","C","T"),
# allele_alt = c("T","C", NA,"C","G","A"))
# test_loci_wrong <- test_loci
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_gen_tibble_save_load.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_group_by.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_gt_as_.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ test_that("show_loci gets and sets information",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_gt_as_plink.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
45 changes: 44 additions & 1 deletion tests/testthat/test_gt_as_vcf.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -43,3 +43,46 @@ test_that("test reading and writing is equivalent",{
expect_true(identical(show_genotypes(pop_a_vcf), show_genotypes(pop_a_vcf_rewrite)))
expect_true(identical(show_loci(pop_a_vcf), show_loci(pop_a_vcf_rewrite)))
})


test_that("test reading and writing with chunking is equivalent cpp",{
# write out to vcf with chunk size - smaller chunk size = more duplicates
vcf_path_chunked <- gt_as_vcf(test_gt, file = paste0(tempfile(),".vcf"), chunk_size = 2)
# read vcf back in with cpp
test_gt_chunked <- gen_tibble(vcf_path_chunked, quiet=TRUE, parser = "cpp", backingfile = tempfile("anolis_chunk_"))
# check they are the same
expect_true(identical(show_genotypes(test_gt), show_genotypes(test_gt_chunked)))
expect_true(identical(show_loci(test_gt), show_loci(test_gt_chunked)))
expect_equal(count_loci(test_gt_chunked), count_loci(test_gt))

# now do the same with chunk size that is not an exact divisor of the number of loci
vcf_path_chunked <- gt_as_vcf(test_gt, file = paste0(tempfile(),".vcf"), chunk_size = 4)
# read vcf back in with cpp
test_gt_chunked <- gen_tibble(vcf_path_chunked, quiet=TRUE, parser = "cpp", backingfile = tempfile("anolis_chunk_"))
# check they are the same
expect_true(identical(show_genotypes(test_gt), show_genotypes(test_gt_chunked)))
expect_true(identical(show_loci(test_gt), show_loci(test_gt_chunked)))
expect_equal(count_loci(test_gt_chunked), count_loci(test_gt))


})


test_that("test reading and writing with chunking is equivalent rvcf",{
# write out to vcf with chunk size
vcf_path_chunked <- gt_as_vcf(test_gt, file = paste0(tempfile(),".vcf"), chunk_size = 2)
# read vcf back in with vcfR
test_gt_chunked <- gen_tibble(vcf_path_chunked, quiet=TRUE, parser = "vcfR", backingfile = tempfile("anolis_chunk_"))
# check they are the same
expect_true(identical(show_genotypes(test_gt), show_genotypes(test_gt_chunked)))
expect_true(identical(show_loci(test_gt), show_loci(test_gt_chunked)))
expect_equal(count_loci(test_gt_chunked), count_loci(test_gt))
})








2 changes: 1 addition & 1 deletion tests/testthat/test_gt_extract_f2.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_indiv_het_obs.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_that("indiv_het_obs computes correctly",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand All @@ -33,7 +33,7 @@ test_that("indiv_het_obs returns 0's when all genotypes are homozygous", {
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_indiv_missingness.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,2),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_loci_freq.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_that("loci_alt_freq and loci_maf computes correctly",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))
test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE)
Expand Down Expand Up @@ -77,7 +77,7 @@ test_that("loci_alt_freq and loci_maf on grouped tibbles",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_loci_ld_clump.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ test_genotypes <- rbind(c(2,2,0,1,1,2),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -99,7 +99,7 @@ test_that("loci order",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c("chr2","chr1","chr1","chr1","chr2","chr1"),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_loci_missingness.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_that("loci_missingness",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -48,7 +48,7 @@ test_that("loci_missingness on grouped tibble",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_loci_transversions_transitions.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_that("find transitions and transversions",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand All @@ -26,7 +26,7 @@ test_that("check warning message for different alleles",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("a","t","c","g","c","t"),
allele_alt = c("t","c", NA,"c","g","a"))

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_pairwise_allele_sharing.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
Loading

0 comments on commit 0d4e1f3

Please sign in to comment.