Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write vcf #57

Merged
merged 3 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ gt_write_bigsnp_from_dfs <- function(genotypes, indiv_meta, loci,

map <- tibble(chromosome = loci$chromosome,
marker.ID = loci$name,
genetic.dist = loci$genetic_dist,
genetic.dist = as.double(loci$genetic_dist), ## make sure that genetic.dist is double
physical.pos = loci$position,
allele1 = loci$allele_alt,
allele2 = loci$allele_ref)
Expand Down
15 changes: 8 additions & 7 deletions R/gt_as_vcf.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,12 @@ gt_as_vcf <- function(x, file = NULL, chunk_size = NULL, overwrite = FALSE){
}

# set up chunks
chunks_vec <- c(
rep(chunk_size, floor(count_loci(x) / chunk_size)),
count_loci(x) %% chunk_size
)
chunks_vec_index <- c(1,cumsum(chunks_vec))
chunks_vec <- rep(chunk_size, floor(count_loci(x) / chunk_size))
if (count_loci(x) %% chunk_size != 0){
chunks_vec <- c(chunks_vec, count_loci(x) %% chunk_size)
}
chunks_vec_index <- c(0,cumsum(chunks_vec))


# generate the header
vcf_header <- c("##fileformat=VCFv4.3",
Expand Down Expand Up @@ -68,13 +69,13 @@ gt_as_vcf <- function(x, file = NULL, chunk_size = NULL, overwrite = FALSE){
for (chunk_i in seq_along(chunks_vec)) {
genotypes_matrix <- t(show_genotypes(x,
loci_indices =
chunks_vec_index[chunk_i]:chunks_vec_index[chunk_i+1]))
(chunks_vec_index[chunk_i]+1):chunks_vec_index[chunk_i+1]))
genotypes_matrix[genotypes_matrix==0] <- "0/0"
genotypes_matrix[genotypes_matrix==1] <- "0/1"
genotypes_matrix[genotypes_matrix==2] <- "1/1"
genotypes_matrix[is.na(genotypes_matrix)] <- "./."
# subset loci to this chunk
loci_sub <- show_loci(x)[chunks_vec_index[chunk_i]:chunks_vec_index[chunk_i+1],]
loci_sub <- show_loci(x)[(chunks_vec_index[chunk_i]+1):chunks_vec_index[chunk_i+1],]
# add the other columns needed for the
loci_cols <- c("chromosome", "position", "name", "allele_ref", "allele_alt")
loci_sub <- loci_sub %>% select(any_of(loci_cols)) %>%
Expand Down
2 changes: 1 addition & 1 deletion data-raw/ideas/allele_sharing.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion data-raw/ideas/fst_gt_compare_to_hier.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion data-raw/ideas/fst_hier.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion data-raw/ideas/fst_wc.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion data-raw/ideas/fst_wc2.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion data-raw/worked_comparisons/compare_fst.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
14 changes: 7 additions & 7 deletions tests/testthat/test_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -112,7 +112,7 @@ test_that("gen_tibble loci is dataframe or tbl",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))
wrong_loci_matrix <- as.matrix(test_loci)
Expand Down Expand Up @@ -150,7 +150,7 @@ test_that("gen_tibble identifies wrong loci table columns",{
wrong_loci <- data.frame(a=paste0("rs",1:6),
b=paste0("chr",c(1,1,1,1,2,2)),
c=as.integer(c(3,5,65,343,23,456)),
d = as.integer(rep(0,6)),
d = as.double(rep(0,6)),
e = c("A","T","C","G","C","T"),
f = c("T","C", NA,"C","G","A"))
expect_error(test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta,
Expand Down Expand Up @@ -437,7 +437,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -492,7 +492,7 @@ test_that("chr_int is always an integer",{
test_loci_fac <- data.frame(name=paste0("rs",1:6),
chromosome=as.factor(paste0("chr",c(1,1,1,1,2,2))),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))
test_gt <- gen_tibble(x = test_genotypes, loci = test_loci_fac, indiv_meta = test_indiv_meta, quiet = TRUE)
Expand Down Expand Up @@ -531,7 +531,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -590,7 +590,7 @@ test_that("additional vcf tests with larger file",{
# test_loci <- data.frame(name=paste0("rs",1:6),
# chromosome=paste0("chr",c(1,1,1,1,2,2)),
# position=as.integer(c(3,5,65,343,23,456)),
# genetic_dist = as.integer(rep(0,6)),
# genetic_dist = as.double(rep(0,6)),
# allele_ref = c("A","T","C","G","C","T"),
# allele_alt = c("T","C", NA,"C","G","A"))
# test_loci_wrong <- test_loci
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_gen_tibble_save_load.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_group_by.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_gt_as_.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ test_that("show_loci gets and sets information",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_gt_as_plink.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
45 changes: 44 additions & 1 deletion tests/testthat/test_gt_as_vcf.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,0),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -43,3 +43,46 @@ test_that("test reading and writing is equivalent",{
expect_true(identical(show_genotypes(pop_a_vcf), show_genotypes(pop_a_vcf_rewrite)))
expect_true(identical(show_loci(pop_a_vcf), show_loci(pop_a_vcf_rewrite)))
})


test_that("test reading and writing with chunking is equivalent cpp",{
# write out to vcf with chunk size - smaller chunk size = more duplicates
vcf_path_chunked <- gt_as_vcf(test_gt, file = paste0(tempfile(),".vcf"), chunk_size = 2)
# read vcf back in with cpp
test_gt_chunked <- gen_tibble(vcf_path_chunked, quiet=TRUE, parser = "cpp", backingfile = tempfile("anolis_chunk_"))
# check they are the same
expect_true(identical(show_genotypes(test_gt), show_genotypes(test_gt_chunked)))
expect_true(identical(show_loci(test_gt), show_loci(test_gt_chunked)))
expect_equal(count_loci(test_gt_chunked), count_loci(test_gt))

# now do the same with chunk size that is not an exact divisor of the number of loci
vcf_path_chunked <- gt_as_vcf(test_gt, file = paste0(tempfile(),".vcf"), chunk_size = 4)
# read vcf back in with cpp
test_gt_chunked <- gen_tibble(vcf_path_chunked, quiet=TRUE, parser = "cpp", backingfile = tempfile("anolis_chunk_"))
# check they are the same
expect_true(identical(show_genotypes(test_gt), show_genotypes(test_gt_chunked)))
expect_true(identical(show_loci(test_gt), show_loci(test_gt_chunked)))
expect_equal(count_loci(test_gt_chunked), count_loci(test_gt))


})


test_that("test reading and writing with chunking is equivalent rvcf",{
# write out to vcf with chunk size
vcf_path_chunked <- gt_as_vcf(test_gt, file = paste0(tempfile(),".vcf"), chunk_size = 2)
# read vcf back in with vcfR
test_gt_chunked <- gen_tibble(vcf_path_chunked, quiet=TRUE, parser = "vcfR", backingfile = tempfile("anolis_chunk_"))
# check they are the same
expect_true(identical(show_genotypes(test_gt), show_genotypes(test_gt_chunked)))
expect_true(identical(show_loci(test_gt), show_loci(test_gt_chunked)))
expect_equal(count_loci(test_gt_chunked), count_loci(test_gt))
})








2 changes: 1 addition & 1 deletion tests/testthat/test_gt_extract_f2.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_indiv_het_obs.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_that("indiv_het_obs computes correctly",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand All @@ -33,7 +33,7 @@ test_that("indiv_het_obs returns 0's when all genotypes are homozygous", {
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_indiv_missingness.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ test_genotypes <- rbind(c(1,1,0,1,1,2),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_loci_freq.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_that("loci_alt_freq and loci_maf computes correctly",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))
test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE)
Expand Down Expand Up @@ -77,7 +77,7 @@ test_that("loci_alt_freq and loci_maf on grouped tibbles",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_loci_ld_clump.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ test_genotypes <- rbind(c(2,2,0,1,1,2),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -99,7 +99,7 @@ test_that("loci order",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c("chr2","chr1","chr1","chr1","chr2","chr1"),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_loci_missingness.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_that("loci_missingness",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down Expand Up @@ -48,7 +48,7 @@ test_that("loci_missingness on grouped tibble",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_loci_transversions_transitions.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test_that("find transitions and transversions",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand All @@ -26,7 +26,7 @@ test_that("check warning message for different alleles",{
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=c(1,1,1,1,2,2),
position=c(3,5,65,343,23,456),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("a","t","c","g","c","t"),
allele_alt = c("t","c", NA,"c","g","a"))

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_pairwise_allele_sharing.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ test_indiv_meta <- data.frame (id=c("a","b","c"),
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
genetic_dist = as.double(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

Expand Down
Loading
Loading