Skip to content

Commit

Permalink
zUMIs2.5 improve BC error correction
Browse files Browse the repository at this point in the history
  • Loading branch information
cziegenhain committed Jul 23, 2019
1 parent 6b55fc8 commit aae3813
Show file tree
Hide file tree
Showing 11 changed files with 184 additions and 82 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ We provide a script to convert zUMIs output into loom file automatically based o
To convert zUMIs output to loom, simply run `Rscript rds2loom.R myRun.yaml`.

## Changelog
23 Jun 2019: [zUMIs2.5.0 released](https://github.com/sdparekh/zUMIs/releases/tag/2.5.0).
Updated the behavior related to barcode detection: cell BC detection now occurs at the end of the filtering step. Additionally, cell BC correction by hamming distance is dramatically improved and its output stored in the bam file ("BC" tag) produced by zUMIs for downstream processing. Raw barcode sequences are stored in a new tag "BX". We recommend to always use this option from here on. Output bam files are now ordered by cell barcode. Bugfix related to downsampling potentially crashing when chunk-wise processing occurs to save memory.

30 Apr 2019: zUMIs2.4.5: Bugfix: Prevent crashing of statistics for large datasets.

29 Apr 2019: zUMIs2.4.4: Faster mapping by piping STAR SAM output into threaded samtools BAM compression.
Expand All @@ -22,7 +25,7 @@ To convert zUMIs output to loom, simply run `Rscript rds2loom.R myRun.yaml`.

23 Apr 2019: zUMIs2.4.2: chunk-wise samtools processing while counting is now running in parallel

22 Apr 2019: zUMI2.4.1: Various bugfixes; Initial splitting of fastq files is now much faster, especially for large datasets by estimating the number of reads instead of explicitly counting the lines of the input files.
22 Apr 2019: zUMI2.4.1: Various bugfixes; Initial splitting of fastq files is now much faster, especially for large datasets by estimating the number of reads instead of explicitly counting the lines of the input files.

29 Mar 2019: [zUMIs2.4.0 released](https://github.com/sdparekh/zUMIs/releases/tag/2.4.0).
Improved stats to support protocols without UMIs. Creation of stats now also supports read group-chunking to reduce RAM usage. Rsubread::featureCounts multimapping settings were corrected. zUMIs does not create the intermediate "postmap" YAML file anymore - all options are stored in the user-provided YAML. zUMIs can now run RNA velocity for you (set option velocyto to "yes" in the YAML file). We assume velocyto is installed in path. Implemented a check for correct YAML formatting to prevent runs with bad config files. Barcode detection has been refined and now supports automatic detection by zUMIs guided by a whitelist of possible barcodes (eg. for 10xGenomics data). Thus we have introduced a new flag in the barcode section of the YAML file which controls the automatic barcode detection (eg. automatic: yes).
Expand Down
54 changes: 25 additions & 29 deletions UMIstuffFUN.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ splitRG<-function(bccount,mem){
}
if( (maxR > 2e+09 & opt$read_layout == "SE") | (maxR > 1e+09 & opt$read_layout == "PE") ){
maxR <- ifelse(opt$read_layout == "SE",2e+09,1e+09)
}
}
print(paste(maxR,"Reads per chunk"))
nc<-nrow(bccount)
cs=0
Expand Down Expand Up @@ -51,53 +51,49 @@ prep_samtools <- function(featfiles,bccount,cores,samtoolsexc){
nfiles=length(featfiles)
nchunks <- length(unique(bccount$chunkID))
all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")


for(i in unique(bccount$chunkID)){
rgfile <- all_rgfiles[i]
chunks <- bccount[chunkID==i]$XC
if(opt$barcodes$BarcodeBinning > 0){
write.table(file=rgfile,c(chunks,binmap[,falseBC]),col.names = F,quote = F,row.names = F)
}else{
write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
}
write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
}

headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
write(headerXX,"freadHeader")

headercommand <- "cat freadHeader > "
samcommand <- paste(samtoolsexc," view -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -@")
samcommand <- paste(samtoolsexc," view -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -@")
grepcommand <- " | cut -f12,13,14 | sed 's/BC:Z://' | sed 's/UB:Z://' | sed 's/XT:Z://' | grep -F -f "

outfiles_ex <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".ex.",1:nchunks,".txt")
system(paste(headercommand,outfiles_ex,collapse = "; "))

if(length(featfiles)==1){
cpusperchunk <- round(cores/nchunks,0)
ex_cmd <- paste(samcommand,cpusperchunk,featfiles[1],grepcommand,all_rgfiles,">>",outfiles_ex," & ",collapse = " ")

system(paste(ex_cmd,"wait"))
}else{
cpusperchunk <- round(cores/(2*nchunks),0)
ex_cmd <- paste(samcommand,cpusperchunk,featfiles[1],grepcommand,all_rgfiles,">>",outfiles_ex," & ",collapse = " ")

outfiles_in <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".in.",1:nchunks,".txt")
system(paste(headercommand,outfiles_in,collapse = "; "))

in_cmd <- paste(samcommand,cpusperchunk,featfiles[2],grepcommand,all_rgfiles,">>",outfiles_in," & ",collapse = " ")

system(paste(ex_cmd,in_cmd,"wait"))
}
system("rm freadHeader")
system(paste("rm",all_rgfiles))

return(outfiles_ex)
}

#reads2genes <- function(featfiles,chunks,rgfile,cores,samtoolsexc){
reads2genes <- function(featfiles,chunkID){

#nfiles=length(featfiles)
#if(opt$barcodes$BarcodeBinning > 0){
# write.table(file=rgfile,c(chunks,binmap[,falseBC]),col.names = F,quote = F,row.names = F)
Expand All @@ -109,7 +105,7 @@ reads2genes <- function(featfiles,chunkID){
#write(headerXX,"freadHeader")
#samcommand<-paste("cat freadHeader; ",samtoolsexc," view -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -@",cores)
samfile_ex <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".ex.",chunkID,".txt")

if(length(featfiles)==1){
#reads<-data.table::fread(paste(samcommand,featfiles[1],"| cut -f12,13,14 | sed 's/BC:Z://' | sed 's/UB:Z://' | sed 's/XT:Z://' | grep -F -f ",rgfile), na.strings=c(""),
reads<-data.table::fread(samfile_ex, na.strings=c(""),
Expand All @@ -133,14 +129,14 @@ reads2genes <- function(featfiles,chunkID){
}
#system("rm freadHeader")
system(paste("rm",samfile_ex))

if(opt$read_layout == "PE"){
reads <- reads[ seq(1,nrow(reads),2) ]
}
if(opt$barcodes$BarcodeBinning > 0){
reads[RG %in% binmap[,falseBC], RG := binmap[match(RG,binmap[,falseBC]),trueBC]]
}
#if(opt$barcodes$BarcodeBinning > 0){
# reads[RG %in% binmap[,falseBC], RG := binmap[match(RG,binmap[,falseBC]),trueBC]]
#}

setkey(reads,RG)

return( reads[GE!="NA"] )
Expand Down Expand Up @@ -175,7 +171,7 @@ ham_helper_fun <- function(x){
tempdf <- x[
,list(umicount = hammingFilter(UB[!is.na(UB)],edit = opt$counting_opts$Ham_Dist,gbcid=paste(RG,GE,sep="_")),
readcount = .N), by=c("RG","GE")]

return(tempdf)
}

Expand Down Expand Up @@ -231,7 +227,7 @@ umiCollapseHam<-function(reads,bccount, nmin=0,nmax=Inf,ftype=c("intron","exon")
out <- mclapply(readsamples_list,ham_helper_fun, mc.cores = opt$num_threads, mc.preschedule = TRUE)
#out <- parLapply(cl=cl,readsamples_list,ham_helper_fun) #calculate hammings in parallel
df <- data.table::rbindlist(out) #bind list into single DT

print("Finished multi-threaded hamming distances")
#stopCluster(cl)
gc()
Expand All @@ -241,13 +237,13 @@ umiFUNs<-list(umiCollapseID=umiCollapseID, umiCollapseHam=umiCollapseHam)

check_nonUMIcollapse <- function(seqfiles){
#decide wether to run in UMI or no-UMI mode
UMI_check <- lapply(seqfiles,
UMI_check <- lapply(seqfiles,
function(x) {
if(!is.null(x$base_definition)) {
if(any(grepl("^UMI",x$base_definition))) return("UMI method detected.")
}
})

umi_decision <- ifelse(length(unlist(UMI_check))>0,"UMI","nonUMI")
return(umi_decision)
}
Expand Down
54 changes: 54 additions & 0 deletions correct_BCtag.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/perl
use warnings;

if(@ARGV != 4)
{
print
"\n#####################################################################################
Usage: perl $0 <inbam> <outbam> <BCbinmap> <samtools-executable> \n
Please drop your suggestions and clarifications to <christoph.ziegenhain\@ki.se>\n
######################################################################################\n\n";
exit;
}
BEGIN{
$inbam=$ARGV[0];
$outbam=$ARGV[1];
$binmap=$ARGV[2];
$samtoolsexc=$ARGV[3];
}

open(INBAM, "$samtoolsexc view $inbam | sed 's/BC:Z://' | " ) || die "Couldn't open file $inbam. Check permissions!\n Check if it is a bam file and it exists\n\n";
open(BCBAM,"| $samtoolsexc view -b -o $outbam -");


my %bcmap;
open(DATA, "cat $binmap | sed 's/,/\t/g' | cut -f1,3 | grep -v 'falseBC' | ") || die "Can't open $binmap ! \n";
while (<DATA>) {
my ($raw, @fixedBC) = split(/\t/);
$bcmap{$raw} = \@fixedBC;
}
close DATA;


while (<INBAM>) {
$read=$_;
chomp($read);
@read = split(/\t/,$read);
$thisBC = $read[11];

if (defined($bcmap{$thisBC})) {
#print "BC is in hash\n";
$correctBC = $bcmap{$thisBC}[0];
chomp($correctBC);
}
else {
#print "BC is not in hash\n";
$correctBC = $thisBC;
}

print BCBAM $read[0],"\t",$read[1],"\t",$read[2],"\t",$read[3],"\t",$read[4],"\t",$read[5],"\t",$read[6],"\t",$read[7],"\t",$read[8],"\t",
$read[9],"\t",$read[10],"\t","BX:Z:",$thisBC,"\t","BC:Z:",$correctBC,"\t",$read[12],"\n";

}
close INBAM;
close BCBAM;
2 changes: 1 addition & 1 deletion fqfilter_countUMI.pl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
}
close BC;

open BAMF, "samtools view -@ 2 -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN $bam | " || die "Couldn't open file $bam. Check permissions!\n Check if it is a bam file and it exists\n\n";
open BAMF, "samtools view -@ 2 -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN $bam | " || die "Couldn't open file $bam. Check permissions!\n Check if it is a bam file and it exists\n\n";

%bccount=();
while(<BAMF>){
Expand Down
2 changes: 1 addition & 1 deletion merge_bbmap_alignment.pl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ BEGIN
$pigz=$ARGV[4];
}

open STARBAMF, "$samtoolsexc view -@ 2 -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN $starbam | cut -f12,13 | " || die "Couldn't open file $starbam. Check permissions!\n Check if it is a bam file and it exists\n\n";
open STARBAMF, "$samtoolsexc view -@ 2 -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN $starbam | cut -f12,13 | " || die "Couldn't open file $starbam. Check permissions!\n Check if it is a bam file and it exists\n\n";
open BBBAMF, "$pigz -dc $bbmap | $samtoolsexc view -x XT -x AM - | " || die "Couldn't open file $bbmap. Check permissions!\n Check if it is differently zipped then .gz\n\n";

open(BCBAM,"| $samtoolsexc view -b -o $outbam -");
Expand Down
49 changes: 23 additions & 26 deletions statsFUN.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ splitRG_stats<-function(bccount,mem){
}
if( (maxR > 1e+09 & opt$read_layout == "SE") | (maxR > 5e+08 & opt$read_layout == "PE") ){
maxR <- ifelse(opt$read_layout == "SE",1e+09,5e+08)
}
}
print(paste(maxR,"Reads per chunk"))
nc<-nrow(bccount)
cs=0
Expand All @@ -37,47 +37,44 @@ prep_samtools_stats <- function(featfiles,bccount,cores,samtoolsexc){
nfiles=length(featfiles)
nchunks <- length(unique(bccount$chunkID))
all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")


for(i in unique(bccount$chunkID)){
rgfile <- all_rgfiles[i]
chunks <- bccount[chunkID==i]$XC
if(opt$barcodes$BarcodeBinning > 0){
write.table(file=rgfile,c(chunks,binmap[,falseBC]),col.names = F,quote = F,row.names = F)
}else{
write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
}
write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)

}

headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
write(headerXX,"freadHeader")

headercommand <- "cat freadHeader > "
samcommand <- paste(samtoolsexc," view -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x UB -@")
samcommand <- paste(samtoolsexc," view -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x UB -@")
grepcommand <- " | cut -f12,13,14 | sed 's/BC:Z://' | sed 's/XS:Z://' | sed 's/XT:Z://' | grep -F -f "

outfiles_ex <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".ex.",1:nchunks,".txt")
system(paste(headercommand,outfiles_ex,collapse = "; "))

if(length(featfiles)==1){
cpusperchunk <- round(cores/nchunks,0)
ex_cmd <- paste(samcommand,cpusperchunk,featfiles[1],grepcommand,all_rgfiles,">>",outfiles_ex," & ",collapse = " ")

system(paste(ex_cmd,"wait"))
}else{
cpusperchunk <- round(cores/(2*nchunks),0)
ex_cmd <- paste(samcommand,cpusperchunk,featfiles[1],grepcommand,all_rgfiles,">>",outfiles_ex," & ",collapse = " ")

outfiles_in <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".in.",1:nchunks,".txt")
system(paste(headercommand,outfiles_in,collapse = "; "))

in_cmd <- paste(samcommand,cpusperchunk,featfiles[2],grepcommand,all_rgfiles,">>",outfiles_in," & ",collapse = " ")

system(paste(ex_cmd,in_cmd,"wait"))
}
system("rm freadHeader")
system(paste("rm",all_rgfiles))

return(outfiles_ex)
}

Expand All @@ -90,19 +87,19 @@ sumstatBAM <- function(featfiles,cores,outdir,user_seq,bc,outfile,samtoolsexc){
bccount <- bccount[XC %in% bc$XC]
}
bccount <- splitRG_stats(bccount=bccount, mem= opt$mem_limit)

samouts <- prep_samtools_stats(featfiles = featfiles,
bccount = bccount,
cores = opt$num_threads,
samtoolsexc=samtoolsexc)


for(i in unique(bccount$chunkID)){
print(paste("Working on chunk",i))
#rgfile = paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".currentRGgroup.txt")
#chunks = bccount[chunkID==i]$XC
#write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)

## minifunction for string operations
#headerXX<-paste( c(paste0("V",1:3)) ,collapse="\t")
#write(headerXX,paste(outdir,"freadHeader",sep="/"))
Expand All @@ -113,8 +110,8 @@ sumstatBAM <- function(featfiles,cores,outdir,user_seq,bc,outfile,samtoolsexc){
}else{
samfile_in <- samfile_ex
}


#tmp<-data.table::fread(paste(samcommand,featfiles[1],"| cut -f12,13,14 | sed 's/BC:Z://' | sed 's/XS:Z://' | sed 's/XT:Z://' | grep -F -f ",rgfile), na.strings=c(""),
tmp <- data.table::fread(samfile_ex, na.strings=c(""),
select=c(1,2,3),header=T,fill=T,colClasses = "character" , col.names = c("RG","XS","GE") )[
Expand All @@ -133,7 +130,7 @@ sumstatBAM <- function(featfiles,cores,outdir,user_seq,bc,outfile,samtoolsexc){
][,type:=.rmUnassigned(XS)
][type=="NoFeatures",type:="Intergenic"
][,XS:=NULL]

if(i==1){
mapCount<-tmp
}else{
Expand Down Expand Up @@ -161,7 +158,7 @@ countGenes <- function(cnt,threshold=1,user_seq){
# avoid integer overflow issues in large matrices
# see https://github.com/sdparekh/zUMIs/issues/105
cnt <- (cnt>=threshold)

samples<-as.data.frame(Matrix::colSums(cnt[!(rownames(cnt) %in% user_seq),]))
colnames(samples) <- c("Count")
samples[,"SampleID"] <- as.factor(row.names(samples))
Expand Down
42 changes: 42 additions & 0 deletions zUMIs-BCdetection.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env Rscript
library(methods)
library(data.table)
library(yaml)
library(ggplot2)

##########################
myYaml <- commandArgs(trailingOnly = T)

opt <-read_yaml(myYaml)
setwd(opt$out_dir)
source(paste0(opt$zUMIs_directory,"/barcodeIDFUN.R"))
options(datatable.fread.input.cmd.message=FALSE)
data.table::setDTthreads(threads=opt$num_threads)

#######################################################################
#######################################################################
##### Barcode handling & chunking

#read file with barcodecounts
# bc is the vector of barcodes to keep
bccount<-cellBC(bcfile = opt$barcodes$barcode_file,
bcnum = opt$barcodes$barcode_num,
bcauto = opt$barcodes$automatic,
bccount_file= paste0(opt$out_dir,"/", opt$project, ".BCstats.txt"),
outfilename = paste0(opt$out_dir,"/zUMIs_output/stats/",opt$project,".detected_cells.pdf"))

fwrite(bccount,file=paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes.txt"))

#check if binning of adjacent barcodes should be run
if(opt$barcodes$BarcodeBinning > 0){
binmap <- BCbin(bccount_file = paste0(opt$out_dir,"/", opt$project, ".BCstats.txt"),
bc_detected = bccount)
fwrite(binmap,file=paste0(opt$out_dir,"/zUMIs_output/",opt$project,".BCbinning.txt"))
#update the number reads in BCcount table
binmap_additional <- binmap[, .(addtl = sum(n)), by = trueBC]
bccount[match(binmap_additional$trueBC,XC),n := n + binmap_additional$addtl]
fwrite(bccount,file=paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes_binned.txt"))
}

##############################################################
q()
Loading

0 comments on commit aae3813

Please sign in to comment.