Skip to content

Latest commit

 

History

History
69 lines (53 loc) · 2.34 KB

benchmark2.md

File metadata and controls

69 lines (53 loc) · 2.34 KB

Benchmarking NextPolish2 using data from Arabidopsis thaliana (Col-XJTU)

Requirement

Software commands

  1. Download reads
#HiFi
wget https://download.cncb.ac.cn/gsa/CRA004538/CRR302668/CRR302668.fastq.gz

# ngs
wget https://download.cncb.ac.cn/gsa/CRA004538/CRR302670/CRR302670_f1.fastq.gz
wget https://download.cncb.ac.cn/gsa/CRA004538/CRR302670/CRR302670_r2.fastq.gz
  1. Downsample HiFi reads
fxTools -s 4.68g CRR302668.fastq.gz > CRR302668.subsample.35x.fastq
  1. Assemble reference
hifiasm -o CRR302668.asm -t 30 CRR302668.subsample.35x.fastq
awk '{if ($1~/^S/ && length($3)>=1000000){print ">"$2;print $3}}' CRR302668.asm.bp.p_ctg.gfa > CRR302668.asm.bp.p_ctg.gfa.fa
  1. Polishing with Racon + Merfin
k=19
thread=5
# Construct k-mer db
meryl count k=${k} threads=${thread} CRR302670*.fastq.gz output reads.meryl

# Collect histogram for GenomeScope
meryl histogram reads.meryl > reads.hist

# Exclude frequency = 1 k-mers
meryl greater-than 1 reads.meryl output reads.gt1.meryl

bash automated-polishing.sh ${thread} 1 CRR302668.asm.bp.p_ctg.gfa.fasta CRR302668.subsample.35x.fastq reads.gt1.meryl racon.meryl
  1. Polishing with NextPolish2
thread=5
yak count -t ${thread} -k 21 -b 37 -o sr.k21.yak <(zcat CRR302670*.fastq.gz) <(zcat CRR302670*.fastq.gz)
yak count -t ${thread} -k 31 -b 37 -o sr.k31.yak <(zcat CRR302670*.fastq.gz) <(zcat CRR302670*.fastq.gz)

HiFi_mapping_file=racon.meryl.iter_1.winnowmap.bam # output by `Racon + Merfin`
samtools index ${HiFi_mapping_file}

nextPolish2 -t ${thread} -r ${HiFi_mapping_file} CRR302668.asm.bp.p_ctg.gfa.fa sr.k21.yak sr.k31.yak -o CRR302668.asm.bp.p_ctg.gfa.np2.fa
  1. Performance assessment
genome=CRR302668.asm.bp.p_ctg.gfa.fa #CRR302668.asm.bp.p_ctg.gfa.np2.fa, racon.meryl.iter_1.consensus.fasta
sh $MERQURY/merqury.sh reads.meryl ${genome} out