- Download reads
#HiFi
wget https://download.cncb.ac.cn/gsa/CRA004538/CRR302668/CRR302668.fastq.gz
# ngs
wget https://download.cncb.ac.cn/gsa/CRA004538/CRR302670/CRR302670_f1.fastq.gz
wget https://download.cncb.ac.cn/gsa/CRA004538/CRR302670/CRR302670_r2.fastq.gz
- Downsample HiFi reads
fxTools -s 4.68g CRR302668.fastq.gz > CRR302668.subsample.35x.fastq
- Assemble reference
hifiasm -o CRR302668.asm -t 30 CRR302668.subsample.35x.fastq
awk '{if ($1~/^S/ && length($3)>=1000000){print ">"$2;print $3}}' CRR302668.asm.bp.p_ctg.gfa > CRR302668.asm.bp.p_ctg.gfa.fa
- Polishing with Racon + Merfin
k=19
thread=5
# Construct k-mer db
meryl count k=${k} threads=${thread} CRR302670*.fastq.gz output reads.meryl
# Collect histogram for GenomeScope
meryl histogram reads.meryl > reads.hist
# Exclude frequency = 1 k-mers
meryl greater-than 1 reads.meryl output reads.gt1.meryl
bash automated-polishing.sh ${thread} 1 CRR302668.asm.bp.p_ctg.gfa.fasta CRR302668.subsample.35x.fastq reads.gt1.meryl racon.meryl
- Polishing with NextPolish2
thread=5
yak count -t ${thread} -k 21 -b 37 -o sr.k21.yak <(zcat CRR302670*.fastq.gz) <(zcat CRR302670*.fastq.gz)
yak count -t ${thread} -k 31 -b 37 -o sr.k31.yak <(zcat CRR302670*.fastq.gz) <(zcat CRR302670*.fastq.gz)
HiFi_mapping_file=racon.meryl.iter_1.winnowmap.bam # output by `Racon + Merfin`
samtools index ${HiFi_mapping_file}
nextPolish2 -t ${thread} -r ${HiFi_mapping_file} CRR302668.asm.bp.p_ctg.gfa.fa sr.k21.yak sr.k31.yak -o CRR302668.asm.bp.p_ctg.gfa.np2.fa
- Performance assessment
genome=CRR302668.asm.bp.p_ctg.gfa.fa #CRR302668.asm.bp.p_ctg.gfa.np2.fa, racon.meryl.iter_1.consensus.fasta
sh $MERQURY/merqury.sh reads.meryl ${genome} out