Skip to content

Commit

Permalink
new minimap2 parameters for ONT HQ and HiFi
Browse files Browse the repository at this point in the history
  • Loading branch information
mikolmogorov committed Feb 20, 2023
1 parent 943176c commit 432002d
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 43 deletions.
2 changes: 1 addition & 1 deletion flye/__build__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__build__ = 1783
__build__ = 1784
5 changes: 2 additions & 3 deletions flye/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,7 @@ def run(self):
logger.info("Running Minimap2")
out_alignment = os.path.join(self.consensus_dir, "minimap.bam")
aln.make_alignment(self.in_contigs, self.args.reads, self.args.threads,
self.consensus_dir, self.args.platform, out_alignment,
reference_mode=True, sam_output=True)
self.args.platform, self.args.read_type, out_alignment)

contigs_info = aln.get_contigs_info(self.in_contigs)
logger.info("Computing consensus")
Expand Down Expand Up @@ -326,7 +325,7 @@ def run(self):
self.out_files["stats"], self.out_files["contigs"])
pol.generate_polished_edges(self.in_graph_edges, self.in_graph_gfa,
self.out_files["contigs"],
self.polishing_dir, self.args.platform,
self.polishing_dir, self.args.platform, self.args.read_type,
stats, self.args.threads)
os.remove(contigs)
if os.path.getsize(self.out_files["contigs"]) == 0:
Expand Down
61 changes: 27 additions & 34 deletions flye/polishing/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,9 @@ def check_binaries():


def make_alignment(reference_file, reads_file, num_proc,
work_dir, platform, out_alignment, reference_mode,
sam_output):
"""
Runs minimap2 and sorts its output
"""
minimap_ref_mode = {False: "ava", True: "map"}
minimap_reads_mode = {"nano": "ont", "pacbio": "pb"}
mode = minimap_ref_mode[reference_mode] + "-" + minimap_reads_mode[platform]

_run_minimap(reference_file, reads_file, num_proc, mode,
out_alignment, sam_output)

#if sam_output:
# preprocess_sam(out_alignment, work_dir)
platform, read_type, out_alignment):
mode = platform + "-" + read_type
_run_minimap(reference_file, reads_file, num_proc, mode, out_alignment)


def get_contigs_info(contigs_file):
Expand Down Expand Up @@ -235,18 +224,30 @@ def name_split(h):
return out_dict


def _run_minimap(reference_file, reads_files, num_proc, mode, out_file,
sam_output):
def _run_minimap(reference_file, reads_files, num_proc, reads_type, out_file):
#SAM_HEADER = "\'@PG|@HD|@SQ|@RG|@CO\'"
work_dir = os.path.dirname(out_file)
stderr_file = os.path.join(work_dir, "minimap.stderr")
SORT_THREADS = "4"
SORT_MEM = "4G" if os.path.getsize(reference_file) > 100 * 1024 * 1024 else "1G"
BATCH = "5G" if os.path.getsize(reference_file) > 100 * 1024 * 1024 else "1G"

mode = None
extra_args = []
if reads_type in ["pacbio-raw", "pacbio-corrected", "pacbio-subasm"]:
mode = "map-pb"
if reads_type == "pacbio-hifi":
mode = "map-hifi"
elif reads_type in ["nano-raw", "nano-corrected"]:
mode = "map-ont"
elif reads_type == "nano-nano_hq":
mode = "map-ont"
extra_args = ["-k", "17"]

cmdline = [MINIMAP_BIN, "'" + reference_file + "'"]
cmdline.extend(["'" + read_file + "'" for read_file in reads_files])
cmdline.extend(["-x", mode, "-t", str(num_proc)])
cmdline.extend(extra_args)

#Produces gzipped SAM sorted by reference name. Since it's not sorted by
#read name anymore, it's important that all reads have SEQ.
Expand All @@ -257,22 +258,14 @@ def _run_minimap(reference_file, reads_files, num_proc, mode, out_file,
#--secondary-seq = custom option to output SEQ for seqcondary alignment with hard clipping
#-L: move CIGAR strings for ultra-long reads to the separate tag
#-Q don't output fastq quality
if sam_output:
tmp_prefix = os.path.join(os.path.dirname(out_file),
"sort_" + datetime.datetime.now().strftime("%y%m%d_%H%M%S"))
cmdline.extend(["-a", "-p", "0.5", "-N", "10", "--sam-hit-only", "-L", "-K", BATCH,
"-z", "1000", "-Q", "--secondary-seq", "-I", "64G"])
cmdline.extend(["|", SAMTOOLS_BIN, "view", "-T", "'" + reference_file + "'", "-u", "-"])
cmdline.extend(["|", SAMTOOLS_BIN, "sort", "-T", "'" + tmp_prefix + "'", "-O", "bam",
"-@", SORT_THREADS, "-l", "1", "-m", SORT_MEM])
cmdline.extend(["-o", "'" + out_file + "'"])
else:
pass #paf output enabled by default

#cmdline.extend(["|", "grep", "-Ev", SAM_HEADER]) #removes headers
#cmdline.extend(["|", "sort", "-k", "3,3", "-T", work_dir,
# "--parallel=8", "-S", "4G"])
#cmdline.extend(["|", "gzip", "-1"])
tmp_prefix = os.path.join(os.path.dirname(out_file),
"sort_" + datetime.datetime.now().strftime("%y%m%d_%H%M%S"))
cmdline.extend(["-a", "-p", "0.5", "-N", "10", "--sam-hit-only", "-L", "-K", BATCH,
"-z", "1000", "-Q", "--secondary-seq", "-I", "64G"])
cmdline.extend(["|", SAMTOOLS_BIN, "view", "-T", "'" + reference_file + "'", "-u", "-"])
cmdline.extend(["|", SAMTOOLS_BIN, "sort", "-T", "'" + tmp_prefix + "'", "-O", "bam",
"-@", SORT_THREADS, "-l", "1", "-m", SORT_MEM])
cmdline.extend(["-o", "'" + out_file + "'"])

#logger.debug("Running: " + " ".join(cmdline))
try:
Expand All @@ -281,11 +274,11 @@ def _run_minimap(reference_file, reads_files, num_proc, mode, out_file,
"set -eo pipefail; " + " ".join(cmdline)],
stderr=open(stderr_file, "w"),
stdout=open(os.devnull, "w"))
if sam_output:
subprocess.check_call(SAMTOOLS_BIN + " index -@ 4 " + "'" + out_file + "'", shell=True)
subprocess.check_call(SAMTOOLS_BIN + " index -@ 4 " + "'" + out_file + "'", shell=True)
#os.remove(stderr_file)

except (subprocess.CalledProcessError, OSError) as e:
logger.error("Error running minimap2, terminating. See the alignment error log "
" for details: " + stderr_file)
logger.error("Cmd: " + " ".join(cmdline))
raise AlignmentException(str(e))
8 changes: 3 additions & 5 deletions flye/polishing/polish.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,7 @@ def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, read_platfo
logger.info("Running minimap2")
alignment_file = os.path.join(work_dir, "minimap_{0}.bam".format(i + 1))
make_alignment(prev_assembly, read_seqs, num_threads,
work_dir, read_platform, alignment_file,
reference_mode=True, sam_output=True)
read_platform, read_type, alignment_file)
else:
logger.info("Polishing with provided bam")
alignment_file = read_seqs[0]
Expand Down Expand Up @@ -144,7 +143,7 @@ def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, read_platfo


def generate_polished_edges(edges_file, gfa_file, polished_contigs, work_dir,
error_mode, polished_stats, num_threads):
platform, read_type, polished_stats, num_threads):
"""
Generate polished graph edges sequences by extracting them from
polished contigs
Expand All @@ -163,8 +162,7 @@ def generate_polished_edges(edges_file, gfa_file, polished_contigs, work_dir,
alignment_file = os.path.join(work_dir, "edges_aln.bam")
polished_dict = fp.read_sequence_dict(polished_contigs)
make_alignment(polished_contigs, [edges_file], num_threads,
work_dir, error_mode, alignment_file,
reference_mode=True, sam_output=True)
platform, read_type, alignment_file)
aln_reader = SynchronizedSamReader(alignment_file,
polished_dict, multiprocessing.Manager(),
cfg.vals["max_read_coverage"])
Expand Down

0 comments on commit 432002d

Please sign in to comment.