Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace usearch with diamond in protein search workflows #292

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
readme = f.read()


def mcl_data_files():
def aux_data_files():
"""
Automatically detects the platform and sets the appropriate mcl binary
"""
Expand All @@ -23,12 +23,15 @@ def mcl_data_files():
"trifusion/data/resources/mcl/windows/64bit/mcl64.exe",
"trifusion/data/resources/mcl/windows/64bit/cygwin1.dll",
"trifusion/data/resources/mcl/windows/32bit/mcl32.exe",
"trifusion/data/resources/mcl/windows/32bit/cygwin1.dll"]
"trifusion/data/resources/mcl/windows/32bit/cygwin1.dll",
"trifusion/data/resources/diamond/linux/diamond",
"trifusion/data/resources/diamond/windows/diamond.exe"
]

return data_file


mcl_file = mcl_data_files()
aux_files = aux_data_files()

setup(
name="trifusion",
Expand Down Expand Up @@ -74,7 +77,7 @@ def mcl_data_files():
"Programming Language :: Python",
"Programming Language :: Python :: 2.7",
"Topic :: Scientific/Engineering :: Bio-Informatics"],
data_files=mcl_file,
data_files=aux_files,
entry_points={
"gui_scripts": [
"TriFusion = trifusion.TriFusion:gui_exec"
Expand Down
194 changes: 81 additions & 113 deletions trifusion/app.py

Large diffs are not rendered by default.

45 changes: 23 additions & 22 deletions trifusion/data/resources/background_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,10 +355,10 @@ def background_export_groups(f, nm, a):


def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
protein_max_stop, usearch_file, usearch_evalue,
usearch_threads, usearch_output, mcl_file, mcl_inflation,
protein_max_stop, diamond_path, evalue,
diamond_threads, search_output, mcl_file, mcl_inflation,
ortholog_prefix, group_prefix, orto_max_gene,
orto_min_sp, sqldb, ortho_dir, usearch_db):
orto_min_sp, sqldb, ortho_dir, protein_db):
"""Execution of the orthology search pipeline.


Expand All @@ -379,13 +379,13 @@ def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
Minimum lenght of protein sequences.
protein_max_stop : int
Maximum percentage of stop codons allowed.
usearch_file : str
diamond_path : str
Path to usearch executbale.
usearch_evalue: int or float
Evalue for usearch execution.
usearch_threads : int
Number of threads used by usearch execution.
usearch_output : str
evalue: int or float
Evalue for search execution.
diamond_threads : int
Number of threads used by diamond search execution.
search_output : str
Name of usearch's output file.
mcl_file : str
Path to the mcl executable.
Expand All @@ -406,7 +406,7 @@ def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
Path to the sqlite database.
ortho_dir : str
Path to the directory where the results will be generated.
usearch_db : str
protein_db : str
Name of the file used as database for usearch.
"""

Expand All @@ -429,41 +429,42 @@ def orto_execution(nm, temp_dir, proteome_files, protein_min_len,

nm.task = "filter"
ortho_pipe.filter_fasta(protein_min_len, protein_max_stop,
usearch_db, ortho_dir, nm)
protein_db, ortho_dir, nm)
nm.finished_tasks = ["schema", "adjust", "filter"]

if nm.stop:
raise KillByUser("")

nm.task = "usearch"
ortho_pipe.allvsall_usearch(usearch_db, usearch_evalue, ortho_dir,
usearch_threads, usearch_output,
usearch_bin=usearch_file, nm=nm)
nm.finished_tasks = ["schema", "adjust", "filter", "usearch"]
nm.task = "diamond"
ortho_pipe.make_diamond_db(ortho_dir, protein_db, diamond_path, nm=nm)
ortho_pipe.allvsall_search(protein_db, evalue, ortho_dir,
diamond_threads, search_output,
diamond_bin=diamond_path, nm=nm)
nm.finished_tasks = ["schema", "adjust", "filter", "diamond"]

if nm.stop:
raise KillByUser("")

nm.task = "parse"
ortho_pipe.blast_parser(usearch_output, ortho_dir,
ortho_pipe.blast_parser(search_output, ortho_dir,
db_dir=temp_dir, nm=nm)
nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse"]
nm.finished_tasks = ["schema", "adjust", "filter", "diamond", "parse"]

if nm.stop:
raise KillByUser("")

nm.task = "pairs"
ortho_pipe.pairs(temp_dir, nm=nm)
ortho_pipe.dump_pairs(temp_dir, ortho_dir, nm=nm)
nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse",
nm.finished_tasks = ["schema", "adjust", "filter", "diamond", "parse",
"pairs"]

if nm.stop:
raise KillByUser("")

nm.task = "mcl"
ortho_pipe.mcl(mcl_inflation, ortho_dir, mcl_file=mcl_file, nm=nm)
nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse",
nm.finished_tasks = ["schema", "adjust", "filter", "diamond", "parse",
"pairs", "mcl"]

if nm.stop:
Expand All @@ -472,7 +473,7 @@ def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
nm.task = "dump"
ortho_pipe.mcl_groups(mcl_inflation, ortholog_prefix, "1000",
group_prefix, ortho_dir, nm=nm)
nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse",
nm.finished_tasks = ["schema", "adjust", "filter", "diamond", "parse",
"pairs", "mcl", "dump"]

if nm.stop:
Expand All @@ -485,7 +486,7 @@ def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
orto_max_gene,
orto_min_sp,
sqldb + "_out",
join(ortho_dir, "backstage_files", usearch_db),
join(ortho_dir, "backstage_files", protein_db),
temp_dir,
ortho_dir, nm=nm)
nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse",
Expand Down
2 changes: 1 addition & 1 deletion trifusion/data/resources/custom_widgets.py
Original file line number Diff line number Diff line change
Expand Up @@ -1283,7 +1283,7 @@ class DialogMCLFix(BoxLayout):
cancel = ObjectProperty(None)


class DialogUSEARCHFix(BoxLayout):
class DialogDiamondFix(BoxLayout):
cancel = ObjectProperty(None)


Expand Down
Binary file added trifusion/data/resources/diamond/linux/diamond
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed trifusion/data/resources/usearch/64bit/vcomp100.dll
Binary file not shown.
4 changes: 2 additions & 2 deletions trifusion/data/screens/Orthology.kv
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,12 @@ ShowcaseScreen:
Label:
id: output_label
markup: True
text: "[size=18][b]Threads[/b][/size]\n[size=13]Number of threads (CPU's) to use during USEARCH[/size]"
text: "[size=18][b]Threads[/b][/size]\n[size=13]Number of threads (CPU's) to use during orthology search[/size]"
halign: "left"
valign: "middle"
text_size: self.size
ProcessBt:
id: usearch_threads
id: search_threads
size_hint_x: None
width: gl_orto_search.bt_wdt
text: "1"
Expand Down
42 changes: 27 additions & 15 deletions trifusion/ortho/protein2dna.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def translate(sequence):
return aa_sequence


def create_db(f_list, dest="./", ns=None):
def create_db(f_list, diamond_bin, dest="./", ns=None):
"""
Creates a fasta database file containing the translated protein sequences
from the cds files. The final transcripts.fas file will be use
Expand All @@ -97,7 +97,8 @@ def create_db(f_list, dest="./", ns=None):
:param f_list. List, containing the file names of the transcript files
"""

output_handle = open(join(dest, "transcripts.fas"), "w")
transcript_file = join(dest, "transcripts.fas")
output_handle = open(transcript_file, "w")
id_dic = {}

if ns:
Expand Down Expand Up @@ -136,6 +137,18 @@ def create_db(f_list, dest="./", ns=None):

output_handle.close()

# Make diamond database
diamong_mkdb = [
diamond_bin,
"makedb",
"--in",
transcript_file,
"-d",
join(dest, "transcripts")
]

subprocess.Popen(diamong_mkdb).wait()

return id_dic


Expand Down Expand Up @@ -185,26 +198,25 @@ def create_query_from_dict(protein_dict):
return query_db


def pair_search(usearch_bin, dest="./"):
def pair_search(diamond_bin, dest="./"):
"""
This will use USEARCH to search for translated transcript sequences
identical to the original protein files
"""

query_path = join(dest, "query.fas")
db_path = join(dest, "transcripts.fas")
db_path = join(dest, "transcripts")
out_path = join(dest, "pairs.out")

subprocess.Popen([usearch_bin,
"-usearch_global",
subprocess.Popen([diamond_bin,
"blastp",
"-q",
query_path,
"-db",
"--db",
db_path,
"-id",
"1",
"-maxaccepts",
".9",
"-blast6out",
"--id",
"100",
"--out",
out_path]).wait()


Expand Down Expand Up @@ -279,7 +291,7 @@ def convert_protein_file(pairs, group_obj, id_db, output_dir, shared_ns):


def convert_group(sqldb, cds_file_list, protein_db, group_sequences,
usearch_bin, output_dir, shared_namespace=None):
diamond_bin, output_dir, shared_namespace=None):
"""
Convenience function that wraps all required operations to convert protein
to nucleotide files from a Group object
Expand All @@ -290,7 +302,7 @@ def convert_group(sqldb, cds_file_list, protein_db, group_sequences,
shared_namespace.missed = 0
shared_namespace.good = 0
# Create database
id_db = create_db(cds_file_list, output_dir, shared_namespace)
id_db = create_db(cds_file_list, diamond_bin, output_dir, shared_namespace)

if shared_namespace:
shared_namespace.act = "Creating query"
Expand All @@ -312,7 +324,7 @@ def convert_group(sqldb, cds_file_list, protein_db, group_sequences,
# Execute search
if shared_namespace:
shared_namespace.act = "Performing search"
pair_search(usearch_bin, output_dir)
pair_search(diamond_bin, output_dir)

if shared_namespace:
# Kill switch
Expand Down
78 changes: 55 additions & 23 deletions trifusion/orthomcl_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def check_unique_field(proteome_file, verbose=False, nm=None):
def prep_fasta(proteome_file, code, unique_id, dest, verbose=False, nm=None):

if verbose:
print_col("\t Preparing file for USEARCH", GREEN, 1)
print_col("\t Preparing file for protein search", GREEN, 1)

# Storing header list to check for duplicates
header_list = []
Expand Down Expand Up @@ -245,34 +245,64 @@ def filter_fasta(min_len, max_stop, db, dest, nm=None):
FilterFasta.orthomcl_filter_fasta(cp_dir, min_len, max_stop, db, dest, nm)


def allvsall_usearch(goodproteins, evalue, dest, cpus, usearch_outfile,
usearch_bin="usearch", nm=None):
def make_diamond_db(dest, goodproteins, diamond_bin="diamond", nm=None):

print_col("Perfoming USEARCH All-vs-All (may take a while...)", GREEN, 1)
print_col("Creating Diamond database", GREEN, 1)

# FNULL = open(os.devnull, "w")
usearch_cmd = [usearch_bin,
"-ublast",
join(dest, "backstage_files", goodproteins),
"-db",
join(dest, "backstage_files", goodproteins),
"-blast6out",
join(dest, "backstage_files", usearch_outfile),
"-evalue", str(evalue),
"--maxaccepts",
"0",
"-threads",
str(cpus)]
diamong_mkdb = [
diamond_bin,
"makedb",
"--in",
join(dest, "backstage_files", goodproteins),
"-d",
join(dest, "backstage_files", "database")
]

if nm:
# The subprocess.Popen handler cannot be passed directly in Windows
# due to pickling issues. So I pass the pid of the process instead.
subp = subprocess.Popen(usearch_cmd)
subp = subprocess.Popen(diamong_mkdb)
nm.subp = subp.pid
subp.wait()
nm.subp = None
else:
_ = subprocess.Popen(usearch_cmd).wait()
_ = subprocess.Popen(diamong_mkdb).wait()


def allvsall_search(goodproteins, evalue, dest, cpus, usearch_outfile,
diamond_bin="diamond", nm=None):

print_col("Perfoming Diamond All-vs-All (may take a while...)", GREEN, 1)

diamond_cmd = [
diamond_bin,
"blastp",
"-d",
join(dest, "backstage_files", "database"),
"-q",
join(dest, "backstage_files", goodproteins),
"-o",
join(dest, "backstage_files", usearch_outfile),
"--sensitive",
"--threads",
str(cpus),
"--evalue",
str(evalue)
]

print_col("\tCommand: {}".format(" ".join(diamond_cmd)), GREEN, 1)

if nm:
# The subprocess.Popen handler cannot be passed directly in Windows
# due to pickling issues. So I pass the pid of the process instead.
# subp = subprocess.Popen(usearch_cmd)
subp = subprocess.Popen(diamond_cmd)
nm.subp = subp.pid
subp.wait()
nm.subp = None
else:
# _ = subprocess.Popen(usearch_cmd).wait()
_ = subprocess.Popen(diamond_cmd).wait()


def blast_parser(usearch_ouput, dest, db_dir, nm):
Expand Down Expand Up @@ -610,8 +640,9 @@ def main():
adjust_fasta(proteome_files, output_dir)
filter_fasta(min_length, max_percent_stop, database_name,
output_dir)
allvsall_usearch(database_name, evalue_cutoff, output_dir, cpus,
usearch_out_name, usearch_bin=usearch_bin)
make_diamond_db(output_dir, database_name)
allvsall_search(database_name, evalue_cutoff, output_dir, cpus,
usearch_out_name, usearch_bin=usearch_bin)
blast_parser(usearch_out_name, output_dir, tmp_dir, None)
pairs(tmp_dir)
dump_pairs(tmp_dir, output_dir)
Expand All @@ -628,8 +659,9 @@ def main():
install_schema(tmp_dir)
filter_fasta(min_length, max_percent_stop, database_name,
output_dir)
allvsall_usearch(database_name, evalue_cutoff, output_dir, cpus,
usearch_out_name, usearch_bin=usearch_bin)
make_diamond_db(output_dir, database_name)
allvsall_search(database_name, evalue_cutoff, output_dir, cpus,
usearch_out_name, usearch_bin=usearch_bin)
blast_parser(usearch_out_name, output_dir, tmp_dir, None)
pairs(tmp_dir)
dump_pairs(tmp_dir, output_dir)
Expand Down
Loading