ODiogoSilva · ODiogoSilva · Mar 31, 2018 · Apr 1, 2018 · Apr 1, 2018
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
     readme = f.read()
 
 
-def mcl_data_files():
+def aux_data_files():
     """
     Automatically detects the platform and sets the appropriate mcl binary
     """
@@ -23,12 +23,15 @@ def mcl_data_files():
                  "trifusion/data/resources/mcl/windows/64bit/mcl64.exe",
                  "trifusion/data/resources/mcl/windows/64bit/cygwin1.dll",
                  "trifusion/data/resources/mcl/windows/32bit/mcl32.exe",
-                 "trifusion/data/resources/mcl/windows/32bit/cygwin1.dll"]
+                 "trifusion/data/resources/mcl/windows/32bit/cygwin1.dll",
+                 "trifusion/data/resources/diamond/linux/diamond",
+                 "trifusion/data/resources/diamond/windows/diamond.exe"
+                 ]
 
     return data_file
 
 
-mcl_file = mcl_data_files()
+aux_files = aux_data_files()
 
 setup(
     name="trifusion",
@@ -74,7 +77,7 @@ def mcl_data_files():
                  "Programming Language :: Python",
                  "Programming Language :: Python :: 2.7",
                  "Topic :: Scientific/Engineering :: Bio-Informatics"],
-    data_files=mcl_file,
+    data_files=aux_files,
     entry_points={
         "gui_scripts": [
             "TriFusion = trifusion.TriFusion:gui_exec"

diff --git a/trifusion/app.py b/trifusion/app.py
diff --git a/trifusion/data/resources/background_tasks.py b/trifusion/data/resources/background_tasks.py
@@ -355,10 +355,10 @@ def background_export_groups(f, nm, a):
 
 
 def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
-                   protein_max_stop, usearch_file, usearch_evalue,
-                   usearch_threads, usearch_output, mcl_file, mcl_inflation,
+                   protein_max_stop, diamond_path, evalue,
+                   diamond_threads, search_output, mcl_file, mcl_inflation,
                    ortholog_prefix, group_prefix, orto_max_gene,
-                   orto_min_sp, sqldb, ortho_dir, usearch_db):
+                   orto_min_sp, sqldb, ortho_dir, protein_db):
     """Execution of the orthology search pipeline.
 
 
@@ -379,13 +379,13 @@ def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
         Minimum lenght of protein sequences.
     protein_max_stop : int
         Maximum percentage of stop codons allowed.
-    usearch_file : str
+    diamond_path : str
         Path to usearch executbale.
-    usearch_evalue: int or float
-        Evalue for usearch execution.
-    usearch_threads : int
-        Number of threads used by usearch execution.
-    usearch_output : str
+    evalue: int or float
+        Evalue for search execution.
+    diamond_threads : int
+        Number of threads used by diamond search execution.
+    search_output : str
         Name of usearch's output file.
     mcl_file : str
         Path to the mcl executable.
@@ -406,7 +406,7 @@ def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
         Path to the sqlite database.
     ortho_dir : str
         Path to the directory where the results will be generated.
-    usearch_db : str
+    protein_db : str
         Name of the file used as database for usearch.
     """
 
@@ -429,41 +429,42 @@ def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
 
         nm.task = "filter"
         ortho_pipe.filter_fasta(protein_min_len, protein_max_stop,
-                                usearch_db, ortho_dir, nm)
+                                protein_db, ortho_dir, nm)
         nm.finished_tasks = ["schema", "adjust", "filter"]
 
         if nm.stop:
             raise KillByUser("")
 
-        nm.task = "usearch"
-        ortho_pipe.allvsall_usearch(usearch_db, usearch_evalue, ortho_dir,
-                                    usearch_threads, usearch_output,
-                                    usearch_bin=usearch_file, nm=nm)
-        nm.finished_tasks = ["schema", "adjust", "filter", "usearch"]
+        nm.task = "diamond"
+        ortho_pipe.make_diamond_db(ortho_dir, protein_db, diamond_path, nm=nm)
+        ortho_pipe.allvsall_search(protein_db, evalue, ortho_dir,
+                                   diamond_threads, search_output,
+                                   diamond_bin=diamond_path, nm=nm)
+        nm.finished_tasks = ["schema", "adjust", "filter", "diamond"]
 
         if nm.stop:
             raise KillByUser("")
 
         nm.task = "parse"
-        ortho_pipe.blast_parser(usearch_output, ortho_dir,
+        ortho_pipe.blast_parser(search_output, ortho_dir,
                                 db_dir=temp_dir, nm=nm)
-        nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse"]
+        nm.finished_tasks = ["schema", "adjust", "filter", "diamond", "parse"]
 
         if nm.stop:
             raise KillByUser("")
 
         nm.task = "pairs"
         ortho_pipe.pairs(temp_dir, nm=nm)
         ortho_pipe.dump_pairs(temp_dir, ortho_dir, nm=nm)
-        nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse",
+        nm.finished_tasks = ["schema", "adjust", "filter", "diamond", "parse",
                              "pairs"]
 
         if nm.stop:
             raise KillByUser("")
 
         nm.task = "mcl"
         ortho_pipe.mcl(mcl_inflation, ortho_dir, mcl_file=mcl_file, nm=nm)
-        nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse",
+        nm.finished_tasks = ["schema", "adjust", "filter", "diamond", "parse",
                              "pairs", "mcl"]
 
         if nm.stop:
@@ -472,7 +473,7 @@ def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
         nm.task = "dump"
         ortho_pipe.mcl_groups(mcl_inflation, ortholog_prefix, "1000",
                               group_prefix, ortho_dir, nm=nm)
-        nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse",
+        nm.finished_tasks = ["schema", "adjust", "filter", "diamond", "parse",
                              "pairs", "mcl", "dump"]
 
         if nm.stop:
@@ -485,7 +486,7 @@ def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
             orto_max_gene,
             orto_min_sp,
             sqldb + "_out",
-            join(ortho_dir, "backstage_files", usearch_db),
+            join(ortho_dir, "backstage_files", protein_db),
             temp_dir,
             ortho_dir, nm=nm)
         nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse",

diff --git a/trifusion/data/resources/custom_widgets.py b/trifusion/data/resources/custom_widgets.py
@@ -1283,7 +1283,7 @@ class DialogMCLFix(BoxLayout):
     cancel = ObjectProperty(None)
 
 
-class DialogUSEARCHFix(BoxLayout):
+class DialogDiamondFix(BoxLayout):
     cancel = ObjectProperty(None)
 
 

diff --git a/trifusion/data/resources/diamond/linux/diamond b/trifusion/data/resources/diamond/linux/diamond
diff --git a/trifusion/data/resources/diamond/src/v0.9.19.tar.gz b/trifusion/data/resources/diamond/src/v0.9.19.tar.gz
diff --git a/trifusion/data/resources/diamond/windows/diamond.exe b/trifusion/data/resources/diamond/windows/diamond.exe
diff --git a/trifusion/data/resources/usearch/64bit/vcomp100.dll b/trifusion/data/resources/usearch/64bit/vcomp100.dll
diff --git a/trifusion/data/screens/Orthology.kv b/trifusion/data/screens/Orthology.kv
@@ -118,12 +118,12 @@ ShowcaseScreen:
                                 Label:
                                     id: output_label
                                     markup: True
-                                    text: "[size=18][b]Threads[/b][/size]\n[size=13]Number of threads (CPU's) to use during USEARCH[/size]"
+                                    text: "[size=18][b]Threads[/b][/size]\n[size=13]Number of threads (CPU's) to use during orthology search[/size]"
                                     halign: "left"
                                     valign: "middle"
                                     text_size: self.size
                                 ProcessBt:
-                                    id: usearch_threads
+                                    id: search_threads
                                     size_hint_x: None
                                     width: gl_orto_search.bt_wdt
                                     text: "1"

diff --git a/trifusion/ortho/protein2dna.py b/trifusion/ortho/protein2dna.py
@@ -86,7 +86,7 @@ def translate(sequence):
     return aa_sequence
 
 
-def create_db(f_list, dest="./", ns=None):
+def create_db(f_list, diamond_bin, dest="./", ns=None):
     """
     Creates a fasta database file containing the translated protein sequences
     from the cds files. The final transcripts.fas file will be use
@@ -97,7 +97,8 @@ def create_db(f_list, dest="./", ns=None):
     :param f_list. List, containing the file names of the transcript files
     """
 
-    output_handle = open(join(dest, "transcripts.fas"), "w")
+    transcript_file = join(dest, "transcripts.fas")
+    output_handle = open(transcript_file, "w")
     id_dic = {}
 
     if ns:
@@ -136,6 +137,18 @@ def create_db(f_list, dest="./", ns=None):
 
     output_handle.close()
 
+    # Make diamond database
+    diamong_mkdb = [
+        diamond_bin,
+        "makedb",
+        "--in",
+        transcript_file,
+        "-d",
+        join(dest, "transcripts")
+    ]
+
+    subprocess.Popen(diamong_mkdb).wait()
+
     return id_dic
 
 
@@ -185,26 +198,25 @@ def create_query_from_dict(protein_dict):
     return query_db
 
 
-def pair_search(usearch_bin, dest="./"):
+def pair_search(diamond_bin, dest="./"):
     """
     This will use USEARCH to search for translated transcript sequences
     identical to the original protein files
     """
 
     query_path = join(dest, "query.fas")
-    db_path = join(dest, "transcripts.fas")
+    db_path = join(dest, "transcripts")
     out_path = join(dest, "pairs.out")
 
-    subprocess.Popen([usearch_bin,
-                      "-usearch_global",
+    subprocess.Popen([diamond_bin,
+                      "blastp",
+                      "-q",
                       query_path,
-                      "-db",
+                      "--db",
                       db_path,
-                      "-id",
-                      "1",
-                      "-maxaccepts",
-                      ".9",
-                      "-blast6out",
+                      "--id",
+                      "100",
+                      "--out",
                       out_path]).wait()
 
 
@@ -279,7 +291,7 @@ def convert_protein_file(pairs, group_obj, id_db, output_dir, shared_ns):
 
 
 def convert_group(sqldb, cds_file_list, protein_db, group_sequences,
-                usearch_bin, output_dir, shared_namespace=None):
+                  diamond_bin, output_dir, shared_namespace=None):
     """
     Convenience function that wraps all required operations to convert protein
     to nucleotide files from a Group object
@@ -290,7 +302,7 @@ def convert_group(sqldb, cds_file_list, protein_db, group_sequences,
         shared_namespace.missed = 0
         shared_namespace.good = 0
     # Create database
-    id_db = create_db(cds_file_list, output_dir, shared_namespace)
+    id_db = create_db(cds_file_list, diamond_bin, output_dir, shared_namespace)
 
     if shared_namespace:
         shared_namespace.act = "Creating query"
@@ -312,7 +324,7 @@ def convert_group(sqldb, cds_file_list, protein_db, group_sequences,
     # Execute search
     if shared_namespace:
         shared_namespace.act = "Performing search"
-    pair_search(usearch_bin, output_dir)
+    pair_search(diamond_bin, output_dir)
 
     if shared_namespace:
         # Kill switch

diff --git a/trifusion/orthomcl_pipeline.py b/trifusion/orthomcl_pipeline.py
@@ -125,7 +125,7 @@ def check_unique_field(proteome_file, verbose=False, nm=None):
 def prep_fasta(proteome_file, code, unique_id, dest, verbose=False, nm=None):
 
     if verbose:
-        print_col("\t Preparing file for USEARCH", GREEN, 1)
+        print_col("\t Preparing file for protein search", GREEN, 1)
 
     # Storing header list to check for duplicates
     header_list = []
@@ -245,34 +245,64 @@ def filter_fasta(min_len, max_stop, db, dest, nm=None):
     FilterFasta.orthomcl_filter_fasta(cp_dir, min_len, max_stop, db, dest, nm)
 
 
-def allvsall_usearch(goodproteins, evalue, dest, cpus, usearch_outfile,
-                     usearch_bin="usearch", nm=None):
+def make_diamond_db(dest, goodproteins, diamond_bin="diamond", nm=None):
 
-    print_col("Perfoming USEARCH All-vs-All (may take a while...)", GREEN, 1)
+    print_col("Creating Diamond database", GREEN, 1)
 
-    # FNULL = open(os.devnull, "w")
-    usearch_cmd = [usearch_bin,
-                   "-ublast",
-                   join(dest, "backstage_files", goodproteins),
-                   "-db",
-                   join(dest, "backstage_files", goodproteins),
-                   "-blast6out",
-                   join(dest, "backstage_files", usearch_outfile),
-                   "-evalue", str(evalue),
-                   "--maxaccepts",
-                   "0",
-                   "-threads",
-                   str(cpus)]
+    diamong_mkdb = [
+        diamond_bin,
+        "makedb",
+        "--in",
+        join(dest, "backstage_files", goodproteins),
+        "-d",
+        join(dest, "backstage_files", "database")
+    ]
 
     if nm:
         # The subprocess.Popen handler cannot be passed directly in Windows
         # due to pickling issues. So I pass the pid of the process instead.
-        subp = subprocess.Popen(usearch_cmd)
+        subp = subprocess.Popen(diamong_mkdb)
         nm.subp = subp.pid
         subp.wait()
         nm.subp = None
     else:
-        _ = subprocess.Popen(usearch_cmd).wait()
+        _ = subprocess.Popen(diamong_mkdb).wait()
+
+
+def allvsall_search(goodproteins, evalue, dest, cpus, usearch_outfile,
+                    diamond_bin="diamond", nm=None):
+
+    print_col("Perfoming Diamond All-vs-All (may take a while...)", GREEN, 1)
+
+    diamond_cmd = [
+        diamond_bin,
+        "blastp",
+        "-d",
+        join(dest, "backstage_files", "database"),
+        "-q",
+        join(dest, "backstage_files", goodproteins),
+        "-o",
+        join(dest, "backstage_files", usearch_outfile),
+        "--sensitive",
+        "--threads",
+        str(cpus),
+        "--evalue",
+        str(evalue)
+    ]
+
+    print_col("\tCommand: {}".format(" ".join(diamond_cmd)), GREEN, 1)
+
+    if nm:
+        # The subprocess.Popen handler cannot be passed directly in Windows
+        # due to pickling issues. So I pass the pid of the process instead.
+        # subp = subprocess.Popen(usearch_cmd)
+        subp = subprocess.Popen(diamond_cmd)
+        nm.subp = subp.pid
+        subp.wait()
+        nm.subp = None
+    else:
+        # _ = subprocess.Popen(usearch_cmd).wait()
+        _ = subprocess.Popen(diamond_cmd).wait()
 
 
 def blast_parser(usearch_ouput, dest, db_dir, nm):
@@ -610,8 +640,9 @@ def main():
             adjust_fasta(proteome_files, output_dir)
             filter_fasta(min_length, max_percent_stop, database_name,
                          output_dir)
-            allvsall_usearch(database_name, evalue_cutoff, output_dir, cpus,
-                             usearch_out_name, usearch_bin=usearch_bin)
+            make_diamond_db(output_dir, database_name)
+            allvsall_search(database_name, evalue_cutoff, output_dir, cpus,
+                            usearch_out_name, usearch_bin=usearch_bin)
             blast_parser(usearch_out_name, output_dir, tmp_dir, None)
             pairs(tmp_dir)
             dump_pairs(tmp_dir, output_dir)
@@ -628,8 +659,9 @@ def main():
             install_schema(tmp_dir)
             filter_fasta(min_length, max_percent_stop, database_name,
                          output_dir)
-            allvsall_usearch(database_name, evalue_cutoff, output_dir, cpus,
-                             usearch_out_name, usearch_bin=usearch_bin)
+            make_diamond_db(output_dir, database_name)
+            allvsall_search(database_name, evalue_cutoff, output_dir, cpus,
+                            usearch_out_name, usearch_bin=usearch_bin)
             blast_parser(usearch_out_name, output_dir, tmp_dir, None)
             pairs(tmp_dir)
             dump_pairs(tmp_dir, output_dir)