Release updates v0.1.2

FOI-Bioinformatics · Sep 20, 2019 · 54a31c3 · 54a31c3
1 parent 010bcfb
commit 54a31c3
Show file tree

Hide file tree

Showing 7 changed files with 64 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -149,8 +149,7 @@ flextaxd-create
     --skip "taxid"                      ## exclude genomes in taxid see details below
 ```
 
-
-## Remove branches, the --skip parameter was implemented for benchmarking purposes as an option to remove branches by taxid
-## all children of the given taxid will be excluded.
+### Exclude genomes in taxid on database creation
+Remove branches, the --skip parameter was implemented for benchmarking purposes as an option to remove branches by taxid, all children of the given taxid will be excluded.
 
 flextaxd-create --skip "taxid,taxid2"
diff --git a/custom_taxonomy/custom_taxonomy_databases.py b/custom_taxonomy/custom_taxonomy_databases.py
@@ -20,7 +20,7 @@
 each column (not <tab>|<tab>).
 '''
 
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 __author__ = "David Sundell"
 __credits__ = ["David Sundell"]
 __license__ = "GPLv3"
@@ -29,7 +29,7 @@
 __date__ = "2019-09-11"
 __status__ = "Beta"
 __pkgname__="custom_taxonomy_databases"
-__github__="https://github.com/davve2/custom-taxonomy-databases"
+__github__="https://github.com/davve2/flextaxd"
 __programs_supported__ = ["kraken2", "krakenuniq","ganon","centrifuge"]
 
 ###################################--system imports--####################################
@@ -102,6 +102,7 @@ def __init__(self,  message,expression=""):
     basic.add_argument('-v','--verbose', action='store_true',   help="verbose output")
     basic.add_argument('--log', metavar="", default = None,   help="use a log file instead of stdout")
     basic.add_argument("--dump", action='store_true', help="Write database to names.dmp and nodes.dmp")
+    basic.add_argument('--dump_mini', action='store_true', help="Dump minimal file with tab as separator")
     basic.add_argument("--force", action='store_true', help="use when script is implemented in pipeline to avoid security questions on overwrite!")
 
     rmodules = get_read_modules()
@@ -115,16 +116,14 @@ def __init__(self,  message,expression=""):
     mod_opts.add_argument('-md', '--mod_database', metavar="",default=False, help="Database file containing modifications")
     mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False, help="File that lists which node a genome should be assigned to")
     mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None,  help='Path to genome folder is required when using NCBI_taxonomy as source')
-    mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to replace branch")
-    mod_opts.add_argument('--replace', metavar="", default=False, help="Add if existing children of parents should be removed!")
+    mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to add (replace see below) branch")
+    mod_opts.add_argument('--replace', action='store_true', help="Add if existing children of parents should be removed!")
 
     out_opts = parser.add_argument_group('output_opts', "Output options")
     out_opts.add_argument('--dbprogram', metavar="", default=False,choices=__programs_supported__, help="Adjust output file to certain output specifications ["+", ".join(__programs_supported__)+"]")
     out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes", help="change dump prefix reqires two names default(names,nodes)")
     out_opts.add_argument('--dump_sep', metavar="", default="\t|\t", help="Set output separator default(NCBI) also adds extra trailing columns for kraken")
-    out_opts.add_argument('--dump_mini', metavar="", default=False, help="Dump minimal file with tab as separator")
     out_opts.add_argument('--dump_descriptions', action='store_true', default=False, help="Dump description names instead of database integers")
-    out_opts.add_argument("--taxDB", action='store_true',help="This file is required when running krakenuniq (is it?)") ## added of krakenuniq
 
     parser.add_argument("--version", action='store_true', help=argparse.SUPPRESS)
 
@@ -175,8 +174,8 @@ def __init__(self,  message,expression=""):
         logfile = open(args.log, "w")
         sys.stdout = logfile
 
-    if force:
-        '''Remove database if force is turned on'''
+    if force and args.taxonomy_file:
+        '''Remove database if force is turned on and new source file is given'''
         if os.path.exists(args.database):
             os.system("rm {database}".format(database=args.database))
 
@@ -204,30 +203,34 @@ def __init__(self,  message,expression=""):
             read_obj.parse_taxonomy()                                                           ## Parse taxonomy file
 
             '''Parse genome2taxid file'''                                                       ## Fix at some point only one function should be needed
-            if args.taxonomy_type == "NCBI" and args.genomeid2taxid:
+            if not args.genomeid2taxid:
+                print("Warning no genomeid2taxid file given!")
+            elif args.taxonomy_type == "NCBI" and args.genomeid2taxid:
                 read_obj.parse_genomeid2taxid(args.genomes_path,args.genomeid2taxid)
-            if args.taxonomy_type == "CanSNPer":
+            elif args.taxonomy_type == "CanSNPer":
                 read_obj.parse_genomeid2taxid(args.genomeid2taxid)
 
             if verbose: print("Nodes in taxonomy tree {n} number of taxonomies {k}".format(n=read_obj.length, k=read_obj.ids))
             if verbose: current_time = report_time(current_time)
 
     ''' 1. Modify database, if datasource for modification of current database is supplied process this data'''
     if args.mod_file or args.mod_database:
+        if args.mod_file and not args.genomeid2taxid:
+            exit("No genomeid2taxid file given!")
         if verbose: print("Loading module: ModifyTree")
         modify_module = dynamic_import("modules", "ModifyTree")
-        modify_obj = modify_module(database=args.database, mod_file=args.mod_file, mod_database= args.mod_database,verbose=verbose,parent=args.parent)
+        modify_obj = modify_module(database=args.database, mod_file=args.mod_file, mod_database= args.mod_database,verbose=verbose,parent=args.parent,replace=args.replace)
         modify_obj.update_database()
         if args.mod_file:
             if verbose: current_time = report_time(current_time)
             modify_obj.update_annotations(genomeid2taxid=args.genomeid2taxid)
         if verbose: current_time = report_time(current_time)
 
     ''' 2. Dump custom taxonomy database into NCBI/kraken readable format)'''
-    if args.dump:
+    if args.dump or args.dump_mini:
         '''Check if datase exists if it does make sure the user intends to overwrite the file'''
         nameprefix,nodeprefix = args.dump_prefix.split(",")
-        if os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp") or os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp"):
+        if (os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp") or os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp")) and not force:
             if args.log: sys.stdout = original_sysout ## If log file is used, make sure the input reaches the user terminal
             ans = input("Warning: {names} and/or {nodes} already exists, overwrite? (y/n): ")
             if ans not in ["y","Y","yes", "Yes"]:
@@ -244,7 +247,7 @@ def __init__(self,  message,expression=""):
             write_obj.set_minimal()
         write_obj.nodes()
         write_obj.names()
-        if args.taxDB:
+        if False: #args.taxDB:
             write_obj.set_separator("\t")
             write_obj.set_prefix("names,taxDB")
             write_obj.set_order(True)

diff --git a/custom_taxonomy/modules/CreateKrakenDatabase.py b/custom_taxonomy/modules/CreateKrakenDatabase.py
@@ -42,8 +42,10 @@ def __init__(self, database, kraken_database, genomes_path, outdir,verbose=False
 		self.krakendb= kraken_database
 		self.ncbi_rewrite_speed = "fastest"
 		self.verbose = verbose
-		self.skiptax = parse_skip(skip.split(","))  ## if node should be skipd this must be true, otherwise nodes in modfile are added to existing database
-
+		if skip:
+			self.skiptax = parse_skip(skip.split(","))  ## if node should be skipd this must be true, otherwise nodes in modfile are added to existing database
+		else:
+			self.skiptax = []
 		if verbose: print("{krakendb}".format(outdir = self.outdir, krakendb=self.krakendb))
 		if not os.path.exists("{krakendb}".format(outdir = self.outdir, krakendb=self.krakendb)):
 			os.system("mkdir -p {krakendb}".format(outdir = self.outdir, krakendb=self.krakendb))

diff --git a/custom_taxonomy/modules/ModifyTree.py b/custom_taxonomy/modules/ModifyTree.py
@@ -16,7 +16,7 @@ def __init__(self, message):
 
 class ModifyTree(object):
 	"""docstring for ModifyTree."""
-	def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, separator="\t",verbose=False,parent=False):
+	def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, separator="\t",verbose=False,parent=False,replace=False):
 		super(ModifyTree, self).__init__()
 		self.verbose = verbose
 		if self.verbose: print("Modify Tree")
@@ -25,19 +25,19 @@ def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, s
 		self.names = {}
 
 		self.parent=parent
-
+		self.replace = replace
 		self.ncbi_order = True
 		self.mod_genomes =False
 
 		### Connect to or create database
-		self.taxonomydb = ModifyFunctions(database)
+		self.taxonomydb = ModifyFunctions(database,verbose=verbose)
 		self.rank= self.taxonomydb.get_rank(col=2)
 		## Save all nodes in the current database
 		self.nodeDict = self.taxonomydb.get_nodes()
 
 		self.taxid_base = self.taxonomydb.get_taxid_base()
 		if mod_database:
-			self.modsource = self.parse_mod_database(ModifyFunctions(mod_database))
+			self.modsource = self.parse_mod_database(ModifyFunctions(mod_database,verbose=verbose))
 		elif mod_file:
 			self.modsource = self.parse_mod_file(mod_file)
 		else:
@@ -101,24 +101,24 @@ def parse_mod_database(self, database):
 		self.new_nodes = set()
 		### Get links from modified database
 		mod_links = database.get_links(self.dbmod_annotation.keys())
-		parent_link = self.taxonomydb.get_parent(self.taxonomydb.get_id(self.parent))
+		self.parent_link = self.taxonomydb.get_parent(self.taxonomydb.get_id(self.parent))
 
 		self.old_nodes = self.taxonomydb.get_children(set([self.taxonomydb.get_id(self.parent)])) - set([self.taxonomydb.get_id(self.parent)])
 		### Translate node ids between databases and add non existing nodes into current database
 		for link in mod_links:
 			link = list(link)
 			if link[0] == link[1]: ### This should only occur if the root node of the mod database is used link replace with
-				self.new_links.add(parent_link)
+				self.new_links.add(self.parent_link)
 			else:
 				parent,child,rank = self.dbmod_annotation[link[0]].strip(),self.dbmod_annotation[link[1]].strip(),link[2]
 				#print([parent,child,rank])
 				self._parse_new_links(parent,child,rank)
 		## remove any overlapping nodes, they don't need new indexnumber but can keep the old ones
 		self.old_nodes -= self.new_nodes
 		### get links from current database
-		self.existing_links = self.taxonomydb.get_links(self.old_nodes)
+		self.existing_links = set(self.taxonomydb.get_links(self.old_nodes))
 		### Check which links are overlapping and may need to be updated
-		self.modified_links = set(self.existing_links) - self.new_links
+		self.modified_links = self.existing_links - self.new_links
 		'''Get all genomes annotated to new nodes in existing database'''
 		self.mod_genomes = database.get_genomes()
 		return self.modified_links
@@ -127,7 +127,9 @@ def parse_mod_file(self, modfile):
 		'''Parse mod_file information'''
 		self.new_links = set()
 		self.new_nodes = set()
-		self.old_nodes = self.taxonomydb.get_children(set([self.taxonomydb.get_id(self.parent)])) - set([self.taxonomydb.get_id(self.parent)])
+		parent = self.taxonomydb.get_id(self.parent)
+		self.parent_link = self.taxonomydb.get_parent(self.taxonomydb.get_id(self.parent))
+		self.old_nodes = self.taxonomydb.get_children(set([parent])) - set([parent])
 		with open(modfile, "r") as f:
 			headers = f.readline().strip().split(self.sep)
 			if len(headers) < 2 or len(headers)>3:
@@ -145,7 +147,6 @@ def parse_mod_file(self, modfile):
 				if len(line) == 2:
 					line +=["-"] ## Add no specified rank to line
 				parent,child,rank = line
-				print(parent,child,rank)
 				try:
 					rank = self.rank[rank]
 				except:
@@ -157,9 +158,9 @@ def parse_mod_file(self, modfile):
 		## remove any overlapping nodes, they don't need new indexnumber but can keep the old ones
 		self.old_nodes -= self.new_nodes
 		### get links from current database
-		self.existing_links = self.taxonomydb.get_links(self.new_nodes,swap=swap)
+		self.existing_links = set(self.taxonomydb.get_links(self.old_nodes,swap=swap))
 		### Check which links are overlapping and may need to be updated
-		self.modified_links = set(self.existing_links) & self.new_links
+		self.modified_links = self.existing_links & self.new_links
 		return self.modified_links
 
 	def update_annotations(self, genomeid2taxid):
@@ -226,14 +227,15 @@ def update_genomes(self):
 
 	def update_database(self):
 		'''Update the database file'''
-		if self.skip:
+		if self.replace:
 			self.taxonomydb.delete_links(self.modified_links)
+			self.taxonomydb.delete_links(self.existing_links-set(self.parent_link))  ## delete existing links from old nodes, except parent
 			self.taxonomydb.delete_nodes(self.old_nodes)
 			if self.verbose: print("Deleted nodes {nodes}".format(nodes=self.old_nodes))
 		added,nodes = self.taxonomydb.add_links(self.new_links)
 		if added + len(nodes) + len(self.modified_links) > 0:
-			if self.skip:
-				print("Deleting {n} links and {n2} nodes that are no longer valid".format(n=len(self.modified_links),n2=len(self.old_nodes)))
+			if self.replace:
+				print("Deleting {n} links and {n2} nodes that are no longer valid".format(n=len(self.modified_links & self.existing_links),n2=len(self.old_nodes)))
 			print("Adding {n} new nodes".format(n=len(nodes)))
 			print("Adding {n} updated and/or new links".format(n=added))
 

diff --git a/custom_taxonomy/modules/WriteTaxonomy.py b/custom_taxonomy/modules/WriteTaxonomy.py
@@ -19,11 +19,17 @@ def __init__(self, path, database=".taxonomydb",verbose=False,separator="\t|\t",
 		### Allows a minimal output file with only nessesary fields default is NCBI id | name | empty | scientific name
 		self.minimal = minimal
 		if self.minimal:
-			self.separator = self.minimal
-			if self.separator =="\\t":
+			if dbprogram:
+				print("# WARNING: dbprogram cannot be used in combination with dump_mini, parameter ignored! (use --dump)")
+			self.dbprogram = False
+			if separator != "\t|\t":
+				self.separator = separator
+			else:
 				self.separator = "\t"
+		else:
+			self.dbprogram  = dbprogram
 		self.parent = False ## Default print is NCBI structure with child in the first column
-		self.dbprogram  = dbprogram
+
 
 	def set_separator(self,sep):
 		self.separator=sep
@@ -77,7 +83,7 @@ def names(self):
 		'''Write node annotations to names.dmp'''
 		if self.verbose: print('Write annotations to: {}{}.dmp'.format(self.path,self.prefix[0]))
 		end = "\n"
-		if self.dbprogram == "ganon":
+		if self.dbprogram == "ganon" or self.dbprogram == "krakenuniq":
 			end = "\t|\n"
 		with open('{}{}.dmp'.format(self.path,self.prefix[0]),"w") as outputfile:
 			## Retrieve all nodes that exists in the database

diff --git a/custom_taxonomy/modules/database/DatabaseConnection.py b/custom_taxonomy/modules/database/DatabaseConnection.py
@@ -8,6 +8,12 @@ def __init__(self, value):
 	def __str__(self):
 		return repr(self.value)
 
+class NameError(Exception):
+	def __init__(self, value):
+		self.value = value
+	def __str__(self):
+		return repr(self.value)
+
 class DatabaseConnection(object):
 	"""docstring for DatabaseConnection"""
 	def __init__(self, database, verbose=False):
@@ -254,6 +260,7 @@ def add_nodes(self,nodes, table="tree",hold=False):
 	def delete_links(self,links, table="tree",hold=False):
 		'''This function deletes all links given in links'''
 		QUERY = "DELETE FROM {table} WHERE parent = {parent} AND child = {child}"
+		if self.verbose: print(QUERY.format(table=table,parent="",child=""), links)
 		for parent,child,rank in links:
 			res = self.query(QUERY.format(table=table, parent=parent, child=child))
 
@@ -264,6 +271,7 @@ def delete_links(self,links, table="tree",hold=False):
 	def delete_nodes(self, nodes, table="nodes",hold=False):
 		'''This function deletes all nodes given in nodes'''
 		QUERY = "DELETE FROM {table} WHERE id = {node}"
+		if self.verbose: print(QUERY.format(table=table,node=""), nodes)
 		for node in nodes:
 			res = self.query(QUERY.format(table=table, node=node))
 		## Commit changes
@@ -312,7 +320,11 @@ def get_parent(self,name):
 	def get_id(self,name):
 		'''get node id from name'''
 		QUERY = '''SELECT id FROM nodes WHERE name = "{node}"'''.format(node=name)
-		res = self.query(QUERY).fetchone()[0]
+		try:
+			res = self.query(QUERY).fetchone()[0]
+		except TypeError:
+			print(QUERY)
+			raise NameError("Name not found in the database! {name}".format(name=name))
 		return res
 
 

diff --git a/setup.py b/setup.py
@@ -16,7 +16,7 @@
     long_description = f.read()
 
 setup(
-    name='FlexTaxD',
+    name='flextaxd',
     ##Global version, does not nessesarily follow script versions
     version=__version__,