From 54a31c336e7e0af05348255f10782937609233a1 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 20 Sep 2019 15:20:55 +0200 Subject: [PATCH] Release updates v0.1.2 --- README.md | 5 ++- custom_taxonomy/custom_taxonomy_databases.py | 31 ++++++++++-------- .../modules/CreateKrakenDatabase.py | 6 ++-- custom_taxonomy/modules/ModifyTree.py | 32 ++++++++++--------- custom_taxonomy/modules/WriteTaxonomy.py | 14 +++++--- .../modules/database/DatabaseConnection.py | 14 +++++++- setup.py | 2 +- 7 files changed, 64 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 53d7519..e238868 100644 --- a/README.md +++ b/README.md @@ -149,8 +149,7 @@ flextaxd-create --skip "taxid" ## exclude genomes in taxid see details below ``` - -## Remove branches, the --skip parameter was implemented for benchmarking purposes as an option to remove branches by taxid -## all children of the given taxid will be excluded. +### Exclude genomes in taxid on database creation +Remove branches, the --skip parameter was implemented for benchmarking purposes as an option to remove branches by taxid, all children of the given taxid will be excluded. flextaxd-create --skip "taxid,taxid2" diff --git a/custom_taxonomy/custom_taxonomy_databases.py b/custom_taxonomy/custom_taxonomy_databases.py index 544a26b..b56cf05 100644 --- a/custom_taxonomy/custom_taxonomy_databases.py +++ b/custom_taxonomy/custom_taxonomy_databases.py @@ -20,7 +20,7 @@ each column (not |). ''' -__version__ = "0.1.1" +__version__ = "0.1.2" __author__ = "David Sundell" __credits__ = ["David Sundell"] __license__ = "GPLv3" @@ -29,7 +29,7 @@ __date__ = "2019-09-11" __status__ = "Beta" __pkgname__="custom_taxonomy_databases" -__github__="https://github.com/davve2/custom-taxonomy-databases" +__github__="https://github.com/davve2/flextaxd" __programs_supported__ = ["kraken2", "krakenuniq","ganon","centrifuge"] ###################################--system imports--#################################### @@ -102,6 +102,7 @@ def __init__(self, message,expression=""): basic.add_argument('-v','--verbose', action='store_true', help="verbose output") basic.add_argument('--log', metavar="", default = None, help="use a log file instead of stdout") basic.add_argument("--dump", action='store_true', help="Write database to names.dmp and nodes.dmp") + basic.add_argument('--dump_mini', action='store_true', help="Dump minimal file with tab as separator") basic.add_argument("--force", action='store_true', help="use when script is implemented in pipeline to avoid security questions on overwrite!") rmodules = get_read_modules() @@ -115,16 +116,14 @@ def __init__(self, message,expression=""): mod_opts.add_argument('-md', '--mod_database', metavar="",default=False, help="Database file containing modifications") mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False, help="File that lists which node a genome should be assigned to") mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None, help='Path to genome folder is required when using NCBI_taxonomy as source') - mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to replace branch") - mod_opts.add_argument('--replace', metavar="", default=False, help="Add if existing children of parents should be removed!") + mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to add (replace see below) branch") + mod_opts.add_argument('--replace', action='store_true', help="Add if existing children of parents should be removed!") out_opts = parser.add_argument_group('output_opts', "Output options") out_opts.add_argument('--dbprogram', metavar="", default=False,choices=__programs_supported__, help="Adjust output file to certain output specifications ["+", ".join(__programs_supported__)+"]") out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes", help="change dump prefix reqires two names default(names,nodes)") out_opts.add_argument('--dump_sep', metavar="", default="\t|\t", help="Set output separator default(NCBI) also adds extra trailing columns for kraken") - out_opts.add_argument('--dump_mini', metavar="", default=False, help="Dump minimal file with tab as separator") out_opts.add_argument('--dump_descriptions', action='store_true', default=False, help="Dump description names instead of database integers") - out_opts.add_argument("--taxDB", action='store_true',help="This file is required when running krakenuniq (is it?)") ## added of krakenuniq parser.add_argument("--version", action='store_true', help=argparse.SUPPRESS) @@ -175,8 +174,8 @@ def __init__(self, message,expression=""): logfile = open(args.log, "w") sys.stdout = logfile - if force: - '''Remove database if force is turned on''' + if force and args.taxonomy_file: + '''Remove database if force is turned on and new source file is given''' if os.path.exists(args.database): os.system("rm {database}".format(database=args.database)) @@ -204,9 +203,11 @@ def __init__(self, message,expression=""): read_obj.parse_taxonomy() ## Parse taxonomy file '''Parse genome2taxid file''' ## Fix at some point only one function should be needed - if args.taxonomy_type == "NCBI" and args.genomeid2taxid: + if not args.genomeid2taxid: + print("Warning no genomeid2taxid file given!") + elif args.taxonomy_type == "NCBI" and args.genomeid2taxid: read_obj.parse_genomeid2taxid(args.genomes_path,args.genomeid2taxid) - if args.taxonomy_type == "CanSNPer": + elif args.taxonomy_type == "CanSNPer": read_obj.parse_genomeid2taxid(args.genomeid2taxid) if verbose: print("Nodes in taxonomy tree {n} number of taxonomies {k}".format(n=read_obj.length, k=read_obj.ids)) @@ -214,9 +215,11 @@ def __init__(self, message,expression=""): ''' 1. Modify database, if datasource for modification of current database is supplied process this data''' if args.mod_file or args.mod_database: + if args.mod_file and not args.genomeid2taxid: + exit("No genomeid2taxid file given!") if verbose: print("Loading module: ModifyTree") modify_module = dynamic_import("modules", "ModifyTree") - modify_obj = modify_module(database=args.database, mod_file=args.mod_file, mod_database= args.mod_database,verbose=verbose,parent=args.parent) + modify_obj = modify_module(database=args.database, mod_file=args.mod_file, mod_database= args.mod_database,verbose=verbose,parent=args.parent,replace=args.replace) modify_obj.update_database() if args.mod_file: if verbose: current_time = report_time(current_time) @@ -224,10 +227,10 @@ def __init__(self, message,expression=""): if verbose: current_time = report_time(current_time) ''' 2. Dump custom taxonomy database into NCBI/kraken readable format)''' - if args.dump: + if args.dump or args.dump_mini: '''Check if datase exists if it does make sure the user intends to overwrite the file''' nameprefix,nodeprefix = args.dump_prefix.split(",") - if os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp") or os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp"): + if (os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp") or os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp")) and not force: if args.log: sys.stdout = original_sysout ## If log file is used, make sure the input reaches the user terminal ans = input("Warning: {names} and/or {nodes} already exists, overwrite? (y/n): ") if ans not in ["y","Y","yes", "Yes"]: @@ -244,7 +247,7 @@ def __init__(self, message,expression=""): write_obj.set_minimal() write_obj.nodes() write_obj.names() - if args.taxDB: + if False: #args.taxDB: write_obj.set_separator("\t") write_obj.set_prefix("names,taxDB") write_obj.set_order(True) diff --git a/custom_taxonomy/modules/CreateKrakenDatabase.py b/custom_taxonomy/modules/CreateKrakenDatabase.py index 7117875..80403c6 100644 --- a/custom_taxonomy/modules/CreateKrakenDatabase.py +++ b/custom_taxonomy/modules/CreateKrakenDatabase.py @@ -42,8 +42,10 @@ def __init__(self, database, kraken_database, genomes_path, outdir,verbose=False self.krakendb= kraken_database self.ncbi_rewrite_speed = "fastest" self.verbose = verbose - self.skiptax = parse_skip(skip.split(",")) ## if node should be skipd this must be true, otherwise nodes in modfile are added to existing database - + if skip: + self.skiptax = parse_skip(skip.split(",")) ## if node should be skipd this must be true, otherwise nodes in modfile are added to existing database + else: + self.skiptax = [] if verbose: print("{krakendb}".format(outdir = self.outdir, krakendb=self.krakendb)) if not os.path.exists("{krakendb}".format(outdir = self.outdir, krakendb=self.krakendb)): os.system("mkdir -p {krakendb}".format(outdir = self.outdir, krakendb=self.krakendb)) diff --git a/custom_taxonomy/modules/ModifyTree.py b/custom_taxonomy/modules/ModifyTree.py index db3160f..72d6a87 100644 --- a/custom_taxonomy/modules/ModifyTree.py +++ b/custom_taxonomy/modules/ModifyTree.py @@ -16,7 +16,7 @@ def __init__(self, message): class ModifyTree(object): """docstring for ModifyTree.""" - def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, separator="\t",verbose=False,parent=False): + def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, separator="\t",verbose=False,parent=False,replace=False): super(ModifyTree, self).__init__() self.verbose = verbose if self.verbose: print("Modify Tree") @@ -25,19 +25,19 @@ def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, s self.names = {} self.parent=parent - + self.replace = replace self.ncbi_order = True self.mod_genomes =False ### Connect to or create database - self.taxonomydb = ModifyFunctions(database) + self.taxonomydb = ModifyFunctions(database,verbose=verbose) self.rank= self.taxonomydb.get_rank(col=2) ## Save all nodes in the current database self.nodeDict = self.taxonomydb.get_nodes() self.taxid_base = self.taxonomydb.get_taxid_base() if mod_database: - self.modsource = self.parse_mod_database(ModifyFunctions(mod_database)) + self.modsource = self.parse_mod_database(ModifyFunctions(mod_database,verbose=verbose)) elif mod_file: self.modsource = self.parse_mod_file(mod_file) else: @@ -101,14 +101,14 @@ def parse_mod_database(self, database): self.new_nodes = set() ### Get links from modified database mod_links = database.get_links(self.dbmod_annotation.keys()) - parent_link = self.taxonomydb.get_parent(self.taxonomydb.get_id(self.parent)) + self.parent_link = self.taxonomydb.get_parent(self.taxonomydb.get_id(self.parent)) self.old_nodes = self.taxonomydb.get_children(set([self.taxonomydb.get_id(self.parent)])) - set([self.taxonomydb.get_id(self.parent)]) ### Translate node ids between databases and add non existing nodes into current database for link in mod_links: link = list(link) if link[0] == link[1]: ### This should only occur if the root node of the mod database is used link replace with - self.new_links.add(parent_link) + self.new_links.add(self.parent_link) else: parent,child,rank = self.dbmod_annotation[link[0]].strip(),self.dbmod_annotation[link[1]].strip(),link[2] #print([parent,child,rank]) @@ -116,9 +116,9 @@ def parse_mod_database(self, database): ## remove any overlapping nodes, they don't need new indexnumber but can keep the old ones self.old_nodes -= self.new_nodes ### get links from current database - self.existing_links = self.taxonomydb.get_links(self.old_nodes) + self.existing_links = set(self.taxonomydb.get_links(self.old_nodes)) ### Check which links are overlapping and may need to be updated - self.modified_links = set(self.existing_links) - self.new_links + self.modified_links = self.existing_links - self.new_links '''Get all genomes annotated to new nodes in existing database''' self.mod_genomes = database.get_genomes() return self.modified_links @@ -127,7 +127,9 @@ def parse_mod_file(self, modfile): '''Parse mod_file information''' self.new_links = set() self.new_nodes = set() - self.old_nodes = self.taxonomydb.get_children(set([self.taxonomydb.get_id(self.parent)])) - set([self.taxonomydb.get_id(self.parent)]) + parent = self.taxonomydb.get_id(self.parent) + self.parent_link = self.taxonomydb.get_parent(self.taxonomydb.get_id(self.parent)) + self.old_nodes = self.taxonomydb.get_children(set([parent])) - set([parent]) with open(modfile, "r") as f: headers = f.readline().strip().split(self.sep) if len(headers) < 2 or len(headers)>3: @@ -145,7 +147,6 @@ def parse_mod_file(self, modfile): if len(line) == 2: line +=["-"] ## Add no specified rank to line parent,child,rank = line - print(parent,child,rank) try: rank = self.rank[rank] except: @@ -157,9 +158,9 @@ def parse_mod_file(self, modfile): ## remove any overlapping nodes, they don't need new indexnumber but can keep the old ones self.old_nodes -= self.new_nodes ### get links from current database - self.existing_links = self.taxonomydb.get_links(self.new_nodes,swap=swap) + self.existing_links = set(self.taxonomydb.get_links(self.old_nodes,swap=swap)) ### Check which links are overlapping and may need to be updated - self.modified_links = set(self.existing_links) & self.new_links + self.modified_links = self.existing_links & self.new_links return self.modified_links def update_annotations(self, genomeid2taxid): @@ -226,14 +227,15 @@ def update_genomes(self): def update_database(self): '''Update the database file''' - if self.skip: + if self.replace: self.taxonomydb.delete_links(self.modified_links) + self.taxonomydb.delete_links(self.existing_links-set(self.parent_link)) ## delete existing links from old nodes, except parent self.taxonomydb.delete_nodes(self.old_nodes) if self.verbose: print("Deleted nodes {nodes}".format(nodes=self.old_nodes)) added,nodes = self.taxonomydb.add_links(self.new_links) if added + len(nodes) + len(self.modified_links) > 0: - if self.skip: - print("Deleting {n} links and {n2} nodes that are no longer valid".format(n=len(self.modified_links),n2=len(self.old_nodes))) + if self.replace: + print("Deleting {n} links and {n2} nodes that are no longer valid".format(n=len(self.modified_links & self.existing_links),n2=len(self.old_nodes))) print("Adding {n} new nodes".format(n=len(nodes))) print("Adding {n} updated and/or new links".format(n=added)) diff --git a/custom_taxonomy/modules/WriteTaxonomy.py b/custom_taxonomy/modules/WriteTaxonomy.py index 4f4009f..e02329b 100644 --- a/custom_taxonomy/modules/WriteTaxonomy.py +++ b/custom_taxonomy/modules/WriteTaxonomy.py @@ -19,11 +19,17 @@ def __init__(self, path, database=".taxonomydb",verbose=False,separator="\t|\t", ### Allows a minimal output file with only nessesary fields default is NCBI id | name | empty | scientific name self.minimal = minimal if self.minimal: - self.separator = self.minimal - if self.separator =="\\t": + if dbprogram: + print("# WARNING: dbprogram cannot be used in combination with dump_mini, parameter ignored! (use --dump)") + self.dbprogram = False + if separator != "\t|\t": + self.separator = separator + else: self.separator = "\t" + else: + self.dbprogram = dbprogram self.parent = False ## Default print is NCBI structure with child in the first column - self.dbprogram = dbprogram + def set_separator(self,sep): self.separator=sep @@ -77,7 +83,7 @@ def names(self): '''Write node annotations to names.dmp''' if self.verbose: print('Write annotations to: {}{}.dmp'.format(self.path,self.prefix[0])) end = "\n" - if self.dbprogram == "ganon": + if self.dbprogram == "ganon" or self.dbprogram == "krakenuniq": end = "\t|\n" with open('{}{}.dmp'.format(self.path,self.prefix[0]),"w") as outputfile: ## Retrieve all nodes that exists in the database diff --git a/custom_taxonomy/modules/database/DatabaseConnection.py b/custom_taxonomy/modules/database/DatabaseConnection.py index 1526d07..fdcc534 100644 --- a/custom_taxonomy/modules/database/DatabaseConnection.py +++ b/custom_taxonomy/modules/database/DatabaseConnection.py @@ -8,6 +8,12 @@ def __init__(self, value): def __str__(self): return repr(self.value) +class NameError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + class DatabaseConnection(object): """docstring for DatabaseConnection""" def __init__(self, database, verbose=False): @@ -254,6 +260,7 @@ def add_nodes(self,nodes, table="tree",hold=False): def delete_links(self,links, table="tree",hold=False): '''This function deletes all links given in links''' QUERY = "DELETE FROM {table} WHERE parent = {parent} AND child = {child}" + if self.verbose: print(QUERY.format(table=table,parent="",child=""), links) for parent,child,rank in links: res = self.query(QUERY.format(table=table, parent=parent, child=child)) @@ -264,6 +271,7 @@ def delete_links(self,links, table="tree",hold=False): def delete_nodes(self, nodes, table="nodes",hold=False): '''This function deletes all nodes given in nodes''' QUERY = "DELETE FROM {table} WHERE id = {node}" + if self.verbose: print(QUERY.format(table=table,node=""), nodes) for node in nodes: res = self.query(QUERY.format(table=table, node=node)) ## Commit changes @@ -312,7 +320,11 @@ def get_parent(self,name): def get_id(self,name): '''get node id from name''' QUERY = '''SELECT id FROM nodes WHERE name = "{node}"'''.format(node=name) - res = self.query(QUERY).fetchone()[0] + try: + res = self.query(QUERY).fetchone()[0] + except TypeError: + print(QUERY) + raise NameError("Name not found in the database! {name}".format(name=name)) return res diff --git a/setup.py b/setup.py index bfa6424..eeee004 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ long_description = f.read() setup( - name='FlexTaxD', + name='flextaxd', ##Global version, does not nessesarily follow script versions version=__version__,