Skip to content

Commit

Permalink
Release updates v0.1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
davve2 committed Sep 20, 2019
1 parent 010bcfb commit 54a31c3
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 40 deletions.
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,7 @@ flextaxd-create
--skip "taxid" ## exclude genomes in taxid see details below
```


## Remove branches, the --skip parameter was implemented for benchmarking purposes as an option to remove branches by taxid
## all children of the given taxid will be excluded.
### Exclude genomes in taxid on database creation
Remove branches, the --skip parameter was implemented for benchmarking purposes as an option to remove branches by taxid, all children of the given taxid will be excluded.

flextaxd-create --skip "taxid,taxid2"
31 changes: 17 additions & 14 deletions custom_taxonomy/custom_taxonomy_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
each column (not <tab>|<tab>).
'''

__version__ = "0.1.1"
__version__ = "0.1.2"
__author__ = "David Sundell"
__credits__ = ["David Sundell"]
__license__ = "GPLv3"
Expand All @@ -29,7 +29,7 @@
__date__ = "2019-09-11"
__status__ = "Beta"
__pkgname__="custom_taxonomy_databases"
__github__="https://github.com/davve2/custom-taxonomy-databases"
__github__="https://github.com/davve2/flextaxd"
__programs_supported__ = ["kraken2", "krakenuniq","ganon","centrifuge"]

###################################--system imports--####################################
Expand Down Expand Up @@ -102,6 +102,7 @@ def __init__(self, message,expression=""):
basic.add_argument('-v','--verbose', action='store_true', help="verbose output")
basic.add_argument('--log', metavar="", default = None, help="use a log file instead of stdout")
basic.add_argument("--dump", action='store_true', help="Write database to names.dmp and nodes.dmp")
basic.add_argument('--dump_mini', action='store_true', help="Dump minimal file with tab as separator")
basic.add_argument("--force", action='store_true', help="use when script is implemented in pipeline to avoid security questions on overwrite!")

rmodules = get_read_modules()
Expand All @@ -115,16 +116,14 @@ def __init__(self, message,expression=""):
mod_opts.add_argument('-md', '--mod_database', metavar="",default=False, help="Database file containing modifications")
mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False, help="File that lists which node a genome should be assigned to")
mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None, help='Path to genome folder is required when using NCBI_taxonomy as source')
mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to replace branch")
mod_opts.add_argument('--replace', metavar="", default=False, help="Add if existing children of parents should be removed!")
mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to add (replace see below) branch")
mod_opts.add_argument('--replace', action='store_true', help="Add if existing children of parents should be removed!")

out_opts = parser.add_argument_group('output_opts', "Output options")
out_opts.add_argument('--dbprogram', metavar="", default=False,choices=__programs_supported__, help="Adjust output file to certain output specifications ["+", ".join(__programs_supported__)+"]")
out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes", help="change dump prefix reqires two names default(names,nodes)")
out_opts.add_argument('--dump_sep', metavar="", default="\t|\t", help="Set output separator default(NCBI) also adds extra trailing columns for kraken")
out_opts.add_argument('--dump_mini', metavar="", default=False, help="Dump minimal file with tab as separator")
out_opts.add_argument('--dump_descriptions', action='store_true', default=False, help="Dump description names instead of database integers")
out_opts.add_argument("--taxDB", action='store_true',help="This file is required when running krakenuniq (is it?)") ## added of krakenuniq

parser.add_argument("--version", action='store_true', help=argparse.SUPPRESS)

Expand Down Expand Up @@ -175,8 +174,8 @@ def __init__(self, message,expression=""):
logfile = open(args.log, "w")
sys.stdout = logfile

if force:
'''Remove database if force is turned on'''
if force and args.taxonomy_file:
'''Remove database if force is turned on and new source file is given'''
if os.path.exists(args.database):
os.system("rm {database}".format(database=args.database))

Expand Down Expand Up @@ -204,30 +203,34 @@ def __init__(self, message,expression=""):
read_obj.parse_taxonomy() ## Parse taxonomy file

'''Parse genome2taxid file''' ## Fix at some point only one function should be needed
if args.taxonomy_type == "NCBI" and args.genomeid2taxid:
if not args.genomeid2taxid:
print("Warning no genomeid2taxid file given!")
elif args.taxonomy_type == "NCBI" and args.genomeid2taxid:
read_obj.parse_genomeid2taxid(args.genomes_path,args.genomeid2taxid)
if args.taxonomy_type == "CanSNPer":
elif args.taxonomy_type == "CanSNPer":
read_obj.parse_genomeid2taxid(args.genomeid2taxid)

if verbose: print("Nodes in taxonomy tree {n} number of taxonomies {k}".format(n=read_obj.length, k=read_obj.ids))
if verbose: current_time = report_time(current_time)

''' 1. Modify database, if datasource for modification of current database is supplied process this data'''
if args.mod_file or args.mod_database:
if args.mod_file and not args.genomeid2taxid:
exit("No genomeid2taxid file given!")
if verbose: print("Loading module: ModifyTree")
modify_module = dynamic_import("modules", "ModifyTree")
modify_obj = modify_module(database=args.database, mod_file=args.mod_file, mod_database= args.mod_database,verbose=verbose,parent=args.parent)
modify_obj = modify_module(database=args.database, mod_file=args.mod_file, mod_database= args.mod_database,verbose=verbose,parent=args.parent,replace=args.replace)
modify_obj.update_database()
if args.mod_file:
if verbose: current_time = report_time(current_time)
modify_obj.update_annotations(genomeid2taxid=args.genomeid2taxid)
if verbose: current_time = report_time(current_time)

''' 2. Dump custom taxonomy database into NCBI/kraken readable format)'''
if args.dump:
if args.dump or args.dump_mini:
'''Check if datase exists if it does make sure the user intends to overwrite the file'''
nameprefix,nodeprefix = args.dump_prefix.split(",")
if os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp") or os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp"):
if (os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp") or os.path.exists(args.outdir.rstrip("/")+"/"+nameprefix+".dmp")) and not force:
if args.log: sys.stdout = original_sysout ## If log file is used, make sure the input reaches the user terminal
ans = input("Warning: {names} and/or {nodes} already exists, overwrite? (y/n): ")
if ans not in ["y","Y","yes", "Yes"]:
Expand All @@ -244,7 +247,7 @@ def __init__(self, message,expression=""):
write_obj.set_minimal()
write_obj.nodes()
write_obj.names()
if args.taxDB:
if False: #args.taxDB:
write_obj.set_separator("\t")
write_obj.set_prefix("names,taxDB")
write_obj.set_order(True)
Expand Down
6 changes: 4 additions & 2 deletions custom_taxonomy/modules/CreateKrakenDatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ def __init__(self, database, kraken_database, genomes_path, outdir,verbose=False
self.krakendb= kraken_database
self.ncbi_rewrite_speed = "fastest"
self.verbose = verbose
self.skiptax = parse_skip(skip.split(",")) ## if node should be skipd this must be true, otherwise nodes in modfile are added to existing database

if skip:
self.skiptax = parse_skip(skip.split(",")) ## if node should be skipd this must be true, otherwise nodes in modfile are added to existing database
else:
self.skiptax = []
if verbose: print("{krakendb}".format(outdir = self.outdir, krakendb=self.krakendb))
if not os.path.exists("{krakendb}".format(outdir = self.outdir, krakendb=self.krakendb)):
os.system("mkdir -p {krakendb}".format(outdir = self.outdir, krakendb=self.krakendb))
Expand Down
32 changes: 17 additions & 15 deletions custom_taxonomy/modules/ModifyTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, message):

class ModifyTree(object):
"""docstring for ModifyTree."""
def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, separator="\t",verbose=False,parent=False):
def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, separator="\t",verbose=False,parent=False,replace=False):
super(ModifyTree, self).__init__()
self.verbose = verbose
if self.verbose: print("Modify Tree")
Expand All @@ -25,19 +25,19 @@ def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, s
self.names = {}

self.parent=parent

self.replace = replace
self.ncbi_order = True
self.mod_genomes =False

### Connect to or create database
self.taxonomydb = ModifyFunctions(database)
self.taxonomydb = ModifyFunctions(database,verbose=verbose)
self.rank= self.taxonomydb.get_rank(col=2)
## Save all nodes in the current database
self.nodeDict = self.taxonomydb.get_nodes()

self.taxid_base = self.taxonomydb.get_taxid_base()
if mod_database:
self.modsource = self.parse_mod_database(ModifyFunctions(mod_database))
self.modsource = self.parse_mod_database(ModifyFunctions(mod_database,verbose=verbose))
elif mod_file:
self.modsource = self.parse_mod_file(mod_file)
else:
Expand Down Expand Up @@ -101,24 +101,24 @@ def parse_mod_database(self, database):
self.new_nodes = set()
### Get links from modified database
mod_links = database.get_links(self.dbmod_annotation.keys())
parent_link = self.taxonomydb.get_parent(self.taxonomydb.get_id(self.parent))
self.parent_link = self.taxonomydb.get_parent(self.taxonomydb.get_id(self.parent))

self.old_nodes = self.taxonomydb.get_children(set([self.taxonomydb.get_id(self.parent)])) - set([self.taxonomydb.get_id(self.parent)])
### Translate node ids between databases and add non existing nodes into current database
for link in mod_links:
link = list(link)
if link[0] == link[1]: ### This should only occur if the root node of the mod database is used link replace with
self.new_links.add(parent_link)
self.new_links.add(self.parent_link)
else:
parent,child,rank = self.dbmod_annotation[link[0]].strip(),self.dbmod_annotation[link[1]].strip(),link[2]
#print([parent,child,rank])
self._parse_new_links(parent,child,rank)
## remove any overlapping nodes, they don't need new indexnumber but can keep the old ones
self.old_nodes -= self.new_nodes
### get links from current database
self.existing_links = self.taxonomydb.get_links(self.old_nodes)
self.existing_links = set(self.taxonomydb.get_links(self.old_nodes))
### Check which links are overlapping and may need to be updated
self.modified_links = set(self.existing_links) - self.new_links
self.modified_links = self.existing_links - self.new_links
'''Get all genomes annotated to new nodes in existing database'''
self.mod_genomes = database.get_genomes()
return self.modified_links
Expand All @@ -127,7 +127,9 @@ def parse_mod_file(self, modfile):
'''Parse mod_file information'''
self.new_links = set()
self.new_nodes = set()
self.old_nodes = self.taxonomydb.get_children(set([self.taxonomydb.get_id(self.parent)])) - set([self.taxonomydb.get_id(self.parent)])
parent = self.taxonomydb.get_id(self.parent)
self.parent_link = self.taxonomydb.get_parent(self.taxonomydb.get_id(self.parent))
self.old_nodes = self.taxonomydb.get_children(set([parent])) - set([parent])
with open(modfile, "r") as f:
headers = f.readline().strip().split(self.sep)
if len(headers) < 2 or len(headers)>3:
Expand All @@ -145,7 +147,6 @@ def parse_mod_file(self, modfile):
if len(line) == 2:
line +=["-"] ## Add no specified rank to line
parent,child,rank = line
print(parent,child,rank)
try:
rank = self.rank[rank]
except:
Expand All @@ -157,9 +158,9 @@ def parse_mod_file(self, modfile):
## remove any overlapping nodes, they don't need new indexnumber but can keep the old ones
self.old_nodes -= self.new_nodes
### get links from current database
self.existing_links = self.taxonomydb.get_links(self.new_nodes,swap=swap)
self.existing_links = set(self.taxonomydb.get_links(self.old_nodes,swap=swap))
### Check which links are overlapping and may need to be updated
self.modified_links = set(self.existing_links) & self.new_links
self.modified_links = self.existing_links & self.new_links
return self.modified_links

def update_annotations(self, genomeid2taxid):
Expand Down Expand Up @@ -226,14 +227,15 @@ def update_genomes(self):

def update_database(self):
'''Update the database file'''
if self.skip:
if self.replace:
self.taxonomydb.delete_links(self.modified_links)
self.taxonomydb.delete_links(self.existing_links-set(self.parent_link)) ## delete existing links from old nodes, except parent
self.taxonomydb.delete_nodes(self.old_nodes)
if self.verbose: print("Deleted nodes {nodes}".format(nodes=self.old_nodes))
added,nodes = self.taxonomydb.add_links(self.new_links)
if added + len(nodes) + len(self.modified_links) > 0:
if self.skip:
print("Deleting {n} links and {n2} nodes that are no longer valid".format(n=len(self.modified_links),n2=len(self.old_nodes)))
if self.replace:
print("Deleting {n} links and {n2} nodes that are no longer valid".format(n=len(self.modified_links & self.existing_links),n2=len(self.old_nodes)))
print("Adding {n} new nodes".format(n=len(nodes)))
print("Adding {n} updated and/or new links".format(n=added))

Expand Down
14 changes: 10 additions & 4 deletions custom_taxonomy/modules/WriteTaxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,17 @@ def __init__(self, path, database=".taxonomydb",verbose=False,separator="\t|\t",
### Allows a minimal output file with only nessesary fields default is NCBI id | name | empty | scientific name
self.minimal = minimal
if self.minimal:
self.separator = self.minimal
if self.separator =="\\t":
if dbprogram:
print("# WARNING: dbprogram cannot be used in combination with dump_mini, parameter ignored! (use --dump)")
self.dbprogram = False
if separator != "\t|\t":
self.separator = separator
else:
self.separator = "\t"
else:
self.dbprogram = dbprogram
self.parent = False ## Default print is NCBI structure with child in the first column
self.dbprogram = dbprogram


def set_separator(self,sep):
self.separator=sep
Expand Down Expand Up @@ -77,7 +83,7 @@ def names(self):
'''Write node annotations to names.dmp'''
if self.verbose: print('Write annotations to: {}{}.dmp'.format(self.path,self.prefix[0]))
end = "\n"
if self.dbprogram == "ganon":
if self.dbprogram == "ganon" or self.dbprogram == "krakenuniq":
end = "\t|\n"
with open('{}{}.dmp'.format(self.path,self.prefix[0]),"w") as outputfile:
## Retrieve all nodes that exists in the database
Expand Down
14 changes: 13 additions & 1 deletion custom_taxonomy/modules/database/DatabaseConnection.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ def __init__(self, value):
def __str__(self):
return repr(self.value)

class NameError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)

class DatabaseConnection(object):
"""docstring for DatabaseConnection"""
def __init__(self, database, verbose=False):
Expand Down Expand Up @@ -254,6 +260,7 @@ def add_nodes(self,nodes, table="tree",hold=False):
def delete_links(self,links, table="tree",hold=False):
'''This function deletes all links given in links'''
QUERY = "DELETE FROM {table} WHERE parent = {parent} AND child = {child}"
if self.verbose: print(QUERY.format(table=table,parent="",child=""), links)
for parent,child,rank in links:
res = self.query(QUERY.format(table=table, parent=parent, child=child))

Expand All @@ -264,6 +271,7 @@ def delete_links(self,links, table="tree",hold=False):
def delete_nodes(self, nodes, table="nodes",hold=False):
'''This function deletes all nodes given in nodes'''
QUERY = "DELETE FROM {table} WHERE id = {node}"
if self.verbose: print(QUERY.format(table=table,node=""), nodes)
for node in nodes:
res = self.query(QUERY.format(table=table, node=node))
## Commit changes
Expand Down Expand Up @@ -312,7 +320,11 @@ def get_parent(self,name):
def get_id(self,name):
'''get node id from name'''
QUERY = '''SELECT id FROM nodes WHERE name = "{node}"'''.format(node=name)
res = self.query(QUERY).fetchone()[0]
try:
res = self.query(QUERY).fetchone()[0]
except TypeError:
print(QUERY)
raise NameError("Name not found in the database! {name}".format(name=name))
return res


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
long_description = f.read()

setup(
name='FlexTaxD',
name='flextaxd',
##Global version, does not nessesarily follow script versions
version=__version__,

Expand Down

0 comments on commit 54a31c3

Please sign in to comment.