forked from qunfengdong/BLCA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
1.subset_db_gg.py
114 lines (91 loc) · 3.78 KB
/
1.subset_db_gg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
import sys
import math
import glob
import re
import os
import getopt
import subprocess
__author__ = "Xiang Gao, Huaiying Lin, Qunfeng Dong"
__copyright__ = "Copyright 2016, Bayesian-based LCA taxonomic classification"
__credits__ = ["Xiang Gao", "Huaiying Lin","Qunfeng Dong"]
__license__ = "GPL"
__version__ = "1.2"
__maintainer__ = "Huaiying Lin"
__email__ = "ying.eddi2008@gmail.com"
__status__ = "Production"
'''
Get a subset of lineage information from NCBI 16S Microbial database.
'''
def usage():
print "\n<< Bayesian-based LCA taxonomic classification method >>\n\n Please make sure the following softwares are in your PATH:\n\t1.muscle (http://www.drive5.com/muscle/downloads.htm), muscle should be the program's name.\n\t2.ncbi-blast suite (ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/)\n\t3.Biopython is installed locally.\n"
print " This is the utility script to format Greengene Database before running the BLCA taxonomy profiling.\n"
print 'Usage: python '+sys.argv[0]+'\n'
print "Arguments:\n - Optional:"
print "\t-d\t\tThe database file that you want to format. Default: gg_13_5.fasta.gz.\n\t-t\t\tThe taxonomy database link from Greengenes. Default: gg_13_5_taxonomy.txt.gz. \n - Other:"
print "\t-h\t\tShow program usage and quit"
dbfsafile='gg_13_5.fasta.gz'
dbtaxfile='gg_13_5_taxonomy.txt.gz'
#### Get options ####
opts, args=getopt.getopt(sys.argv[1:],"d:t:u:h",['Database','fsadb','taxdb','help'])
for o,a in opts:
if o == "-d":
dbfsafile=a
elif o == "-t":
dbtaxfile=a
elif o in ('-h','--help'):
print usage()
sys.exit()
else:
assert False, 'unhandle option'
################ Function ##############
def check_program(prgname):
'''Check whether a program has been installed and put in the PATH'''
import os
path=os.popen("which "+prgname).read().rstrip()
if len(path) > 0 and os.path.exists(path):
print prgname+" is located in your PATH!"
else:
print "ERROR: "+prgname+" is NOT in your PATH, please set up "+prgname+"!"
sys.exit(1)
def format_gg_taxfile(taxfile,folder):
''' Read in Greengenes taxonomy file, and format it and write out into BLCA compatiable format '''
leveldic={'c':'class','g':'genus','k':'superkingdom','f':'family','o':'order','p':'phylum','s':'species'}
taxin=open(folder+'/'+taxfile)
outtaxname=taxfile.split(".")
outtaxfile=''.join(outtaxname[:len(outtaxname)-1:])+".taxonomy"
if os.path.isfile(outtaxfile):
os.remove(outtaxfile)
taxout=open(folder+'/'+outtaxfile,'aw')
for l in taxin:
ln=l.rstrip().replace(" ","").split("\t")
taxout.write(ln[0]+"\t")
tmp=dict(x.split("__") for x in ln[1].split(";"))
for k in tmp.keys():
tmp[leveldic[k]]=tmp.pop(k)
for key, val in tmp.items():
taxout.write(key+':'+val+";")
taxout.write("\n")
taxin.close()
taxout.close()
print ">> "+taxfile+" has been formatted and outputted!"
def make_blastdb(dbfsa,folder):
''' Format downloaded Greengenes sequence fasta files into blast compatiable format '''
check_program('makeblastdb')
dbname=folder+'/'+dbfsa.rstrip('.fasta')
subprocess.call(['makeblastdb','-dbtype',"nucl",'-in',folder+'/'+dbfsa,'-parse_seqids','-out',dbname])
print dbfsa+" has been formatted!"
def ungz(file,folder):
fname=file.rstrip('.gz')
os.system("gzip -dc "+file+" > "+folder+"/"+fname)
print fname+" has been unzipped!"
######## MAIN ###########
foldername='gg'
os.system('rm -fr '+foldername)
os.system('mkdir '+foldername)
ungz(dbfsafile,foldername)
ungz(dbtaxfile,foldername)
taxfile=dbtaxfile.rstrip('.gz')
dbfsa=dbfsafile.rstrip('.gz')
format_gg_taxfile(taxfile,foldername)
make_blastdb(dbfsa,foldername)