-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmu-convert-fungene-ids-to-taxastring.py
92 lines (80 loc) · 3.82 KB
/
mu-convert-fungene-ids-to-taxastring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
import sys
import re
## This script is designed to take a list of node hits that were obtained from a vsearch of fungene dna or protein seqs
## and convert the names to the taxonomic string associated with the assigned species id in the header. Here are the steps to
## generate the necessary infiles.
##
# 1. Run vsearch on the MED output NODE-REPRESENTATIVES.fasta ( note , you need to remove the "-" pads on the seqs prior to vsearch
# vsearch --usearch_global NODE-REPRESENTATIVES.fa --db ~/Dropbox/oxygen_gradient/dsrB-references/dsrB-nuclotide.fa --blast6out NODE-HITS.txt --id 0.6
#
# 2. Return the NODE-HITS using the following unix commands
# cut -f 1 -d "|" NODE-HITS.txt > NODE-HITS-1.txt
# cut -f 2 NODE-HITS.txt > NODE-HITS-2.txt
# paste NODE-HITS-1.txt NODE-HITS-2.txt > NODE-HITS-1-2.txt
# rm NODE-HITS-1.txt
# rm NODE-HITS-2.txt
# mv NODE-HITS-1-2.txt NODE-HITS.txt
#
# file looks like this
# NODE-ID fungene-id
# 000008777 CP007518
# 000008635 LZFM01000013
# 000001533 CP000142
# 000002489 AKFT01000013
#
#
# 3. Use taxize (an R library) to determine the string of taxonomy associated with each species in the header
# Here is an example to use taxize in R. The variable "d" (below) was generated using unix and text editing to return
# just the genus name.
#
# library("taxize")
# d <- c("Acetonema","Acidobacteria","Actinobacillus","Actinomyces")
# work = tax_name(d, get = c("phylum","class","order","family","genus"), db = "ncbi")
# write.table(work, "~/Dropbox/oxygen_gradient/nrfA-references/raw_taxit.txt")
#
# needed to fix the table a bit
# sed 's/"//g' raw_taxit.txt | sed 's/ / /g' > fix
# mv fix raw_taxit.txt
#
#
# 4. Now we come to the utility of this baby script. we need to create a table where the file contains the following fields
#
# node Domain Phylum Class Order Family Genus Species
#
#. %^ * ~ ~ . %^ * ~ ~ . LETS GET DOWN TO BUSINESS AND MAKE THIS HAPPEN . %^ * ~ ~ . %^ * ~ ~ .
# Read in the node hits
node_hits = open(sys.argv[1], 'rU')
# Read in the fungene reference file - headers for each of the fungene reference headers. It was created like this
# grep ">" nrfA-fungene.fa | sed 's/\>//g' > fungene_ids.txt
node_refs = open(sys.argv[2], 'rU')
# Read in the taxize table
taxa = open(sys.argv[3], 'rU')
hits_dict = {}
for node in node_hits:
x = node.strip().split('\t')
hits_dict[x[0]] = x[1]
taxa_dict = {}
for id in taxa:
x = id.strip().split(' ')
#genus is the key
taxa_dict[x[2]] = x[2],x[3],x[4],x[5],x[6]
for key in taxa_dict.keys():
print taxa_dict[key][1]
refs_dict = {}
for ref in node_refs:
x = ref.strip()
x = re.split(',| |=', x)
refs_dict[x[0]] = x[5],x[6]
hits_refs_dict = {}
for key in hits_dict.keys():
hits_refs_dict[key] = hits_dict[key], refs_dict[hits_dict[key]][0], refs_dict[hits_dict[key]][1]
output = open('fungene.tax', 'w')
#output.write('node'+";"+'Phylum'+";"+'Class'+";"+'Order'+";"+'Family'+";"+'Genus'+";"+'Species'+";"+"\n")
for key in hits_refs_dict.keys():
print hits_refs_dict[key], key
if hits_refs_dict[key][1] in taxa_dict.keys():
# output.write(key+';'+taxa_dict[hits_refs_dict[key][1]][0]+';'+taxa_dict[hits_refs_dict[key][1]][1]+';'+taxa_dict[hits_refs_dict[key][1]][2]+';'+taxa_dict[hits_refs_dict[key][1]][3]+';'+taxa_dict[hits_refs_dict[key][1]][4]+';'+hits_refs_dict[key][2]+'\n')
output.write(hits_refs_dict[key][0]+'\t'+taxa_dict[hits_refs_dict[key][1]][1]+';'+taxa_dict[hits_refs_dict[key][1]][2]+';'+taxa_dict[hits_refs_dict[key][1]][3]+';'+taxa_dict[hits_refs_dict[key][1]][4]+';'+taxa_dict[hits_refs_dict[key][1]][0]+';'+hits_refs_dict[key][2]+'\n')
else:
output.write(hits_refs_dict[key][0]+'\t'+"unknown"+';'+"unknown"+';'+"unknown"+';'+"unknown"+';'+"unknown"+';'+"unknown"+'\n')