-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmap_gene.py
123 lines (105 loc) · 3.7 KB
/
map_gene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import argparse, re
# Get input arguments
parser = argparse.ArgumentParser()
parser.add_argument('--gene', help='gene id')
parser.add_argument('--genes', help='list of gene ids')
parser.add_argument('--col', help='column', type=int, default=0)
parser.add_argument('--replace', help='replace?', default=False, action='store_true')
parser.add_argument('--mouse', help='use mouse target & db', default=False, action='store_true')
parser.add_argument('--multi', help='multiple mappings', default=False, action='store_true')
parser.add_argument('--full_na', help='full NAs', default=False, action='store_true')
parser.add_argument('--skip_na', help='skip NAs', default=False, action='store_true')
parser.add_argument('--db', help='gene info database', default='/home/unix/csmillie/aviv/db/human.gene_info.txt')
parser.add_argument('--target', help='target list', default='/home/unix/csmillie/aviv/db/human_genes.txt')
args = parser.parse_args()
# Get mouse files
if args.mouse:
args.db = '/home/unix/csmillie/aviv/db/mouse.gene_info.txt'
args.target = '/home/unix/csmillie/aviv/db/mouse_genes.txt'
# Fix case
tcase = {} # map gene.upper() -> gene
qcase = {} # map gene.upper() -> gene
def fix_gene(gene):
gene = gene.upper()
return gene
# Get target genes
target = {}
for line in open(args.target):
gene = line.rstrip()
GENE = fix_gene(gene)
target[GENE] = 1
tcase[GENE] = gene
# Get query genes
query = {}
if args.gene:
gene = args.gene
GENE = fix_gene(gene)
query[GENE] = []
qcase[GENE] = gene
if args.genes:
for line in open(args.genes):
gene = line.rstrip().split()[args.col]
GENE = fix_gene(gene)
query[GENE] = []
qcase[GENE] = gene
# Populate query
for GENE in query:
if GENE in target:
query[GENE] = [GENE]
# Read gene info
for line in open(args.db):
if line.startswith('#'):
continue
line = line.rstrip().split('\t')
genes = [line[2]] + line[4].split('|')
GENES = [fix_gene(gene) for gene in genes]
for QGENE in GENES:
if QGENE in query:
if args.multi == False and len(query[QGENE]) > 0:
continue
for TGENE in GENES:
if TGENE in target:
query[QGENE].append(TGENE)
if args.multi == False:
break
# Print map
if args.gene:
qgene = args.gene
QGENE = fix_gene(args.gene)
if len(query[QGENE]) > 0:
TGENES = query[QGENE]
tgenes = ','.join([tcase[TGENE] for TGENE in TGENES])
print '%s\t%s' %(qgene, tgenes)
else:
if args.skip_na == False:
if args.full_na == True:
tgene = '%s:NA' %(qgene)
else:
tgene = 'NA'
print '%s\t%s' %(qgene, tgene)
if args.genes:
for line in open(args.genes):
line = line.rstrip().split()
qgene = line[args.col]
QGENE = fix_gene(qgene)
if len(query[QGENE]) > 0:
TGENES = query[QGENE]
tgenes = ','.join(set([tcase[TGENE] for TGENE in TGENES]))
if args.replace == True:
line[args.col] = tgenes
else:
line.insert(args.col+1, tgenes)
else:
if args.skip_na == True:
continue
if args.full_na == True:
if args.replace == True:
line[args.col] = '%s:NA' %(qgene)
else:
line.insert(args.col+1, '%s:NA' %(qgene))
else:
if args.replace == True:
line[args.col] = 'NA'
else:
line.insert(args.col+1, 'NA')
print '\t'.join(line)