-
Notifications
You must be signed in to change notification settings - Fork 0
/
script-20090715a.py
124 lines (113 loc) · 3.89 KB
/
script-20090715a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
This is a junk script.
The idea is to read some data and newick trees,
then give nested gene clusters within each data set,
then give clusters that are the same between data sets.
I hope that each experiment was done with the same set of genes.
The newick trees do not represent the phylogenetic relationship among the genes;
rather they represent a kind of inferred correlation structure.
"""
import sys
import logging
from khatrisvd import treebuilder
from khatrisvd import mtree
def get_gene_names(data_filename):
"""
@param data_filename: the name of the data file
@return: a list of gene names
"""
# read the gene names from the file
fin = open(data_filename)
lines = fin.readlines()
fin.close()
# strip the whitespace from the ends of the lines
lines = [line.strip() for line in lines]
# remove empty lines
lines = [line for line in lines if line]
# remove the first line of headers
lines = lines[1:]
# split lines by commas
rows = [[s.strip() for s in line.split(',')] for line in lines]
# get only the first element of each line
names = [row[0] for row in rows]
# remove the quotes from the gene names
names = [name[1:-1].strip() for name in names]
# return the list
return names
def summarize_data():
"""
The first two sys.argv arguments are meant to be the two data files.
"""
# get the first gene names
data_filename = sys.argv[1]
names_a = get_gene_names(data_filename)
print len(names_a), 'names'
# get the second gene names
data_filename = sys.argv[2]
names_b = get_gene_names(data_filename)
print len(names_b), 'names'
print
# compare the names
for a, b in zip(names_a, names_b)[:10]:
print a
print b
print
# compare all of the names
nequal = 0
ndifferent = 0
for a, b in zip(names_a, names_b):
if a == b:
nequal += 1
else:
ndifferent += 1
print nequal, 'equal'
print ndifferent, 'different'
def get_tree(tree_filename):
fin = open(tree_filename)
multistring = fin.read()
fin.close()
return mtree.newick_to_mtree(multistring)
def analyze_clusters():
"""
First arg is data file, second arg is newick file.
"""
logging.debug('parse the gene names')
data_filename = sys.argv[1]
names = get_gene_names(data_filename)
logging.debug('parse the tree')
tree_filename = sys.argv[2]
root = get_tree(tree_filename)
logging.debug('center and sort the tree')
root = treebuilder.center_and_sort_tree(root)
logging.debug('get the clusters')
clusters = treebuilder.get_clusters(root)
logging.debug('show the corresponding genes')
length_cluster_pairs = list(sorted((len(c), list(c)) for c in clusters))
for l, c in length_cluster_pairs:
#print len(c)
print ', '.join(names[x] for x in c)
def analyze_consensus_clusters():
logging.debug('parse the gene names')
data_filename = sys.argv[1]
names = get_gene_names(data_filename)
logging.debug('get clusters from the first tree')
tree_filename = sys.argv[2]
root = get_tree(tree_filename)
root = treebuilder.center_and_sort_tree(root)
first_clusters = treebuilder.get_clusters(root)
logging.debug('get clusters from the second tree')
tree_filename = sys.argv[3]
root = get_tree(tree_filename)
root = treebuilder.center_and_sort_tree(root)
second_clusters = treebuilder.get_clusters(root)
logging.debug('get consensus clusters')
consensus = set(frozenset(c) for c in first_clusters) & set(frozenset(c) for c in second_clusters)
logging.debug('show the corresponding genes')
length_cluster_pairs = list(sorted((len(c), list(c)) for c in consensus))
for l, c in length_cluster_pairs:
#print len(c)
print ', '.join(names[x] for x in c)
def main():
analyze_clusters()
if __name__ == '__main__':
main()