-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmethreport.py
executable file
·165 lines (132 loc) · 4.76 KB
/
methreport.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
## (c) 2014, Alberto Riva (ariva@ufl.edu)
## DiBiG, ICBR Bioinformatics, University of Florida
import sys
from Bio import SeqIO
import Script
# Script object
def usage():
sys.stderr.write("""methreport.py - Report methylation rate at CG and GC positions.
Usage: methreport.py [-gcg] infile [outfile]
`Infile' should be a multi-FASTA file in which the first sequence is assumed to
be the reference. All other sequences should have the same length as the reference
and be aligned to it. This program will identify all CG and GC positions in the
reference sequence (including GCG positions if the -gcg option is specified) and
report to the output the nummber and fraction of unconverted Cs at each position.
Output is written to `outfile', is specified, or to standard output. Entries for
CG positions are written first, followed by those for GC positions. The output is
in tab-delimited format with three columns: position, number of unconverted Cs,
fraction of unconverted Cs (over total number of sequences examined).
Options:
-gcg | Do not exclude GCG positions from analysis.
""")
P = Script.Script("methreport.py", version="1.0", usage=usage)
EXCLGCG=False
# Utility classes
class refDesc():
"""A class containing the reference sequence, its length, and a list of CG and GC positions."""
sequence = None
length = 0
CGpositions = []
GCpositions = []
numCGs = 0
numGCs = 0
global EXCLGCG
def __init__(self, ref):
length = len(ref)
self.sequence = ref
self.length = length
self.CGpositions = detectCG(ref, length, EXCLGCG)
self.GCpositions = detectGC(ref, length, EXCLGCG)
self.numCGs = len(self.CGpositions)
self.numGCs = len(self.GCpositions)
### General
def loadSequences(filename):
return SeqIO.parse(filename, "fasta")
def detectCG(seq, length, excludeGCG=False):
"""Returns the list of C positions in CG dinucleotides in sequence `seq'.
If `excludeGCG' is True, ignores GCG positions."""
result = []
candidate = False
for i in range(length):
if (seq[i] == 'C'):
candidate = i
elif (seq[i] == 'G') and candidate:
if excludeGCG:
if (i < 2) or seq[i-2] != 'G':
result.append(candidate)
candidate = False
else:
result.append(candidate)
candidate = False
else:
candidate = False
return result
def detectGC(seq, length, excludeGCG=False):
"""Returns the list of C positions in GC dinucleotides in sequence `seq'.
If `excludeGCG' is True, ignores GCG positions."""
result = []
candidate = False
for i in range(1, length):
if (seq[i] == 'C') and (seq[i-1] == 'G'):
# This is a GC position. Now check position i+1 if excluding GCGs
if excludeGCG:
if (i == length-1) or seq[i+1] != 'G':
result.append(i)
# Otherwise, simply add the position
else:
result.append(i)
return result
def formatTabDelim(stream, l):
stream.write("\t".join(l) + "\n")
def main():
global EXCLGCG
infile = None
outfile = ""
# Parse arguments
args = sys.argv[1:]
P.standardOpts(args)
for arg in args:
if arg == "-gcg":
EXCLGCG = True
elif infile == "":
infile = P.isFile(arg)
else:
outfile = arg
if not infile:
P.errmsg(P.NOFILE)
nreads = 0
seqs = loadSequences(infile)
rd = refDesc(seqs.next()) # reference sequence
print("Reference sequence loaded from file `{}'.".format(infile))
print("{}bp, {} CG positions, {} GC positions.".format(rd.length, rd.numCGs, rd.numGCs))
CGarr = [ [p, 0] for p in rd.CGpositions ]
GCarr = [ [p, 0] for p in rd.GCpositions ]
print("Reading sequences...")
for s in seqs:
nreads += 1
for p in CGarr:
if s[p[0]] == 'C':
p[1] += 1
for p in GCarr:
if s[p[0]] == 'C':
p[1] += 1
if outfile:
out = open(outfile, "w")
else:
out = sys.stdout
try:
formatTabDelim(out, ["#CG", "Sites:", str(len(CGarr))])
formatTabDelim(out, ["CG pos", "Num C", "Perc C"])
for pair in CGarr:
formatTabDelim(out, [str(pair[0]), str(pair[1]), str(1.0 * pair[1] / nreads)])
out.write("\n")
formatTabDelim(out, ["#GC", "Sites:", str(len(GCarr))])
formatTabDelim(out, ["GC pos", "Num C", "Perc C"])
for pair in GCarr:
formatTabDelim(out, [str(pair[0]), str(pair[1]), str(1.0 * pair[1] / nreads)])
finally:
if outfile:
out.close()
if __name__ == "__main__":
main()