-
Notifications
You must be signed in to change notification settings - Fork 5
/
Barley_Parts_to_Pseudomolecules.py
executable file
·209 lines (195 loc) · 7.16 KB
/
Barley_Parts_to_Pseudomolecules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3
"""A simple script to convert the pseudomolecule parts into full
pseudomolecules."""
import argparse
import gzip
# Store the lengths of the barley pseudomolecule parts in a dictionary. We only
# really need to store the first parts, since we will just add the part2
# positions to it.
# Define the length of the part1 pieces as a dictionary constant.
# Morex v1 sizes
PARTS_SIZES_V1 = {
'chr1H': 312837513,
'chr2H': 393532674,
'chr3H': 394310633,
'chr4H': 355061206,
'chr5H': 380865482,
'chr6H': 294822070,
'chr7H': 325797516,
'chrUn': 249774706,
'Pt': 999999999999
}
# Morex v2 sizes including plastids
PARTS_SIZES_V2 = {
'chr1H': 205502676,
'chr2H': 305853815,
'chr3H': 271947776,
'chr4H': 282386439,
'chr5H': 205989812,
'chr6H': 260041240,
'chr7H': 328847863,
'chrUn': 85026395,
'EF115541.1': 136462,
'AP017301.1': 525599,
'Pt': 999999999999
}
# Morex v3 sizes
PARTS_SIZES_V3 = {
'chr1H': 206486643,
'chr2H': 301293086,
'chr3H': 267852507,
'chr4H': 276149121,
'chr5H': 204878572,
'chr6H': 256319444,
'chr7H': 328847192,
'chrUn': 29110253,
'EF115541.1': 136462,
'AP017301.1': 525599,
'Pt': 999999999999
}
def parse_args():
"""Set up an argument parser, and actually parse the args."""
parser = argparse.ArgumentParser(
description='Convert parts coordinates to pseudomolecule coordinates.',
add_help=True
)
# Add mutually exclusive group for the file format. This is so that only
# one format can be specified.
fmt = parser.add_mutually_exclusive_group(required=True)
fmt.add_argument(
'--vcf',
'-v',
action='store_true',
help='Input file is VCF or VCF.gz'
)
fmt.add_argument(
'--bed',
'-b',
action='store_true',
help='Input file is BED'
)
parser.add_argument(
'intervals',
metavar='file',
help='File with intervals to convert'
)
parser.add_argument(
'ref_version',
metavar='reference_version',
help='Input is one of the following: morex_v1, morex_v2, morex_v3')
a = parser.parse_args()
return a
def set_format(args):
"""Set the format that is being used for conversion."""
arg_dict = vars(args)
if arg_dict['vcf']:
return 'VCF'
elif arg_dict['bed']:
return 'BED'
else:
return None
def vcf_conv(intervals, parts_sizes):
"""Convert VCF. Remember that VCF is 1-based, which makes the math a lot nicer.
Handles both VCF and gzipped VCFs."""
if '.gz' in intervals:
# If lines start with '#', print them
with gzip.open(intervals, 'rt') as f:
for line in f:
if line.startswith('#'):
print(line.strip())
else:
tmp = line.strip().split()
# modify the chromosome to not have the part
chrom = tmp[0].split('_')[0]
# And check if we have to modify the position. If the chromosome
# name has '_part2' in it, then we have to modify it.
if '_part2' in tmp[0]:
#offset = parts_sizes[chrom + '_part1']
offset = parts_sizes[chrom]
newpos = str(int(tmp[1]) + offset)
else:
newpos = tmp[1]
# then print out the VCF line with the new position and chromosome
# name
toprint = [chrom, newpos] + tmp[2:]
print('\t'.join(toprint))
else:
# If lines start with '#', print them
with open(intervals, 'r') as f:
for line in f:
if line.startswith('#'):
print(line.strip())
else:
tmp = line.strip().split()
# modify the chromosome to not have the part
chrom = tmp[0].split('_')[0]
# And check if we have to modify the position. If the chromosome
# name has '_part2' in it, then we have to modify it.
if '_part2' in tmp[0]:
#offset = parts_sizes[chrom + '_part1']
offset = parts_sizes[chrom]
newpos = str(int(tmp[1]) + offset)
else:
newpos = tmp[1]
# then print out the VCF line with the new position and chromosome
# name
toprint = [chrom, newpos] + tmp[2:]
print('\t'.join(toprint))
def bed_conv(intervals, parts_sizes):
"""Convert BED. Remember that BED is 0-based (i.e., the first base is 0)."""
# If lines start with '#', print them
with open(intervals, 'r') as f:
for line in f:
if line.startswith('#'):
print(line.strip())
continue
else:
tmp = line.strip().split()
# modify the chromosome to not have th part
chrom = tmp[0].split('_')[0]
# And check if we have to modify the position. If the chromosome
# name has '_part2' in it, then we have to modify it.
if '_part2' in tmp[0]:
#offset = parts_sizes[chrom + '_part1']
offset = parts_sizes[chrom]
newstartpos = str(int(tmp[1]) + offset)
newendpos = str(int(tmp[2]) + offset)
else:
newstartpos = tmp[1]
newendpos = tmp[2]
# Check if standard BED file format with 3 columns
if len(tmp) == 3:
# Then print out BED line with chromosome name and new start and end positions
toprint = [chrom, newstartpos, newendpos]
print('\t'.join(toprint))
elif len(tmp) > 3:
# Print out BED line with chromosome name and new start and end positions
# along with remaining columns
toprint = [chrom, newstartpos, newendpos] + tmp[3:]
print('\t'.join(toprint))
else:
print('Too few columns, please check input file is in BED format.')
exit(1)
def main():
"""Driver function."""
args = parse_args()
f = set_format(args)
# Check which version of the Morex reference we are using
if args.ref_version == "morex_v1":
PARTS_SIZES = PARTS_SIZES_V1
elif args.ref_version == "morex_v2":
PARTS_SIZES = PARTS_SIZES_V2
elif args.ref_version == "morex_v3":
PARTS_SIZES = PARTS_SIZES_V3
else:
print('Invalid reference version provided, valid options are: morex_v1, morex_v2, morex_v3')
exit(1)
# Then, use the format and the supplied file to convert the coordinates
if f == 'VCF':
vcf_conv(args.intervals, PARTS_SIZES)
elif f == 'BED':
bed_conv(args.intervals, PARTS_SIZES)
else:
print('Unrecognized format, weird.')
exit(1)
main() # Run the program