-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrs_primer.py
185 lines (146 loc) · 9.94 KB
/
rs_primer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Author: WangYaheng
Date: Sep-17-2017 13:20
"""
import re, sys, os, argparse, csv, urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse, primer3
from bs4 import BeautifulSoup as bs
def get_opt():
'''Check and parsing the opts'''
parser = argparse.ArgumentParser(
prog = 'get_rs_fasta',
description = 'get_rs_fasta: A program to get rs-number fasta.',
usage = '%(prog)s.py [options] -r rs_num -o [results]\n %(prog)s.py [options] -r rs28934571 -o rs28934571'
)
parser.add_argument('-r','--rs_nums', nargs='?',type=str,default=sys.stdin,help="A string of rs_num. [Required]",required=True)
parser.add_argument('-o','--outprefix',nargs='?',type=str,default=sys.stdin,help="Output file prefix name. default: 'results'. [Optional]",required=False)
thermo_group=parser.add_argument_group('Primer3 picks primers for PCR reaction settings', 'set these arguments for real experimental values.')
thermo_group.add_argument('-mono_conc','--PRIMER_SALT_MONOVALENT',nargs='?',type=float,default=50,help='The millimolar (mM) concentration of monovalent salt cations (usually KCl) in the PCR. default=50.0',required=False)
thermo_group.add_argument('-diva_conc','--PRIMER_SALT_DIVALENT',nargs='?',type=float,default=1.5,help='The millimolar (mM) concentration of divalent salt cations (usually MgCl^(2+)) in the PCR. default=1.5',required=False)
thermo_group.add_argument('-oligo_conc','--PRIMER_DNA_CONC',nargs='?',type=float,default=50,help='A value to use as nanomolar (nM) concentration of each annealing oligo over the course the PCR. default=50.0',required=False)
thermo_group.add_argument('-dntp_conc','--PRIMER_DNTP_CONC',nargs='?',type=float,default=0.6,help='The millimolar (mM) concentration of the sum of all deoxyribonucleotide triphosphates. dafault=0.6',required=False)
ampli_group=parser.add_argument_group('PCR product settings','set these argument for your expected multi-PCR products.')
ampli_group.add_argument('--PRIMER_PICK_LEFT_PRIMER',nargs='?',type=int,default=1,help="If the associated value = 1 (non-0), then primer3 will attempt to pick left primers. default=1",required=False)
ampli_group.add_argument('--PRIMER_PICK_RIGHT_PRIMER',nargs='?',type=int,default=1,help="If the associated value = 1 (non-0), then primer3 will attempt to pick left primers. default=1",required=False)
ampli_group.add_argument('--PRIMER_PICK_INTERNAL_PRIMER',nargs='?',type=int,default=0,help="If the associated value = 1 (non-0), then primer3 will attempt to pick an internal oligo (hybridization probe to detect the PCR product). default=0",required=False)
ampli_group.add_argument('--PRIMER_MAX_SIZE',nargs='?',type=int,default=21,help='Maximum acceptable length (in bases) of a primer. Currently this parameter cannot be larger than 35. default=27',required=False)
ampli_group.add_argument('--PRIMER_MIN_SIZE',nargs='?',type=int,default=19,help='Minimum acceptable length of a primer. Must be greater than 0 and less than or equal to PRIMER_MAX_SIZE. default=18',required=False)
ampli_group.add_argument('--PRIMER_OPT_SIZE',nargs='?',type=int,default=20,help='Optimum length (in bases) of a primer. default=20',required=False)
ampli_group.add_argument('--PRIMER_MAX_GC',nargs='?',type=float,default=80,help='Maximum allowable percentage of Gs and Cs in any primer generated by Primer. default=80.0',required=False)
ampli_group.add_argument('--PRIMER_MIN_GC',nargs='?',type=float,default=20,help='Minimum allowable percentage of Gs and Cs in any primer. default=20.0',required=False)
ampli_group.add_argument('--PRIMER_OPT_GC_PERCENT',nargs='?',type=float,default=50,help='Optimum GC percent. default=65.0',required=False)
ampli_group.add_argument('--PRIMER_MAX_TM',nargs='?',type=float,default=70,help='Maximum acceptable melting temperature (Celsius) for a primer oligo. default=80.0',required=False)
ampli_group.add_argument('--PRIMER_MIN_TM',nargs='?',type=float,default=50,help='Minimum acceptable melting temperature (Celsius) for a primer oligo. default=55.0',required=False)
ampli_group.add_argument('--PRIMER_OPT_TM',nargs='?',type=float,default=60,help='Optimum melting temperature (Celsius) for a primer. default=65.0',required=False)
ampli_group.add_argument('--PRIMER_MAX_DIFF_TM',nargs='?',type=float,default=100,help='Maximum acceptable (unsigned) difference between the melting temperatures of the left and right primers. default=100.0',required=False)
ampli_group.add_argument('-n','--PRIMER_NUM_RETURN',nargs='?',type=int,default=2,help='The maximum number of primer (pairs) to return. Primer pairs returned are sorted by their "quality", in other words by the value of the objective function (where a lower number indicates a better primer pair). default=5',required=False)
ampli_group.add_argument('--PRIMER_PICK_ANYWAY',nargs='?',type=int,default=1,help='Return primer even if it violates specific constraints. default=1',required=False)
ampli_group.add_argument('--PRIMER_MIN_LEFT_THREE_PRIME_DISTANCE',nargs='?',type=int,default=8,help="When returning multiple primer pairs, the minimum number of base pairs between the 3' ends of any two left primers. default=-1",required=False)
ampli_group.add_argument('--PRIMER_MIN_RIGHT_THREE_PRIME_DISTANCE',nargs='?',type=int,default=8,help="When returning multiple primer pairs, the minimum number of base pairs between the 3' ends of any two right primers. default=-1",required=False)
ampli_group.add_argument('--PRIMER_PRODUCT_SIZE_RANGE',nargs='?',type=str,default='80-250',help="The associated values specify the lengths of the product that the user wants the primers to create, and is a space separated list of elements of the form. default='200-225,225-250'", required=False)
args=parser.parse_args()
return args
def paser_html(rs_num, path, base_url = 'https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs='):
html = urllib.request.urlopen(base_url+rs_num).read()
soup = bs(html, 'html5lib')
seq_tmp = soup.find('div', id='Fasta').get_text().split()
name_tmp = [content.split('=') for content in seq_tmp[0].replace('>', '').split('|')]
alleles = name_tmp[7][-1].replace('\'', '')
mol = name_tmp[8][-1]
rs_num = name_tmp[2][0]
allePos = int(name_tmp[3][-1])
totalLen= float(name_tmp[4][-1]) - 1
seq = ''.join(seq_tmp[1:]).upper()
sub_seq = seq[allePos - 31 : allePos - 1] + seq[allePos : allePos + 30]
sub_gc_percent = 0.0
for i in sub_seq:
sub_gc_percent += 1 if i == 'G' or i == 'C' else 0
sub_gc_percent = sub_gc_percent / len(sub_seq)
total_gc_percent=0
for i in seq:
total_gc_percent +=1 if i == 'G' or i == 'C' else 0
total_gc_percent = total_gc_percent / totalLen
outname = '>%s %s %s: null GC:%.3f SNP_GC:%.2f\n'%(rs_num, mol, alleles, total_gc_percent, sub_gc_percent)
outseq = seq.replace(re.findall(re.compile('[RYWSMKBHDVN]'), seq)[0], alleles)+'\n'
return rs_num, seq, allePos, outname, outseq
def _thermo_args(args):
thermo_args = {}
[thermo_args.setdefault(k, v) for k, v in args._get_kwargs() if k != 'rs_num' and k != 'outprefix']
thermo_args['PRIMER_PRODUCT_SIZE_RANGE'] = [[int(y) for y in x.split('-')] for x in args.PRIMER_PRODUCT_SIZE_RANGE.strip(',').split(',')]
return thermo_args
def _seq_args(seq_info):
rs_num, seq, allePos = seq_info
seq_args = {}
seq_args.setdefault('SEQUENCE_ID', str(rs_num))
seq_args.setdefault('SEQUENCE_TEMPLATE', str(seq.replace(re.findall(re.compile('[RYWSMKBHDVN]'), seq)[0], 'N')))
seq_args.setdefault('SEQUENCE_TARGET', [allePos-26, 50])
return seq_args
def _s_primer(seq_args, thermo_args):
p3 = primer3.bindings.designPrimers(seq_args, thermo_args)
p3.setdefault('SEQUENCE_ID', seq_args['SEQUENCE_ID'])
p3.setdefault('SEQUENCE_TEMPLATE', seq_args['SEQUENCE_TEMPLATE'])
return p3
def parser_p3out(p3_out):
p3dict = {}
seq_id = []
for record in p3_out:
id = record['SEQUENCE_ID']
seq = record['SEQUENCE_TEMPLATE']
seq_id.append(id)
ok = record['PRIMER_PAIR_EXPLAIN']
if re.search('ok 0\s*$', ok) or re.search('ok 1\s*$', ok):
msg = "No acceptable primer pairs were found for %s. Try relaxing various parameters, including the max and min primer melting temperatures, GC content, product size range or primer size." % id
print(msg)
exit()
tmp_dict = {}
for key, value in list(record.items()):
r = re.compile('PRIMER_LEFT_(\d+)?(_)?SEQUENCE$')
m = r.search(key)
if m:
fp = value
sn = int(m.group(1)) if m.group(1) else 0
if sn not in tmp_dict:
tmp_dict[sn] = {}
tmp_dict[sn]['fp'] = fp
r = re.compile('PRIMER_RIGHT_(\d+)?(_)?SEQUENCE$')
m = r.search(key)
if m:
rp = value
sn = int(m.group(1)) if m.group(1) else 0
if sn not in tmp_dict:
tmp_dict[sn] = {}
tmp_dict[sn]['rp'] = rp
# Product size
r = re.compile('PRIMER_PAIR_(\d+)?(_)?PRODUCT_SIZE$')
m = r.search(key)
if m:
size = value
sn = int(m.group(1)) if m.group(1) else 0
if sn not in tmp_dict:
tmp_dict[sn] = {}
tmp_dict[sn]['size'] = int(size)
p3dict[id] = tmp_dict
return p3dict, seq_id
if __name__ == '__main__':
args = get_opt()
path = os.path.dirname(os.path.abspath(__file__))
thermo_args = _thermo_args(args)
rs_nums = [rs_num.strip() for rs_num in args.rs_nums.strip(',').split(',')]
seq_infos =[]
for rs_num in rs_nums:
tmp = paser_html(rs_num, path)
seq_infos.append(tmp[:3])
with open(os.path.join(path, 'results.doc'), 'a') as file:
file.write(tmp[3])
file.write(tmp[4])
print("rs_num: %s fasta seq have been writen in file %s\\results.doc"%(rs_num, path))
seq_argss = [_seq_args(seq_info) for seq_info in seq_infos]
p3_out = [_s_primer(seq_args, thermo_args) for seq_args in seq_argss]
p3dict, seq_id = parser_p3out(p3_out)
with open(os.path.join(path, 'primers.csv'), 'w', newline ='') as file:
csvfile = csv.writer(file, dialect='excel')
csvfile.writerow(['rs_num', 'f_primer', 'r_primer', 'product_size'])
for id in seq_id:
csvfile.writerow([id, p3dict[id][0]['fp'], p3dict[id][0]['rp'], p3dict[id][0]['size']])
print("rs_num: %s , designed primers have been writen in file %s\\primers.csv"%(id, path))