forked from henryjuho/sal_enhancers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_read_group_info.py
87 lines (60 loc) · 2.68 KB
/
get_read_group_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import sys
import time
import gzip
def sample_names(sample_csv, download_info):
accession2sample = {x.split(',')[-1].replace(' ', '').rstrip(): x.split(',')[3].replace(' ', '').rstrip() for x in open(sample_csv) if not x.startswith('Pop')}
run_accession2sample = {}
for line in open(download_info):
if line.startswith('study'):
continue
line = line.rstrip().split('\t')
sample_accession = line[2]
run_accession = line[4]
try:
sample = accession2sample[sample_accession]
except KeyError:
sample = accession2sample[run_accession]
run_accession2sample[run_accession] = sample
return run_accession2sample
def main():
"""
from Padraic's: https://github.com/padraicc/Corcoran_et_al_2017/blob/master/great_tit/get_read_group_info_bgi.py
read group tag info: https://gatkforums.broadinstitute.org/gatk/discussion/6472/read-groups
also here: https://samtools.github.io/hts-specs/SAMv1.pdf
"""
# generate time for dt read group tag
# should be date of sequencing, but just use current
# date to fill in this tag, so that the file produced
# works with the bwa_mem_real.sh script for mapping
date = time.localtime()
dt = str(date[0]) + '-' + str(date[1]) + '-' + str(date[2])
get_sample = sample_names('barson_tableS5.csv', 'PRJEB10744.txt')
f1 = open('fastq_and_rg.txt', 'w')
for i in sys.stdin:
i = i.rstrip()
col = i.strip().split('/')
run_accession = col[-1].split('_')[0]
sm = get_sample[run_accession] # sample acquired from csv file
# get the flowcell and lane id from the first record of fastq 1
fq_1 = i
fq_2 = i.replace('1_val_1.fq.gz', '2_val_2.fq.gz')
fq = gzip.open(fq_1, 'rb')
fq_header = fq.readline().decode('utf-8')
fq.close()
fq_fields = fq_header.split(':')
flowcell = fq_fields[2]
lane = fq_fields[3]
pl = 'illumina' # platform
pu = flowcell + '_' + lane # platform unit
lb = run_accession + '_' + 'lib1' # library identifier
pi = '500' # Predicted median insert size
ds = 'paired_end' # description
cn = 'NORWEGIAN_UNIVERSITY_OF_LIFE_SCIENCES' # Name of sequencing center producing the read.
rg_id = lb + '_' + lane # read group id
# print fq_1 + '\t' + fq_2 + '\t' + rg_id + '\t' + pl \
# + '\t' + pu + '\t' + lb + '\t' + pi + '\t' + \
# ds + '\t' + dt + '\t' + sm + '\t' + cn
f1.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
(fq_1, fq_2, rg_id, pl, pu, lb, pi, ds, dt, sm, cn))
if __name__ == '__main__':
main()