forked from jazsakr/lr_blk_smk
-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
360 lines (299 loc) · 12.2 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import pysam
import pandas as pd
import re
import math
import gzip
import pyranges as pr
import numpy as np
import scanpy as sc
import swan_vis as swan
def process_meta(meta_fname):
meta = pd.read_csv(meta_fname, sep='\t')
meta['genotype'] = meta.genotype.str.replace(' ', '_')
meta['genotype'] = meta.genotype.str.replace('/', '_')
meta['age'] = meta.age.str.replace(' ', '_')
dupe_ids = meta.loc[meta.mouse_id.duplicated(), 'mouse_id'].tolist()
if len(dupe_ids) > 0:
raise ValueError(f'Found duplicated mouse ids {dupe_ids} in mouse metadata')
return meta
def parse_config_file(fname,
meta_fname,
datasets_per_run,
auto_dedupe=True):
"""
Parameters:
fname (str): Path to config file fname. One line per input fastq.
meta_fname (str): Path to file with metadata information.
datasets_per_run (int): Number of datasets to process in each TALON run
auto_dedupe (bool): Automatically deduplicate duplicate fastqs that result from
successive Porechop rounds
Returns:
df (pandas DataFrame): DF w/ pipeline information; one line per fastq
dataset_df (pandas DataFrame): DF w/ dataset information; one line per mouse
"""
df = pd.read_csv(fname, sep='\t')
############ Basename + fname
df['basename'] = df.fname.str.rsplit('/', n=1, expand=True)[1]
df['path'] = df.fname.str.rsplit('/', n=1, expand=True)[0]
############ Dataset + flowcell df
# get flowcell
exp = '.*\/[\w-]+_(\d+)(?:_t\d+)?\.fastq(?:.gz)?'
df['flowcell'] = df.fname.str.extract(exp)
# check to make sure the same file stem isn't there more than once
# (can happen if different flow cells needed different amounts of chopping)
# df['file_stem'] = df.basename.str.rsplit('_', n=1, expand=True)[0]
exp = '.*\/([\w-]+_\d+)(?:_t\d+)?\.fastq(?:.gz)?'
df['file_stem'] = df.fname.str.extract(exp)
df['chop_num'] = df.basename.str.rsplit('.fastq', expand=True)[0].str.rsplit('_t', expand=True)[1].astype(float)
if df.file_stem.duplicated().any():
dupe_stems = df.loc[df.file_stem.duplicated(keep=False), 'basename'].tolist()
if not auto_dedupe:
raise ValueError(f'Files {dupe_stems} seem to be duplicated. Check config file.')
else:
print(f'Files {dupe_stems} seem to be duplicated. Automatically removing lower chop numbers')
df = df.sort_values(by='chop_num', ascending=False)
df = df.drop_duplicates(subset='file_stem', keep='first')
# extract the sample name
temp = df.basename.str.split('_', expand=True)[[0,1]]#.str.join('_')
df['sample_temp'] = temp[0]+'_'+temp[1]
# extract the mouse id
df['mouse_id'] = df['sample_temp'].str.split('_', expand=True)[1]
# extract the "study" name
exp = '^(ad[0-9]+)'
df['study'] = df.basename.str.extract(exp)
# merge in metadata
meta = process_meta(meta_fname)
df['mouse_id'] = df['mouse_id'].astype('int')
df = df.merge(meta, how='left', on='mouse_id')
# get tech rep numbers -- each mouse has multiple reps
# and are therefore technical reps
df['flowcell'] = df.sort_values(['genotype', 'mouse_id'],
ascending=[True, True])\
.groupby(['mouse_id']) \
.cumcount() + 1
# sample should be the genotype + age + sex + tissue
df['sample'] = df.genotype+'_'+ \
df.sex+'_'+ \
df.age+'_'+ \
df.tissue
# get biorep numbers -- each mouse_id is a different mouse
# and therefore a different biorep
temp = df[['sample', 'mouse_id']].drop_duplicates()
temp.reset_index(inplace=True, drop=True)
temp['biorep_num'] = temp.sort_values(['sample', 'mouse_id'],
ascending=[True, True])\
.groupby(['sample']) \
.cumcount()+1
df = df.merge(temp, how='left',
on=['sample', 'mouse_id'])
# talon dataset should be sample + bio rep
df['dataset'] = df['sample']+'_'+df['biorep_num'].astype(str)
# # dataset should be sample + bio rep + flow cel
# df['dataset'] = df['talon_dataset']+'_'+df['flowcell'].astype(str)
############ TALON dataset df
# create a dataset-level df that will represent the aggregate
cols = ['sample', 'mouse_id', 'genotype', 'sex', 'study', \
'age', 'tissue', 'biorep_num', 'dataset', 'platform']
dataset_df = df[cols].drop_duplicates()
# get the talon run number these will go into
dataset_df = dataset_df.sort_values(by='study', ascending=False)
dataset_df = dataset_df.reset_index(drop=True)
dataset_df['talon_run_num'] = np.nan
for ind, entry in dataset_df.iterrows():
curr_study = entry.study
# first entry
if ind == 0:
run_num = 0
study_ind = 0
prev_study = curr_study
# when we find a new study
if curr_study != prev_study:
run_num = 0
study_ind = 0
# when we hit the max. # datasets / run
if study_ind % datasets_per_run == 0:
run_num += 1
# actual assignment of number
dataset_df.loc[ind, 'talon_run_num'] = run_num
# keep track of last study that was assigned a number
prev_study = curr_study
# inc study ind
study_ind += 1
# clean up some typing
dataset_df['talon_run_num'] = dataset_df.talon_run_num.astype(int)
df['flowcell'] = df.flowcell.astype(str)
return df, dataset_df
def rev_comp(seq):
""" Returns the reverse complement of a DNA sequence,
retaining the case of each letter"""
complement = ""
for base in seq:
if base == "A": complement += "T"
elif base == "T": complement += "A"
elif base == "G": complement += "C"
elif base == "C": complement += "G"
elif base == "N": complement += "N"
elif base == "a": complement += "t"
elif base == "t": complement += "a"
elif base == "g": complement += "c"
elif base == "c": complement += "g"
elif base == "n": complement += "n"
elif base == "*": complement += "*"
else:
complement += base
print("Warning: reverse complement function encountered unknown base " + "'" + base + "'")
reverseComplement = complement[::-1]
return reverseComplement
def flip_fastq(infile, outfile):
reverse = ''
if infile.endswith('.gz'):
f = gzip.open(infile, 'rt')
else:
f = open(infile, 'r')
o = open(outfile, 'w')
for line in f:
line = line.strip()
# if line starts with @, write identifier to ouput file
if re.match('^@.{19,49}', line):
o.write(line + "\n")
elif re.match('^[ATCG].*', line):
seq = line
# if there is a TTTTTT repeat at begining of read, reverse sequence then write to output file
if re.match('TTTTTTTTTTTTTTTTTT*', seq):
line = rev_comp(seq)
o.write(line + '\n')
reverse = True
# if there is no TTTTT repeat at beginng of read, write to output file
else:
o.write(line + '\n')
reverse = False
else:
if not re.match('^@ATCG([+]\n)', line):
qual = line
# if the sequence line was reversed, reverse the quality score line then write to ouput
if reverse == True:
line = qual[::-1]
o.write(line + '\n')
# if the sequence line was not reversed, write quality score line to output
else:
o.write(line + '\n')
o.close()
f.close()
def reverse_alignment(infile, outfile, threads=1):
reverse_strand = {0: 16, 16: 0}
if infile.endswith('.bam'):
in_mode = 'rb'
else:
in_mode = 'r'
if outfile.endswith('bam'):
out_mode = 'wb'
else:
out_mode = 'w'
input = pysam.AlignmentFile(infile, in_mode, threads=threads)
output = pysam.AlignmentFile(outfile, out_mode, template=input, threads=threads)
for read in input:
if read.has_tag('ts') and read.flag in reverse_strand:
if read.get_tag('ts') == '-':
read.flag = reverse_strand[read.flag]
read.set_tag('ts', '+')
output.write(read)
input.close()
output.close()
################################################################################
############################ LAPA file post-proc ###############################
################################################################################
# def gtf_add_rescue_ism_cat(gtf):
# """
# Update LAPA GTF w/ ISM rescue category for those that were assigned new
# ends from LAPA
# """
# gtf = pr.read_gtf(input.lapa_gtf, as_df=True)
# gtf.loc[(gtf.transcript_id.str.contains('#'))&(gtf.ISM_transcript=='TRUE'),
# 'transcript_novelty'] = 'ISM_rescue'
# gtf = pr.PyRanges(gtf)
# return gtf
def ab_add_rescue_ism_cat(ab):
"""
Update LAPA abundance w/ ISM rescue category for those that were assigned new
ends from LAPA
"""
df = pd.read_csv(ab, sep='\t')
df.loc[(df.annot_transcript_id.str.contains('#'))&(df.transcript_novelty=='ISM'), 'transcript_novelty'] = 'ISM_rescue'
return df
def filter_lapa_on_nov(df,
t_novs=['Known', 'NIC', 'NNC', 'ISM_rescue'],
g_novs=['Known']):
"""
Filter LAPA output based on gene and transcript novelty.
Input:
df (pandas df): Abundance table from LAPA
t_nov (list of str): Transcript novelty categories to keep
g_nov (list of str): Gene novelty categories to keep
Returns:
filt_df (pandas df): DataFrame of gene id, transcript id passing filt
"""
df = df.loc[df.transcript_novelty.isin(t_novs)]
df = df.loc[df.gene_novelty.isin(g_novs)]
filt_df = df[['annot_gene_id', 'annot_transcript_id']].drop_duplicates()
filt_df = filt_df.rename({'annot_gene_id':'gid',
'annot_transcript_id': 'tid'}, axis=1)
return filt_df
def filter_spikes(gtf):
"""
Filter LAPA output based on SIRV / ERCC status
Input:
gtf (str): GTF path from LAPA
Returns:
filt_df (pandas df): DataFrame of gene id, transcript id passing filt
"""
df = pr.read_gtf(gtf, as_df=True)
df = df.loc[~df.Chromosome.str.contains('SIRV')]
df = df.loc[~df.Chromosome.str.contains('ERCC')]
df = df.loc[df.Feature == 'transcript']
filt_df = df[['gene_id', 'transcript_id']].drop_duplicates()
filt_df = filt_df.rename({'gene_id':'gid',
'transcript_id':'tid'}, axis=1)
return filt_df
def get_ids_from_pass_list(filt_list):
filt_df = pd.read_csv(filt_list, sep='\t')
gids = filt_df.gid.unique().tolist()
tids = filt_df.tid.unique().tolist()
return gids, tids
def filt_lapa_ab(ab, filt_list):
"""
Filter LAPA abundance using a TALON-style pass list
"""
df = pd.read_csv(ab, sep='\t')
gids, tids = get_ids_from_pass_list(filt_list)
df = df.loc[(df.annot_gene_id.isin(gids))&(df.annot_transcript_id.isin(tids))]
return df
def filt_lapa_gtf(gtf, filt_list):
"""
Filter LAPA GTF using a TALON-style pass list
"""
gtf = pr.read_gtf(gtf).as_df()
gids, tids = get_ids_from_pass_list(filt_list)
# first filter on tids
gtf = gtf.loc[(gtf.transcript_id.isin(tids))|(gtf.Feature=='gene')]
# then filter on gids
gtf = gtf.loc[(gtf.gene_id.isin(gids))]
gtf = pr.PyRanges(gtf)
return gtf
def save_swan_adata(swan_file,
ofile,
how='iso'):
"""
Save anndata obj from Swan.
Input:
swan_file (str): Input SwanGraph file
ofile (str): Output AnnData file
how (str): {'iso', 'tss', 'tes', 'ic', 'edge', 'loc', 'gene'}
"""
sg = swan.read(swan_file)
if how == 'gene':
adata = sg.gene_adata
elif how == 'iso':
adata = sg.adata
else:
raise ValueError("You haven't implemented this yet.")
adata.write(ofile)