Skip to content

Commit

Permalink
fixed smalle edge case
Browse files Browse the repository at this point in the history
  • Loading branch information
fairliereese committed Jan 17, 2023
1 parent 2bcc49f commit 9b6783f
Showing 1 changed file with 19 additions and 5 deletions.
24 changes: 19 additions & 5 deletions cerberus/cerberus.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,27 +754,43 @@ def merge_ends(ends, ref, mode):
direction = 'upstream'
elif mode == 'tes':
direction = 'downstream'
# pdb.set_trace()

# limit to relevant columns
ends = ends[['Chromosome', 'Start', 'End', 'Strand',
'gene_id', 'transcript_id']]



# get only the relevant columns and deduplicate
ends = ends.df
t_ends = ends.copy(deep=True)
ends.drop('transcript_id', axis=1, inplace=True)
ends.drop_duplicates(inplace=True)
ends['temp_id'] = [i for i in range(len(ends.index))] # id to track which ends get assigned
ends_back = ends.copy(deep=True)
end_ids = ends.temp_id.unique().tolist()
ends = pr.PyRanges(ends)

# find closest interval in ref
ends = ends.nearest(ref,
strandedness=None,
how=direction)
# pdb.set_trace()

# fix the ends with mismatching gene ids - this part can be slow :(
ends = ends.df

# fix ends that didn't get assigned for whatever reason above
# only fix monoexonic guys?
end_ids_2 = ends.temp_id.unique().tolist()
missing_ends = list(set(end_ids)-set(end_ids_2))
print(f'# missing ends: {len(missing_ends)}')
ends_back = ends_back.loc[ends_back.temp_id.isin(missing_ends)]
ends_back = pr.PyRanges(ends_back)
ends_back = ends_back.nearest(ref,
strandedness=None) # find the nearest end w/o dir.
ends_back = ends_back.as_df()
ends = pd.concat([ends, ends_back])

# fix the ends with mismatching gene ids - this part can be slow :(
fix_ends = ends.loc[ends.gene_id != ends.gene_id_b]
fix_ends = fix_ends[['Chromosome', 'Start', 'End', 'Strand',
'gene_id']]
Expand All @@ -793,7 +809,6 @@ def merge_ends(ends, ref, mode):

if i % 100 == 0:
print('Processed {} / {} genes'.format(i, len(fix_ends.gene_id.unique().tolist())))
# pdb.set_trace()

# merge back in to get transcript ids
t_ends = t_ends.merge(ends, how='left',
Expand Down Expand Up @@ -2014,7 +2029,6 @@ def assign_triplets(gtf_df, tss, ic, tes, gene_source, t_map):
# for beep in ['tss', 'tes', 'ic']:
# print('# affected transcripts w/ null {}: {}'.format(beep,len(df.loc[df[beep].isnull()].index)))
# df = df.loc[~df[beep].isnull()]

df['transcript_triplet'] = '['+df.tss.astype(int).astype(str)+','+\
df.ic.astype(int).astype(str)+','+\
df.tes.astype(int).astype(str)+']'
Expand Down

0 comments on commit 9b6783f

Please sign in to comment.