Skip to content

Commit

Permalink
Do not parse pdm
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafael Barrero Rodríguez committed Mar 1, 2024
1 parent 63422ac commit 2ad5226
Show file tree
Hide file tree
Showing 14 changed files with 41,234 additions and 87 deletions.
23 changes: 19 additions & 4 deletions 4_qTableReport/qReportMaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ def generateFreqTable(config, sign_i, fdr_i, rep, contrast):

rep_i = rep[boolean]

rep_i = rep_i[[pdmCol, pdmFreq]].droplevel(1, axis=1)
rep_i = rep_i[[
pdmCol, pdmFreq,tuple(config['pCol']),tuple(config['gCol']), tuple(config['aCol']), tuple(config['mCol'])
]].droplevel(1, axis=1)

# If no pdm is filtered return empty list
if rep_i.shape[0] == 0:
Expand All @@ -125,13 +127,19 @@ def generateFreqTable(config, sign_i, fdr_i, rep, contrast):
'infile': rep_i,
'outfile': None,
'peptidoform_column': pdmCol[0],
'peptide_column': config['pCol'][0],
'modifcation_column': config['gCol'][0],
'modified_residue_column': config['aCol'][0],
'modified_position_column': config['mCol'][0],
'show_unassigned': False,
'x': config['x'],
'peakorph_column': None,
'scanfreq_column': pdmFreq[0],
'binom': config['binom'],
'q_thr': config['q_thr'],
'values_pivot': config['values_pivot']
})


outFolder = os.path.join(config['outfolder'], 'FreqTables', contrast, f"{config['qvalue_dNM'][1]}-{fdr_i}")
if not os.path.exists(outFolder):
Expand All @@ -142,6 +150,7 @@ def generateFreqTable(config, sign_i, fdr_i, rep, contrast):
biPivot.to_excel(writer, sheet_name=f'PIVOT-{config["binom"]}-{config["q_thr"]}-{config["values_pivot"]}')

ptm = bi[bi[config['binom']]<config['q_thr']]
ptm = ptm.rename(columns={config['aCol'][0]:'a', config['gCol'][0]:'d'})
ptm = list(zip(ptm.a, ptm.d))
return ptm

Expand Down Expand Up @@ -380,7 +389,7 @@ def qReportDesign(config, quan, qTableD, contrast):
q2info.columns = pd.MultiIndex.from_tuples([qTableD.columns[0] if n==0 else (i,'','') for n,i in enumerate(q2info.columns)])
qTableD = pd.merge(q2info, qTableD, how='right', on=[qTableD.columns[0]])

if config['plotFolder']:
if config['plotFolder'] and os.path.exists(config['plotFolder']):
plotted_q = [os.path.splitext(i)[0] for i in os.listdir(config['plotFolder'])]
qTableD[qTableD.columns[0]] = \
[f"=HYPERLINK(\"{os.path.join(config['plotFolder'], i)}.html\", \"{i}\")" if i in plotted_q else i for i in qTableD.iloc[:, 0]]
Expand Down Expand Up @@ -437,7 +446,9 @@ def qReportContrast(rep0, config, contrast):
ptmCol = ('PTM', 'REL')

# Get required report fraction
rep = rep0.loc[:, list(set([pdmCol, qCol, pdmFreq, qFreq, sign, signNM, FDRdNM, FDRNM, qdCol, ptmCol]))].drop_duplicates()
rep = rep0.loc[:, list(set([
pdmCol, qCol, pdmFreq, qFreq, sign, signNM, FDRdNM, FDRNM, qdCol, ptmCol,
tuple(config['pCol']),tuple(config['gCol']), tuple(config['aCol']), tuple(config['mCol'])]))].drop_duplicates()


# Extract NM elements from report
Expand Down Expand Up @@ -590,8 +601,12 @@ def main(config, file=None):


ptmCol = ('PTM', 'REL')
rep[ptmCol] = [
(None, None) if np.isnan(k) else (i,j)
for i,j, k in zip(rep[tuple(config['aCol'])], rep[tuple(config['gCol'])], rep[tuple(config['mCol'])])
]
pdmCol = tuple(config['pdmCol'])
rep[ptmCol] = getPTMCol(rep, config)
#rep[ptmCol] = getPTMCol(rep, config)
rep = rep[~rep[pdmCol].duplicated()]

_ = getBasalQReport(rep, tuple(config['qCol']), tuple(config['qDescCol']), tuple(config['pdmFreq']), ptmCol)
Expand Down
34 changes: 24 additions & 10 deletions 4_qTableReport/qReportMaker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@
#

# Input file (Report generated by FDRoptimizer)
infile: D:\CNIC\Scripts\ReportStats\test\FDR_LIMMA_NM_qfq_table_pgmfreq.tsv
infile: path\to\infile.tsv

# Output file
outfolder: D:\CNIC\Scripts\ReportStats\test\

# Path to file relating protein Uniprot ID (first col) to other information (e.g. Category)
q2info: S:\U_Proteomica\UNIDAD\NextCloud\1_PTM_Analysis\Heteroplasmia_Heart\ReportStats-v0.6\myMitocarta.tsv
outfolder: path\to\outfolder

# FDR threshold used
qvThr:
Expand All @@ -31,10 +28,23 @@ groups:
# First row column name
# Second row column name

# Format of pdm:
# 1 --> PEP[MOD]TIDE
# 2 --> PEPTIDE;MOD;POSITION
pdmColFormat: 2
# Column name containing group
gCol:
- g2
- REL


# Column name containing modified aminoacid
aCol:
- a
- REL

# Column name containing peptide position of modification
mCol:
- m
- REL




#
Expand Down Expand Up @@ -169,4 +179,8 @@ values_pivot: x-PSM
#
plotFolder: D:\CNIC\Scripts\ReportStats\test\Plots

n_cpu: 8
n_cpu: 8


# Path to file relating protein Uniprot ID (first col) to other information (e.g. Category)
q2info:
89 changes: 53 additions & 36 deletions 4_qTableReport/utils/BinomialSiteListMaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,6 @@

PEAK = 'PEAK'

# args = {
# 'infile': r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Proteomics\GroupTools\BinomialResMod\test\test1\heteroplasmy_psm_table.txt",
# 'outfile': None,
# 'peptidoform_column': 'TrunkSequence',
# 'x': 5,
# 'include_nm': False,
# 'peakorph_column': 'New_PeakAssignation',
# 'scanfreq_column': None
# }

#
# Local Functions
Expand Down Expand Up @@ -78,13 +69,13 @@ def getBinom(wdf, col):
fdf['p2'] = [afreq[i] for i in fdf[a]]

d_size = wdf[d].value_counts().to_frame().reset_index()#.rename(columns={'index': 'd', 'd': 'n2'})
d_size.columns = ['d', 'n2']
d_size.columns = [d, 'n2']

fdf = pd.merge(
fdf,
d_size,
how='left',
on='d'
on=d
)

# binom = P( Bi(n,p) >= x )
Expand Down Expand Up @@ -121,7 +112,9 @@ def main(args):
'''

# Set column names
pdm, p, d, a, m, x = args['peptidoform_column'], 'p', 'd', 'a', 'm', args['x']
pdm, p, d, a, m, x = args['peptidoform_column'], args['peptide_column'], \
args['modifcation_column'], args['modified_residue_column'], \
args['modified_position_column'], args['x']

# Read infile
if type(args['infile']) == pd.DataFrame:
Expand All @@ -136,30 +129,49 @@ def main(args):
df[args['scanfreq_column']])].reset_index(drop=True)

# Build working df
pdmList = df[pdm].tolist()

if args['peakorph_column']:
logging.info(f"Filtering NM based on {args['peakorph_column']}")
pdmList = df.loc[df[args['peakorph_column']] == PEAK, pdm].tolist()

# if not args['include_nm']:
# logging.info("Excluding NM (pdm without [Mod])")
pdmList = [i for i in pdmList if '[' in i]

logging.info("Obtaining working dataframe")
wdf = [
(i, re.search(r'(.)\[([^]]+)\]', i))
for i in pdmList
]

wdf = [
# (i, *i.split('_'), 'U', int(len(i)/2)) if j == None else
(i, re.sub(r'\[[^]]+\]', '', i), j.groups()[1],
j.groups()[0], i.index('[')-1) # m index is 0-based
for i, j in wdf
]
if d=='' or p=='' or a=='' or m=='':
p, d, a, m = 'p', 'd', 'a', 'm'

pdmList = df[pdm].tolist()

if args['peakorph_column']:
logging.info(f"Filtering NM based on {args['peakorph_column']}")
pdmList = df.loc[df[args['peakorph_column']] == PEAK, pdm].tolist()

# if not args['include_nm']:
# logging.info("Excluding NM (pdm without [Mod])")

pdmListNM = [i for i in pdmList if '[' not in i]
unassigned = pd.Series([i.split('_')[1] for i in pdmListNM]).value_counts().to_frame()
unassigned.columns = ['Unnasigned']

pdmList = [i for i in pdmList if '[' in i]

logging.info("Obtaining working dataframe")
wdf = [
(i, re.search(r'(.)\[([^]]+)\]', i))
for i in pdmList
]

wdf = [
# (i, *i.split('_'), 'U', int(len(i)/2)) if j == None else
(i, re.sub(r'\[[^]]+\]', '', i), j.groups()[1],
j.groups()[0], i.index('[')-1) # m index is 0-based
for i, j in wdf
]


wdf = pd.DataFrame(wdf, columns=[pdm, p, d, a, m])

else:
wdf = pd.DataFrame(df, columns=[pdm, p, d, a, m])
unassigned = wdf[wdf.m.isna()][[d]].value_counts().to_frame()
unassigned.columns = ['Unnasigned']
wdf = wdf[~wdf.m.isna()]

wdf = pd.DataFrame(wdf, columns=[pdm, p, d, a, m])
if wdf.shape[0] == 0:
logging.error('No modified peptidoform was detected. Exiting program...')
return None, None


logging.info("Calculating binomial pvalues at PSM level")
Expand Down Expand Up @@ -188,7 +200,11 @@ def main(args):
q_thr = float(args['q_thr']) # 0.01
values_pivot = args['values_pivot'] #'x-PSM'

biPivot = pd.pivot_table(bi[bi[binom]<q_thr], index='d', columns='a', values=values_pivot)
biPivot = pd.pivot_table(bi[bi[binom]<q_thr], index=d, columns=a, values=values_pivot)

if args['show_unassigned']:
biPivot = pd.concat([biPivot, unassigned])

biPivot['total'] = biPivot.sum(axis=1)
biPivot = biPivot.sort_values('total', ascending=False)

Expand Down Expand Up @@ -238,6 +254,7 @@ def main(args):
config.read(args.config)
params = dict(config.items('Params'))
params['x'] = int(params['x'])
params['show_unassigned'] = params['show_unassigned'].lower() == 'true'

else:
params = args.__dict__
Expand Down
18 changes: 17 additions & 1 deletion BinomialSiteListMaker/BinomialSiteListMaker.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,25 @@ infile = Path/To/Input.tsv
# Path to output file (.xlsx file)
outfile = Path/To/Output.xlsx

# Name of the column containing peptideform in the format: PEP[mod]TIDE
# Name of the column containing peptidoform in the format: PEP[mod]TIDE
peptidoform_column = New_Assigned_Sequence

# Name of the column containing plain peptide (if not specified it will be obtained from pdm)
peptide_column =

# Column containing modification name (if not specified it will be obtained from pdm)
modifcation_column =


# Column containing modified residue (if not specified it will be obtained from pdm)
modified_residue_column =

# Column containing peptide position of modification (if not specified it will be obtained from pdm)
modified_position_column =

# True or False
show_unassigned = False

# Window size used to estimate Probability(aminoacid)
x = 5

Expand Down
Loading

0 comments on commit 2ad5226

Please sign in to comment.