Do not parse pdm

CNIC-Proteomics · Mar 1, 2024 · 2ad5226 · 2ad5226
1 parent 63422ac
commit 2ad5226
Show file tree

Hide file tree

Showing 14 changed files with 41,234 additions and 87 deletions.
diff --git a/4_qTableReport/qReportMaker.py b/4_qTableReport/qReportMaker.py
@@ -114,7 +114,9 @@ def generateFreqTable(config, sign_i, fdr_i, rep, contrast):
 
     rep_i = rep[boolean]
 
-    rep_i = rep_i[[pdmCol, pdmFreq]].droplevel(1, axis=1)
+    rep_i = rep_i[[
+        pdmCol, pdmFreq,tuple(config['pCol']),tuple(config['gCol']), tuple(config['aCol']), tuple(config['mCol'])
+        ]].droplevel(1, axis=1)
 
     # If no pdm is filtered return empty list
     if rep_i.shape[0] == 0:
@@ -125,13 +127,19 @@ def generateFreqTable(config, sign_i, fdr_i, rep, contrast):
         'infile': rep_i,
         'outfile': None,
         'peptidoform_column': pdmCol[0],
+        'peptide_column': config['pCol'][0],
+        'modifcation_column': config['gCol'][0],
+        'modified_residue_column': config['aCol'][0],
+        'modified_position_column': config['mCol'][0],
+        'show_unassigned': False,
         'x': config['x'],
         'peakorph_column': None,
         'scanfreq_column': pdmFreq[0],
         'binom': config['binom'],
         'q_thr': config['q_thr'],
         'values_pivot': config['values_pivot']
         })
+
 
     outFolder = os.path.join(config['outfolder'], 'FreqTables', contrast, f"{config['qvalue_dNM'][1]}-{fdr_i}")
     if not os.path.exists(outFolder):
@@ -142,6 +150,7 @@ def generateFreqTable(config, sign_i, fdr_i, rep, contrast):
         biPivot.to_excel(writer, sheet_name=f'PIVOT-{config["binom"]}-{config["q_thr"]}-{config["values_pivot"]}')
 
     ptm = bi[bi[config['binom']]<config['q_thr']]
+    ptm = ptm.rename(columns={config['aCol'][0]:'a', config['gCol'][0]:'d'})
     ptm = list(zip(ptm.a, ptm.d))
     return ptm
 
@@ -380,7 +389,7 @@ def qReportDesign(config, quan, qTableD, contrast):
         q2info.columns = pd.MultiIndex.from_tuples([qTableD.columns[0] if n==0 else (i,'','') for n,i in enumerate(q2info.columns)])
         qTableD = pd.merge(q2info, qTableD, how='right', on=[qTableD.columns[0]])
 
-    if config['plotFolder']:
+    if config['plotFolder'] and os.path.exists(config['plotFolder']):
         plotted_q = [os.path.splitext(i)[0] for i in os.listdir(config['plotFolder'])]
         qTableD[qTableD.columns[0]] = \
             [f"=HYPERLINK(\"{os.path.join(config['plotFolder'], i)}.html\", \"{i}\")" if i in plotted_q else i for i in qTableD.iloc[:, 0]]
@@ -437,7 +446,9 @@ def qReportContrast(rep0, config, contrast):
     ptmCol = ('PTM', 'REL')
 
     # Get required report fraction
-    rep = rep0.loc[:, list(set([pdmCol, qCol, pdmFreq, qFreq, sign, signNM, FDRdNM, FDRNM, qdCol, ptmCol]))].drop_duplicates()
+    rep = rep0.loc[:, list(set([
+        pdmCol, qCol, pdmFreq, qFreq, sign, signNM, FDRdNM, FDRNM, qdCol, ptmCol, 
+        tuple(config['pCol']),tuple(config['gCol']), tuple(config['aCol']), tuple(config['mCol'])]))].drop_duplicates()
 
 
     # Extract NM elements from report
@@ -590,8 +601,12 @@ def main(config, file=None):
 
 
     ptmCol = ('PTM', 'REL')
+    rep[ptmCol] = [
+        (None, None) if np.isnan(k) else (i,j) 
+        for i,j, k in zip(rep[tuple(config['aCol'])], rep[tuple(config['gCol'])], rep[tuple(config['mCol'])])
+        ]
     pdmCol = tuple(config['pdmCol'])
-    rep[ptmCol] = getPTMCol(rep, config)
+    #rep[ptmCol] = getPTMCol(rep, config)
     rep = rep[~rep[pdmCol].duplicated()]
 
     _ = getBasalQReport(rep, tuple(config['qCol']), tuple(config['qDescCol']), tuple(config['pdmFreq']), ptmCol)

diff --git a/4_qTableReport/qReportMaker.yaml b/4_qTableReport/qReportMaker.yaml
@@ -3,13 +3,10 @@
 #
 
 # Input file (Report generated by FDRoptimizer)
-infile: D:\CNIC\Scripts\ReportStats\test\FDR_LIMMA_NM_qfq_table_pgmfreq.tsv
+infile: path\to\infile.tsv
 
 # Output file
-outfolder: D:\CNIC\Scripts\ReportStats\test\
-
-# Path to file relating protein Uniprot ID (first col) to other information (e.g. Category)
-q2info: S:\U_Proteomica\UNIDAD\NextCloud\1_PTM_Analysis\Heteroplasmia_Heart\ReportStats-v0.6\myMitocarta.tsv
+outfolder: path\to\outfolder
 
 # FDR threshold used
 qvThr:
@@ -31,10 +28,23 @@ groups:
 # First row column name
 # Second row column name
 
-# Format of pdm:
-# 1 --> PEP[MOD]TIDE
-# 2 --> PEPTIDE;MOD;POSITION
-pdmColFormat: 2
+# Column name containing group
+gCol: 
+    - g2
+    - REL
+
+
+# Column name containing modified aminoacid
+aCol: 
+    - a
+    - REL
+
+# Column name containing peptide position of modification
+mCol: 
+    - m
+    - REL
+
+
 
 
 #
@@ -169,4 +179,8 @@ values_pivot: x-PSM
 #
 plotFolder: D:\CNIC\Scripts\ReportStats\test\Plots
 
-n_cpu: 8 
+n_cpu: 8 
+
+
+# Path to file relating protein Uniprot ID (first col) to other information (e.g. Category)
+q2info: 
diff --git a/4_qTableReport/utils/BinomialSiteListMaker.py b/4_qTableReport/utils/BinomialSiteListMaker.py
@@ -25,15 +25,6 @@
 
 PEAK = 'PEAK'
 
-# args = {
-#         'infile': r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Proteomics\GroupTools\BinomialResMod\test\test1\heteroplasmy_psm_table.txt",
-#         'outfile': None,
-#         'peptidoform_column': 'TrunkSequence',
-#         'x': 5,
-#         'include_nm': False,
-#         'peakorph_column': 'New_PeakAssignation',
-#         'scanfreq_column': None
-#         }
 
 #
 # Local Functions
@@ -78,13 +69,13 @@ def getBinom(wdf, col):
     fdf['p2'] = [afreq[i] for i in fdf[a]]
 
     d_size = wdf[d].value_counts().to_frame().reset_index()#.rename(columns={'index': 'd', 'd': 'n2'})
-    d_size.columns = ['d', 'n2']
+    d_size.columns = [d, 'n2']
 
     fdf = pd.merge(
         fdf,
         d_size,
         how='left',
-        on='d'
+        on=d
     )
 
     # binom = P( Bi(n,p) >= x )
@@ -121,7 +112,9 @@ def main(args):
     '''
 
     # Set column names
-    pdm, p, d, a, m, x = args['peptidoform_column'], 'p', 'd', 'a', 'm', args['x']
+    pdm, p, d, a, m, x = args['peptidoform_column'], args['peptide_column'], \
+        args['modifcation_column'], args['modified_residue_column'], \
+            args['modified_position_column'], args['x']
 
     # Read infile
     if type(args['infile']) == pd.DataFrame:
@@ -136,30 +129,49 @@ def main(args):
             df[args['scanfreq_column']])].reset_index(drop=True)
 
     # Build working df
-    pdmList = df[pdm].tolist()
-
-    if args['peakorph_column']:
-        logging.info(f"Filtering NM based on {args['peakorph_column']}")
-        pdmList = df.loc[df[args['peakorph_column']] == PEAK, pdm].tolist()
-
-    # if not args['include_nm']:
-        # logging.info("Excluding NM (pdm without [Mod])")
-    pdmList = [i for i in pdmList if '[' in i]
-
-    logging.info("Obtaining working dataframe")
-    wdf = [
-        (i, re.search(r'(.)\[([^]]+)\]', i))
-        for i in pdmList
-    ]
-
-    wdf = [
-        # (i, *i.split('_'), 'U', int(len(i)/2)) if j == None else
-        (i, re.sub(r'\[[^]]+\]', '', i), j.groups()[1],
-         j.groups()[0], i.index('[')-1)  # m index is 0-based
-        for i, j in wdf
-    ]
+    if d=='' or p=='' or a=='' or m=='':
+        p, d, a, m = 'p', 'd', 'a', 'm'
+
+        pdmList = df[pdm].tolist()
+
+        if args['peakorph_column']:
+            logging.info(f"Filtering NM based on {args['peakorph_column']}")
+            pdmList = df.loc[df[args['peakorph_column']] == PEAK, pdm].tolist()
+
+        # if not args['include_nm']:
+            # logging.info("Excluding NM (pdm without [Mod])")
+
+        pdmListNM = [i for i in pdmList if '[' not in i]
+        unassigned = pd.Series([i.split('_')[1] for i in pdmListNM]).value_counts().to_frame()
+        unassigned.columns = ['Unnasigned']
+
+        pdmList = [i for i in pdmList if '[' in i]
+
+        logging.info("Obtaining working dataframe")
+        wdf = [
+            (i, re.search(r'(.)\[([^]]+)\]', i))
+            for i in pdmList
+        ]
+
+        wdf = [
+            # (i, *i.split('_'), 'U', int(len(i)/2)) if j == None else
+            (i, re.sub(r'\[[^]]+\]', '', i), j.groups()[1],
+             j.groups()[0], i.index('[')-1)  # m index is 0-based
+            for i, j in wdf
+        ]
+
+
+        wdf = pd.DataFrame(wdf, columns=[pdm, p, d, a, m])
+
+    else:
+        wdf = pd.DataFrame(df, columns=[pdm, p, d, a, m])
+        unassigned = wdf[wdf.m.isna()][[d]].value_counts().to_frame()
+        unassigned.columns = ['Unnasigned']
+        wdf = wdf[~wdf.m.isna()]
 
-    wdf = pd.DataFrame(wdf, columns=[pdm, p, d, a, m])
+    if wdf.shape[0] == 0:
+        logging.error('No modified peptidoform was detected. Exiting program...')
+        return None, None
 
 
     logging.info("Calculating binomial pvalues at PSM level")
@@ -188,7 +200,11 @@ def main(args):
     q_thr = float(args['q_thr']) # 0.01
     values_pivot = args['values_pivot'] #'x-PSM'
 
-    biPivot = pd.pivot_table(bi[bi[binom]<q_thr], index='d', columns='a', values=values_pivot)
+    biPivot = pd.pivot_table(bi[bi[binom]<q_thr], index=d, columns=a, values=values_pivot)
+
+    if args['show_unassigned']:
+        biPivot = pd.concat([biPivot, unassigned])
+
     biPivot['total'] = biPivot.sum(axis=1)
     biPivot = biPivot.sort_values('total', ascending=False)
 
@@ -238,6 +254,7 @@ def main(args):
         config.read(args.config)
         params = dict(config.items('Params'))
         params['x'] = int(params['x'])
+        params['show_unassigned'] = params['show_unassigned'].lower() == 'true'
 
     else:
         params = args.__dict__

diff --git a/BinomialSiteListMaker/BinomialSiteListMaker.ini b/BinomialSiteListMaker/BinomialSiteListMaker.ini
@@ -6,9 +6,25 @@ infile = Path/To/Input.tsv
 # Path to output file (.xlsx file)
 outfile = Path/To/Output.xlsx
 
-# Name of the column containing peptideform in the format: PEP[mod]TIDE
+# Name of the column containing peptidoform in the format: PEP[mod]TIDE
 peptidoform_column = New_Assigned_Sequence
 
+# Name of the column containing plain peptide (if not specified it will be obtained from pdm)
+peptide_column = 
+
+# Column containing modification name (if not specified it will be obtained from pdm)
+modifcation_column =
+
+
+# Column containing modified residue (if not specified it will be obtained from pdm)
+modified_residue_column =
+
+# Column containing peptide position of modification (if not specified it will be obtained from pdm)
+modified_position_column =
+
+# True or False
+show_unassigned = False
+
 # Window size used to estimate Probability(aminoacid)
 x = 5