From e47baf5310c7fcae3fd5a670cdb41b9782120876 Mon Sep 17 00:00:00 2001
From: MrBones1102 <jamespopoli@gmail.com>
Date: Wed, 12 Jun 2024 16:31:38 -0700
Subject: [PATCH] Successful generation of pearson correlation graph of
 cytokines plasma vs serum from recreate file using getSetup() from common.py

---
 .gitignore                        |   1 +
 tfac/figures/james_S1_recreate.py | 137 ++++++++++++++++++++++++------
 2 files changed, 112 insertions(+), 26 deletions(-)

diff --git a/.gitignore b/.gitignore
index 48a025f..a8e84a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,3 +130,4 @@ JamesIterfigureS2.png
 JamesIterfigureS3.png
 JamesIterfigureS4.png
 JamesIterfigureS5.png
+JamesS1ByHand.png
diff --git a/tfac/figures/james_S1_recreate.py b/tfac/figures/james_S1_recreate.py
index 4fc9076..e2464b2 100644
--- a/tfac/figures/james_S1_recreate.py
+++ b/tfac/figures/james_S1_recreate.py
@@ -5,9 +5,11 @@
 import pandas as pd
 import numpy as np
 from scipy import sparse
-# plot 1 is the pearson coefficients of serum and plasma cytokine levels
 
-#start by importing cytokine data
+from scipy.stats import pearsonr
+import seaborn as sns
+
+from tfac.figures.common import getSetup
 
 """
 Since we will be grabbing data from files present in tfac-mrsa,
@@ -16,12 +18,33 @@
 PATH_HERE = dirname(dirname(dirname(abspath(__file__))))
 OPTIMAL_SCALING = 2 ** 7.0
 
-def import_cytokines(scale_cyto=True):
+def import_patient_metadata():
+    """
+    Returns patient meta data, including cohort and outcome.
+
+    Returns:
+        patient_data (pandas.DataFrame): Patient outcomes and cohorts
+    """
+    patient_data = pd.read_csv(
+        join(PATH_HERE, 'tfac', 'data', 'mrsa', 'patient_metadata.txt'),
+        delimiter=',',
+        index_col=0
+    )
+
+    # Drop patients with only RNAseq
+    patient_data = patient_data.loc[patient_data["type"] != "2RNAseq", :]
+
+    return patient_data
+# #debug
+# print(import_patient_metadata())
+
+def import_cytokines(scale_cyto=True, transpose=True):
     """
     Return 2 matrices containing 1) plasma_ctyo data and 2) serum_cyto data
     
     Parameters:
-        scale default:True log scale the concentrations
+        scale default:True | log scale the concentrations
+        transpose default:True | want cytokine labels as indeces (axis=0) 
     
     Returns:
         plasma_ctyo (pandas.DataFrame)
@@ -58,30 +81,28 @@ def import_cytokines(scale_cyto=True):
         serum_cyto = serum_cyto.transform(np.log)
         serum_cyto -= serum_cyto.mean(axis=0)
 
-    return plasma_cyto, serum_cyto
+    """
+    If sample isn't represnted in patients, remove it from cytokines.
+    Do this by reindexing cyto data by the intersection of patient index
+    and cyto index
+    """
+    patients = set(import_patient_metadata().index)
+    plasma_cyto = plasma_cyto.reindex(set(plasma_cyto.index).intersection(patients))
+    serum_cyto = serum_cyto.reindex(set(serum_cyto.index).intersection(patients))
+    print(f"plasma_cyto shape post-importation, pre-transpose: {plasma_cyto.shape}")
+    print(f"serum_cyto shape post-importation, pre-transpose: {serum_cyto.shape}")
 
-# debug line
-plasma_cyto, serum_cyto = import_cytokines()
 
-def import_patient_metadata():
-    """
-    Returns patient meta data, including cohort and outcome.
+    # transpose by default so that we expect cytokine labels as indeces (axis=0)
+    if transpose:
+        plasma_cyto = plasma_cyto.T
+        serum_cyto = serum_cyto.T
 
-    Returns:
-        patient_data (pandas.DataFrame): Patient outcomes and cohorts
-    """
-    patient_data = pd.read_csv(
-        join(PATH_HERE, 'tfac', 'data', 'mrsa', 'patient_metadata.txt'),
-        delimiter=',',
-        index_col=0
-    )
+    return plasma_cyto, serum_cyto
 
-    # Drop patients with only RNAseq
-    patient_data = patient_data.loc[patient_data["type"] != "2RNAseq", :]
+# # debug line
+# plasma_cyto, serum_cyto = import_cytokines()
 
-    return patient_data
-#debug
-print(import_patient_metadata())
 
 def form_tensor(variance_scaling: float = OPTIMAL_SCALING):
     """
@@ -95,7 +116,7 @@ def form_tensor(variance_scaling: float = OPTIMAL_SCALING):
         tensor (numpy.array): tensor of cytokine data
     """
     #import relevant datasets, not doing RNA yet
-    plasma_cyto, serum_cyto = import_cytokines()
+    plasma_cyto, serum_cyto = import_cytokines(transpose=False)
     patient_data = import_patient_metadata()
     print(f"plasma_cyto shape before reindex: {plasma_cyto.shape} (Patient, Cytokine)")
     print(f"serum_cyto shape before reindex: {serum_cyto.shape} (Patient, Cytokine)")
@@ -122,7 +143,71 @@ def form_tensor(variance_scaling: float = OPTIMAL_SCALING):
     return np.copy(tensor), patient_data
 
 def fig_S1_setup():
+    # we're only interested in plasma cytokines so we shove RNA into _
     tensor, patInfo = form_tensor()
+    plasma, _ = import_cytokines() # we just need cytokine list
+
+    # collect cytokines from the labels of the plasma data and the number
+    cytokines = plasma.index
+    n_cytokines = len(cytokines)
+
+    print(f"tensor is shape {tensor.shape} after creation")
+    tensor = tensor.T
+    patInfo = patInfo.T
+    serum_slice = tensor[0, :, :]
+    plasma_slice = tensor[1, :, :]
+    print(f"tensor is shape {tensor.shape} during slice collection (mode 1)")
+
+    # concatenate the serum and plasma slices across the index (37,177)+(37,177)=(74, 177)
+    test = pd.concat([pd.DataFrame(serum_slice), pd.DataFrame(plasma_slice)])
+    print(f"concatenated serum+plasma slices shape: {test.shape}")
+
+    # drop any patients (axis=1) that only have either serum or plasma cytokine (axis=0) data
+    test = test.dropna(axis=1)
+    # we are trying to generate the pearson coefficients between serum and plasma
+
+    """
+    Setup a pearson list, then for every i in 34 cytokines append the
+    cytokine string and pearsonr calc of test row i (index i) with
+    test row i+n_cytokines (+n_cyto to get to plasma concat section). Do
+    this while ensuring everything is a numpy float in the pears list.
+    Turn pears back into a pandas DataFrame
+    """
+    pears = []
+    for i in range(n_cytokines):
+        pears.append([cytokines[i], pearsonr(test.iloc[i, :].to_numpy(dtype=float), test.iloc[i + n_cytokines, :].to_numpy(dtype=float))[0]])
+    pears = pd.DataFrame(pears).sort_values(1) # sort by incerasing pearson correlation (column 1)
+    print(f"shape of pears: {pears.shape}")
+
+    return pears, serum_slice, plasma_slice, cytokines, patInfo
+
+# #debug
+# fig_S1_setup()
+
+def makeFigure():
+    """Skipping boxplot section for now"""
+
+    # list of axis objects (plots)
+    fig_size = (8, 3)
+    layout = {
+        "ncols": 1,
+        "nrows": 1
+    } #single plot for now
+
+    # cheating by using getSetup without understanding it for now
+    ax, f, _ = getSetup(
+        fig_size,
+        layout
+    )
+
+    pears, serum_slice, plasma_slice, cytokines, patIfno = fig_S1_setup()
+    a = sns.pointplot(data=pears, x=0, y=1, join=False, ax=ax[0])
+    a.set_xticklabels(a.get_xticklabels(), rotation=30, ha="right")
+    a.set_xlabel("Cytokine")
+    a.set_ylabel("Pearson's correlation")
+    a.set_title("Serum-Plasma Cytokine Level Correlation")
+
+    return f
 
-#debug
-form_tensor()
+fig = makeFigure()
+fig.savefig("./JamesS1ByHand.png")