From 7ab8574a7825e11010a0312a0878cc9f250dfa70 Mon Sep 17 00:00:00 2001
From: Armaan Abraham <armaanabraham@g.ucla.edu>
Date: Fri, 14 Jun 2024 13:17:17 -0600
Subject: [PATCH] add comments to remaining files

---
 bicytok/distanceMetricFuncs.py | 281 +++------------------------------
 bicytok/figures/figure3.py     |  17 +-
 bicytok/figures/figure4.py     |   5 +
 bicytok/figures/figure5.py     |   4 +-
 bicytok/figures/figure6.py     |   2 +
 bicytok/figures/figure7.py     |   2 +-
 bicytok/figures/figure8.py     |   2 +-
 bicytok/selectivityFuncs.py    |   2 +-
 general_comments.md            |   1 +
 9 files changed, 55 insertions(+), 261 deletions(-)

diff --git a/bicytok/distanceMetricFuncs.py b/bicytok/distanceMetricFuncs.py
index 8e4d863..1264d2d 100644
--- a/bicytok/distanceMetricFuncs.py
+++ b/bicytok/distanceMetricFuncs.py
@@ -14,12 +14,15 @@
 path_here = dirname(dirname(__file__))
 
 
+# figure 2
 def KL_EMD_1D(ax, targCell, numFactors, offTargState=0) -> pd.DataFrame:
     """
     Finds markers which have average greatest difference (EMD and KL) from other cells
     :param ax: Axes to plot on
     :param targCell: Target cell type for analysis
     :param numFactors: Number of top factors to consider
+    Armaan: offTargState values should be defined as constant integers or
+    strings so that this code is more readable.
     :param offTargState: State of off-target comparison (0 for all non-memory Tregs,
         1 for all non-Tregs, 2 for naive Tregs)
     :return:
@@ -31,6 +34,7 @@ def KL_EMD_1D(ax, targCell, numFactors, offTargState=0) -> pd.DataFrame:
     for marker in CITE_DF.loc[
         :,
         (
+            # Armaan: I think you can use df.isin here
             (CITE_DF.columns != "CellType1")
             & (CITE_DF.columns != "CellType2")
             & (CITE_DF.columns != "CellType3")
@@ -42,17 +46,21 @@ def KL_EMD_1D(ax, targCell, numFactors, offTargState=0) -> pd.DataFrame:
             targCellMark = (
                 CITE_DF.loc[CITE_DF["CellType3"] == targCell][marker].values / markAvg
             )
+            # Armaan: isn't this comment outdated? I think it's comparing to all
+            # cell types that are not targCell
             # Compare to all non-memory Tregs
             if offTargState == 0:
                 offTargCellMark = (
                     CITE_DF.loc[CITE_DF["CellType3"] != targCell][marker].values
                     / markAvg
                 )
+            # Armaan: what if the targcell is in the not Treg category?
             # Compare to all non-Tregs
             elif offTargState == 1:
                 offTargCellMark = (
                     CITE_DF.loc[CITE_DF["CellType2"] != "Treg"][marker].values / markAvg
                 )
+            # Armaan: what if the targcell is Treg Naive?
             # Compare to naive Tregs
             elif offTargState == 2:
                 offTargCellMark = (
@@ -90,12 +98,18 @@ def KL_EMD_1D(ax, targCell, numFactors, offTargState=0) -> pd.DataFrame:
                 )
 
     corrsDF = pd.DataFrame()
+
     for i, distance in enumerate(["Wasserstein Distance", "KL Divergence"]):
         ratioDF = markerDF.sort_values(by=distance)
         posCorrs = ratioDF.tail(numFactors).Marker.values
         corrsDF = pd.concat(
             [corrsDF, pd.DataFrame({"Distance": distance, "Marker": posCorrs})]
         )
+        # Armaan: I'm not sure why you are filtering markerDF here. Aren't you
+        # plotting the top numFactors markers here independently for each
+        # distance metric? Additionally, since the return value of this function
+        # isn't used anywhere, you can get rid of all of the corrsDF stuff,
+        # calculate ratioDF once, and plot it.
         markerDF = markerDF.loc[markerDF["Marker"].isin(posCorrs)]
         sns.barplot(
             data=ratioDF.tail(numFactors), y="Marker", x=distance, ax=ax[i], color="k"
@@ -106,27 +120,7 @@ def KL_EMD_1D(ax, targCell, numFactors, offTargState=0) -> pd.DataFrame:
     return corrsDF
 
 
-def get_conversion_factor(weightDF, receptor_name):
-    """
-    conversion factors used for citeseq dataset
-    :param weightDF: DataFrame containing weight information for receptors
-    :param receptor_name: Name of the receptor for which the conversion factor is needed
-    :return:
-        conversion_factor: Conversion factor for the specified receptor
-    """
-    IL2Rb_factor = weightDF.loc[weightDF["Receptor"] == "IL2Rb", "Weight"].values[0]
-    IL7Ra_factor = weightDF.loc[weightDF["Receptor"] == "IL7Ra", "Weight"].values[0]
-    IL2Ra_factor = weightDF.loc[weightDF["Receptor"] == "IL2Ra", "Weight"].values[0]
-    if receptor_name == "CD122":
-        return IL2Rb_factor
-    elif receptor_name == "CD25":
-        return IL2Ra_factor
-    elif receptor_name == "CD127":
-        return IL7Ra_factor
-    else:
-        return (IL7Ra_factor + IL2Ra_factor + IL2Rb_factor) / 3
-
-
+# figure 5,7
 def EMD_2D(dataset, signal_receptor, target_cells, special_receptor, ax):
     """
     returns list of descending EMD values for specified target cell (2 receptors)
@@ -264,244 +258,14 @@ def EMD_2D(dataset, signal_receptor, target_cells, special_receptor, ax):
     return sorted_results
 
 
-def EMD_3D(dataset1, target_cells, ax=None):
-    """
-    returns list of descending EMD values for specified target cell (3 receptors)
-    returns list of descending EMD values for specified target cell (3 receptors)
-    :param dataset1: DataFrame containing the dataset
-    :param target_cells: Target cell type for analysis
-    :param ax: Matplotlib Axes object for plotting (optional)
-    :return:
-    List of tuples (format: (recep1, recep2, recep 3, OT value) containing
-        optimal transport distances and receptor information for 3D analysis
-    """
-    CITE_DF = importCITE()
-
-    weightDF = convFactCalc(CITE_DF)
-    exclude_columns = ["CellType1", "CellType2", "CellType3", "Cell"]
-    threshold_multiplier = 5
-    # Calculate the mean and standard deviation for each numeric column
-    numeric_columns = [col for col in dataset1.columns if col not in exclude_columns]
-    column_means = dataset1[numeric_columns].mean()
-    column_stddevs = dataset1[numeric_columns].std()
-    # Identify outliers for each numeric column
-    outliers = {}
-    for column in numeric_columns:
-        threshold = column_means[column] + threshold_multiplier * column_stddevs[column]
-        outliers[column] = dataset1[column] > threshold
-    # Create a mask to filter rows with outliers
-    outlier_mask = pd.DataFrame(outliers)
-    dataset = dataset1[~outlier_mask.any(axis=1)]
-    # receptor_names = [col for col in dataset.columns if col not in exclude_columns]
-    results = []
-    target_cells_df = dataset[
-        (dataset["CellType3"] == target_cells)
-        | (dataset["CellType2"] == target_cells)
-        | (dataset["CellType1"] == target_cells)
-    ]
-    off_target_cells_df = dataset[
-        ~(
-            (dataset["CellType3"] == target_cells)
-            | (dataset["CellType2"] == target_cells)
-            | (dataset["CellType1"] == target_cells)
-        )
-    ]
-    receptor_names = [
-        col
-        for col in dataset.columns
-        if col not in exclude_columns
-        and np.mean(target_cells_df[col]) > np.mean(off_target_cells_df[col])
-        and np.mean(target_cells_df[col]) > 5
-    ]
-
-    for receptor1_name in receptor_names:
-        for receptor2_name in receptor_names:
-            for receptor3_name in receptor_names:
-                if (
-                    receptor1_name != receptor2_name
-                    and receptor1_name != receptor3_name
-                    and receptor2_name != receptor3_name
-                ):
-                    # Get on and off-target counts for receptor1 2 and 3
-                    receptor1_on_target_counts = target_cells_df[receptor1_name].values
-                    receptor1_off_target_counts = off_target_cells_df[
-                        receptor1_name
-                    ].values
-                    receptor2_on_target_counts = target_cells_df[receptor2_name].values
-                    receptor2_off_target_counts = off_target_cells_df[
-                        receptor2_name
-                    ].values
-                    receptor3_on_target_counts = target_cells_df[receptor3_name].values
-                    receptor3_off_target_counts = off_target_cells_df[
-                        receptor3_name
-                    ].values
-                    conversion_factor_receptor1 = get_conversion_factor(
-                        weightDF, receptor1_name
-                    )
-                    conversion_factor_receptor2 = get_conversion_factor(
-                        weightDF, receptor2_name
-                    )
-                    conversion_factor_receptor3 = get_conversion_factor(
-                        weightDF, receptor3_name
-                    )
-                    # Apply the conversion factors to the counts
-                    receptor1_on_target_counts = (
-                        receptor1_on_target_counts * conversion_factor_receptor1
-                    )
-                    receptor1_off_target_counts = (
-                        receptor1_off_target_counts * conversion_factor_receptor1
-                    )
-                    receptor2_on_target_counts = (
-                        receptor2_on_target_counts * conversion_factor_receptor2
-                    )
-                    receptor2_off_target_counts = (
-                        receptor2_off_target_counts * conversion_factor_receptor2
-                    )
-                    receptor3_on_target_counts = (
-                        receptor3_on_target_counts * conversion_factor_receptor3
-                    )
-                    receptor3_off_target_counts = (
-                        receptor3_off_target_counts * conversion_factor_receptor3
-                    )
-                    average_receptor_counts_1_on = np.mean(receptor1_on_target_counts)
-                    average_receptor_counts_1_off = np.mean(receptor1_off_target_counts)
-                    average_receptor_counts_2_on = np.mean(receptor2_on_target_counts)
-                    average_receptor_counts_2_off = np.mean(receptor2_off_target_counts)
-                    average_receptor_counts_3_on = np.mean(receptor3_on_target_counts)
-                    average_receptor_counts_3_off = np.mean(receptor3_off_target_counts)
-                    average_receptor_counts_1 = np.mean(
-                        np.concatenate(
-                            (receptor1_on_target_counts, receptor1_off_target_counts)
-                        ),
-                        axis=0,
-                    )
-                    average_receptor_counts_2 = np.mean(
-                        np.concatenate(
-                            (receptor2_on_target_counts, receptor2_off_target_counts)
-                        ),
-                        axis=0,
-                    )
-                    average_receptor_counts_3 = np.mean(
-                        np.concatenate(
-                            (receptor3_on_target_counts, receptor3_off_target_counts)
-                        ),
-                        axis=0,
-                    )
-                    if (
-                        average_receptor_counts_1_on > 5
-                        and average_receptor_counts_2_on > 5
-                        and average_receptor_counts_3_on > 5
-                        and average_receptor_counts_1_on > average_receptor_counts_1_off
-                        and average_receptor_counts_2_on > average_receptor_counts_2_off
-                        and average_receptor_counts_3_on > average_receptor_counts_3_off
-                    ):
-                        receptor1_on_target_counts = (
-                            receptor1_on_target_counts.astype(float)
-                            / average_receptor_counts_1
-                        )
-                        receptor1_off_target_counts = (
-                            receptor1_off_target_counts.astype(float)
-                            / average_receptor_counts_1
-                        )
-                        receptor2_on_target_counts = (
-                            receptor2_on_target_counts.astype(float)
-                            / average_receptor_counts_2
-                        )
-                        receptor2_off_target_counts = (
-                            receptor2_off_target_counts.astype(float)
-                            / average_receptor_counts_2
-                        )
-                        receptor3_on_target_counts = (
-                            receptor3_on_target_counts.astype(float)
-                            / average_receptor_counts_3
-                        )
-                        receptor3_off_target_counts = (
-                            receptor3_off_target_counts.astype(float)
-                            / average_receptor_counts_3
-                        )
-                        # Calculate the EMD between on-target and off-target counts for
-                        # both receptors # change this so its two [||]
-                        on_target_counts = np.concatenate(
-                            (
-                                receptor1_on_target_counts[:, np.newaxis],
-                                receptor2_on_target_counts[:, np.newaxis],
-                                receptor3_on_target_counts[:, np.newaxis],
-                            ),
-                            axis=1,
-                        )
-                        off_target_counts = np.concatenate(
-                            (
-                                receptor1_off_target_counts[:, np.newaxis],
-                                receptor2_off_target_counts[:, np.newaxis],
-                                receptor3_off_target_counts[:, np.newaxis],
-                            ),
-                            axis=1,
-                        )
-                        average_receptor_counts = np.mean(
-                            np.concatenate((on_target_counts, off_target_counts)),
-                            axis=0,
-                        )
-                        on_target_counts = (
-                            on_target_counts.astype(float) / average_receptor_counts
-                        )
-                        off_target_counts = (
-                            off_target_counts.astype(float) / average_receptor_counts
-                        )
-                        M = ot.dist(on_target_counts, off_target_counts)
-                        a = (
-                            np.ones(on_target_counts.shape[0])
-                            / on_target_counts.shape[0]
-                        )
-                        b = (
-                            np.ones(off_target_counts.shape[0])
-                            / off_target_counts.shape[0]
-                        )
-                        optimal_transport = ot.emd2(a, b, M, numItermax=10000000)
-                        results.append(
-                            (
-                                optimal_transport,
-                                receptor1_name,
-                                receptor2_name,
-                                receptor3_name,
-                            )
-                        )
-                        print("ot:", optimal_transport)
-                    else:
-                        results.append(
-                            (0, receptor1_name, receptor2_name, receptor3_name)
-                        )
-
-    sorted_results = sorted(results, reverse=True)
-    top_receptor_info = [
-        (receptor1_name, receptor2_name, receptor3_name, optimal_transport)
-        for optimal_transport, receptor1_name, receptor2_name, receptor3_name in sorted_results[
-            :10
-        ]
-    ]
-    print("top 10 dist:", top_receptor_info)
-    receptor_pairs = [(info[0], info[1], info[2]) for info in top_receptor_info]
-    distances = [info[3] for info in top_receptor_info]
-    if ax is not None:
-        ax.bar(range(len(receptor_pairs)), distances)
-        ax.set_xlabel("Receptor Pair", fontsize=14)
-        ax.set_ylabel("Distance", fontsize=14)
-        ax.set_title(
-            f"Top 10 Receptor Pair Distances (3D) for {target_cells}", fontsize=14
-        )
-        ax.set_xticks(range(len(receptor_pairs)))
-        ax.set_xticklabels(
-            [f"{pair[0]} - {pair[1]} - {pair[2]}" for pair in receptor_pairs],
-            rotation="vertical",
-            fontsize=14,
-        )
-    return sorted_results
-
-
 def calculate_kl_divergence_2D(targCellMark, offTargCellMark):
     """
     calculates the Kullback-Leibler (KL) divergence between two
         probability distributions
     *used in combination with 1D or 2D KL functions
+    Armaan: I think this docstring is outdated, as it says that this function is
+    used for both 1d and 2d KLs. It actually seems like you can reuse this
+    function for 1d and 2d, as KernelDensity.fit() takes in 1d and 2d data.
     :param targCellMark: Target cell marker data
     :param offTargCellMark: Off-target cell marker data
     :return:
@@ -524,12 +288,15 @@ def calculate_kl_divergence_2D(targCellMark, offTargCellMark):
     return KL_div
 
 
+# figure 5,8
 def KL_divergence_2D(dataset, signal_receptor, target_cells, special_receptor, ax):
     """
+    Armaan: does this return EMD or just KL? I think either the function name or
+    docstring should be changed.
     returns list of descending EMD values for specified target cell (2 receptors)
     :param dataset: DataFrame containing the dataset
     :param signal_receptor: Name of the signal receptor
-    :param target_cells: Target cell type for analysis
+    :param target_cells: Target cell types for analysis
     :param special_receptor: Special receptor to consider
         (optional, used for just calculating distance for 2 receptors)
     :param ax: Matplotlib Axes object for plotting (optional)
@@ -539,6 +306,8 @@ def KL_divergence_2D(dataset, signal_receptor, target_cells, special_receptor, a
     CITE_DF = importCITE()
     weightDF = convFactCalc(CITE_DF)
 
+    # Armaan: just declare one idx and then use it and its negation to filter
+    # the df so you don't repeat code.
     target_cells_df = dataset[
         (dataset["CellType3"] == target_cells) | (dataset["CellType2"] == target_cells)
     ]
@@ -579,6 +348,7 @@ def KL_divergence_2D(dataset, signal_receptor, target_cells, special_receptor, a
         off_target_receptor_counts = off_target_cells_df[
             [signal_receptor, receptor_name]
         ].values
+        # Armaan: can you declare these mappings once and then use them throughout?
         if receptor_name == "CD122":
             conversion_factor = weightDF.loc[
                 weightDF["Receptor"] == "IL2Rb", "Weight"
@@ -637,6 +407,7 @@ def KL_divergence_2D(dataset, signal_receptor, target_cells, special_receptor, a
     return sorted_results
 
 
+# figure 5
 def correlation(cell_type, relevant_epitopes):
     """Calculates the Pearson correlation between two celltypes receptor counts"""
     epitopesList = pd.read_csv("./bicytok/data/epitopeList.csv")
diff --git a/bicytok/figures/figure3.py b/bicytok/figures/figure3.py
index 73ce661..a126562 100644
--- a/bicytok/figures/figure3.py
+++ b/bicytok/figures/figure3.py
@@ -3,18 +3,23 @@
 import seaborn as sns
 
 from ..selectivityFuncs import (
-    get_cell_bindings,
     calcReceptorAbundances,
+    get_cell_bindings,
 )
 from .common import getSetup
 
-
 """SECONDARY: signaling receptor
 EPITOPE: additional targeting receptor
+Armaan: why call it 'starting' affinity? I don't think you're fitting any
+affinities in this figure, so maybe state this explicitly and just call them
+affinities. Also, are you sure you shouldn't be optimizing the affinities here?
+This seems to be the case for the other figures.
 SECONDARY_AFF: starting affinity of ligand and secondary receptor"""
 
+# Armaan: Why call it secondary? Can we use a better name?
 SECONDARY = "CD122"
 EPITOPE = "CD278"
+# Armaan: how did we pick this affinity?
 SECONDARY_AFF = 6.0
 VALENCY = 4
 
@@ -33,12 +38,18 @@
 
 
 def makeFigure():
+    # Armaan: I think you should move all of the figure descriptions to the top
+    # of the file, before you declare the constants, so that the reader has
+    # context for the constants.
     """Figure file to generate bar plots for amount of signal receptor
     bound to each given cell type"""
     ax, f = getSetup((8, 3), (1, 2))
 
+    # Armaan: how are the 8.5s chosen here? I think it would be best to declare
+    # a lot of these literals at the top of the file.
     affs = np.array([SECONDARY_AFF, 8.5, 8.5])
 
+    # Armaan: use os.path.join or pathlib.Path here
     epitopesList = pd.read_csv("./bicytok/data/epitopeList.csv")
     epitopes = list(epitopesList["Epitope"].unique())
 
@@ -47,6 +58,8 @@ def makeFigure():
     bindings = get_cell_bindings(
         epitopesDF,
         SECONDARY,
+        # Armaan: Why is CD25 declared down here while EPITOPE is declared at
+        # the top of the file?
         ["CD25", EPITOPE],
         affs,
         0.1,
diff --git a/bicytok/figures/figure4.py b/bicytok/figures/figure4.py
index 245c19d..501c77c 100644
--- a/bicytok/figures/figure4.py
+++ b/bicytok/figures/figure4.py
@@ -42,6 +42,8 @@ def makeFigure():
 
     offTCells = CELLS[CELLS != targCell]
 
+    # Armaan: use join on the second segment of this path too instead of the /
+    # in the string
     epitopesList = pd.read_csv(join(path_here, "data/epitopeList.csv"))
     epitopes = list(epitopesList["Epitope"].unique())
     epitopesDF = calcReceptorAbundances(epitopes, CELLS)
@@ -57,6 +59,9 @@ def makeFigure():
     for i, target1 in enumerate(targets):
         for j, target2 in enumerate(targets):
             if i == j:
+                # Armaan: shouldn't the molecule in this case include 1 subunit
+                # targeting SIGNAL and 2 subunits corresponding to target1?
+                # Right now it's just 1 subunit for target1.
                 targetsBoth = [target1]
                 optAffs = [8.0, 8.0]
                 valenciesBoth = np.array([[SIGNAL[1], valencies[i]]])
diff --git a/bicytok/figures/figure5.py b/bicytok/figures/figure5.py
index 6e187cd..a8b2976 100644
--- a/bicytok/figures/figure5.py
+++ b/bicytok/figures/figure5.py
@@ -34,7 +34,8 @@
 
 def makeFigure():
     """Figure file to generate plots of bispecific ligand selectivity for
-    combinations of different KL divergences, EMDs, and anti-correlations."""
+    combinations of different KL divergences, EMDs, and anti-correlations.
+    """
     ax, f = getSetup((9, 3), (1, 3))
 
     CITE_DF = importCITE()
@@ -42,6 +43,7 @@ def makeFigure():
 
     offTCells = CELLS[CELLS != TARG_CELL]
 
+    # Armaan: use path.join for the second segment too.
     epitopesList = pd.read_csv(join(path_here, "data/epitopeList.csv"))
     epitopes = list(epitopesList["Epitope"].unique())
     epitopesDF = calcReceptorAbundances(epitopes, CELLS, numCells=1000)
diff --git a/bicytok/figures/figure6.py b/bicytok/figures/figure6.py
index c97370e..d3b114a 100644
--- a/bicytok/figures/figure6.py
+++ b/bicytok/figures/figure6.py
@@ -48,6 +48,8 @@ def makeFigure():
 
     for targCell in cells:
         markerDF = pd.DataFrame(columns=["Marker", "KL", "EMD"])
+        # Armaan: you do this in multiple places, maybe create a function which gets
+        # columns that are not cell types?
         for marker in CITE_DF.loc[
             :,
             (
diff --git a/bicytok/figures/figure7.py b/bicytok/figures/figure7.py
index ab8729e..ae97e1b 100644
--- a/bicytok/figures/figure7.py
+++ b/bicytok/figures/figure7.py
@@ -5,7 +5,6 @@
 from ..imports import importCITE
 from .common import getSetup
 
-
 TARGET_CELL = "Treg"
 
 
@@ -16,6 +15,7 @@ def makeFigure():
 
     markerDF = importCITE()
     new_df = markerDF.head(1000)
+    # Armaan: you are creating a list of receptors and then overwriting it right after.
     receptors = []
     for column in new_df.columns:
         if column not in ["CellType1", "CellType2", "CellType3", "Cell"]:
diff --git a/bicytok/figures/figure8.py b/bicytok/figures/figure8.py
index 07a9df7..f5d7096 100644
--- a/bicytok/figures/figure8.py
+++ b/bicytok/figures/figure8.py
@@ -5,7 +5,6 @@
 from ..imports import importCITE
 from .common import getSetup
 
-
 TARGET_CELL = "Treg"
 
 
@@ -15,6 +14,7 @@ def makeFigure():
 
     markerDF = importCITE()
     new_df = markerDF.head(1000)
+    # Armaan: you are creating a list of receptors and then overwriting it right after.
     receptors = []
     for column in new_df.columns:
         if column not in ["CellType1", "CellType2", "CellType3", "Cell"]:
diff --git a/bicytok/selectivityFuncs.py b/bicytok/selectivityFuncs.py
index 7e6e874..b8178bb 100644
--- a/bicytok/selectivityFuncs.py
+++ b/bicytok/selectivityFuncs.py
@@ -341,7 +341,7 @@ def get_rec_vecs(
 
 
 # Armaan: Can you refactor this and minOffTargSelec to use the same logic for
-# infering the number of bound signaling receptors? These two function share a
+# inferring the number of bound signaling receptors? These two functions share a
 # lot of the same functionality. One reason to avoid this is if this function is
 # a lot slower, and you don't want to call it during optimization, but it
 # doesn't seem obvious that it would be.
diff --git a/general_comments.md b/general_comments.md
index d92378e..981ee31 100644
--- a/general_comments.md
+++ b/general_comments.md
@@ -16,3 +16,4 @@ used (e.g. binding model paper uses R_eq).
 are unused.
 6. Instead of implementing binding model locally, import it from
 https://github.com/meyer-lab/valentBind.
+7. In figure files, move docstrings describing the whole figure to the top of the file, not the makeFigure function.
\ No newline at end of file