From 7ab8574a7825e11010a0312a0878cc9f250dfa70 Mon Sep 17 00:00:00 2001 From: Armaan Abraham Date: Fri, 14 Jun 2024 13:17:17 -0600 Subject: [PATCH] add comments to remaining files --- bicytok/distanceMetricFuncs.py | 281 +++------------------------------ bicytok/figures/figure3.py | 17 +- bicytok/figures/figure4.py | 5 + bicytok/figures/figure5.py | 4 +- bicytok/figures/figure6.py | 2 + bicytok/figures/figure7.py | 2 +- bicytok/figures/figure8.py | 2 +- bicytok/selectivityFuncs.py | 2 +- general_comments.md | 1 + 9 files changed, 55 insertions(+), 261 deletions(-) diff --git a/bicytok/distanceMetricFuncs.py b/bicytok/distanceMetricFuncs.py index 8e4d863..1264d2d 100644 --- a/bicytok/distanceMetricFuncs.py +++ b/bicytok/distanceMetricFuncs.py @@ -14,12 +14,15 @@ path_here = dirname(dirname(__file__)) +# figure 2 def KL_EMD_1D(ax, targCell, numFactors, offTargState=0) -> pd.DataFrame: """ Finds markers which have average greatest difference (EMD and KL) from other cells :param ax: Axes to plot on :param targCell: Target cell type for analysis :param numFactors: Number of top factors to consider + Armaan: offTargState values should be defined as constant integers or + strings so that this code is more readable. :param offTargState: State of off-target comparison (0 for all non-memory Tregs, 1 for all non-Tregs, 2 for naive Tregs) :return: @@ -31,6 +34,7 @@ def KL_EMD_1D(ax, targCell, numFactors, offTargState=0) -> pd.DataFrame: for marker in CITE_DF.loc[ :, ( + # Armaan: I think you can use df.isin here (CITE_DF.columns != "CellType1") & (CITE_DF.columns != "CellType2") & (CITE_DF.columns != "CellType3") @@ -42,17 +46,21 @@ def KL_EMD_1D(ax, targCell, numFactors, offTargState=0) -> pd.DataFrame: targCellMark = ( CITE_DF.loc[CITE_DF["CellType3"] == targCell][marker].values / markAvg ) + # Armaan: isn't this comment outdated? I think it's comparing to all + # cell types that are not targCell # Compare to all non-memory Tregs if offTargState == 0: offTargCellMark = ( CITE_DF.loc[CITE_DF["CellType3"] != targCell][marker].values / markAvg ) + # Armaan: what if the targcell is in the not Treg category? # Compare to all non-Tregs elif offTargState == 1: offTargCellMark = ( CITE_DF.loc[CITE_DF["CellType2"] != "Treg"][marker].values / markAvg ) + # Armaan: what if the targcell is Treg Naive? # Compare to naive Tregs elif offTargState == 2: offTargCellMark = ( @@ -90,12 +98,18 @@ def KL_EMD_1D(ax, targCell, numFactors, offTargState=0) -> pd.DataFrame: ) corrsDF = pd.DataFrame() + for i, distance in enumerate(["Wasserstein Distance", "KL Divergence"]): ratioDF = markerDF.sort_values(by=distance) posCorrs = ratioDF.tail(numFactors).Marker.values corrsDF = pd.concat( [corrsDF, pd.DataFrame({"Distance": distance, "Marker": posCorrs})] ) + # Armaan: I'm not sure why you are filtering markerDF here. Aren't you + # plotting the top numFactors markers here independently for each + # distance metric? Additionally, since the return value of this function + # isn't used anywhere, you can get rid of all of the corrsDF stuff, + # calculate ratioDF once, and plot it. markerDF = markerDF.loc[markerDF["Marker"].isin(posCorrs)] sns.barplot( data=ratioDF.tail(numFactors), y="Marker", x=distance, ax=ax[i], color="k" @@ -106,27 +120,7 @@ def KL_EMD_1D(ax, targCell, numFactors, offTargState=0) -> pd.DataFrame: return corrsDF -def get_conversion_factor(weightDF, receptor_name): - """ - conversion factors used for citeseq dataset - :param weightDF: DataFrame containing weight information for receptors - :param receptor_name: Name of the receptor for which the conversion factor is needed - :return: - conversion_factor: Conversion factor for the specified receptor - """ - IL2Rb_factor = weightDF.loc[weightDF["Receptor"] == "IL2Rb", "Weight"].values[0] - IL7Ra_factor = weightDF.loc[weightDF["Receptor"] == "IL7Ra", "Weight"].values[0] - IL2Ra_factor = weightDF.loc[weightDF["Receptor"] == "IL2Ra", "Weight"].values[0] - if receptor_name == "CD122": - return IL2Rb_factor - elif receptor_name == "CD25": - return IL2Ra_factor - elif receptor_name == "CD127": - return IL7Ra_factor - else: - return (IL7Ra_factor + IL2Ra_factor + IL2Rb_factor) / 3 - - +# figure 5,7 def EMD_2D(dataset, signal_receptor, target_cells, special_receptor, ax): """ returns list of descending EMD values for specified target cell (2 receptors) @@ -264,244 +258,14 @@ def EMD_2D(dataset, signal_receptor, target_cells, special_receptor, ax): return sorted_results -def EMD_3D(dataset1, target_cells, ax=None): - """ - returns list of descending EMD values for specified target cell (3 receptors) - returns list of descending EMD values for specified target cell (3 receptors) - :param dataset1: DataFrame containing the dataset - :param target_cells: Target cell type for analysis - :param ax: Matplotlib Axes object for plotting (optional) - :return: - List of tuples (format: (recep1, recep2, recep 3, OT value) containing - optimal transport distances and receptor information for 3D analysis - """ - CITE_DF = importCITE() - - weightDF = convFactCalc(CITE_DF) - exclude_columns = ["CellType1", "CellType2", "CellType3", "Cell"] - threshold_multiplier = 5 - # Calculate the mean and standard deviation for each numeric column - numeric_columns = [col for col in dataset1.columns if col not in exclude_columns] - column_means = dataset1[numeric_columns].mean() - column_stddevs = dataset1[numeric_columns].std() - # Identify outliers for each numeric column - outliers = {} - for column in numeric_columns: - threshold = column_means[column] + threshold_multiplier * column_stddevs[column] - outliers[column] = dataset1[column] > threshold - # Create a mask to filter rows with outliers - outlier_mask = pd.DataFrame(outliers) - dataset = dataset1[~outlier_mask.any(axis=1)] - # receptor_names = [col for col in dataset.columns if col not in exclude_columns] - results = [] - target_cells_df = dataset[ - (dataset["CellType3"] == target_cells) - | (dataset["CellType2"] == target_cells) - | (dataset["CellType1"] == target_cells) - ] - off_target_cells_df = dataset[ - ~( - (dataset["CellType3"] == target_cells) - | (dataset["CellType2"] == target_cells) - | (dataset["CellType1"] == target_cells) - ) - ] - receptor_names = [ - col - for col in dataset.columns - if col not in exclude_columns - and np.mean(target_cells_df[col]) > np.mean(off_target_cells_df[col]) - and np.mean(target_cells_df[col]) > 5 - ] - - for receptor1_name in receptor_names: - for receptor2_name in receptor_names: - for receptor3_name in receptor_names: - if ( - receptor1_name != receptor2_name - and receptor1_name != receptor3_name - and receptor2_name != receptor3_name - ): - # Get on and off-target counts for receptor1 2 and 3 - receptor1_on_target_counts = target_cells_df[receptor1_name].values - receptor1_off_target_counts = off_target_cells_df[ - receptor1_name - ].values - receptor2_on_target_counts = target_cells_df[receptor2_name].values - receptor2_off_target_counts = off_target_cells_df[ - receptor2_name - ].values - receptor3_on_target_counts = target_cells_df[receptor3_name].values - receptor3_off_target_counts = off_target_cells_df[ - receptor3_name - ].values - conversion_factor_receptor1 = get_conversion_factor( - weightDF, receptor1_name - ) - conversion_factor_receptor2 = get_conversion_factor( - weightDF, receptor2_name - ) - conversion_factor_receptor3 = get_conversion_factor( - weightDF, receptor3_name - ) - # Apply the conversion factors to the counts - receptor1_on_target_counts = ( - receptor1_on_target_counts * conversion_factor_receptor1 - ) - receptor1_off_target_counts = ( - receptor1_off_target_counts * conversion_factor_receptor1 - ) - receptor2_on_target_counts = ( - receptor2_on_target_counts * conversion_factor_receptor2 - ) - receptor2_off_target_counts = ( - receptor2_off_target_counts * conversion_factor_receptor2 - ) - receptor3_on_target_counts = ( - receptor3_on_target_counts * conversion_factor_receptor3 - ) - receptor3_off_target_counts = ( - receptor3_off_target_counts * conversion_factor_receptor3 - ) - average_receptor_counts_1_on = np.mean(receptor1_on_target_counts) - average_receptor_counts_1_off = np.mean(receptor1_off_target_counts) - average_receptor_counts_2_on = np.mean(receptor2_on_target_counts) - average_receptor_counts_2_off = np.mean(receptor2_off_target_counts) - average_receptor_counts_3_on = np.mean(receptor3_on_target_counts) - average_receptor_counts_3_off = np.mean(receptor3_off_target_counts) - average_receptor_counts_1 = np.mean( - np.concatenate( - (receptor1_on_target_counts, receptor1_off_target_counts) - ), - axis=0, - ) - average_receptor_counts_2 = np.mean( - np.concatenate( - (receptor2_on_target_counts, receptor2_off_target_counts) - ), - axis=0, - ) - average_receptor_counts_3 = np.mean( - np.concatenate( - (receptor3_on_target_counts, receptor3_off_target_counts) - ), - axis=0, - ) - if ( - average_receptor_counts_1_on > 5 - and average_receptor_counts_2_on > 5 - and average_receptor_counts_3_on > 5 - and average_receptor_counts_1_on > average_receptor_counts_1_off - and average_receptor_counts_2_on > average_receptor_counts_2_off - and average_receptor_counts_3_on > average_receptor_counts_3_off - ): - receptor1_on_target_counts = ( - receptor1_on_target_counts.astype(float) - / average_receptor_counts_1 - ) - receptor1_off_target_counts = ( - receptor1_off_target_counts.astype(float) - / average_receptor_counts_1 - ) - receptor2_on_target_counts = ( - receptor2_on_target_counts.astype(float) - / average_receptor_counts_2 - ) - receptor2_off_target_counts = ( - receptor2_off_target_counts.astype(float) - / average_receptor_counts_2 - ) - receptor3_on_target_counts = ( - receptor3_on_target_counts.astype(float) - / average_receptor_counts_3 - ) - receptor3_off_target_counts = ( - receptor3_off_target_counts.astype(float) - / average_receptor_counts_3 - ) - # Calculate the EMD between on-target and off-target counts for - # both receptors # change this so its two [||] - on_target_counts = np.concatenate( - ( - receptor1_on_target_counts[:, np.newaxis], - receptor2_on_target_counts[:, np.newaxis], - receptor3_on_target_counts[:, np.newaxis], - ), - axis=1, - ) - off_target_counts = np.concatenate( - ( - receptor1_off_target_counts[:, np.newaxis], - receptor2_off_target_counts[:, np.newaxis], - receptor3_off_target_counts[:, np.newaxis], - ), - axis=1, - ) - average_receptor_counts = np.mean( - np.concatenate((on_target_counts, off_target_counts)), - axis=0, - ) - on_target_counts = ( - on_target_counts.astype(float) / average_receptor_counts - ) - off_target_counts = ( - off_target_counts.astype(float) / average_receptor_counts - ) - M = ot.dist(on_target_counts, off_target_counts) - a = ( - np.ones(on_target_counts.shape[0]) - / on_target_counts.shape[0] - ) - b = ( - np.ones(off_target_counts.shape[0]) - / off_target_counts.shape[0] - ) - optimal_transport = ot.emd2(a, b, M, numItermax=10000000) - results.append( - ( - optimal_transport, - receptor1_name, - receptor2_name, - receptor3_name, - ) - ) - print("ot:", optimal_transport) - else: - results.append( - (0, receptor1_name, receptor2_name, receptor3_name) - ) - - sorted_results = sorted(results, reverse=True) - top_receptor_info = [ - (receptor1_name, receptor2_name, receptor3_name, optimal_transport) - for optimal_transport, receptor1_name, receptor2_name, receptor3_name in sorted_results[ - :10 - ] - ] - print("top 10 dist:", top_receptor_info) - receptor_pairs = [(info[0], info[1], info[2]) for info in top_receptor_info] - distances = [info[3] for info in top_receptor_info] - if ax is not None: - ax.bar(range(len(receptor_pairs)), distances) - ax.set_xlabel("Receptor Pair", fontsize=14) - ax.set_ylabel("Distance", fontsize=14) - ax.set_title( - f"Top 10 Receptor Pair Distances (3D) for {target_cells}", fontsize=14 - ) - ax.set_xticks(range(len(receptor_pairs))) - ax.set_xticklabels( - [f"{pair[0]} - {pair[1]} - {pair[2]}" for pair in receptor_pairs], - rotation="vertical", - fontsize=14, - ) - return sorted_results - - def calculate_kl_divergence_2D(targCellMark, offTargCellMark): """ calculates the Kullback-Leibler (KL) divergence between two probability distributions *used in combination with 1D or 2D KL functions + Armaan: I think this docstring is outdated, as it says that this function is + used for both 1d and 2d KLs. It actually seems like you can reuse this + function for 1d and 2d, as KernelDensity.fit() takes in 1d and 2d data. :param targCellMark: Target cell marker data :param offTargCellMark: Off-target cell marker data :return: @@ -524,12 +288,15 @@ def calculate_kl_divergence_2D(targCellMark, offTargCellMark): return KL_div +# figure 5,8 def KL_divergence_2D(dataset, signal_receptor, target_cells, special_receptor, ax): """ + Armaan: does this return EMD or just KL? I think either the function name or + docstring should be changed. returns list of descending EMD values for specified target cell (2 receptors) :param dataset: DataFrame containing the dataset :param signal_receptor: Name of the signal receptor - :param target_cells: Target cell type for analysis + :param target_cells: Target cell types for analysis :param special_receptor: Special receptor to consider (optional, used for just calculating distance for 2 receptors) :param ax: Matplotlib Axes object for plotting (optional) @@ -539,6 +306,8 @@ def KL_divergence_2D(dataset, signal_receptor, target_cells, special_receptor, a CITE_DF = importCITE() weightDF = convFactCalc(CITE_DF) + # Armaan: just declare one idx and then use it and its negation to filter + # the df so you don't repeat code. target_cells_df = dataset[ (dataset["CellType3"] == target_cells) | (dataset["CellType2"] == target_cells) ] @@ -579,6 +348,7 @@ def KL_divergence_2D(dataset, signal_receptor, target_cells, special_receptor, a off_target_receptor_counts = off_target_cells_df[ [signal_receptor, receptor_name] ].values + # Armaan: can you declare these mappings once and then use them throughout? if receptor_name == "CD122": conversion_factor = weightDF.loc[ weightDF["Receptor"] == "IL2Rb", "Weight" @@ -637,6 +407,7 @@ def KL_divergence_2D(dataset, signal_receptor, target_cells, special_receptor, a return sorted_results +# figure 5 def correlation(cell_type, relevant_epitopes): """Calculates the Pearson correlation between two celltypes receptor counts""" epitopesList = pd.read_csv("./bicytok/data/epitopeList.csv") diff --git a/bicytok/figures/figure3.py b/bicytok/figures/figure3.py index 73ce661..a126562 100644 --- a/bicytok/figures/figure3.py +++ b/bicytok/figures/figure3.py @@ -3,18 +3,23 @@ import seaborn as sns from ..selectivityFuncs import ( - get_cell_bindings, calcReceptorAbundances, + get_cell_bindings, ) from .common import getSetup - """SECONDARY: signaling receptor EPITOPE: additional targeting receptor +Armaan: why call it 'starting' affinity? I don't think you're fitting any +affinities in this figure, so maybe state this explicitly and just call them +affinities. Also, are you sure you shouldn't be optimizing the affinities here? +This seems to be the case for the other figures. SECONDARY_AFF: starting affinity of ligand and secondary receptor""" +# Armaan: Why call it secondary? Can we use a better name? SECONDARY = "CD122" EPITOPE = "CD278" +# Armaan: how did we pick this affinity? SECONDARY_AFF = 6.0 VALENCY = 4 @@ -33,12 +38,18 @@ def makeFigure(): + # Armaan: I think you should move all of the figure descriptions to the top + # of the file, before you declare the constants, so that the reader has + # context for the constants. """Figure file to generate bar plots for amount of signal receptor bound to each given cell type""" ax, f = getSetup((8, 3), (1, 2)) + # Armaan: how are the 8.5s chosen here? I think it would be best to declare + # a lot of these literals at the top of the file. affs = np.array([SECONDARY_AFF, 8.5, 8.5]) + # Armaan: use os.path.join or pathlib.Path here epitopesList = pd.read_csv("./bicytok/data/epitopeList.csv") epitopes = list(epitopesList["Epitope"].unique()) @@ -47,6 +58,8 @@ def makeFigure(): bindings = get_cell_bindings( epitopesDF, SECONDARY, + # Armaan: Why is CD25 declared down here while EPITOPE is declared at + # the top of the file? ["CD25", EPITOPE], affs, 0.1, diff --git a/bicytok/figures/figure4.py b/bicytok/figures/figure4.py index 245c19d..501c77c 100644 --- a/bicytok/figures/figure4.py +++ b/bicytok/figures/figure4.py @@ -42,6 +42,8 @@ def makeFigure(): offTCells = CELLS[CELLS != targCell] + # Armaan: use join on the second segment of this path too instead of the / + # in the string epitopesList = pd.read_csv(join(path_here, "data/epitopeList.csv")) epitopes = list(epitopesList["Epitope"].unique()) epitopesDF = calcReceptorAbundances(epitopes, CELLS) @@ -57,6 +59,9 @@ def makeFigure(): for i, target1 in enumerate(targets): for j, target2 in enumerate(targets): if i == j: + # Armaan: shouldn't the molecule in this case include 1 subunit + # targeting SIGNAL and 2 subunits corresponding to target1? + # Right now it's just 1 subunit for target1. targetsBoth = [target1] optAffs = [8.0, 8.0] valenciesBoth = np.array([[SIGNAL[1], valencies[i]]]) diff --git a/bicytok/figures/figure5.py b/bicytok/figures/figure5.py index 6e187cd..a8b2976 100644 --- a/bicytok/figures/figure5.py +++ b/bicytok/figures/figure5.py @@ -34,7 +34,8 @@ def makeFigure(): """Figure file to generate plots of bispecific ligand selectivity for - combinations of different KL divergences, EMDs, and anti-correlations.""" + combinations of different KL divergences, EMDs, and anti-correlations. + """ ax, f = getSetup((9, 3), (1, 3)) CITE_DF = importCITE() @@ -42,6 +43,7 @@ def makeFigure(): offTCells = CELLS[CELLS != TARG_CELL] + # Armaan: use path.join for the second segment too. epitopesList = pd.read_csv(join(path_here, "data/epitopeList.csv")) epitopes = list(epitopesList["Epitope"].unique()) epitopesDF = calcReceptorAbundances(epitopes, CELLS, numCells=1000) diff --git a/bicytok/figures/figure6.py b/bicytok/figures/figure6.py index c97370e..d3b114a 100644 --- a/bicytok/figures/figure6.py +++ b/bicytok/figures/figure6.py @@ -48,6 +48,8 @@ def makeFigure(): for targCell in cells: markerDF = pd.DataFrame(columns=["Marker", "KL", "EMD"]) + # Armaan: you do this in multiple places, maybe create a function which gets + # columns that are not cell types? for marker in CITE_DF.loc[ :, ( diff --git a/bicytok/figures/figure7.py b/bicytok/figures/figure7.py index ab8729e..ae97e1b 100644 --- a/bicytok/figures/figure7.py +++ b/bicytok/figures/figure7.py @@ -5,7 +5,6 @@ from ..imports import importCITE from .common import getSetup - TARGET_CELL = "Treg" @@ -16,6 +15,7 @@ def makeFigure(): markerDF = importCITE() new_df = markerDF.head(1000) + # Armaan: you are creating a list of receptors and then overwriting it right after. receptors = [] for column in new_df.columns: if column not in ["CellType1", "CellType2", "CellType3", "Cell"]: diff --git a/bicytok/figures/figure8.py b/bicytok/figures/figure8.py index 07a9df7..f5d7096 100644 --- a/bicytok/figures/figure8.py +++ b/bicytok/figures/figure8.py @@ -5,7 +5,6 @@ from ..imports import importCITE from .common import getSetup - TARGET_CELL = "Treg" @@ -15,6 +14,7 @@ def makeFigure(): markerDF = importCITE() new_df = markerDF.head(1000) + # Armaan: you are creating a list of receptors and then overwriting it right after. receptors = [] for column in new_df.columns: if column not in ["CellType1", "CellType2", "CellType3", "Cell"]: diff --git a/bicytok/selectivityFuncs.py b/bicytok/selectivityFuncs.py index 7e6e874..b8178bb 100644 --- a/bicytok/selectivityFuncs.py +++ b/bicytok/selectivityFuncs.py @@ -341,7 +341,7 @@ def get_rec_vecs( # Armaan: Can you refactor this and minOffTargSelec to use the same logic for -# infering the number of bound signaling receptors? These two function share a +# inferring the number of bound signaling receptors? These two functions share a # lot of the same functionality. One reason to avoid this is if this function is # a lot slower, and you don't want to call it during optimization, but it # doesn't seem obvious that it would be. diff --git a/general_comments.md b/general_comments.md index d92378e..981ee31 100644 --- a/general_comments.md +++ b/general_comments.md @@ -16,3 +16,4 @@ used (e.g. binding model paper uses R_eq). are unused. 6. Instead of implementing binding model locally, import it from https://github.com/meyer-lab/valentBind. +7. In figure files, move docstrings describing the whole figure to the top of the file, not the makeFigure function. \ No newline at end of file