From d9643f86e3db4918ba564639b2d8d38aa1246822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?= Date: Wed, 30 Aug 2023 17:15:25 -0400 Subject: [PATCH 01/10] Create --- inst/python/g2bento.py | 41 ++++++++ inst/python/python_bento_analysis.py | 148 +++++++++++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 inst/python/g2bento.py create mode 100644 inst/python/python_bento_analysis.py diff --git a/inst/python/g2bento.py b/inst/python/g2bento.py new file mode 100644 index 000000000..2ab4d9ae4 --- /dev/null +++ b/inst/python/g2bento.py @@ -0,0 +1,41 @@ +from calendar import c +import os + +import anndata as ad +import bento as bt +import geopandas as gpd +import numpy as np +import pandas as pd +import scipy +from anndata import AnnData +from geopandas import GeoDataFrame +from shapely.geometry import MultiPolygon, Point, Polygon + + +def create_seg_df(vertices_df: pd.DataFrame, x: str = 'x', y: str = 'y', cell_id: str = 'cell_id') -> pd.DataFrame: + """ + Create a dataframe with cell_id and geometry columns from a dataframe with vertices + :param vertices_df: a dataframe with columns: cell_id, x, y + :param x: the column name of x coordinates + :param y: the column name of y coordinates + :param cell_id: the column name of cell id + :param bounds: the bounds of the area (minx, miny, maxx, maxy) + :return: a dataframe with cell_id and geometry columns + """ + # --- create polygons --- + polygons = vertices_df.groupby(cell_id).apply(lambda group: Polygon(zip(group[x], group[y]))) # type: ignore + seg_df = gpd.GeoDataFrame(polygons, columns=['geometry']) + # --- correct invalid polygons --- + corrected_seg_df = seg_df.copy(deep=True) + for i in range(seg_df.shape[0]): + if not seg_df.iloc[i, 0].is_valid: # type: ignore + corrected_seg_df.iloc[i, 0] = seg_df.iloc[i, 0].buffer(0) # type: ignore + if isinstance(corrected_seg_df.iloc[i, 0], MultiPolygon): + corrected_seg_df.iloc[i, 0] = max(corrected_seg_df.iloc[i, 0], key=lambda x: x.area) # type: ignore + return corrected_seg_df + + +def create_AnnData(trainscripts: pd.DataFrame, cell_shape: GeoDataFrame, nucleus_shape) -> AnnData: + cell_seg = create_seg_df(cell_shape, x='x', y='y', cell_id='cell') + nucleus_seg = create_seg_df(nucleus_shape, x='x', y='y', cell_id='nucleus') + return bt.io.prepare(molecules=trainscripts, cell_seg=cell_seg, other_seg={'nucleus': nucleus_seg}) # type: ignore diff --git a/inst/python/python_bento_analysis.py b/inst/python/python_bento_analysis.py new file mode 100644 index 000000000..697af368a --- /dev/null +++ b/inst/python/python_bento_analysis.py @@ -0,0 +1,148 @@ +from typing import List, Optional + +import geopandas as gpd +import numpy as np +import pandas as pd +from anndata import AnnData +from scipy.sparse import csc_matrix +from shapely.geometry import MultiPolygon, Point, Polygon + + +def create_seg_df(vertices_df: pd.DataFrame, x: str = 'x', y: str = 'y', cell_id: str = 'cell_id') -> pd.DataFrame: + """ + Create a dataframe with cell_id and geometry columns from a dataframe with vertices + :param vertices_df: a dataframe with columns: cell_id, x, y + :param x: the column name of x coordinates + :param y: the column name of y coordinates + :param cell_id: the column name of cell id + :param bounds: the bounds of the area (minx, miny, maxx, maxy) + :return: a dataframe with cell_id and geometry columns + """ + # --- create polygons --- + polygons = vertices_df.groupby(cell_id).apply(lambda group: Polygon(zip(group[x], group[y]))) # type: ignore + seg_df = gpd.GeoDataFrame(polygons, columns=['geometry']) + # --- correct invalid polygons --- + corrected_seg_df = seg_df.copy(deep=True) + for i in range(seg_df.shape[0]): + if not seg_df.iloc[i, 0].is_valid: # type: ignore + corrected_seg_df.iloc[i, 0] = seg_df.iloc[i, 0].buffer(0) # type: ignore + if isinstance(corrected_seg_df.iloc[i, 0], MultiPolygon): + corrected_seg_df.iloc[i, 0] = max(corrected_seg_df.iloc[i, 0], key=lambda x: x.area) # type: ignore + return corrected_seg_df + + +def valid_transcript(row, adata: AnnData, legal_cells: List[str]) -> bool: + """ + Check if the transcript is valid + :param adata: anndata object + :param row: a row of adata.uns['points'] + :return: True if the transcript is valid + """ + return str(row['cell']) in legal_cells and adata.obs.loc[str(row['cell']), 'cell_shape'].contains( # type: ignore + Point(row['x'], row['y'])) + + +def adata_X_handle(adata: AnnData) -> None: + """ + Convert adata.X from sparse matrix to dense matrix + :param adata: anndata object + :return: None + """ + if isinstance(adata.X, csc_matrix): # in case of duplicate handling + adata.X = np.array(adata.X.todense()) + + +def adata_var_handle(adata: AnnData) -> None: + """ + Set gene as index of adata.var + :param adata: anndata object + :return: None + """ + if adata.var.shape[1] == 1: # in case of duplicate handling + adata.var.set_index('gene', inplace=True) + + +def adata_obs_handle(adata: AnnData, + shape_names: Optional[List[str]] = None, + bounds: Optional[List[float]] = None) -> None: + """ + Handle adata.obs to make sure the data type is correct + :param adata: anndata object + :param shape_names: the names of shapes that need to handle + :param bounds: the bounds of the area (minx, miny, maxx, maxy) + :return: None + """ + # --- shapes need to handle --- + if shape_names is None: + shape_names = ['cell_shape', 'nucleus_shape'] # we may have other shapes (mitochondria etc) in the future + # --- create GEODataFrame --- + for shape in shape_names: + if shape not in adata.obs.columns: + seg_df = create_seg_df(vertices_df=adata.uns[shape]) + adata.obs[shape] = seg_df['geometry'] + del adata.uns[shape] + # --- remove cells outside the bounds --- + if bounds is not None: + area_shape = Polygon([(bounds[0], bounds[1]), (bounds[0], bounds[3]), (bounds[2], bounds[3]), + (bounds[2], bounds[1])]) + adata.uns['legal_cells'] = adata.obs['cell_shape'].apply(lambda x: area_shape.contains(x)) + else: + adata.uns['legal_cells'] = np.ones(adata.obs.shape[0], dtype=bool) + # --- add batch column --- + if 'batch' not in adata.obs.columns: + adata.obs['batch'] = 0 + + +def adata_uns_points_handle(adata: AnnData) -> None: + """ + Handle adata.uns['points'] to make sure the data type is correct + :param adata: anndata object + :return: None + """ + # --- correct the value of nucleus --- + if adata.uns['points']['nucleus'].min() == 0: # in case of duplicate handling + points = adata.uns['points'] + points['nucleus'] = points['cell'].values * points['nucleus'].values + points.loc[points['nucleus'] == 0, 'nucleus'] = -1 + # --- remove unused transcripts --- + legal_cells = adata.obs.index[adata.uns['legal_cells']] + legal_transcripts = adata.uns['points'].apply(valid_transcript, axis=1, args=(adata, legal_cells)) + adata.uns['points'] = adata.uns['points'][legal_transcripts] + # --- setup data type --- + dtypes = {'x': 'float', 'y': 'float', 'gene': 'str', 'cell': 'str', 'nucleus': 'str', 'batch': 'str'} + adata.uns['points'] = adata.uns['points'].astype(dtypes) + adata.uns['points']['gene'] = adata.uns['points']['gene'].astype('category') + adata.uns['points']['cell'] = adata.uns['points']['cell'].astype('category') + adata.uns['points']['nucleus'] = adata.uns['points']['nucleus'].astype('category') + adata.uns['points']['batch'] = adata.uns['points']['batch'].astype('category') + # --- add legal_genes in adata.uns --- + adata.uns['legal_genes'] = [x in adata.uns['points']['gene'].tolist() for x in adata.var.index] + + +def adata_handle(adata: AnnData, bounds: Optional[List[float]] = None) -> AnnData: + """ + Handle anndata object to make sure the data type is correct + Process: + 1. processing adata.X: format + 2. processing adata.var: format + 3. processing adata.obs + - create GEODataFrame & delete adata.uns['cell_shape'] and adata.uns['nucleus_shape'] + - remove cells outside the bounds & add legal_cells in adata.uns + - add batch column + 4. processing adata.uns['points'] + - correct the value of nucleus + - remove unused transcripts + - setup data type + - add legal_genes in adata.uns + 5. subset adata + :param adata: AnnData object + :return: Filtered Anndata object + """ + adata_X_handle(adata) + adata_var_handle(adata) + adata_obs_handle(adata, bounds=bounds) + adata_uns_points_handle(adata) + + return adata[adata.uns['legal_cells'], adata.uns['legal_genes']] + + From a0bf628b6bdb8cca118c5fbb00a7080752231e0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?= Date: Fri, 8 Sep 2023 10:45:55 -0400 Subject: [PATCH 02/10] Ready for install --- R/python_bento.R | 29 +++ inst/python/g2bento.py | 79 ++++++-- inst/python/log.py | 41 +++++ inst/python/python_bento_analysis.py | 265 +++++++++++++-------------- 4 files changed, 267 insertions(+), 147 deletions(-) create mode 100644 R/python_bento.R create mode 100644 inst/python/log.py diff --git a/R/python_bento.R b/R/python_bento.R new file mode 100644 index 000000000..8ef17c613 --- /dev/null +++ b/R/python_bento.R @@ -0,0 +1,29 @@ +#' @title Create bento adata object from gobject +#' @name createBentoAdata +#' @description Create bento adata object from gobject +#' @param gobject Giotto object +#' @return bento_adata bento adata object +#' @export +createBentoAdata <- function(gobject){ + # Transcripts + transcripts_df <- as.data.frame(sf::st_as_sf(gobject@feat_info$rna@spatVector)) + coordinates_df <- sf::st_coordinates(transcripts_df['geometry']) + t_df <- cbind(coordinates_df, transcripts_df[c("feat_ID")]) + colnames(t_df) <- c('x','y','gene') + t_df <- t_df[t_df$cell != '-1',] # Remove extracellular points, required by Bento + + # Cell shapes + # TODO: Add batch information based on? + cell_poly <- spatVector_to_dt(gobject@spatial_info$cell@spatVector) + cell_poly <- data.frame(cell_id = cell_poly$poly_ID, x = cell_poly$x, y = cell_poly$y, batch = 0L) + + # Nuclei shapes + # TODO: Add batch information based on? + nucleus_poly <- spatVector_to_dt(gobject@spatial_info$nucleus@spatVector) + nucleus_poly <- data.frame(cell_id = nucleus_poly$poly_ID, x = nucleus_poly$x, y = nucleus_poly$y, batch = 0L) + + # Create AnnData object + bento_adata <- Giotto::create_AnnData(trainscripts = t_df, cell_shape = cell_poly, nucleus_shape = nucleus_poly) + + return(bento_adata) +} diff --git a/inst/python/g2bento.py b/inst/python/g2bento.py index 2ab4d9ae4..783b33cf0 100644 --- a/inst/python/g2bento.py +++ b/inst/python/g2bento.py @@ -1,16 +1,10 @@ -from calendar import c -import os - -import anndata as ad import bento as bt import geopandas as gpd -import numpy as np import pandas as pd -import scipy from anndata import AnnData -from geopandas import GeoDataFrame -from shapely.geometry import MultiPolygon, Point, Polygon +from shapely.geometry import MultiPolygon, Polygon +from log import debug, warning, info def create_seg_df(vertices_df: pd.DataFrame, x: str = 'x', y: str = 'y', cell_id: str = 'cell_id') -> pd.DataFrame: """ @@ -35,7 +29,68 @@ def create_seg_df(vertices_df: pd.DataFrame, x: str = 'x', y: str = 'y', cell_id return corrected_seg_df -def create_AnnData(trainscripts: pd.DataFrame, cell_shape: GeoDataFrame, nucleus_shape) -> AnnData: - cell_seg = create_seg_df(cell_shape, x='x', y='y', cell_id='cell') - nucleus_seg = create_seg_df(nucleus_shape, x='x', y='y', cell_id='nucleus') - return bt.io.prepare(molecules=trainscripts, cell_seg=cell_seg, other_seg={'nucleus': nucleus_seg}) # type: ignore +def add_batch(adata: AnnData, cell_shape: pd.DataFrame): + """ + Add batch information to an AnnData object + If cell_seg have batch information, add batch information to adata, else all batch will be set to 0 + :param adata: an AnnData object + :param cell_seg: the name of the cell segmentation + :return: an AnnData object with batch information + """ + if 'batch' in cell_shape.columns: + info('Batch information found in cell_shape, adding batch information to adata') + adata.obs['batch'] = [cell_shape.loc[cell_shape['cell_id']==cell,'batch'].values[0] for cell in adata.obs_names] # type: ignore + adata.uns['points']['batch'] = [cell_shape.loc[cell_shape['cell_id']==cell,'batch'].values[0] for cell in adata.uns['points']['cell']] # type: ignore + else: # Interim measures, batch information may not transfered to cell_shape + warning('Batch information not found in cell_shape, all batch will be set to 0') + adata.obs['batch'] = 0 + adata.uns['points']['batch'] = 0 + adata.obs['batch'] = adata.obs['batch'].astype('category') + adata.uns['points']['batch'] = adata.uns['points']['batch'].astype('category') + + +def create_AnnData(trainscripts, cell_shape, nucleus_shape, x_max = None, x_min = None, y_max = None, y_min = None) -> AnnData: + # --- processing input --- + trainscripts = pd.DataFrame(trainscripts) + cell_shape = pd.DataFrame(cell_shape) + cell_shape['cell_id'] = cell_shape['cell_id'].astype('category') + if 'batch' in cell_shape.columns: + cell_shape['batch'] = cell_shape['batch'].astype('category') + nucleus_shape = pd.DataFrame(nucleus_shape) + nucleus_shape['cell_id'] = nucleus_shape['cell_id'].astype('category') + + # --- create shape --- + cell_seg = create_seg_df(cell_shape, x='x', y='y', cell_id='cell_id') + nucleus_seg = create_seg_df(nucleus_shape, x='x', y='y', cell_id='cell_id') + if cell_seg.shape[0] > 500: + warning('cell_seg has more than 500 cells, processing may take a long time.') + + # --- filter cells --- + # Interim measures + # subsetGiottoLocs don't give wanted result, so we use a workaround here + if x_max is not None and x_min is not None and y_max is not None and y_min is not None: + area_shape = Polygon([(x_min, y_min), (x_max, y_min), (x_max, y_max), (x_min, y_max)]) + legal_cells = cell_seg['geometry'].apply(lambda x: area_shape.contains(x)) + else: + legal_cells = pd.Series([True] * cell_seg.shape[0]) + + # --- create AnnData --- + adata: AnnData = bt.io.prepare(molecules=trainscripts, cell_seg=cell_seg, other_seg={'nucleus': nucleus_seg}) # type: ignore + add_batch(adata, cell_shape) + + # --- filter genes --- + # Interim measures + # subsetGiottoLocs don't give wanted result, so we use a workaround here + legal_genes = adata.var_names.isin(set(adata.uns['points']['gene'].values)) + filtered_adata = adata[legal_cells,legal_genes] # type: ignore + filtered_adata.uns['points']['gene'] = filtered_adata.uns['points']['gene'].cat.remove_unused_categories() + + # Interim measures + # subsetted adata don't have _X property, which will cause unexpect error for adata.__sizeof__ + # when create object in R, reticulate will call sys.getsizeof to get the size of the object, which will call adata.__sizeof__ + # https://github.com/rstudio/reticulate/issues/1332 + # https://github.com/rstudio/rstudio/issues/13491 + filtered_adata._X = filtered_adata.X + return filtered_adata + + # return adata diff --git a/inst/python/log.py b/inst/python/log.py new file mode 100644 index 000000000..e6c3f2d1c --- /dev/null +++ b/inst/python/log.py @@ -0,0 +1,41 @@ +import sys +import time + + +def get_current_time() -> str: + return time.strftime('%H:%M:%S', time.localtime()) + + +def write_direct_message(message: str): + curr_time_str = get_current_time() + sys.stdout.write(f'{curr_time_str} --- {message}\n') + sys.stdout.flush() + + +def debug(message: str): + write_direct_message(f'DEBUG: {message}') + + +def info(message: str): + write_direct_message(f'INFO: {message}') + + +def write_direct_message_err(message: str): + curr_time_str = get_current_time() + sys.stderr.write(f'{curr_time_str} --- {message}\n') + sys.stderr.flush() + + +def warning(message: str): + write_direct_message_err(f'WARNING: {message}') + + +def error(message: str): + write_direct_message_err(f'ERROR: {message}') + + +def critical(message: str): + write_direct_message_err(f'CRITICAL: {message}') + + +__all__ = ['debug', 'info', 'warning', 'error', 'critical'] diff --git a/inst/python/python_bento_analysis.py b/inst/python/python_bento_analysis.py index 697af368a..45cf0e055 100644 --- a/inst/python/python_bento_analysis.py +++ b/inst/python/python_bento_analysis.py @@ -1,148 +1,143 @@ from typing import List, Optional - -import geopandas as gpd -import numpy as np -import pandas as pd +import bento as bt from anndata import AnnData -from scipy.sparse import csc_matrix -from shapely.geometry import MultiPolygon, Point, Polygon +import emoji +from bento._utils import track +from bento.tools._colocation import _colocation_tensor +from bento.tools import decompose +from kneed import KneeLocator + +import matplotlib as mpl +import matplotlib.pyplot as plt +import seaborn as sns + +from log import warning, info + + +# --------------------------------- +# modified bento and dependencies functions/classes +# --------------------------------- +@track +def colocation( + data: AnnData, + ranks: List[int], + fname: str, + iterations: int = 3, + plot_error: bool = True, + copy: bool = False, +): + """Decompose a tensor of pairwise colocalization quotients into signatures. + + Parameters + ---------- + adata : AnnData + Spatial formatted AnnData object. + ranks : list + List of ranks to decompose the tensor. + iterations : int + Number of iterations to run the decomposition. + plot_error : bool + Whether to plot the error of the decomposition. + copy : bool + Whether to return a copy of the AnnData object. Default False. + Returns + ------- + adata : AnnData + .uns['factors']: Decomposed tensor factors. + .uns['factors_error']: Decomposition error. + """ + adata = data.copy() if copy else data + print("Preparing tensor...") + _colocation_tensor(adata, copy=copy) -def create_seg_df(vertices_df: pd.DataFrame, x: str = 'x', y: str = 'y', cell_id: str = 'cell_id') -> pd.DataFrame: - """ - Create a dataframe with cell_id and geometry columns from a dataframe with vertices - :param vertices_df: a dataframe with columns: cell_id, x, y - :param x: the column name of x coordinates - :param y: the column name of y coordinates - :param cell_id: the column name of cell id - :param bounds: the bounds of the area (minx, miny, maxx, maxy) - :return: a dataframe with cell_id and geometry columns - """ - # --- create polygons --- - polygons = vertices_df.groupby(cell_id).apply(lambda group: Polygon(zip(group[x], group[y]))) # type: ignore - seg_df = gpd.GeoDataFrame(polygons, columns=['geometry']) - # --- correct invalid polygons --- - corrected_seg_df = seg_df.copy(deep=True) - for i in range(seg_df.shape[0]): - if not seg_df.iloc[i, 0].is_valid: # type: ignore - corrected_seg_df.iloc[i, 0] = seg_df.iloc[i, 0].buffer(0) # type: ignore - if isinstance(corrected_seg_df.iloc[i, 0], MultiPolygon): - corrected_seg_df.iloc[i, 0] = max(corrected_seg_df.iloc[i, 0], key=lambda x: x.area) # type: ignore - return corrected_seg_df - - -def valid_transcript(row, adata: AnnData, legal_cells: List[str]) -> bool: - """ - Check if the transcript is valid - :param adata: anndata object - :param row: a row of adata.uns['points'] - :return: True if the transcript is valid - """ - return str(row['cell']) in legal_cells and adata.obs.loc[str(row['cell']), 'cell_shape'].contains( # type: ignore - Point(row['x'], row['y'])) + tensor = adata.uns["tensor"] + print(emoji.emojize(":running: Decomposing tensor...")) + factors, errors = decompose(tensor, ranks, iterations=iterations) -def adata_X_handle(adata: AnnData) -> None: - """ - Convert adata.X from sparse matrix to dense matrix - :param adata: anndata object - :return: None - """ - if isinstance(adata.X, csc_matrix): # in case of duplicate handling - adata.X = np.array(adata.X.todense()) + if plot_error and errors.shape[0] > 1: + kl = KneeLocator(errors["rank"], errors["rmse"], direction="decreasing", curve="convex") + if kl.knee is None: + warning('No knee found, please extend the ranks range.\nCurrent ranks range: [{ranks[0]},{ranks[-1]}]') + else: + info(f'Knee found at rank {kl.knee}') + sns.lineplot(data=errors, x="rank", y="rmse", ci=95, marker="o") # type: ignore + plt.axvline(kl.knee, linestyle="--") + plt.savefig(fname) + info(f"Saved to {fname}") + adata.uns["factors"] = factors + adata.uns["factors_error"] = errors -def adata_var_handle(adata: AnnData) -> None: - """ - Set gene as index of adata.var - :param adata: anndata object - :return: None - """ - if adata.var.shape[1] == 1: # in case of duplicate handling - adata.var.set_index('gene', inplace=True) + print(emoji.emojize(":heavy_check_mark: Done.")) + return adata if copy else None -def adata_obs_handle(adata: AnnData, - shape_names: Optional[List[str]] = None, - bounds: Optional[List[float]] = None) -> None: - """ - Handle adata.obs to make sure the data type is correct - :param adata: anndata object - :param shape_names: the names of shapes that need to handle - :param bounds: the bounds of the area (minx, miny, maxx, maxy) - :return: None - """ - # --- shapes need to handle --- - if shape_names is None: - shape_names = ['cell_shape', 'nucleus_shape'] # we may have other shapes (mitochondria etc) in the future - # --- create GEODataFrame --- - for shape in shape_names: - if shape not in adata.obs.columns: - seg_df = create_seg_df(vertices_df=adata.uns[shape]) - adata.obs[shape] = seg_df['geometry'] - del adata.uns[shape] - # --- remove cells outside the bounds --- - if bounds is not None: - area_shape = Polygon([(bounds[0], bounds[1]), (bounds[0], bounds[3]), (bounds[2], bounds[3]), - (bounds[2], bounds[1])]) - adata.uns['legal_cells'] = adata.obs['cell_shape'].apply(lambda x: area_shape.contains(x)) - else: - adata.uns['legal_cells'] = np.ones(adata.obs.shape[0], dtype=bool) - # --- add batch column --- - if 'batch' not in adata.obs.columns: - adata.obs['batch'] = 0 - - -def adata_uns_points_handle(adata: AnnData) -> None: - """ - Handle adata.uns['points'] to make sure the data type is correct - :param adata: anndata object - :return: None - """ - # --- correct the value of nucleus --- - if adata.uns['points']['nucleus'].min() == 0: # in case of duplicate handling - points = adata.uns['points'] - points['nucleus'] = points['cell'].values * points['nucleus'].values - points.loc[points['nucleus'] == 0, 'nucleus'] = -1 - # --- remove unused transcripts --- - legal_cells = adata.obs.index[adata.uns['legal_cells']] - legal_transcripts = adata.uns['points'].apply(valid_transcript, axis=1, args=(adata, legal_cells)) - adata.uns['points'] = adata.uns['points'][legal_transcripts] - # --- setup data type --- - dtypes = {'x': 'float', 'y': 'float', 'gene': 'str', 'cell': 'str', 'nucleus': 'str', 'batch': 'str'} - adata.uns['points'] = adata.uns['points'].astype(dtypes) - adata.uns['points']['gene'] = adata.uns['points']['gene'].astype('category') - adata.uns['points']['cell'] = adata.uns['points']['cell'].astype('category') - adata.uns['points']['nucleus'] = adata.uns['points']['nucleus'].astype('category') - adata.uns['points']['batch'] = adata.uns['points']['batch'].astype('category') - # --- add legal_genes in adata.uns --- - adata.uns['legal_genes'] = [x in adata.uns['points']['gene'].tolist() for x in adata.var.index] - - -def adata_handle(adata: AnnData, bounds: Optional[List[float]] = None) -> AnnData: - """ - Handle anndata object to make sure the data type is correct - Process: - 1. processing adata.X: format - 2. processing adata.var: format - 3. processing adata.obs - - create GEODataFrame & delete adata.uns['cell_shape'] and adata.uns['nucleus_shape'] - - remove cells outside the bounds & add legal_cells in adata.uns - - add batch column - 4. processing adata.uns['points'] - - correct the value of nucleus - - remove unused transcripts - - setup data type - - add legal_genes in adata.uns - 5. subset adata - :param adata: AnnData object - :return: Filtered Anndata object - """ - adata_X_handle(adata) - adata_var_handle(adata) - adata_obs_handle(adata, bounds=bounds) - adata_uns_points_handle(adata) +# --------------------------------- +# bento wrapper functions +# --------------------------------- + + +def analysis_shape_features(adata: AnnData, feature_names: Optional[List[str]] = None) -> None: + if feature_names is None: + feature_names = list(bt.tl.list_shape_features().keys()) + bt.tl.obs_stats(adata, feature_names=feature_names) + + +def plot_shape_features_analysis_results(adata: AnnData, fname: str): + bt.pl.shapes(adata, fname=fname) + + +def analysis_points_features(adata: AnnData, + shapes_names: Optional[List[str]] = None, + feature_names: Optional[List[str]] = None) -> None: + if shapes_names is None: + shapes_names = ["cell_shape", "nucleus_shape"] + if feature_names is None: + feature_names = list(bt.tl.list_point_features().keys()) + bt.tl.analyze_points(adata, shape_names=shapes_names, feature_names=feature_names, groupby='gene') + + +def plot_points_features_analysis_results(adata: AnnData, fname: str) -> None: + bt.pl.points(adata, fname=fname) + + +def analysis_rna_forest(adata: AnnData) -> None: + bt.tl.lp(adata) + bt.tl.lp_stats(adata) + + +def plot_rna_forest_analysis_results(adata: AnnData, fname1: str, fname2: str) -> None: + bt.pl.lp_genes(adata, fname=fname1) + bt.pl.lp_dist(adata, fname=fname2) + + +def analysis_colocalization(adata: AnnData, fname: str, ranks: Optional[List[int]] = None) -> None: + if ranks is None: + ranks = list(range(1, 6)) + + # Cytoplasm = cell - nucleus + adata.obs["cytoplasm_shape"] = bt.geo.get_shape(adata, "cell_shape") - bt.geo.get_shape(adata, "nucleus_shape") + + # Create point index + adata.uns["points"]["cytoplasm"] = (adata.uns["points"]["nucleus"].astype(int) < 0).astype(int) + + bt.tl.coloc_quotient(adata, shapes=["cytoplasm_shape", "nucleus_shape"]) + + colocation(adata, ranks=ranks, fname=fname) + - return adata[adata.uns['legal_cells'], adata.uns['legal_genes']] +def plot_colocalization_analysis_results(adata: AnnData, fname: str, rank: int) -> None: + bt.pl.colocation(adata, rank=rank, fname=fname) +def chekc_genes_number(adata: AnnData) -> None: + print(f'adata shape: {adata.shape}') + print(f'adata points genes: {len(adata.uns["points"]["gene"].unique())}') + print(f'adata cell_gene_features genes: {len(adata.uns["cell_gene_features"]["gene"].unique())}') + diff_set = set(adata.uns["points"]["gene"]) - set(adata.uns["cell_gene_features"]["gene"]) + print(diff_set) + for g in diff_set: + print(f'{g}') + print(adata.uns['points'][adata.uns['points']['gene'] == g]) From 5a94590cde406d97870029c358f8d617dfb59d0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?= Date: Fri, 8 Sep 2023 12:27:15 -0400 Subject: [PATCH 03/10] Remove --- R/python_bento.R | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 R/python_bento.R diff --git a/R/python_bento.R b/R/python_bento.R deleted file mode 100644 index 8ef17c613..000000000 --- a/R/python_bento.R +++ /dev/null @@ -1,29 +0,0 @@ -#' @title Create bento adata object from gobject -#' @name createBentoAdata -#' @description Create bento adata object from gobject -#' @param gobject Giotto object -#' @return bento_adata bento adata object -#' @export -createBentoAdata <- function(gobject){ - # Transcripts - transcripts_df <- as.data.frame(sf::st_as_sf(gobject@feat_info$rna@spatVector)) - coordinates_df <- sf::st_coordinates(transcripts_df['geometry']) - t_df <- cbind(coordinates_df, transcripts_df[c("feat_ID")]) - colnames(t_df) <- c('x','y','gene') - t_df <- t_df[t_df$cell != '-1',] # Remove extracellular points, required by Bento - - # Cell shapes - # TODO: Add batch information based on? - cell_poly <- spatVector_to_dt(gobject@spatial_info$cell@spatVector) - cell_poly <- data.frame(cell_id = cell_poly$poly_ID, x = cell_poly$x, y = cell_poly$y, batch = 0L) - - # Nuclei shapes - # TODO: Add batch information based on? - nucleus_poly <- spatVector_to_dt(gobject@spatial_info$nucleus@spatVector) - nucleus_poly <- data.frame(cell_id = nucleus_poly$poly_ID, x = nucleus_poly$x, y = nucleus_poly$y, batch = 0L) - - # Create AnnData object - bento_adata <- Giotto::create_AnnData(trainscripts = t_df, cell_shape = cell_poly, nucleus_shape = nucleus_poly) - - return(bento_adata) -} From 038bff5bdf292ac0d4d455aea271e67387876b35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?= Date: Fri, 8 Sep 2023 14:15:20 -0400 Subject: [PATCH 04/10] Add: wrapup createBentoAdata --- NAMESPACE | 1 + R/python_bento.R | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 R/python_bento.R diff --git a/NAMESPACE b/NAMESPACE index 93d1d4589..3c92cca1f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -86,6 +86,7 @@ export(comparePolygonExpression) export(convertEnsemblToGeneSymbol) export(convertGiottoLargeImageToMG) export(createArchRProj) +export(createBentoAdata) export(createCellMetaObj) export(createCrossSection) export(createDimObj) diff --git a/R/python_bento.R b/R/python_bento.R new file mode 100644 index 000000000..7ce06ff2c --- /dev/null +++ b/R/python_bento.R @@ -0,0 +1,31 @@ +#' @title Create bento adata object from gobject +#' @name createBentoAdata +#' @description Create bento adata object from gobject +#' @param gobject Giotto object +#' @return bento_adata bento adata object +#' @export +createBentoAdata <- function(gobject){ + # Transcripts + transcripts_df <- as.data.frame(sf::st_as_sf(gobject@feat_info$rna@spatVector)) + coordinates_df <- sf::st_coordinates(transcripts_df['geometry']) + t_df <- cbind(coordinates_df, transcripts_df[c("feat_ID")]) + colnames(t_df) <- c('x','y','gene') + t_df <- t_df[t_df$cell != '-1',] # Remove extracellular points, required by Bento + + # Cell shapes + # TODO: Add batch information based on? + cell_poly <- spatVector_to_dt(gobject@spatial_info$cell@spatVector) + cell_poly <- data.frame(cell_id = cell_poly$poly_ID, x = cell_poly$x, y = cell_poly$y, batch = 0L) + + # Nuclei shapes + # TODO: Add batch information based on? + nucleus_poly <- spatVector_to_dt(gobject@spatial_info$nucleus@spatVector) + nucleus_poly <- data.frame(cell_id = nucleus_poly$poly_ID, x = nucleus_poly$x, y = nucleus_poly$y, batch = 0L) + + # Create AnnData object + g2bento_path <- system.file("python","g2bento.py",package="Giotto") + reticulate::source_python(g2bento_path) + bento_adata <- create_AnnData(trainscripts=t_df, cell_shape=cell_poly, nucleus_shape=nucleus_poly) + + return(bento_adata) +} From 69347ad7a5305688849ace8cb923232a0b6c3c99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?= Date: Fri, 8 Sep 2023 14:15:31 -0400 Subject: [PATCH 05/10] Add: install bento --- R/python_environment.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/python_environment.R b/R/python_environment.R index 84a5a8af8..044a15066 100644 --- a/R/python_environment.R +++ b/R/python_environment.R @@ -194,6 +194,10 @@ install_giotto_environment_specific = function(packages_to_install = c('pandas', python_version = python_version) } + # reticulate don't support installation from github yet + # using system call instead + config <- reticulate::py_discover_config(use_environment='giotto_env') + system2(config$python, c("-m", "pip", "install", "git+https://github.com/wwang-chcn/bento-tools.git@giotto_install")) } From 89b84802c64a0713fd5035d199f0289c9ba6da73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?= Date: Fri, 8 Sep 2023 15:17:43 -0400 Subject: [PATCH 06/10] Update: remove cell filter process --- inst/python/g2bento.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/inst/python/g2bento.py b/inst/python/g2bento.py index 783b33cf0..f5f570f5b 100644 --- a/inst/python/g2bento.py +++ b/inst/python/g2bento.py @@ -49,7 +49,7 @@ def add_batch(adata: AnnData, cell_shape: pd.DataFrame): adata.uns['points']['batch'] = adata.uns['points']['batch'].astype('category') -def create_AnnData(trainscripts, cell_shape, nucleus_shape, x_max = None, x_min = None, y_max = None, y_min = None) -> AnnData: +def create_AnnData(trainscripts, cell_shape, nucleus_shape) -> AnnData: # --- processing input --- trainscripts = pd.DataFrame(trainscripts) cell_shape = pd.DataFrame(cell_shape) @@ -66,13 +66,8 @@ def create_AnnData(trainscripts, cell_shape, nucleus_shape, x_max = None, x_min warning('cell_seg has more than 500 cells, processing may take a long time.') # --- filter cells --- - # Interim measures - # subsetGiottoLocs don't give wanted result, so we use a workaround here - if x_max is not None and x_min is not None and y_max is not None and y_min is not None: - area_shape = Polygon([(x_min, y_min), (x_max, y_min), (x_max, y_max), (x_min, y_max)]) - legal_cells = cell_seg['geometry'].apply(lambda x: area_shape.contains(x)) - else: - legal_cells = pd.Series([True] * cell_seg.shape[0]) + # Let Giotto perform the filtering + legal_cells = pd.Series([True] * cell_seg.shape[0]) # --- create AnnData --- adata: AnnData = bt.io.prepare(molecules=trainscripts, cell_seg=cell_seg, other_seg={'nucleus': nucleus_seg}) # type: ignore From e43de1657caf9e3a34c21ca7593c2febc1324999 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?= Date: Fri, 8 Sep 2023 15:18:31 -0400 Subject: [PATCH 07/10] Update: comments --- inst/python/g2bento.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/python/g2bento.py b/inst/python/g2bento.py index f5f570f5b..27b36d5f8 100644 --- a/inst/python/g2bento.py +++ b/inst/python/g2bento.py @@ -81,10 +81,11 @@ def create_AnnData(trainscripts, cell_shape, nucleus_shape) -> AnnData: filtered_adata.uns['points']['gene'] = filtered_adata.uns['points']['gene'].cat.remove_unused_categories() # Interim measures - # subsetted adata don't have _X property, which will cause unexpect error for adata.__sizeof__ + # subsetted adata (_is_view == True) don't have _X property, which will cause unexpect error for adata.__sizeof__ # when create object in R, reticulate will call sys.getsizeof to get the size of the object, which will call adata.__sizeof__ # https://github.com/rstudio/reticulate/issues/1332 # https://github.com/rstudio/rstudio/issues/13491 + # wait for AnnData to fix this issue filtered_adata._X = filtered_adata.X return filtered_adata From 8391c92d97aadd31772b8ea8430bd508431ba227 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?= Date: Fri, 8 Sep 2023 16:18:56 -0400 Subject: [PATCH 08/10] Update: add python_bento.R --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 8fcd0beb4..83c8479a5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -143,6 +143,7 @@ Collate: 'interactivity.R' 'interoperability.R' 'poly_influence.R' + 'python_bento.R' 'python_environment.R' 'python_hmrf.R' 'python_scrublet.R' From 0bd85f3503a281db557bb99aa14d41ba3fed1511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?= Date: Thu, 14 Sep 2023 15:39:48 -0400 Subject: [PATCH 09/10] Fix: typo --- R/python_bento.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/python_bento.R b/R/python_bento.R index 7ce06ff2c..702340d42 100644 --- a/R/python_bento.R +++ b/R/python_bento.R @@ -7,7 +7,7 @@ createBentoAdata <- function(gobject){ # Transcripts transcripts_df <- as.data.frame(sf::st_as_sf(gobject@feat_info$rna@spatVector)) - coordinates_df <- sf::st_coordinates(transcripts_df['geometry']) + coordinates_df <- lapply(transcripts_df['geometry'], sf::st_coordinates)$geometry t_df <- cbind(coordinates_df, transcripts_df[c("feat_ID")]) colnames(t_df) <- c('x','y','gene') t_df <- t_df[t_df$cell != '-1',] # Remove extracellular points, required by Bento From a178c63127611c3fcea6e89a9fc1d011e438ff1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?= Date: Thu, 14 Sep 2023 16:06:18 -0400 Subject: [PATCH 10/10] Fix: createBentoAdata --- R/python_bento.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/python_bento.R b/R/python_bento.R index 702340d42..773b39d91 100644 --- a/R/python_bento.R +++ b/R/python_bento.R @@ -8,9 +8,8 @@ createBentoAdata <- function(gobject){ # Transcripts transcripts_df <- as.data.frame(sf::st_as_sf(gobject@feat_info$rna@spatVector)) coordinates_df <- lapply(transcripts_df['geometry'], sf::st_coordinates)$geometry - t_df <- cbind(coordinates_df, transcripts_df[c("feat_ID")]) + t_df <- as.data.frame(cbind(coordinates_df, transcripts_df[c("feat_ID")])) colnames(t_df) <- c('x','y','gene') - t_df <- t_df[t_df$cell != '-1',] # Remove extracellular points, required by Bento # Cell shapes # TODO: Add batch information based on?