From d9643f86e3db4918ba564639b2d8d38aa1246822 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?=
 <wwang.bio@gmail.com>
Date: Wed, 30 Aug 2023 17:15:25 -0400
Subject: [PATCH 01/10] Create

---
 inst/python/g2bento.py               |  41 ++++++++
 inst/python/python_bento_analysis.py | 148 +++++++++++++++++++++++++++
 2 files changed, 189 insertions(+)
 create mode 100644 inst/python/g2bento.py
 create mode 100644 inst/python/python_bento_analysis.py

diff --git a/inst/python/g2bento.py b/inst/python/g2bento.py
new file mode 100644
index 000000000..2ab4d9ae4
--- /dev/null
+++ b/inst/python/g2bento.py
@@ -0,0 +1,41 @@
+from calendar import c
+import os
+
+import anndata as ad
+import bento as bt
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import scipy
+from anndata import AnnData
+from geopandas import GeoDataFrame
+from shapely.geometry import MultiPolygon, Point, Polygon
+
+
+def create_seg_df(vertices_df: pd.DataFrame, x: str = 'x', y: str = 'y', cell_id: str = 'cell_id') -> pd.DataFrame:
+    """
+    Create a dataframe with cell_id and geometry columns from a dataframe with vertices
+    :param vertices_df: a dataframe with columns: cell_id, x, y
+    :param x: the column name of x coordinates
+    :param y: the column name of y coordinates
+    :param cell_id: the column name of cell id
+    :param bounds: the bounds of the area (minx, miny, maxx, maxy)
+    :return: a dataframe with cell_id and geometry columns
+    """
+    # --- create polygons ---
+    polygons = vertices_df.groupby(cell_id).apply(lambda group: Polygon(zip(group[x], group[y])))  # type: ignore
+    seg_df = gpd.GeoDataFrame(polygons, columns=['geometry'])
+    # --- correct invalid polygons ---
+    corrected_seg_df = seg_df.copy(deep=True)
+    for i in range(seg_df.shape[0]):
+        if not seg_df.iloc[i, 0].is_valid:  # type: ignore
+            corrected_seg_df.iloc[i, 0] = seg_df.iloc[i, 0].buffer(0)  # type: ignore
+        if isinstance(corrected_seg_df.iloc[i, 0], MultiPolygon):
+            corrected_seg_df.iloc[i, 0] = max(corrected_seg_df.iloc[i, 0], key=lambda x: x.area)  # type: ignore
+    return corrected_seg_df
+
+
+def create_AnnData(trainscripts: pd.DataFrame, cell_shape: GeoDataFrame, nucleus_shape) -> AnnData:
+    cell_seg = create_seg_df(cell_shape, x='x', y='y', cell_id='cell')
+    nucleus_seg = create_seg_df(nucleus_shape, x='x', y='y', cell_id='nucleus')
+    return bt.io.prepare(molecules=trainscripts, cell_seg=cell_seg, other_seg={'nucleus': nucleus_seg})  # type: ignore
diff --git a/inst/python/python_bento_analysis.py b/inst/python/python_bento_analysis.py
new file mode 100644
index 000000000..697af368a
--- /dev/null
+++ b/inst/python/python_bento_analysis.py
@@ -0,0 +1,148 @@
+from typing import List, Optional
+
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+from anndata import AnnData
+from scipy.sparse import csc_matrix
+from shapely.geometry import MultiPolygon, Point, Polygon
+
+
+def create_seg_df(vertices_df: pd.DataFrame, x: str = 'x', y: str = 'y', cell_id: str = 'cell_id') -> pd.DataFrame:
+    """
+    Create a dataframe with cell_id and geometry columns from a dataframe with vertices
+    :param vertices_df: a dataframe with columns: cell_id, x, y
+    :param x: the column name of x coordinates
+    :param y: the column name of y coordinates
+    :param cell_id: the column name of cell id
+    :param bounds: the bounds of the area (minx, miny, maxx, maxy)
+    :return: a dataframe with cell_id and geometry columns
+    """
+    # --- create polygons ---
+    polygons = vertices_df.groupby(cell_id).apply(lambda group: Polygon(zip(group[x], group[y])))  # type: ignore
+    seg_df = gpd.GeoDataFrame(polygons, columns=['geometry'])
+    # --- correct invalid polygons ---
+    corrected_seg_df = seg_df.copy(deep=True)
+    for i in range(seg_df.shape[0]):
+        if not seg_df.iloc[i, 0].is_valid:  # type: ignore
+            corrected_seg_df.iloc[i, 0] = seg_df.iloc[i, 0].buffer(0)  # type: ignore
+        if isinstance(corrected_seg_df.iloc[i, 0], MultiPolygon):
+            corrected_seg_df.iloc[i, 0] = max(corrected_seg_df.iloc[i, 0], key=lambda x: x.area)  # type: ignore
+    return corrected_seg_df
+
+
+def valid_transcript(row, adata: AnnData, legal_cells: List[str]) -> bool:
+    """
+    Check if the transcript is valid
+    :param adata: anndata object
+    :param row: a row of adata.uns['points']
+    :return: True if the transcript is valid
+    """
+    return str(row['cell']) in legal_cells and adata.obs.loc[str(row['cell']), 'cell_shape'].contains(  # type: ignore
+        Point(row['x'], row['y']))
+
+
+def adata_X_handle(adata: AnnData) -> None:
+    """
+    Convert adata.X from sparse matrix to dense matrix
+    :param adata: anndata object
+    :return: None
+    """
+    if isinstance(adata.X, csc_matrix):  # in case of duplicate handling
+        adata.X = np.array(adata.X.todense())
+
+
+def adata_var_handle(adata: AnnData) -> None:
+    """
+    Set gene as index of adata.var
+    :param adata: anndata object
+    :return: None
+    """
+    if adata.var.shape[1] == 1:  # in case of duplicate handling
+        adata.var.set_index('gene', inplace=True)
+
+
+def adata_obs_handle(adata: AnnData,
+                     shape_names: Optional[List[str]] = None,
+                     bounds: Optional[List[float]] = None) -> None:
+    """
+    Handle adata.obs to make sure the data type is correct
+    :param adata: anndata object
+    :param shape_names: the names of shapes that need to handle
+    :param bounds: the bounds of the area (minx, miny, maxx, maxy)
+    :return: None
+    """
+    # --- shapes need to handle ---
+    if shape_names is None:
+        shape_names = ['cell_shape', 'nucleus_shape']  # we may have other shapes (mitochondria etc) in the future
+    # --- create GEODataFrame ---
+    for shape in shape_names:
+        if shape not in adata.obs.columns:
+            seg_df = create_seg_df(vertices_df=adata.uns[shape])
+            adata.obs[shape] = seg_df['geometry']
+            del adata.uns[shape]
+    # --- remove cells outside the bounds ---
+    if bounds is not None:
+        area_shape = Polygon([(bounds[0], bounds[1]), (bounds[0], bounds[3]), (bounds[2], bounds[3]),
+                              (bounds[2], bounds[1])])
+        adata.uns['legal_cells'] = adata.obs['cell_shape'].apply(lambda x: area_shape.contains(x))
+    else:
+        adata.uns['legal_cells'] = np.ones(adata.obs.shape[0], dtype=bool)
+    # --- add batch column ---
+    if 'batch' not in adata.obs.columns:
+        adata.obs['batch'] = 0
+
+
+def adata_uns_points_handle(adata: AnnData) -> None:
+    """
+    Handle adata.uns['points'] to make sure the data type is correct
+    :param adata: anndata object
+    :return: None
+    """
+    # --- correct the value of nucleus ---
+    if adata.uns['points']['nucleus'].min() == 0:  # in case of duplicate handling
+        points = adata.uns['points']
+        points['nucleus'] = points['cell'].values * points['nucleus'].values
+        points.loc[points['nucleus'] == 0, 'nucleus'] = -1
+    # --- remove unused transcripts ---
+    legal_cells = adata.obs.index[adata.uns['legal_cells']]
+    legal_transcripts = adata.uns['points'].apply(valid_transcript, axis=1, args=(adata, legal_cells))
+    adata.uns['points'] = adata.uns['points'][legal_transcripts]
+    # --- setup data type ---
+    dtypes = {'x': 'float', 'y': 'float', 'gene': 'str', 'cell': 'str', 'nucleus': 'str', 'batch': 'str'}
+    adata.uns['points'] = adata.uns['points'].astype(dtypes)
+    adata.uns['points']['gene'] = adata.uns['points']['gene'].astype('category')
+    adata.uns['points']['cell'] = adata.uns['points']['cell'].astype('category')
+    adata.uns['points']['nucleus'] = adata.uns['points']['nucleus'].astype('category')
+    adata.uns['points']['batch'] = adata.uns['points']['batch'].astype('category')
+    # --- add legal_genes in adata.uns ---
+    adata.uns['legal_genes'] = [x in adata.uns['points']['gene'].tolist() for x in adata.var.index]
+
+
+def adata_handle(adata: AnnData, bounds: Optional[List[float]] = None) -> AnnData:
+    """
+    Handle anndata object to make sure the data type is correct
+    Process:
+        1. processing adata.X: format
+        2. processing adata.var: format
+        3. processing adata.obs
+            - create GEODataFrame & delete adata.uns['cell_shape'] and adata.uns['nucleus_shape']
+            - remove cells outside the bounds & add legal_cells in adata.uns
+            - add batch column
+        4. processing adata.uns['points']
+            - correct the value of nucleus
+            - remove unused transcripts
+            - setup data type
+            - add legal_genes in adata.uns
+        5. subset adata
+    :param adata: AnnData object
+    :return: Filtered Anndata object
+    """
+    adata_X_handle(adata)
+    adata_var_handle(adata)
+    adata_obs_handle(adata, bounds=bounds)
+    adata_uns_points_handle(adata)
+
+    return adata[adata.uns['legal_cells'], adata.uns['legal_genes']]
+
+

From a0bf628b6bdb8cca118c5fbb00a7080752231e0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?=
 <wwang.bio@gmail.com>
Date: Fri, 8 Sep 2023 10:45:55 -0400
Subject: [PATCH 02/10] Ready for install

---
 R/python_bento.R                     |  29 +++
 inst/python/g2bento.py               |  79 ++++++--
 inst/python/log.py                   |  41 +++++
 inst/python/python_bento_analysis.py | 265 +++++++++++++--------------
 4 files changed, 267 insertions(+), 147 deletions(-)
 create mode 100644 R/python_bento.R
 create mode 100644 inst/python/log.py

diff --git a/R/python_bento.R b/R/python_bento.R
new file mode 100644
index 000000000..8ef17c613
--- /dev/null
+++ b/R/python_bento.R
@@ -0,0 +1,29 @@
+#' @title Create bento adata object from gobject
+#' @name createBentoAdata
+#' @description Create bento adata object from gobject
+#' @param gobject Giotto object
+#' @return bento_adata bento adata object
+#' @export
+createBentoAdata <- function(gobject){
+  # Transcripts
+  transcripts_df <- as.data.frame(sf::st_as_sf(gobject@feat_info$rna@spatVector))
+  coordinates_df <- sf::st_coordinates(transcripts_df['geometry'])
+  t_df <- cbind(coordinates_df, transcripts_df[c("feat_ID")])
+  colnames(t_df) <- c('x','y','gene')
+  t_df <- t_df[t_df$cell != '-1',]  # Remove extracellular points, required by Bento
+
+  # Cell shapes
+  # TODO: Add batch information based on?
+  cell_poly <- spatVector_to_dt(gobject@spatial_info$cell@spatVector)
+  cell_poly <- data.frame(cell_id = cell_poly$poly_ID, x = cell_poly$x, y = cell_poly$y, batch = 0L)
+
+  # Nuclei shapes
+  # TODO: Add batch information based on?
+  nucleus_poly <- spatVector_to_dt(gobject@spatial_info$nucleus@spatVector)
+  nucleus_poly <- data.frame(cell_id = nucleus_poly$poly_ID, x = nucleus_poly$x, y = nucleus_poly$y, batch = 0L)
+
+  # Create AnnData object
+  bento_adata <- Giotto::create_AnnData(trainscripts = t_df, cell_shape = cell_poly, nucleus_shape = nucleus_poly)
+  
+  return(bento_adata)
+}
diff --git a/inst/python/g2bento.py b/inst/python/g2bento.py
index 2ab4d9ae4..783b33cf0 100644
--- a/inst/python/g2bento.py
+++ b/inst/python/g2bento.py
@@ -1,16 +1,10 @@
-from calendar import c
-import os
-
-import anndata as ad
 import bento as bt
 import geopandas as gpd
-import numpy as np
 import pandas as pd
-import scipy
 from anndata import AnnData
-from geopandas import GeoDataFrame
-from shapely.geometry import MultiPolygon, Point, Polygon
+from shapely.geometry import MultiPolygon, Polygon
 
+from log import debug, warning, info
 
 def create_seg_df(vertices_df: pd.DataFrame, x: str = 'x', y: str = 'y', cell_id: str = 'cell_id') -> pd.DataFrame:
     """
@@ -35,7 +29,68 @@ def create_seg_df(vertices_df: pd.DataFrame, x: str = 'x', y: str = 'y', cell_id
     return corrected_seg_df
 
 
-def create_AnnData(trainscripts: pd.DataFrame, cell_shape: GeoDataFrame, nucleus_shape) -> AnnData:
-    cell_seg = create_seg_df(cell_shape, x='x', y='y', cell_id='cell')
-    nucleus_seg = create_seg_df(nucleus_shape, x='x', y='y', cell_id='nucleus')
-    return bt.io.prepare(molecules=trainscripts, cell_seg=cell_seg, other_seg={'nucleus': nucleus_seg})  # type: ignore
+def add_batch(adata: AnnData, cell_shape: pd.DataFrame):
+    """
+    Add batch information to an AnnData object
+        If cell_seg have batch information, add batch information to adata, else all batch will be set to 0
+    :param adata: an AnnData object
+    :param cell_seg: the name of the cell segmentation
+    :return: an AnnData object with batch information
+    """
+    if 'batch' in cell_shape.columns:
+        info('Batch information found in cell_shape, adding batch information to adata')
+        adata.obs['batch'] = [cell_shape.loc[cell_shape['cell_id']==cell,'batch'].values[0] for cell in adata.obs_names] # type: ignore
+        adata.uns['points']['batch'] = [cell_shape.loc[cell_shape['cell_id']==cell,'batch'].values[0] for cell in adata.uns['points']['cell']] # type: ignore
+    else:  # Interim measures, batch information may not transfered to cell_shape
+        warning('Batch information not found in cell_shape, all batch will be set to 0')
+        adata.obs['batch'] = 0
+        adata.uns['points']['batch'] = 0
+    adata.obs['batch'] = adata.obs['batch'].astype('category')
+    adata.uns['points']['batch'] = adata.uns['points']['batch'].astype('category')
+
+
+def create_AnnData(trainscripts, cell_shape, nucleus_shape, x_max = None, x_min = None, y_max = None, y_min = None) -> AnnData:
+    # --- processing input ---
+    trainscripts = pd.DataFrame(trainscripts)
+    cell_shape = pd.DataFrame(cell_shape)
+    cell_shape['cell_id'] = cell_shape['cell_id'].astype('category')
+    if 'batch' in cell_shape.columns:
+        cell_shape['batch'] = cell_shape['batch'].astype('category')
+    nucleus_shape = pd.DataFrame(nucleus_shape)
+    nucleus_shape['cell_id'] = nucleus_shape['cell_id'].astype('category')
+
+    # --- create shape ---
+    cell_seg = create_seg_df(cell_shape, x='x', y='y', cell_id='cell_id')
+    nucleus_seg = create_seg_df(nucleus_shape, x='x', y='y', cell_id='cell_id')
+    if cell_seg.shape[0] > 500:
+        warning('cell_seg has more than 500 cells, processing may take a long time.')
+
+    # --- filter cells ---
+    # Interim measures
+    # subsetGiottoLocs don't give wanted result, so we use a workaround here
+    if x_max is not None and x_min is not None and y_max is not None and y_min is not None:
+        area_shape = Polygon([(x_min, y_min), (x_max, y_min), (x_max, y_max), (x_min, y_max)])
+        legal_cells = cell_seg['geometry'].apply(lambda x: area_shape.contains(x))
+    else:
+        legal_cells = pd.Series([True] * cell_seg.shape[0])
+
+    # --- create AnnData ---
+    adata: AnnData = bt.io.prepare(molecules=trainscripts, cell_seg=cell_seg, other_seg={'nucleus': nucleus_seg})  # type: ignore
+    add_batch(adata, cell_shape)
+
+    # --- filter genes ---
+    # Interim measures
+    # subsetGiottoLocs don't give wanted result, so we use a workaround here
+    legal_genes = adata.var_names.isin(set(adata.uns['points']['gene'].values))
+    filtered_adata = adata[legal_cells,legal_genes] # type: ignore
+    filtered_adata.uns['points']['gene'] = filtered_adata.uns['points']['gene'].cat.remove_unused_categories()
+
+    # Interim measures
+    # subsetted adata don't have _X property, which will cause unexpect error for adata.__sizeof__
+    # when create object in R, reticulate will call sys.getsizeof to get the size of the object, which will call adata.__sizeof__
+    # https://github.com/rstudio/reticulate/issues/1332
+    # https://github.com/rstudio/rstudio/issues/13491
+    filtered_adata._X = filtered_adata.X 
+    return filtered_adata
+
+    # return adata
diff --git a/inst/python/log.py b/inst/python/log.py
new file mode 100644
index 000000000..e6c3f2d1c
--- /dev/null
+++ b/inst/python/log.py
@@ -0,0 +1,41 @@
+import sys
+import time
+
+
+def get_current_time() -> str:
+    return time.strftime('%H:%M:%S', time.localtime())
+
+
+def write_direct_message(message: str):
+    curr_time_str = get_current_time()
+    sys.stdout.write(f'{curr_time_str} --- {message}\n')
+    sys.stdout.flush()
+
+
+def debug(message: str):
+    write_direct_message(f'DEBUG: {message}')
+
+
+def info(message: str):
+    write_direct_message(f'INFO: {message}')
+
+
+def write_direct_message_err(message: str):
+    curr_time_str = get_current_time()
+    sys.stderr.write(f'{curr_time_str} --- {message}\n')
+    sys.stderr.flush()
+
+
+def warning(message: str):
+    write_direct_message_err(f'WARNING: {message}')
+
+
+def error(message: str):
+    write_direct_message_err(f'ERROR: {message}')
+
+
+def critical(message: str):
+    write_direct_message_err(f'CRITICAL: {message}')
+
+
+__all__ = ['debug', 'info', 'warning', 'error', 'critical']
diff --git a/inst/python/python_bento_analysis.py b/inst/python/python_bento_analysis.py
index 697af368a..45cf0e055 100644
--- a/inst/python/python_bento_analysis.py
+++ b/inst/python/python_bento_analysis.py
@@ -1,148 +1,143 @@
 from typing import List, Optional
-
-import geopandas as gpd
-import numpy as np
-import pandas as pd
+import bento as bt
 from anndata import AnnData
-from scipy.sparse import csc_matrix
-from shapely.geometry import MultiPolygon, Point, Polygon
+import emoji
+from bento._utils import track
+from bento.tools._colocation import _colocation_tensor
+from bento.tools import decompose
+from kneed import KneeLocator
+
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+from log import warning, info
+
+
+# ---------------------------------
+# modified bento and dependencies functions/classes
+# ---------------------------------
+@track
+def colocation(
+    data: AnnData,
+    ranks: List[int],
+    fname: str,
+    iterations: int = 3,
+    plot_error: bool = True,
+    copy: bool = False,
+):
+    """Decompose a tensor of pairwise colocalization quotients into signatures.
+
+    Parameters
+    ----------
+    adata : AnnData
+        Spatial formatted AnnData object.
+    ranks : list
+        List of ranks to decompose the tensor.
+    iterations : int
+        Number of iterations to run the decomposition.
+    plot_error : bool
+        Whether to plot the error of the decomposition.
+    copy : bool
+        Whether to return a copy of the AnnData object. Default False.
+    Returns
+    -------
+    adata : AnnData
+        .uns['factors']: Decomposed tensor factors.
+        .uns['factors_error']: Decomposition error.
+    """
+    adata = data.copy() if copy else data
 
+    print("Preparing tensor...")
+    _colocation_tensor(adata, copy=copy)
 
-def create_seg_df(vertices_df: pd.DataFrame, x: str = 'x', y: str = 'y', cell_id: str = 'cell_id') -> pd.DataFrame:
-    """
-    Create a dataframe with cell_id and geometry columns from a dataframe with vertices
-    :param vertices_df: a dataframe with columns: cell_id, x, y
-    :param x: the column name of x coordinates
-    :param y: the column name of y coordinates
-    :param cell_id: the column name of cell id
-    :param bounds: the bounds of the area (minx, miny, maxx, maxy)
-    :return: a dataframe with cell_id and geometry columns
-    """
-    # --- create polygons ---
-    polygons = vertices_df.groupby(cell_id).apply(lambda group: Polygon(zip(group[x], group[y])))  # type: ignore
-    seg_df = gpd.GeoDataFrame(polygons, columns=['geometry'])
-    # --- correct invalid polygons ---
-    corrected_seg_df = seg_df.copy(deep=True)
-    for i in range(seg_df.shape[0]):
-        if not seg_df.iloc[i, 0].is_valid:  # type: ignore
-            corrected_seg_df.iloc[i, 0] = seg_df.iloc[i, 0].buffer(0)  # type: ignore
-        if isinstance(corrected_seg_df.iloc[i, 0], MultiPolygon):
-            corrected_seg_df.iloc[i, 0] = max(corrected_seg_df.iloc[i, 0], key=lambda x: x.area)  # type: ignore
-    return corrected_seg_df
-
-
-def valid_transcript(row, adata: AnnData, legal_cells: List[str]) -> bool:
-    """
-    Check if the transcript is valid
-    :param adata: anndata object
-    :param row: a row of adata.uns['points']
-    :return: True if the transcript is valid
-    """
-    return str(row['cell']) in legal_cells and adata.obs.loc[str(row['cell']), 'cell_shape'].contains(  # type: ignore
-        Point(row['x'], row['y']))
+    tensor = adata.uns["tensor"]
 
+    print(emoji.emojize(":running: Decomposing tensor..."))
+    factors, errors = decompose(tensor, ranks, iterations=iterations)
 
-def adata_X_handle(adata: AnnData) -> None:
-    """
-    Convert adata.X from sparse matrix to dense matrix
-    :param adata: anndata object
-    :return: None
-    """
-    if isinstance(adata.X, csc_matrix):  # in case of duplicate handling
-        adata.X = np.array(adata.X.todense())
+    if plot_error and errors.shape[0] > 1:
+        kl = KneeLocator(errors["rank"], errors["rmse"], direction="decreasing", curve="convex")
+        if kl.knee is None:
+            warning('No knee found, please extend the ranks range.\nCurrent ranks range: [{ranks[0]},{ranks[-1]}]')
+        else:
+            info(f'Knee found at rank {kl.knee}')
+            sns.lineplot(data=errors, x="rank", y="rmse", ci=95, marker="o")  # type: ignore
+            plt.axvline(kl.knee, linestyle="--")
+            plt.savefig(fname)
+            info(f"Saved to {fname}")
 
+    adata.uns["factors"] = factors
+    adata.uns["factors_error"] = errors
 
-def adata_var_handle(adata: AnnData) -> None:
-    """
-    Set gene as index of adata.var
-    :param adata: anndata object
-    :return: None
-    """
-    if adata.var.shape[1] == 1:  # in case of duplicate handling
-        adata.var.set_index('gene', inplace=True)
+    print(emoji.emojize(":heavy_check_mark: Done."))
+    return adata if copy else None
 
 
-def adata_obs_handle(adata: AnnData,
-                     shape_names: Optional[List[str]] = None,
-                     bounds: Optional[List[float]] = None) -> None:
-    """
-    Handle adata.obs to make sure the data type is correct
-    :param adata: anndata object
-    :param shape_names: the names of shapes that need to handle
-    :param bounds: the bounds of the area (minx, miny, maxx, maxy)
-    :return: None
-    """
-    # --- shapes need to handle ---
-    if shape_names is None:
-        shape_names = ['cell_shape', 'nucleus_shape']  # we may have other shapes (mitochondria etc) in the future
-    # --- create GEODataFrame ---
-    for shape in shape_names:
-        if shape not in adata.obs.columns:
-            seg_df = create_seg_df(vertices_df=adata.uns[shape])
-            adata.obs[shape] = seg_df['geometry']
-            del adata.uns[shape]
-    # --- remove cells outside the bounds ---
-    if bounds is not None:
-        area_shape = Polygon([(bounds[0], bounds[1]), (bounds[0], bounds[3]), (bounds[2], bounds[3]),
-                              (bounds[2], bounds[1])])
-        adata.uns['legal_cells'] = adata.obs['cell_shape'].apply(lambda x: area_shape.contains(x))
-    else:
-        adata.uns['legal_cells'] = np.ones(adata.obs.shape[0], dtype=bool)
-    # --- add batch column ---
-    if 'batch' not in adata.obs.columns:
-        adata.obs['batch'] = 0
-
-
-def adata_uns_points_handle(adata: AnnData) -> None:
-    """
-    Handle adata.uns['points'] to make sure the data type is correct
-    :param adata: anndata object
-    :return: None
-    """
-    # --- correct the value of nucleus ---
-    if adata.uns['points']['nucleus'].min() == 0:  # in case of duplicate handling
-        points = adata.uns['points']
-        points['nucleus'] = points['cell'].values * points['nucleus'].values
-        points.loc[points['nucleus'] == 0, 'nucleus'] = -1
-    # --- remove unused transcripts ---
-    legal_cells = adata.obs.index[adata.uns['legal_cells']]
-    legal_transcripts = adata.uns['points'].apply(valid_transcript, axis=1, args=(adata, legal_cells))
-    adata.uns['points'] = adata.uns['points'][legal_transcripts]
-    # --- setup data type ---
-    dtypes = {'x': 'float', 'y': 'float', 'gene': 'str', 'cell': 'str', 'nucleus': 'str', 'batch': 'str'}
-    adata.uns['points'] = adata.uns['points'].astype(dtypes)
-    adata.uns['points']['gene'] = adata.uns['points']['gene'].astype('category')
-    adata.uns['points']['cell'] = adata.uns['points']['cell'].astype('category')
-    adata.uns['points']['nucleus'] = adata.uns['points']['nucleus'].astype('category')
-    adata.uns['points']['batch'] = adata.uns['points']['batch'].astype('category')
-    # --- add legal_genes in adata.uns ---
-    adata.uns['legal_genes'] = [x in adata.uns['points']['gene'].tolist() for x in adata.var.index]
-
-
-def adata_handle(adata: AnnData, bounds: Optional[List[float]] = None) -> AnnData:
-    """
-    Handle anndata object to make sure the data type is correct
-    Process:
-        1. processing adata.X: format
-        2. processing adata.var: format
-        3. processing adata.obs
-            - create GEODataFrame & delete adata.uns['cell_shape'] and adata.uns['nucleus_shape']
-            - remove cells outside the bounds & add legal_cells in adata.uns
-            - add batch column
-        4. processing adata.uns['points']
-            - correct the value of nucleus
-            - remove unused transcripts
-            - setup data type
-            - add legal_genes in adata.uns
-        5. subset adata
-    :param adata: AnnData object
-    :return: Filtered Anndata object
-    """
-    adata_X_handle(adata)
-    adata_var_handle(adata)
-    adata_obs_handle(adata, bounds=bounds)
-    adata_uns_points_handle(adata)
+# ---------------------------------
+# bento wrapper functions
+# ---------------------------------
+
+
+def analysis_shape_features(adata: AnnData, feature_names: Optional[List[str]] = None) -> None:
+    if feature_names is None:
+        feature_names = list(bt.tl.list_shape_features().keys())
+    bt.tl.obs_stats(adata, feature_names=feature_names)
+
+
+def plot_shape_features_analysis_results(adata: AnnData, fname: str):
+    bt.pl.shapes(adata, fname=fname)
+
+
+def analysis_points_features(adata: AnnData,
+                             shapes_names: Optional[List[str]] = None,
+                             feature_names: Optional[List[str]] = None) -> None:
+    if shapes_names is None:
+        shapes_names = ["cell_shape", "nucleus_shape"]
+    if feature_names is None:
+        feature_names = list(bt.tl.list_point_features().keys())
+    bt.tl.analyze_points(adata, shape_names=shapes_names, feature_names=feature_names, groupby='gene')
+
+
+def plot_points_features_analysis_results(adata: AnnData, fname: str) -> None:
+    bt.pl.points(adata, fname=fname)
+
+
+def analysis_rna_forest(adata: AnnData) -> None:
+    bt.tl.lp(adata)
+    bt.tl.lp_stats(adata)
+
+
+def plot_rna_forest_analysis_results(adata: AnnData, fname1: str, fname2: str) -> None:
+    bt.pl.lp_genes(adata, fname=fname1)
+    bt.pl.lp_dist(adata, fname=fname2)
+
+
+def analysis_colocalization(adata: AnnData, fname: str, ranks: Optional[List[int]] = None) -> None:
+    if ranks is None:
+        ranks = list(range(1, 6))
+
+    # Cytoplasm = cell - nucleus
+    adata.obs["cytoplasm_shape"] = bt.geo.get_shape(adata, "cell_shape") - bt.geo.get_shape(adata, "nucleus_shape")
+
+    # Create point index
+    adata.uns["points"]["cytoplasm"] = (adata.uns["points"]["nucleus"].astype(int) < 0).astype(int)
+
+    bt.tl.coloc_quotient(adata, shapes=["cytoplasm_shape", "nucleus_shape"])
+
+    colocation(adata, ranks=ranks, fname=fname)
+
 
-    return adata[adata.uns['legal_cells'], adata.uns['legal_genes']]
+def plot_colocalization_analysis_results(adata: AnnData, fname: str, rank: int) -> None:
+    bt.pl.colocation(adata, rank=rank, fname=fname)
 
 
+def chekc_genes_number(adata: AnnData) -> None:
+    print(f'adata shape: {adata.shape}')
+    print(f'adata points genes: {len(adata.uns["points"]["gene"].unique())}')
+    print(f'adata cell_gene_features genes: {len(adata.uns["cell_gene_features"]["gene"].unique())}')
+    diff_set = set(adata.uns["points"]["gene"]) - set(adata.uns["cell_gene_features"]["gene"])
+    print(diff_set)
+    for g in diff_set:
+        print(f'{g}')
+        print(adata.uns['points'][adata.uns['points']['gene'] == g])

From 5a94590cde406d97870029c358f8d617dfb59d0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?=
 <wwang.bio@gmail.com>
Date: Fri, 8 Sep 2023 12:27:15 -0400
Subject: [PATCH 03/10] Remove

---
 R/python_bento.R | 29 -----------------------------
 1 file changed, 29 deletions(-)
 delete mode 100644 R/python_bento.R

diff --git a/R/python_bento.R b/R/python_bento.R
deleted file mode 100644
index 8ef17c613..000000000
--- a/R/python_bento.R
+++ /dev/null
@@ -1,29 +0,0 @@
-#' @title Create bento adata object from gobject
-#' @name createBentoAdata
-#' @description Create bento adata object from gobject
-#' @param gobject Giotto object
-#' @return bento_adata bento adata object
-#' @export
-createBentoAdata <- function(gobject){
-  # Transcripts
-  transcripts_df <- as.data.frame(sf::st_as_sf(gobject@feat_info$rna@spatVector))
-  coordinates_df <- sf::st_coordinates(transcripts_df['geometry'])
-  t_df <- cbind(coordinates_df, transcripts_df[c("feat_ID")])
-  colnames(t_df) <- c('x','y','gene')
-  t_df <- t_df[t_df$cell != '-1',]  # Remove extracellular points, required by Bento
-
-  # Cell shapes
-  # TODO: Add batch information based on?
-  cell_poly <- spatVector_to_dt(gobject@spatial_info$cell@spatVector)
-  cell_poly <- data.frame(cell_id = cell_poly$poly_ID, x = cell_poly$x, y = cell_poly$y, batch = 0L)
-
-  # Nuclei shapes
-  # TODO: Add batch information based on?
-  nucleus_poly <- spatVector_to_dt(gobject@spatial_info$nucleus@spatVector)
-  nucleus_poly <- data.frame(cell_id = nucleus_poly$poly_ID, x = nucleus_poly$x, y = nucleus_poly$y, batch = 0L)
-
-  # Create AnnData object
-  bento_adata <- Giotto::create_AnnData(trainscripts = t_df, cell_shape = cell_poly, nucleus_shape = nucleus_poly)
-  
-  return(bento_adata)
-}

From 038bff5bdf292ac0d4d455aea271e67387876b35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?=
 <wwang.bio@gmail.com>
Date: Fri, 8 Sep 2023 14:15:20 -0400
Subject: [PATCH 04/10] Add: wrapup createBentoAdata

---
 NAMESPACE        |  1 +
 R/python_bento.R | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 R/python_bento.R

diff --git a/NAMESPACE b/NAMESPACE
index 93d1d4589..3c92cca1f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -86,6 +86,7 @@ export(comparePolygonExpression)
 export(convertEnsemblToGeneSymbol)
 export(convertGiottoLargeImageToMG)
 export(createArchRProj)
+export(createBentoAdata)
 export(createCellMetaObj)
 export(createCrossSection)
 export(createDimObj)
diff --git a/R/python_bento.R b/R/python_bento.R
new file mode 100644
index 000000000..7ce06ff2c
--- /dev/null
+++ b/R/python_bento.R
@@ -0,0 +1,31 @@
+#' @title Create bento adata object from gobject
+#' @name createBentoAdata
+#' @description Create bento adata object from gobject
+#' @param gobject Giotto object
+#' @return bento_adata bento adata object
+#' @export
+createBentoAdata <- function(gobject){
+  # Transcripts
+  transcripts_df <- as.data.frame(sf::st_as_sf(gobject@feat_info$rna@spatVector))
+  coordinates_df <- sf::st_coordinates(transcripts_df['geometry'])
+  t_df <- cbind(coordinates_df, transcripts_df[c("feat_ID")])
+  colnames(t_df) <- c('x','y','gene')
+  t_df <- t_df[t_df$cell != '-1',]  # Remove extracellular points, required by Bento
+
+  # Cell shapes
+  # TODO: Add batch information based on?
+  cell_poly <- spatVector_to_dt(gobject@spatial_info$cell@spatVector)
+  cell_poly <- data.frame(cell_id = cell_poly$poly_ID, x = cell_poly$x, y = cell_poly$y, batch = 0L)
+
+  # Nuclei shapes
+  # TODO: Add batch information based on?
+  nucleus_poly <- spatVector_to_dt(gobject@spatial_info$nucleus@spatVector)
+  nucleus_poly <- data.frame(cell_id = nucleus_poly$poly_ID, x = nucleus_poly$x, y = nucleus_poly$y, batch = 0L)
+
+  # Create AnnData object
+  g2bento_path <- system.file("python","g2bento.py",package="Giotto")
+  reticulate::source_python(g2bento_path)
+  bento_adata <- create_AnnData(trainscripts=t_df, cell_shape=cell_poly, nucleus_shape=nucleus_poly)
+  
+  return(bento_adata)
+}

From 69347ad7a5305688849ace8cb923232a0b6c3c99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?=
 <wwang.bio@gmail.com>
Date: Fri, 8 Sep 2023 14:15:31 -0400
Subject: [PATCH 05/10] Add: install bento

---
 R/python_environment.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/R/python_environment.R b/R/python_environment.R
index 84a5a8af8..044a15066 100644
--- a/R/python_environment.R
+++ b/R/python_environment.R
@@ -194,6 +194,10 @@ install_giotto_environment_specific = function(packages_to_install = c('pandas',
                            python_version = python_version)
   }
 
+  # reticulate don't support installation from github yet
+  # using system call instead
+  config <- reticulate::py_discover_config(use_environment='giotto_env')
+  system2(config$python, c("-m", "pip", "install", "git+https://github.com/wwang-chcn/bento-tools.git@giotto_install"))
 
 }
 

From 89b84802c64a0713fd5035d199f0289c9ba6da73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?=
 <wwang.bio@gmail.com>
Date: Fri, 8 Sep 2023 15:17:43 -0400
Subject: [PATCH 06/10] Update: remove cell filter process

---
 inst/python/g2bento.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/inst/python/g2bento.py b/inst/python/g2bento.py
index 783b33cf0..f5f570f5b 100644
--- a/inst/python/g2bento.py
+++ b/inst/python/g2bento.py
@@ -49,7 +49,7 @@ def add_batch(adata: AnnData, cell_shape: pd.DataFrame):
     adata.uns['points']['batch'] = adata.uns['points']['batch'].astype('category')
 
 
-def create_AnnData(trainscripts, cell_shape, nucleus_shape, x_max = None, x_min = None, y_max = None, y_min = None) -> AnnData:
+def create_AnnData(trainscripts, cell_shape, nucleus_shape) -> AnnData:
     # --- processing input ---
     trainscripts = pd.DataFrame(trainscripts)
     cell_shape = pd.DataFrame(cell_shape)
@@ -66,13 +66,8 @@ def create_AnnData(trainscripts, cell_shape, nucleus_shape, x_max = None, x_min
         warning('cell_seg has more than 500 cells, processing may take a long time.')
 
     # --- filter cells ---
-    # Interim measures
-    # subsetGiottoLocs don't give wanted result, so we use a workaround here
-    if x_max is not None and x_min is not None and y_max is not None and y_min is not None:
-        area_shape = Polygon([(x_min, y_min), (x_max, y_min), (x_max, y_max), (x_min, y_max)])
-        legal_cells = cell_seg['geometry'].apply(lambda x: area_shape.contains(x))
-    else:
-        legal_cells = pd.Series([True] * cell_seg.shape[0])
+    # Let Giotto perform the filtering
+    legal_cells = pd.Series([True] * cell_seg.shape[0])
 
     # --- create AnnData ---
     adata: AnnData = bt.io.prepare(molecules=trainscripts, cell_seg=cell_seg, other_seg={'nucleus': nucleus_seg})  # type: ignore

From e43de1657caf9e3a34c21ca7593c2febc1324999 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?=
 <wwang.bio@gmail.com>
Date: Fri, 8 Sep 2023 15:18:31 -0400
Subject: [PATCH 07/10] Update: comments

---
 inst/python/g2bento.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inst/python/g2bento.py b/inst/python/g2bento.py
index f5f570f5b..27b36d5f8 100644
--- a/inst/python/g2bento.py
+++ b/inst/python/g2bento.py
@@ -81,10 +81,11 @@ def create_AnnData(trainscripts, cell_shape, nucleus_shape) -> AnnData:
     filtered_adata.uns['points']['gene'] = filtered_adata.uns['points']['gene'].cat.remove_unused_categories()
 
     # Interim measures
-    # subsetted adata don't have _X property, which will cause unexpect error for adata.__sizeof__
+    # subsetted adata (_is_view == True) don't have _X property, which will cause unexpect error for adata.__sizeof__
     # when create object in R, reticulate will call sys.getsizeof to get the size of the object, which will call adata.__sizeof__
     # https://github.com/rstudio/reticulate/issues/1332
     # https://github.com/rstudio/rstudio/issues/13491
+    # wait for AnnData to fix this issue
     filtered_adata._X = filtered_adata.X 
     return filtered_adata
 

From 8391c92d97aadd31772b8ea8430bd508431ba227 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?=
 <wwang.bio@gmail.com>
Date: Fri, 8 Sep 2023 16:18:56 -0400
Subject: [PATCH 08/10] Update: add python_bento.R

---
 DESCRIPTION | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DESCRIPTION b/DESCRIPTION
index 8fcd0beb4..83c8479a5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -143,6 +143,7 @@ Collate:
     'interactivity.R'
     'interoperability.R'
     'poly_influence.R'
+    'python_bento.R'
     'python_environment.R'
     'python_hmrf.R'
     'python_scrublet.R'

From 0bd85f3503a281db557bb99aa14d41ba3fed1511 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?=
 <wwang.bio@gmail.com>
Date: Thu, 14 Sep 2023 15:39:48 -0400
Subject: [PATCH 09/10] Fix: typo

---
 R/python_bento.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/python_bento.R b/R/python_bento.R
index 7ce06ff2c..702340d42 100644
--- a/R/python_bento.R
+++ b/R/python_bento.R
@@ -7,7 +7,7 @@
 createBentoAdata <- function(gobject){
   # Transcripts
   transcripts_df <- as.data.frame(sf::st_as_sf(gobject@feat_info$rna@spatVector))
-  coordinates_df <- sf::st_coordinates(transcripts_df['geometry'])
+  coordinates_df <- lapply(transcripts_df['geometry'], sf::st_coordinates)$geometry
   t_df <- cbind(coordinates_df, transcripts_df[c("feat_ID")])
   colnames(t_df) <- c('x','y','gene')
   t_df <- t_df[t_df$cell != '-1',]  # Remove extracellular points, required by Bento

From a178c63127611c3fcea6e89a9fc1d011e438ff1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wen=20Wang=20=28=E7=8E=8B=E6=96=87=29?=
 <wwang.bio@gmail.com>
Date: Thu, 14 Sep 2023 16:06:18 -0400
Subject: [PATCH 10/10] Fix: createBentoAdata

---
 R/python_bento.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/python_bento.R b/R/python_bento.R
index 702340d42..773b39d91 100644
--- a/R/python_bento.R
+++ b/R/python_bento.R
@@ -8,9 +8,8 @@ createBentoAdata <- function(gobject){
   # Transcripts
   transcripts_df <- as.data.frame(sf::st_as_sf(gobject@feat_info$rna@spatVector))
   coordinates_df <- lapply(transcripts_df['geometry'], sf::st_coordinates)$geometry
-  t_df <- cbind(coordinates_df, transcripts_df[c("feat_ID")])
+  t_df <- as.data.frame(cbind(coordinates_df, transcripts_df[c("feat_ID")]))
   colnames(t_df) <- c('x','y','gene')
-  t_df <- t_df[t_df$cell != '-1',]  # Remove extracellular points, required by Bento
 
   # Cell shapes
   # TODO: Add batch information based on?