Preprocessing of single cell reference data #9

Ceyhun-Alar · 2024-02-19T08:48:45Z

Can you please share the code for processing reference data to obtain sc_references/sc_breast.csv, sc_breast_markers_pos.csv, and sc_breast_markers_neg.csv?

diala-ar · 2024-03-20T20:10:52Z

I second @Ceyhun-Alar request, as I was not able to reproduce the positive and negative markers for the Breast dataset. I really appreciate if you could share your code to produce those markers. Thanks

xhelenfu · 2024-03-21T04:43:46Z

Hi, please try this code to get the positive and negative marker .csv files from the reference data

import numpy as np
import argparse
import pandas as pd
import natsort


def main(config):

    ref_df = pd.read_csv(config.fp_ref, index_col=0)
    n_genes = ref_df.shape[1] - 3
    print("Ref data shape", ref_df.shape)

    cell_types = ref_df["cell_type"].tolist()
    cell_types = natsort.natsorted(list(set(cell_types)))
    print(cell_types)
    n_cell_types = len(cell_types)

    ref_expr = ref_df.iloc[:, :n_genes].to_numpy()
    gene_names = ref_df.columns[:n_genes]

    # Find genes with expressions in bottom 10% percentile for every ref cell type
    pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True)
    pct_10 = np.tile(pct_10, (1, n_genes))
    low_expr_true = np.zeros(pct_10.shape)
    low_expr_true[ref_expr <= pct_10] = 1

    # Find overlap for different ref samples of the same cell type
    ct_idx = ref_df["ct_idx"].to_numpy()
    low_expr_true_agg = np.zeros((n_cell_types, n_genes))
    for ct in range(n_cell_types):
        rows = np.where(ct_idx == ct)[0]
        low_expr_true_ct = low_expr_true[rows]
        low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0)

    # Set overlaps to 0
    overlaps = np.sum(low_expr_true_agg, 0)
    too_many = np.where(overlaps > config.max_overlaps_neg)[0]
    low_expr_true_agg[:, too_many] = 0

    # print("num neg genes per cell type")
    # print(np.sum(low_expr_true_agg, 1))

    df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names)

    # Find genes with expressions in top 90% percentile for every ref cell type
    pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True)
    pct_90 = np.tile(pct_90, (1, n_genes))
    high_expr_true = np.zeros(pct_90.shape)
    high_expr_true[ref_expr >= pct_90] = 1

    # Find overlap for different ref samples of the same cell type
    ct_idx = ref_df["ct_idx"].to_numpy()
    high_expr_true_agg = np.zeros((n_cell_types, n_genes))
    for ct in range(n_cell_types):
        rows = np.where(ct_idx == ct)[0]
        high_expr_true_ct = high_expr_true[rows]
        high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0)

    # print("num pos genes per cell type")
    # print(np.sum(high_expr_true_agg, 1))

    # Set overlaps to 0
    overlaps = np.sum(high_expr_true_agg, 0)
    too_many = np.where(overlaps > config.max_overlaps_pos)[0]
    high_expr_true_agg[:, too_many] = 0

    df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names)

    df_pos.to_csv(config.fp_pos)
    df_neg.to_csv(config.fp_neg)

    print("Done")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--fp_ref",
        default="data/sc_references/sc_breast.csv",
        type=str,
        help="ref data",
    )

    parser.add_argument(
        "--max_overlaps_pos",
        default=4,
        type=int,
        help="no more than n cell types can have the same markers",
    )

    parser.add_argument(
        "--max_overlaps_neg",
        default=15,
        type=int,
        help="no more than n cell types can have the same markers",
    )

    parser.add_argument(
        "--fp_pos", default="markers_pos.csv", type=str, help="positive markers"
    )
    parser.add_argument(
        "--fp_neg", default="markers_neg.csv", type=str, help="negative markers"
    )

    config = parser.parse_args()
    main(config)

diala-ar · 2024-03-21T20:39:37Z

Hi @xhelenfu,

Thanks so much for sharing the code, it worked!

However, I wrote an R code to generate the sc_breast.csv file from the averages gene expression BRCA_xxx_expression_Celltype_majorlineage.txt files downloaded from the TISCH2 website, but I was not able to regenerate it.

Briefly the code: 1) renames the CD8T and CD8Tex cell types to CD8T/CD8Tex; 2) renames CD4 and Treg cell types to CD4Tconv/Treg and 3) excludes the Tprolif and Oligodendrocyte cell types. The correlation between my generated numbers and yours are pretty high (>0.9). Below is my code:

in_dir = "resources/Xenium_data/output-sample1/"
features = read.delim(file.path(in_dir, 'cell_feature_matrix', 'features.tsv.gz'), header = F)
names(features) = c('Ensembl', 'gene_symbol', 'gene_category')
genes = features$gene_symbol[features$gene_category=='Gene Expression']

### 02. generate average expression of reference studies of BRCA cancer
tumor_subtype = 'BRCA'
in_dir = file.path('resources', 'from_TISCH2', 'avg_expr')
in_dirs = list.dirs(in_dir, recursive=F)
in_dirs = in_dirs[grep(tumor_subtype, in_dirs)]
all_avg_expr = NULL
for (i in 1:length(in_dirs)) {
  in_dir_i = in_dirs[i]
  
  atlas = sub(paste0(tumor_subtype, '_(.+)_Expression'), '\\1', basename(in_dir_i))
  
  file_names = list.files(in_dir_i)
  file_name = grep('Celltype_majorlineage', file_names, value=T)
  avg_expr = read.delim(file.path(in_dir_i, file_name))
  avg_expr = data.frame(t(avg_expr))
  
  found_genes   = intersect(genes, names(avg_expr))
  missing_genes = setdiff(genes, names(avg_expr))
  missing_genes_expr = data.frame(matrix(0, nrow=nrow(avg_expr), ncol=length(missing_genes), 
                                  dimnames=list(NULL, missing_genes)))
  avg_expr = cbind(avg_expr[, found_genes], missing_genes_expr)
  avg_expr = avg_expr[, order(names(avg_expr))]
  avg_expr$cell_type = row.names(avg_expr)
  
  avg_expr[, (ncol(avg_expr)-3):ncol(avg_expr)]
  
  CD8_celltypes = grep('CD8', avg_expr$cell_type)
  avg_expr$cell_type[CD8_celltypes] = 'CD8T/CD8Tex'
  
  CD4_or_Treg_celltypes = grep('CD4|Treg', avg_expr$cell_type)
  avg_expr$cell_type[CD4_or_Treg_celltypes] = 'CD4T/Treg'
  avg_expr$atlas = atlas
  
  avg_expr[, (ncol(avg_expr)-3):ncol(avg_expr)]
  all_avg_expr = rbind(all_avg_expr, avg_expr)
}

## some cleaning
# remove columns having an expression of 0 for all cell types
rm_col = which(apply(all_avg_expr[, -which(names(all_avg_expr) %in% c("cell_type", 'atlas'))], 2, sum)==0)
if ( length(rm_col) > 0 )
  all_avg_expr = all_avg_expr[, -rm_col]

# remove rows with T profil
all_avg_expr = all_avg_expr[!all_avg_expr$cell_type %in% c('Tprolif', 'TProlif', 'Oligodendrocyte'), ]
all_avg_expr[all_avg_expr$cell_type=='Mono.Macro', 'cell_type'] = 'Mono/Macro'
row.names(all_avg_expr) = NULL


## add the cell_type index column ct_idx
cell_types = sort(unique(all_avg_expr$cell_type))
ct_idx     = 0:(length(cell_types)-1)
names(ct_idx) = cell_types
all_avg_expr = data.frame(all_avg_expr[, 1:(ncol(all_avg_expr)-2)],
                          ct_idx = sapply(all_avg_expr$cell_type, function(x) ct_idx[x]),
                          cell_type = all_avg_expr$cell_type,
                          atlas = all_avg_expr$atlas)
all_avg_expr = all_avg_expr[order(all_avg_expr$atlas, all_avg_expr$cell_type), ]
all_avg_expr[, (ncol(all_avg_expr)-3):ncol(all_avg_expr)]
write.csv(all_avg_expr, file=file.path('results', 'bidcell', 'input', 'sc_BRCA.csv'))

I highly appreciate if you could share your code to generate sc_breast.csv.

Thanks.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Preprocessing of single cell reference data #9

Preprocessing of single cell reference data #9

Ceyhun-Alar commented Feb 19, 2024

diala-ar commented Mar 20, 2024

xhelenfu commented Mar 21, 2024

diala-ar commented Mar 21, 2024 •

edited

Loading

Preprocessing of single cell reference data #9

Preprocessing of single cell reference data #9

Comments

Ceyhun-Alar commented Feb 19, 2024

diala-ar commented Mar 20, 2024

xhelenfu commented Mar 21, 2024

diala-ar commented Mar 21, 2024 • edited Loading

diala-ar commented Mar 21, 2024 •

edited

Loading