Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preprocessing of single cell reference data #9

Open
Ceyhun-Alar opened this issue Feb 19, 2024 · 3 comments
Open

Preprocessing of single cell reference data #9

Ceyhun-Alar opened this issue Feb 19, 2024 · 3 comments

Comments

@Ceyhun-Alar
Copy link

Can you please share the code for processing reference data to obtain sc_references/sc_breast.csv, sc_breast_markers_pos.csv, and sc_breast_markers_neg.csv?

@diala-ar
Copy link

I second @Ceyhun-Alar request, as I was not able to reproduce the positive and negative markers for the Breast dataset. I really appreciate if you could share your code to produce those markers. Thanks

@xhelenfu
Copy link
Collaborator

Hi, please try this code to get the positive and negative marker .csv files from the reference data

import numpy as np
import argparse
import pandas as pd
import natsort


def main(config):

    ref_df = pd.read_csv(config.fp_ref, index_col=0)
    n_genes = ref_df.shape[1] - 3
    print("Ref data shape", ref_df.shape)

    cell_types = ref_df["cell_type"].tolist()
    cell_types = natsort.natsorted(list(set(cell_types)))
    print(cell_types)
    n_cell_types = len(cell_types)

    ref_expr = ref_df.iloc[:, :n_genes].to_numpy()
    gene_names = ref_df.columns[:n_genes]

    # Find genes with expressions in bottom 10% percentile for every ref cell type
    pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True)
    pct_10 = np.tile(pct_10, (1, n_genes))
    low_expr_true = np.zeros(pct_10.shape)
    low_expr_true[ref_expr <= pct_10] = 1

    # Find overlap for different ref samples of the same cell type
    ct_idx = ref_df["ct_idx"].to_numpy()
    low_expr_true_agg = np.zeros((n_cell_types, n_genes))
    for ct in range(n_cell_types):
        rows = np.where(ct_idx == ct)[0]
        low_expr_true_ct = low_expr_true[rows]
        low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0)

    # Set overlaps to 0
    overlaps = np.sum(low_expr_true_agg, 0)
    too_many = np.where(overlaps > config.max_overlaps_neg)[0]
    low_expr_true_agg[:, too_many] = 0

    # print("num neg genes per cell type")
    # print(np.sum(low_expr_true_agg, 1))

    df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names)

    # Find genes with expressions in top 90% percentile for every ref cell type
    pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True)
    pct_90 = np.tile(pct_90, (1, n_genes))
    high_expr_true = np.zeros(pct_90.shape)
    high_expr_true[ref_expr >= pct_90] = 1

    # Find overlap for different ref samples of the same cell type
    ct_idx = ref_df["ct_idx"].to_numpy()
    high_expr_true_agg = np.zeros((n_cell_types, n_genes))
    for ct in range(n_cell_types):
        rows = np.where(ct_idx == ct)[0]
        high_expr_true_ct = high_expr_true[rows]
        high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0)

    # print("num pos genes per cell type")
    # print(np.sum(high_expr_true_agg, 1))

    # Set overlaps to 0
    overlaps = np.sum(high_expr_true_agg, 0)
    too_many = np.where(overlaps > config.max_overlaps_pos)[0]
    high_expr_true_agg[:, too_many] = 0

    df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names)

    df_pos.to_csv(config.fp_pos)
    df_neg.to_csv(config.fp_neg)

    print("Done")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--fp_ref",
        default="data/sc_references/sc_breast.csv",
        type=str,
        help="ref data",
    )

    parser.add_argument(
        "--max_overlaps_pos",
        default=4,
        type=int,
        help="no more than n cell types can have the same markers",
    )

    parser.add_argument(
        "--max_overlaps_neg",
        default=15,
        type=int,
        help="no more than n cell types can have the same markers",
    )

    parser.add_argument(
        "--fp_pos", default="markers_pos.csv", type=str, help="positive markers"
    )
    parser.add_argument(
        "--fp_neg", default="markers_neg.csv", type=str, help="negative markers"
    )

    config = parser.parse_args()
    main(config)



@diala-ar
Copy link

diala-ar commented Mar 21, 2024

Hi @xhelenfu,

Thanks so much for sharing the code, it worked!

However, I wrote an R code to generate the sc_breast.csv file from the averages gene expression BRCA_xxx_expression_Celltype_majorlineage.txt files downloaded from the TISCH2 website, but I was not able to regenerate it.

Briefly the code: 1) renames the CD8T and CD8Tex cell types to CD8T/CD8Tex; 2) renames CD4 and Treg cell types to CD4Tconv/Treg and 3) excludes the Tprolif and Oligodendrocyte cell types. The correlation between my generated numbers and yours are pretty high (>0.9). Below is my code:

in_dir = "resources/Xenium_data/output-sample1/"
features = read.delim(file.path(in_dir, 'cell_feature_matrix', 'features.tsv.gz'), header = F)
names(features) = c('Ensembl', 'gene_symbol', 'gene_category')
genes = features$gene_symbol[features$gene_category=='Gene Expression']

### 02. generate average expression of reference studies of BRCA cancer
tumor_subtype = 'BRCA'
in_dir = file.path('resources', 'from_TISCH2', 'avg_expr')
in_dirs = list.dirs(in_dir, recursive=F)
in_dirs = in_dirs[grep(tumor_subtype, in_dirs)]
all_avg_expr = NULL
for (i in 1:length(in_dirs)) {
  in_dir_i = in_dirs[i]
  
  atlas = sub(paste0(tumor_subtype, '_(.+)_Expression'), '\\1', basename(in_dir_i))
  
  file_names = list.files(in_dir_i)
  file_name = grep('Celltype_majorlineage', file_names, value=T)
  avg_expr = read.delim(file.path(in_dir_i, file_name))
  avg_expr = data.frame(t(avg_expr))
  
  found_genes   = intersect(genes, names(avg_expr))
  missing_genes = setdiff(genes, names(avg_expr))
  missing_genes_expr = data.frame(matrix(0, nrow=nrow(avg_expr), ncol=length(missing_genes), 
                                  dimnames=list(NULL, missing_genes)))
  avg_expr = cbind(avg_expr[, found_genes], missing_genes_expr)
  avg_expr = avg_expr[, order(names(avg_expr))]
  avg_expr$cell_type = row.names(avg_expr)
  
  avg_expr[, (ncol(avg_expr)-3):ncol(avg_expr)]
  
  CD8_celltypes = grep('CD8', avg_expr$cell_type)
  avg_expr$cell_type[CD8_celltypes] = 'CD8T/CD8Tex'
  
  CD4_or_Treg_celltypes = grep('CD4|Treg', avg_expr$cell_type)
  avg_expr$cell_type[CD4_or_Treg_celltypes] = 'CD4T/Treg'
  avg_expr$atlas = atlas
  
  avg_expr[, (ncol(avg_expr)-3):ncol(avg_expr)]
  all_avg_expr = rbind(all_avg_expr, avg_expr)
}

## some cleaning
# remove columns having an expression of 0 for all cell types
rm_col = which(apply(all_avg_expr[, -which(names(all_avg_expr) %in% c("cell_type", 'atlas'))], 2, sum)==0)
if ( length(rm_col) > 0 )
  all_avg_expr = all_avg_expr[, -rm_col]

# remove rows with T profil
all_avg_expr = all_avg_expr[!all_avg_expr$cell_type %in% c('Tprolif', 'TProlif', 'Oligodendrocyte'), ]
all_avg_expr[all_avg_expr$cell_type=='Mono.Macro', 'cell_type'] = 'Mono/Macro'
row.names(all_avg_expr) = NULL


## add the cell_type index column ct_idx
cell_types = sort(unique(all_avg_expr$cell_type))
ct_idx     = 0:(length(cell_types)-1)
names(ct_idx) = cell_types
all_avg_expr = data.frame(all_avg_expr[, 1:(ncol(all_avg_expr)-2)],
                          ct_idx = sapply(all_avg_expr$cell_type, function(x) ct_idx[x]),
                          cell_type = all_avg_expr$cell_type,
                          atlas = all_avg_expr$atlas)
all_avg_expr = all_avg_expr[order(all_avg_expr$atlas, all_avg_expr$cell_type), ]
all_avg_expr[, (ncol(all_avg_expr)-3):ncol(all_avg_expr)]
write.csv(all_avg_expr, file=file.path('results', 'bidcell', 'input', 'sc_BRCA.csv'))

I highly appreciate if you could share your code to generate sc_breast.csv.

Thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants