diff --git a/Snakefile b/Snakefile index 878681d..1bdefad 100644 --- a/Snakefile +++ b/Snakefile @@ -12,13 +12,16 @@ __author__ = "Marine AGLAVE" #using: snakemake --profile /mnt/beegfs/pipelines/single-cell/profiles/slurm -s /mnt/beegfs/pipelines/single-cell/Snakefile --configfile /mnt/beegfs/userdata/m_aglave/pipeline/test_new_data/Params.yaml +sys.stderr.write("\n############################################################# \n") +sys.stderr.write("\n\n\t Single-cell RNA-seq pipeline \n\n") +sys.stderr.write("\n############################################################# \n\n") + ### parameters ################################################################################################################################### +sys.stderr.write("\n#################### Setting Parameters ####################\n\n") -#### Pipeline #### STEPS = config['Steps'] PIPELINE_FOLDER = workflow.snakefile PIPELINE_FOLDER = PIPELINE_FOLDER.replace("/Snakefile", "") -#CONDA_ENV_SING = PIPELINE_FOLDER + "/envs/conda/Singularity.yml" if "Alignment_countTable_GE" in STEPS: ### Sample/Project @@ -339,7 +342,7 @@ if "Norm_DimRed_Eval_GE" in STEPS: #alias NDRE_ POSSIBLE_RES = ["%.1f" % number for number in numpy.arange(NDRE_RES_MIN*10,NDRE_RES_MAX*10+1,NDRE_RES_STEPS*10)/10] #*10 then /10 because numpy.arange doesn't handle floats well ASSAY = "RNA" if NDRE_NORM_METHOD == "LogNormalize" else "SCT" -if "Clust_Markers_Annot_GE" in STEPS: +if "Clust_Markers_Annot_GE" in STEPS: #alias CMA ### Sample/Project if ('Clust_Markers_Annot_GE' in config) and ('sample.name.ge' in config['Clust_Markers_Annot_GE']) and ('input.rda.ge' in config['Clust_Markers_Annot_GE']) : CMA_SAMPLE_NAME_GE_RAW = config['Clust_Markers_Annot_GE']['sample.name.ge'] @@ -495,9 +498,6 @@ if "Adding_BCR" in STEPS: dic_ADD_BCR_INFO[ADD_BCR_OUTPUT[i]]['ADD_BCR_INPUT_RDA_GE'] = ADD_BCR_INPUT_RDA_GE[i] dic_ADD_BCR_INFO[ADD_BCR_OUTPUT[i]]['ADD_BCR_INPUT_CSV_BCR'] = ADD_BCR_INPUT_CSV_BCR[i] -if "Alignment_annotations_TCR_BCR" in STEPS or "Adding_TCR" in STEPS or "Adding_BCR" in STEPS: - SINGULARITY_ENV_TCR_BCR = PIPELINE_FOLDER + "/envs/singularity/single_cell_TCR_BCR.simg" - if "Int_Norm_DimRed_Eval_GE" in STEPS: ### Sample/Project if ('Int_Norm_DimRed_Eval_GE' in config) and ('name.int' in config['Int_Norm_DimRed_Eval_GE']) and ('input.list.rda' in config['Int_Norm_DimRed_Eval_GE']) : @@ -607,6 +607,96 @@ if "Int_Clust_Markers_Annot_GE" in STEPS: #Names INT_CMA_CLUST_FOLDER = "dims" + str(INT_CMA_KEEP_DIM) + "_res" + str(INT_CMA_KEEP_RES) +if "Int_Adding_ADT" in STEPS: + ### Sample/Project + if 'Int_Adding_ADT' in config and 'input.rda' in config['Int_Adding_ADT']: + INT_ADD_ADT_INPUT_RDA = config['Int_Adding_ADT']['input.rda'] + elif "Int_Clust_Markers_Annot_GE" in STEPS: + sys.stderr.write("Note: No input.rda find in Int_Adding_ADT section of configfile; input.rda will be determine from Int_Clust_Markers_Annot_GE step for Int_Adding_ADT step!\n") + INT_ADD_ADT_INPUT_RDA = [os.path.normpath(os.path.dirname(dic_INT_CMA_INFO[INT_CMA_NAME_INT[x]]['INT_CMA_INPUT_RDA']) + "/" + INT_CMA_CLUST_FOLDER + "/" + INT_CMA_NAME_INT[x] + INT_CMA_COMPLEMENT[x] + "_" + str(INT_CMA_KEEP_DIM) + "_" + str(INT_CMA_KEEP_RES) + ".rda") for x in range(len(INT_CMA_NAME_INT))] + else: + sys.exit("Error: No input.rda in configfile!\n") + if 'Int_Adding_ADT' in config and 'samples.name.adt' in config['Int_Adding_ADT'] and 'input.dirs.adt' in config['Int_Adding_ADT'] : + INT_ADD_ADT_INPUT_DIR_ADT = [ x.replace(", ", ",") for x in config['Int_Adding_ADT']['input.dirs.adt']] + INT_ADD_ADT_SAMPLE_NAME_ADT = [ x.replace(", ", ",") for x in config['Int_Adding_ADT']['samples.name.adt']] + else: + sys.exit("Error: No samples.name.adt or input.dirs.adt in configfile!\n") + ### Analysis Parameters + INT_ADD_ADT_AUTHOR_NAME = config['Int_Adding_ADT']['author.name'].replace(", ", ",").replace(" ", "_") if ('Int_Adding_ADT' in config and 'author.name' in config['Int_Adding_ADT'] and config['Int_Adding_ADT']['author.name'] != None) else "NULL" + INT_ADD_ADT_AUTHOR_MAIL = config['Int_Adding_ADT']['author.mail'].replace(", ", ",") if ('Int_Adding_ADT' in config and 'author.mail' in config['Int_Adding_ADT'] and config['Int_Adding_ADT']['author.mail'] != None) else "NULL" + INT_ADD_ADT_GENE_NAMES = config['Int_Adding_ADT']['gene.names'].replace(", ", ",") if ('Int_Adding_ADT' in config and 'gene.names' in config['Int_Adding_ADT'] and config['Int_Adding_ADT']['gene.names'] != None) else "NULL" + INT_ADD_ADT_MAX_CUTOFF = config['Int_Adding_ADT']['ADT.max.cutoff'].replace(", ", ",") if ('Int_Adding_ADT' in config and 'ADT.max.cutoff' in config['Int_Adding_ADT'] and config['Int_Adding_ADT']['ADT.max.cutoff'] != None) else "NULL" + INT_ADD_ADT_MIN_CUTOFF = config['Int_Adding_ADT']['ADT.min.cutoff'].replace(", ", ",") if ('Int_Adding_ADT' in config and 'ADT.min.cutoff' in config['Int_Adding_ADT'] and config['Int_Adding_ADT']['ADT.min.cutoff'] != None) else "NULL" + ### Snakefile parameters + #Correspondance input/output + INT_ADD_ADT_OUTPUT = [os.path.splitext(x)[0] for x in INT_ADD_ADT_INPUT_RDA] + dic_INT_ADD_ADT_INFO = {} + for i in range(0,len(INT_ADD_ADT_OUTPUT),1): + dic_INT_ADD_ADT_INFO[INT_ADD_ADT_OUTPUT[i]] = {} + dic_INT_ADD_ADT_INFO[INT_ADD_ADT_OUTPUT[i]]['INT_ADD_ADT_INPUT_RDA'] = INT_ADD_ADT_INPUT_RDA[i] + dic_INT_ADD_ADT_INFO[INT_ADD_ADT_OUTPUT[i]]['INT_ADD_ADT_INPUT_DIR_ADT'] = INT_ADD_ADT_INPUT_DIR_ADT[i] + dic_INT_ADD_ADT_INFO[INT_ADD_ADT_OUTPUT[i]]['INT_ADD_ADT_SAMPLE_NAME_ADT'] = INT_ADD_ADT_SAMPLE_NAME_ADT[i] + sys.stderr.write(INT_ADD_ADT_OUTPUT[i] + str("\n")) + +if "Int_Adding_TCR" in STEPS: + ### Sample/Project + if 'Int_Adding_TCR' in config and 'input.rda' in config['Int_Adding_TCR'] : + INT_ADD_TCR_INPUT_RDA = config['Int_Adding_TCR']['input.rda'] + elif "Int_Adding_ADT" in STEPS: + sys.stderr.write("Note: No input.rda find in Int_Adding_TCR section of configfile; input.rda will be determine from Int_Adding_ADT step for Int_Adding_TCR step!\n") + INT_ADD_TCR_INPUT_RDA = [ x + "_ADT.rda" for x in INT_ADD_ADT_OUTPUT] + elif "Int_Clust_Markers_Annot_GE" in STEPS: + sys.stderr.write("Note: No input.rda find in Int_Adding_TCR section of configfile; input.rda will be determine from Int_Clust_Markers_Annot_GE step for Int_Adding_TCR step!\n") + INT_ADD_TCR_INPUT_RDA = [os.path.normpath(os.path.dirname(dic_INT_CMA_INFO[INT_CMA_NAME_INT[x]]['INT_CMA_INPUT_RDA']) + "/" + INT_CMA_CLUST_FOLDER + "/" + INT_CMA_NAME_INT[x] + INT_CMA_COMPLEMENT[x] + "_" + str(INT_CMA_KEEP_DIM) + "_" + str(INT_CMA_KEEP_RES) + ".rda") for x in range(len(INT_CMA_NAME_INT))] + else: + sys.exit("Error: No input.rda in configfile!\n") + if 'Int_Adding_TCR' in config and 'vdj.input.files.tcr' in config['Int_Adding_TCR'] : + INT_ADD_TCR_INPUT_CSV_TCR = [ x.replace(", ", ",") for x in config['Int_Adding_TCR']['vdj.input.files.tcr']] + else: + sys.exit("Error: No vdj.input.files.tcr in configfile!\n") + ### Analysis Parameters + INT_ADD_TCR_AUTHOR_NAME = config['Int_Adding_TCR']['author.name'].replace(", ", ",").replace(" ", "_") if ('Int_Adding_TCR' in config and 'author.name' in config['Int_Adding_TCR'] and config['Int_Adding_TCR']['author.name'] != None) else "NULL" + INT_ADD_TCR_AUTHOR_MAIL = config['Int_Adding_TCR']['author.mail'].replace(", ", ",") if ('Int_Adding_TCR' in config and 'author.mail' in config['Int_Adding_TCR'] and config['Int_Adding_TCR']['author.mail'] != None) else "NULL" + ### Snakefile parameters + #Correspondance input/output + INT_ADD_TCR_OUTPUT = [os.path.splitext(x)[0] for x in INT_ADD_TCR_INPUT_RDA] + dic_INT_ADD_TCR_INFO = {} + for i in range(0,len(INT_ADD_TCR_OUTPUT),1): + dic_INT_ADD_TCR_INFO[INT_ADD_TCR_OUTPUT[i]] = {} + dic_INT_ADD_TCR_INFO[INT_ADD_TCR_OUTPUT[i]]['INT_ADD_TCR_INPUT_RDA'] = INT_ADD_TCR_INPUT_RDA[i] + dic_INT_ADD_TCR_INFO[INT_ADD_TCR_OUTPUT[i]]['INT_ADD_TCR_INPUT_CSV_TCR'] = INT_ADD_TCR_INPUT_CSV_TCR[i] + +if "Int_Adding_BCR" in STEPS: + ### Sample/Project + if 'Int_Adding_BCR' in config and 'input.rda' in config['Int_Adding_BCR'] : + INT_ADD_BCR_INPUT_RDA = config['Int_Adding_BCR']['input.rda'] + elif "Int_Adding_TCR" in STEPS: + sys.stderr.write("Note: No input.rda.ge find in Int_Adding_BCR section of configfile; input.rda.ge will be determine from Int_Adding_TCR step for Int_Adding_BCR step!\n") + INT_ADD_BCR_INPUT_RDA = [ x + "_TCR.rda" for x in INT_ADD_TCR_OUTPUT] + elif "Adding_ADT" in STEPS: + sys.stderr.write("Note: No input.rda find in Int_Adding_BCR section of configfile; input.rda will be determine from Int_Adding_ADT step for Int_Adding_BCR step!\n") + INT_ADD_BCR_INPUT_RDA = [ x + "_ADT.rda" for x in INT_ADD_ADT_OUTPUT] + elif "Int_Clust_Markers_Annot_GE" in STEPS: + sys.stderr.write("Note: No input.rda find in Int_Adding_BCR section of configfile; input.rda will be determine from Clust_Markers_Annot_GE step for Int_Adding_BCR step!\n") + INT_ADD_BCR_INPUT_RDA = [os.path.normpath(os.path.dirname(dic_INT_CMA_INFO[INT_CMA_NAME_INT[x]]['INT_CMA_INPUT_RDA']) + "/" + INT_CMA_CLUST_FOLDER + "/" + INT_CMA_NAME_INT[x] + INT_CMA_COMPLEMENT[x] + "_" + str(INT_CMA_KEEP_DIM) + "_" + str(INT_CMA_KEEP_RES) + ".rda") for x in range(len(INT_CMA_NAME_INT))] + else: + sys.exit("Error: No input.rda in configfile!\n") + if 'Int_Adding_BCR' in config and 'vdj.input.files.bcr' in config['Int_Adding_BCR'] : + INT_ADD_BCR_INPUT_CSV_BCR = [ x.replace(", ", ",") for x in config['Int_Adding_BCR']['vdj.input.files.bcr']] + else: + sys.exit("Error: No vdj.input.files.bcr in configfile!\n") + ### Analysis Parameters + INT_ADD_BCR_AUTHOR_NAME = config['Int_Adding_BCR']['author.name'].replace(", ", ",").replace(" ", "_") if ('Int_Adding_BCR' in config and 'author.name' in config['Int_Adding_BCR'] and config['Int_Adding_BCR']['author.name'] != None) else "NULL" + INT_ADD_BCR_AUTHOR_MAIL = config['Int_Adding_BCR']['author.mail'].replace(", ", ",") if ('Int_Adding_BCR' in config and 'author.mail' in config['Int_Adding_BCR'] and config['Int_Adding_BCR']['author.mail'] != None) else "NULL" + ### Snakefile parameters + #Correspondance input/output + INT_ADD_BCR_OUTPUT = [os.path.splitext(x)[0] for x in INT_ADD_BCR_INPUT_RDA] + dic_INT_ADD_BCR_INFO = {} + for i in range(0,len(INT_ADD_BCR_OUTPUT),1): + dic_INT_ADD_BCR_INFO[INT_ADD_BCR_OUTPUT[i]] = {} + dic_INT_ADD_BCR_INFO[INT_ADD_BCR_OUTPUT[i]]['INT_ADD_BCR_INPUT_RDA'] = INT_ADD_BCR_INPUT_RDA[i] + dic_INT_ADD_BCR_INFO[INT_ADD_BCR_OUTPUT[i]]['INT_ADD_BCR_INPUT_CSV_BCR'] = INT_ADD_BCR_INPUT_CSV_BCR[i] + if "Grp_Norm_DimRed_Eval_GE" in STEPS: ### Sample/Project if ('Grp_Norm_DimRed_Eval_GE' in config) and ('name.grp' in config['Grp_Norm_DimRed_Eval_GE']) and ('input.list.rda' in config['Grp_Norm_DimRed_Eval_GE']) : @@ -702,6 +792,96 @@ if "Grp_Clust_Markers_Annot_GE" in STEPS: #Names GRP_CMA_CLUST_FOLDER = "dims" + str(GRP_CMA_KEEP_DIM) + "_res" + str(GRP_CMA_KEEP_RES) +if "Grp_Adding_ADT" in STEPS: + ### Sample/Project + if 'Grp_Adding_ADT' in config and 'input.rda' in config['Grp_Adding_ADT']: + GRP_ADD_ADT_INPUT_RDA = config['Grp_Adding_ADT']['input.rda'] + elif "Grp_Clust_Markers_Annot_GE" in STEPS: + sys.stderr.write("Note: No input.rda find in Grp_Adding_ADT section of configfile; input.rda will be determine from Grp_Clust_Markers_Annot_GE step for Grp_Adding_ADT step!\n") + GRP_ADD_ADT_INPUT_RDA = [os.path.normpath(os.path.dirname(dic_GRP_CMA_INFO[GRP_CMA_NAME_GRP[x]]['GRP_CMA_INPUT_RDA']) + "/" + GRP_CMA_CLUST_FOLDER + "/" + GRP_CMA_NAME_GRP[x] + GRP_CMA_COMPLEMENT[x] + "_" + str(GRP_CMA_KEEP_DIM) + "_" + str(GRP_CMA_KEEP_RES) + ".rda") for x in range(len(GRP_CMA_NAME_GRP))] + else: + sys.exit("Error: No input.rda in configfile!\n") + if 'Grp_Adding_ADT' in config and 'samples.name.adt' in config['Grp_Adding_ADT'] and 'input.dirs.adt' in config['Grp_Adding_ADT'] : + GRP_ADD_ADT_INPUT_DIR_ADT = [ x.replace(", ", ",") for x in config['Grp_Adding_ADT']['input.dirs.adt']] + GRP_ADD_ADT_SAMPLE_NAME_ADT = [ x.replace(", ", ",") for x in config['Grp_Adding_ADT']['samples.name.adt']] + else: + sys.exit("Error: No samples.name.adt or input.dirs.adt in configfile!\n") + ### Analysis Parameters + GRP_ADD_ADT_AUTHOR_NAME = config['Grp_Adding_ADT']['author.name'].replace(", ", ",").replace(" ", "_") if ('Grp_Adding_ADT' in config and 'author.name' in config['Grp_Adding_ADT'] and config['Grp_Adding_ADT']['author.name'] != None) else "NULL" + GRP_ADD_ADT_AUTHOR_MAIL = config['Grp_Adding_ADT']['author.mail'].replace(", ", ",") if ('Grp_Adding_ADT' in config and 'author.mail' in config['Grp_Adding_ADT'] and config['Grp_Adding_ADT']['author.mail'] != None) else "NULL" + GRP_ADD_ADT_GENE_NAMES = config['Grp_Adding_ADT']['gene.names'].replace(", ", ",") if ('Grp_Adding_ADT' in config and 'gene.names' in config['Grp_Adding_ADT'] and config['Grp_Adding_ADT']['gene.names'] != None) else "NULL" + GRP_ADD_ADT_MAX_CUTOFF = config['Grp_Adding_ADT']['ADT.max.cutoff'].replace(", ", ",") if ('Grp_Adding_ADT' in config and 'ADT.max.cutoff' in config['Grp_Adding_ADT'] and config['Grp_Adding_ADT']['ADT.max.cutoff'] != None) else "NULL" + GRP_ADD_ADT_MIN_CUTOFF = config['Grp_Adding_ADT']['ADT.min.cutoff'].replace(", ", ",") if ('Grp_Adding_ADT' in config and 'ADT.min.cutoff' in config['Grp_Adding_ADT'] and config['Grp_Adding_ADT']['ADT.min.cutoff'] != None) else "NULL" + ### Snakefile parameters + #Correspondance input/output + GRP_ADD_ADT_OUTPUT = [os.path.splitext(x)[0] for x in GRP_ADD_ADT_INPUT_RDA] + dic_GRP_ADD_ADT_INFO = {} + for i in range(0,len(GRP_ADD_ADT_OUTPUT),1): + dic_GRP_ADD_ADT_INFO[GRP_ADD_ADT_OUTPUT[i]] = {} + dic_GRP_ADD_ADT_INFO[GRP_ADD_ADT_OUTPUT[i]]['GRP_ADD_ADT_INPUT_RDA'] = GRP_ADD_ADT_INPUT_RDA[i] + dic_GRP_ADD_ADT_INFO[GRP_ADD_ADT_OUTPUT[i]]['GRP_ADD_ADT_INPUT_DIR_ADT'] = GRP_ADD_ADT_INPUT_DIR_ADT[i] + dic_GRP_ADD_ADT_INFO[GRP_ADD_ADT_OUTPUT[i]]['GRP_ADD_ADT_SAMPLE_NAME_ADT'] = GRP_ADD_ADT_SAMPLE_NAME_ADT[i] + sys.stderr.write(GRP_ADD_ADT_OUTPUT[i] + str("\n")) + +if "Grp_Adding_TCR" in STEPS: + ### Sample/Project + if 'Grp_Adding_TCR' in config and 'input.rda' in config['Grp_Adding_TCR'] : + GRP_ADD_TCR_INPUT_RDA = config['Grp_Adding_TCR']['input.rda'] + elif "Grp_Adding_ADT" in STEPS: + sys.stderr.write("Note: No input.rda find in Grp_Adding_TCR section of configfile; input.rda will be determine from Grp_Adding_ADT step for Grp_Adding_TCR step!\n") + GRP_ADD_TCR_INPUT_RDA = [ x + "_ADT.rda" for x in GRP_ADD_ADT_OUTPUT] + elif "Grp_Clust_Markers_Annot_GE" in STEPS: + sys.stderr.write("Note: No input.rda find in Grp_Adding_TCR section of configfile; input.rda will be determine from Grp_Clust_Markers_Annot_GE step for Grp_Adding_TCR step!\n") + GRP_ADD_TCR_INPUT_RDA = [os.path.normpath(os.path.dirname(dic_GRP_CMA_INFO[GRP_CMA_NAME_GRP[x]]['GRP_CMA_INPUT_RDA']) + "/" + GRP_CMA_CLUST_FOLDER + "/" + GRP_CMA_NAME_GRP[x] + GRP_CMA_COMPLEMENT[x] + "_" + str(GRP_CMA_KEEP_DIM) + "_" + str(GRP_CMA_KEEP_RES) + ".rda") for x in range(len(GRP_CMA_NAME_GRP))] + else: + sys.exit("Error: No input.rda in configfile!\n") + if 'Grp_Adding_TCR' in config and 'vdj.input.files.tcr' in config['Grp_Adding_TCR'] : + GRP_ADD_TCR_INPUT_CSV_TCR = [ x.replace(", ", ",") for x in config['Grp_Adding_TCR']['vdj.input.files.tcr']] + else: + sys.exit("Error: No vdj.input.files.tcr in configfile!\n") + ### Analysis Parameters + GRP_ADD_TCR_AUTHOR_NAME = config['Grp_Adding_TCR']['author.name'].replace(", ", ",").replace(" ", "_") if ('Grp_Adding_TCR' in config and 'author.name' in config['Grp_Adding_TCR'] and config['Grp_Adding_TCR']['author.name'] != None) else "NULL" + GRP_ADD_TCR_AUTHOR_MAIL = config['Grp_Adding_TCR']['author.mail'].replace(", ", ",") if ('Grp_Adding_TCR' in config and 'author.mail' in config['Grp_Adding_TCR'] and config['Grp_Adding_TCR']['author.mail'] != None) else "NULL" + ### Snakefile parameters + #Correspondance input/output + GRP_ADD_TCR_OUTPUT = [os.path.splitext(x)[0] for x in GRP_ADD_TCR_INPUT_RDA] + dic_GRP_ADD_TCR_INFO = {} + for i in range(0,len(GRP_ADD_TCR_OUTPUT),1): + dic_GRP_ADD_TCR_INFO[GRP_ADD_TCR_OUTPUT[i]] = {} + dic_GRP_ADD_TCR_INFO[GRP_ADD_TCR_OUTPUT[i]]['GRP_ADD_TCR_INPUT_RDA'] = GRP_ADD_TCR_INPUT_RDA[i] + dic_GRP_ADD_TCR_INFO[GRP_ADD_TCR_OUTPUT[i]]['GRP_ADD_TCR_INPUT_CSV_TCR'] = GRP_ADD_TCR_INPUT_CSV_TCR[i] + +if "Grp_Adding_BCR" in STEPS: + ### Sample/Project + if 'Grp_Adding_BCR' in config and 'input.rda' in config['Grp_Adding_BCR'] : + GRP_ADD_BCR_INPUT_RDA = config['Grp_Adding_BCR']['input.rda'] + elif "Grp_Adding_TCR" in STEPS: + sys.stderr.write("Note: No input.rda.ge find in Grp_Adding_BCR section of configfile; input.rda.ge will be determine from Grp_Adding_TCR step for Grp_Adding_BCR step!\n") + GRP_ADD_BCR_INPUT_RDA = [ x + "_TCR.rda" for x in GRP_ADD_TCR_OUTPUT] + elif "Adding_ADT" in STEPS: + sys.stderr.write("Note: No input.rda find in Grp_Adding_BCR section of configfile; input.rda will be determine from Grp_Adding_ADT step for Grp_Adding_BCR step!\n") + GRP_ADD_BCR_INPUT_RDA = [ x + "_ADT.rda" for x in GRP_ADD_ADT_OUTPUT] + elif "Grp_Clust_Markers_Annot_GE" in STEPS: + sys.stderr.write("Note: No input.rda find in Grp_Adding_BCR section of configfile; input.rda will be determine from Clust_Markers_Annot_GE step for Grp_Adding_BCR step!\n") + GRP_ADD_BCR_INPUT_RDA = [os.path.normpath(os.path.dirname(dic_GRP_CMA_INFO[GRP_CMA_NAME_GRP[x]]['GRP_CMA_INPUT_RDA']) + "/" + GRP_CMA_CLUST_FOLDER + "/" + GRP_CMA_NAME_GRP[x] + GRP_CMA_COMPLEMENT[x] + "_" + str(GRP_CMA_KEEP_DIM) + "_" + str(GRP_CMA_KEEP_RES) + ".rda") for x in range(len(GRP_CMA_NAME_GRP))] + else: + sys.exit("Error: No input.rda in configfile!\n") + if 'Grp_Adding_BCR' in config and 'vdj.input.files.bcr' in config['Grp_Adding_BCR'] : + GRP_ADD_BCR_INPUT_CSV_BCR = [ x.replace(", ", ",") for x in config['Grp_Adding_BCR']['vdj.input.files.bcr']] + else: + sys.exit("Error: No vdj.input.files.bcr in configfile!\n") + ### Analysis Parameters + GRP_ADD_BCR_AUTHOR_NAME = config['Grp_Adding_BCR']['author.name'].replace(", ", ",").replace(" ", "_") if ('Grp_Adding_BCR' in config and 'author.name' in config['Grp_Adding_BCR'] and config['Grp_Adding_BCR']['author.name'] != None) else "NULL" + GRP_ADD_BCR_AUTHOR_MAIL = config['Grp_Adding_BCR']['author.mail'].replace(", ", ",") if ('Grp_Adding_BCR' in config and 'author.mail' in config['Grp_Adding_BCR'] and config['Grp_Adding_BCR']['author.mail'] != None) else "NULL" + ### Snakefile parameters + #Correspondance input/output + GRP_ADD_BCR_OUTPUT = [os.path.splitext(x)[0] for x in GRP_ADD_BCR_INPUT_RDA] + dic_GRP_ADD_BCR_INFO = {} + for i in range(0,len(GRP_ADD_BCR_OUTPUT),1): + dic_GRP_ADD_BCR_INFO[GRP_ADD_BCR_OUTPUT[i]] = {} + dic_GRP_ADD_BCR_INFO[GRP_ADD_BCR_OUTPUT[i]]['GRP_ADD_BCR_INPUT_RDA'] = GRP_ADD_BCR_INPUT_RDA[i] + dic_GRP_ADD_BCR_INFO[GRP_ADD_BCR_OUTPUT[i]]['GRP_ADD_BCR_INPUT_CSV_BCR'] = GRP_ADD_BCR_INPUT_CSV_BCR[i] + if "Cerebro" in STEPS: ### Sample/Project CEREBRO_INPUT_RDA = [] @@ -781,117 +961,17 @@ if "Cerebro" in STEPS: else: sys.exit("Error: Unknown version of cerebro in configfile!\n") -#singularity #!/usr/bin/env python +#singularity if "Droplets_QC_GE" in STEPS or "Filtering_GE" in STEPS or "Norm_DimRed_Eval_GE" in STEPS or "Clust_Markers_Annot_GE" in STEPS or "Adding_ADT" in STEPS or "Int_Clust_Markers_Annot_GE" in STEPS or "Grp_Norm_DimRed_Eval_GE" in STEPS or "Grp_Clust_Markers_Annot_GE" in STEPS: SINGULARITY_ENV = PIPELINE_FOLDER + "/envs/singularity/single_cell.simg" if "Int_Norm_DimRed_Eval_GE" in STEPS : INT_SINGULARITY_ENV = PIPELINE_FOLDER + "/envs/singularity/single_cell_integration.simg" - - -onstart: - sys.stderr.write("\n############################################################# \n") - sys.stderr.write("\n\n\t Single-cell RNA-seq pipeline \n\n") - sys.stderr.write("\n############################################################# \n\n") - sys.stderr.write("***************** PARAMETERS ******************\n") - if "Alignment_countTable_GE" in STEPS: - sys.stderr.write("\n" + "Alignment_countTable_GE:" + "\n") - sys.stderr.write("SAMPLE(S):\n") - if isinstance(ALIGN_SAMPLE_NAME_GE_RAW, list): - for sample in ALIGN_SAMPLE_NAME_GE_RAW: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(ALIGN_SAMPLE_NAME_GE_RAW) + "\n") - - if "Alignment_countTable_ADT" in STEPS: - sys.stderr.write("\n" + "Alignment_countTable_ADT:" + "\n") - sys.stderr.write("SAMPLE(S):\n") - if isinstance(ALIGN_SAMPLE_NAME_ADT_RAW, list): - for sample in ALIGN_SAMPLE_NAME_ADT_RAW: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(ALIGN_SAMPLE_NAME_ADT_RAW) + "\n") - - if "Alignment_annotations_TCR_BCR" in STEPS: - sys.stderr.write("\n" + "Alignment_annotations_TCR_BCR:" + "\n") - sys.stderr.write("SAMPLE(S):\n") - if isinstance(ALIGN_SAMPLE_NAME_TCR_BCR_RAW, list): - for sample in ALIGN_SAMPLE_NAME_TCR_BCR_RAW: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(ALIGN_SAMPLE_NAME_TCR_BCR_RAW) + "\n") - - if "Droplets_QC_GE" in STEPS: - sys.stderr.write("\n" + "Droplets_QC_GE:" + "\n") - sys.stderr.write("SAMPLE(S):\n") - if isinstance(QC_SAMPLE_NAME_GE, list): - for sample in QC_SAMPLE_NAME_GE: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(QC_SAMPLE_NAME_GE) + "\n") - - if "Filtering_GE" in STEPS: - sys.stderr.write("\n" + "Filtering_GE:" + "\n") - sys.stderr.write("SAMPLE(S):\n") - if isinstance(FILERING_SAMPLE_NAME_GE, list): - for sample in FILERING_SAMPLE_NAME_GE: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(FILERING_SAMPLE_NAME_GE) + "\n") - - if "Norm_DimRed_Eval_GE" in STEPS: - sys.stderr.write("\n" + "Norm_DimRed_Eval_GE:" + "\n") - sys.stderr.write("SAMPLE(S):\n") - if isinstance(NDRE_SAMPLE_NAME_GE, list): - for sample in NDRE_SAMPLE_NAME_GE: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(NDRE_SAMPLE_NAME_GE) + "\n") - if "Clust_Markers_Annot_GE" in STEPS: - sys.stderr.write("\n" + "Clust_Markers_Annot_GE:" + "\n") - sys.stderr.write("SAMPLE(S):\n") - if isinstance(CMA_SAMPLE_NAME_GE, list): - for sample in CMA_SAMPLE_NAME_GE: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(CMA_SAMPLE_NAME_GE) + "\n") - if "Adding_ADT" in STEPS: - sys.stderr.write("\n" + "Adding_ADT:" + "\n") - sys.stderr.write("RDA FILE(S):\n") - if isinstance(ADD_ADT_INPUT_RDA_GE, list): - for sample in ADD_ADT_INPUT_RDA_GE: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(ADD_ADT_INPUT_RDA_GE) + "\n") - if "Adding_TCR" in STEPS: - sys.stderr.write("\n" + "Adding_TCR:" + "\n") - sys.stderr.write("RDA FILE(S):\n") - if isinstance(ADD_TCR_INPUT_RDA_GE, list): - for sample in ADD_TCR_INPUT_RDA_GE: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(ADD_TCR_INPUT_RDA_GE) + "\n") - if "Adding_BCR" in STEPS: - sys.stderr.write("\n" + "Adding_BCR:" + "\n") - sys.stderr.write("RDA FILE(S):\n") - if isinstance(ADD_BCR_INPUT_RDA_GE, list): - for sample in ADD_BCR_INPUT_RDA_GE: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(ADD_BCR_INPUT_RDA_GE) + "\n") - if "Cerebro" in STEPS: - sys.stderr.write("\n" + "Cerebro:" + "\n") - sys.stderr.write("RDA FILE(S):\n") - if isinstance(CEREBRO_INPUT_RDA, list): - for sample in CEREBRO_INPUT_RDA: - sys.stderr.write("\t" + str(sample) + "\n") - else: - sys.stderr.write("\t" + str(CEREBRO_INPUT_RDA) + "\n") - - sys.stderr.write("\n***************** RUN ******************\n") - return [] - +if "Alignment_annotations_TCR_BCR" in STEPS or "Adding_TCR" in STEPS or "Adding_BCR" in STEPS or "Int_Adding_TCR" in STEPS or "Int_Adding_BCR" in STEPS or "Grp_Adding_TCR" in STEPS or "Grp_Adding_BCR" in STEPS: + SINGULARITY_ENV_TCR_BCR = PIPELINE_FOLDER + "/envs/singularity/single_cell_TCR_BCR.simg" ### rule all ################################################################################################################################### +sys.stderr.write("\n########################### Run ############################\n\n") + include: "rules/Rule_all.smk" rule all: input: @@ -940,8 +1020,26 @@ if "Int_Norm_DimRed_Eval_GE" in STEPS: if "Int_Clust_Markers_Annot_GE" in STEPS: include: "rules/Int_Clust_Markers_Annot_GE.smk" +if "Int_Adding_ADT" in STEPS: + include: "rules/Int_Adding_ADT.smk" + +if "Int_Adding_TCR" in STEPS: + include: "rules/Int_Adding_TCR.smk" + +if "Int_Adding_BCR" in STEPS: + include: "rules/Int_Adding_BCR.smk" + if "Grp_Norm_DimRed_Eval_GE" in STEPS: include: "rules/Grp_Norm_DimRed_Eval_GE.smk" if "Grp_Clust_Markers_Annot_GE" in STEPS: include: "rules/Grp_Clust_Markers_Annot_GE.smk" + +if "Grp_Adding_ADT" in STEPS: + include: "rules/Grp_Adding_ADT.smk" + +if "Grp_Adding_TCR" in STEPS: + include: "rules/Grp_Adding_TCR.smk" + +if "Grp_Adding_BCR" in STEPS: + include: "rules/Grp_Adding_BCR.smk" diff --git a/rules/Grp_Adding_ADT.smk b/rules/Grp_Adding_ADT.smk new file mode 100644 index 0000000..0b9bf25 --- /dev/null +++ b/rules/Grp_Adding_ADT.smk @@ -0,0 +1,86 @@ +""" +########################################################################## +This rule add adt information to expression gene analysis in grouped single-cell RNA-seq. +########################################################################## +""" + +wildcard_constraints: + grp_add_adt_output = "|".join(GRP_ADD_ADT_OUTPUT) + +""" +This function allows to determine the input .rda ge file and kallisto adt folder. +""" +def grp_add_adt_input(wildcards): + sys.stderr.write(str(wildcards.grp_add_adt_output)+"\n") + ge_rda_file = dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_INPUT_RDA'] + kallisto_folder = list(dict.fromkeys(dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_INPUT_DIR_ADT'].split(","))) + kallisto_folder.insert(0,ge_rda_file) + return kallisto_folder + +""" +This function allows to determine the singularity binding parameters. +""" +def grp_add_adt_params_sing(wildcards): + rda_folder = os.path.dirname(dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_INPUT_RDA']) # output_folder too + concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder) + for kallisto_folder in list(dict.fromkeys(dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_INPUT_DIR_ADT'].split(","))): + kallisto_folder = os.path.dirname(kallisto_folder) + concat = concat + " -B " + kallisto_folder + ":" + os.path.normpath("/WORKDIR/" + kallisto_folder) + return concat + +""" +This function allows to determine the input alignment folder for params section. +""" +def grp_add_adt_params_input_folder(wildcards): + return ",".join([ os.path.normpath("/WORKDIR/" + kallisto_folder + "/") for kallisto_folder in list(dict.fromkeys(dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_INPUT_DIR_ADT'].split(","))) ]) + +""" +This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot). +""" +def grp_add_adt_params_output_folder(wildcards): + return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.grp_add_adt_output)) + "/" + +""" +This function allows to determine the sample.name.adt for params. +""" +def grp_add_adt_params_sample_name_adt(wildcards): + return dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_SAMPLE_NAME_ADT'] + + +""" +This rule launches the R script to add adt information to expression gene analysis. +""" +rule grp_add_adt_ge: + input: + grp_add_adt_file = grp_add_adt_input + output: + grp_add_adt_rda_file = "{grp_add_adt_output}" + "_ADT.rda" + params: + sing_bind = grp_add_adt_params_sing, + pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER), + input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]), + kallisto_folder = grp_add_adt_params_input_folder, + output_folder = grp_add_adt_params_output_folder, + sample_name_adt = grp_add_adt_params_sample_name_adt + threads: + 1 + resources: + mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480), + time_min = lambda wildcards, attempt: min(attempt * 120, 200) + shell: + """ + singularity exec --no-home {params.sing_bind} \ + {SINGULARITY_ENV} \ + Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_ADT.R \ + --samples.name.adt {params.sample_name_adt} \ + --input.rda.ge {params.input_rda} \ + --output.dir {params.output_folder} \ + --input.dirs.adt {params.kallisto_folder} \ + --author.name {GRP_ADD_ADT_AUTHOR_NAME} \ + --author.mail {GRP_ADD_ADT_AUTHOR_MAIL} \ + --nthreads {threads} \ + --pipeline.path {params.pipeline_folder} \ + --gene.names {GRP_ADD_ADT_GENE_NAMES} \ + --ADT.min.cutoff {GRP_ADD_ADT_MIN_CUTOFF} \ + --ADT.max.cutoff {GRP_ADD_ADT_MAX_CUTOFF} + """ diff --git a/rules/Grp_Adding_BCR.smk b/rules/Grp_Adding_BCR.smk new file mode 100644 index 0000000..0d3506b --- /dev/null +++ b/rules/Grp_Adding_BCR.smk @@ -0,0 +1,71 @@ +""" +########################################################################## +This rule add bcr information to expression gene analysis in single-cell RNA-seq. +########################################################################## +""" +wildcard_constraints: + grp_add_bcr_output = "|".join(GRP_ADD_BCR_OUTPUT) + +""" +This function allows to determine the input .rda file and csv file from cellranger vdj. +""" +def grp_add_bcr_input(wildcards): + rda_file = dic_GRP_ADD_BCR_INFO[wildcards.grp_add_bcr_output]['GRP_ADD_BCR_INPUT_RDA'] + csv_file = list(dict.fromkeys(dic_GRP_ADD_BCR_INFO[wildcards.grp_add_bcr_output]['GRP_ADD_BCR_INPUT_CSV_BCR'].split(","))) + csv_file.insert(0, rda_file) + return csv_file + +""" +This function allows to determine the singularity binding parameters. +""" +def grp_add_bcr_params_sing(wildcards): + rda_folder = os.path.dirname(dic_GRP_ADD_BCR_INFO[wildcards.grp_add_bcr_output]['GRP_ADD_BCR_INPUT_RDA']) # output_folder too + concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder) + for bcrfile in list(dict.fromkeys(dic_GRP_ADD_BCR_INFO[wildcards.grp_add_bcr_output]['GRP_ADD_BCR_INPUT_CSV_BCR'].split(","))): + bcrfile = os.path.dirname(bcrfile) + concat = concat + " -B " + bcrfile + ":" + os.path.normpath("/WORKDIR/" + bcrfile) + return concat + +""" +This function allows to determine the bcr files folders for params. +""" +def grp_add_bcr_params_bcr_files(wildcards): + return ",".join([ os.path.normpath("/WORKDIR/" + bcrfile) for bcrfile in list(dict.fromkeys(dic_GRP_ADD_BCR_INFO[wildcards.grp_add_bcr_output]['GRP_ADD_BCR_INPUT_CSV_BCR'].split(","))) ]) + +""" +This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot). +""" +def grp_add_bcr_params_output_folder(wildcards): + return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.grp_add_bcr_output)) + "/" + +""" +This rule launches the R script to add adt information to expression gene analysis. +""" +rule grp_add_bcr_ge: + input: + grp_add_bcr_file = grp_add_bcr_input + output: + grp_add_bcr_rda_file = "{grp_add_bcr_output}" + "_BCR.rda" + params: + sing_bind = grp_add_bcr_params_sing, + pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER), + input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]), + input_csv = grp_add_bcr_params_bcr_files, + output_folder = grp_add_bcr_params_output_folder + threads: + 1 + resources: + mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480), + time_min = lambda wildcards, attempt: min(attempt * 120, 200) + shell: + """ + singularity exec --no-home {params.sing_bind} \ + {SINGULARITY_ENV_TCR_BCR} \ + Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_BCR.R \ + --input.rda {params.input_rda} \ + --output.dir {params.output_folder} \ + --vdj.input.files.bcr {params.input_csv} \ + --author.name {GRP_ADD_BCR_AUTHOR_NAME} \ + --author.mail {GRP_ADD_BCR_AUTHOR_MAIL} \ + --pipeline.path {params.pipeline_folder} + """ diff --git a/rules/Grp_Adding_TCR.smk b/rules/Grp_Adding_TCR.smk new file mode 100644 index 0000000..f7cf592 --- /dev/null +++ b/rules/Grp_Adding_TCR.smk @@ -0,0 +1,71 @@ +""" +########################################################################## +This rule add tcr information to expression gene analysis in single-cell RNA-seq. +########################################################################## +""" +wildcard_constraints: + grp_add_tcr_output = "|".join(GRP_ADD_TCR_OUTPUT) + +""" +This function allows to determine the input .rda file and csv file from cellranger vdj. +""" +def grp_add_tcr_input(wildcards): + rda_file = dic_GRP_ADD_TCR_INFO[wildcards.grp_add_tcr_output]['GRP_ADD_TCR_INPUT_RDA'] + csv_file = list(dict.fromkeys(dic_GRP_ADD_TCR_INFO[wildcards.grp_add_tcr_output]['GRP_ADD_TCR_INPUT_CSV_TCR'].split(","))) + csv_file.insert(0, rda_file) + return csv_file + +""" +This function allows to determine the singularity binding parameters. +""" +def grp_add_tcr_params_sing(wildcards): + rda_folder = os.path.dirname(dic_GRP_ADD_TCR_INFO[wildcards.grp_add_tcr_output]['GRP_ADD_TCR_INPUT_RDA']) # output_folder too + concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder) + for tcrfile in list(dict.fromkeys(dic_GRP_ADD_TCR_INFO[wildcards.grp_add_tcr_output]['GRP_ADD_TCR_INPUT_CSV_TCR'].split(","))): + tcrfile = os.path.dirname(tcrfile) + concat = concat + " -B " + tcrfile + ":" + os.path.normpath("/WORKDIR/" + tcrfile) + return concat + +""" +This function allows to determine the tcr files folders for params. +""" +def grp_add_tcr_params_tcr_files(wildcards): + return ",".join([ os.path.normpath("/WORKDIR/" + tcrfile) for tcrfile in list(dict.fromkeys(dic_GRP_ADD_TCR_INFO[wildcards.grp_add_tcr_output]['GRP_ADD_TCR_INPUT_CSV_TCR'].split(","))) ]) + +""" +This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot). +""" +def grp_add_tcr_params_output_folder(wildcards): + return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.grp_add_tcr_output)) + "/" + +""" +This rule launches the R script to add adt information to expression gene analysis. +""" +rule grp_add_tcr_ge: + input: + grp_add_tcr_file = grp_add_tcr_input + output: + grp_add_tcr_rda_file = "{grp_add_tcr_output}" + "_TCR.rda" + params: + sing_bind = grp_add_tcr_params_sing, + pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER), + input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]), + input_csv = grp_add_tcr_params_tcr_files, + output_folder = grp_add_tcr_params_output_folder + threads: + 1 + resources: + mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480), + time_min = lambda wildcards, attempt: min(attempt * 120, 200) + shell: + """ + singularity exec --no-home {params.sing_bind} \ + {SINGULARITY_ENV_TCR_BCR} \ + Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_TCR.R \ + --input.rda {params.input_rda} \ + --output.dir {params.output_folder} \ + --vdj.input.files.tcr {params.input_csv} \ + --author.name {GRP_ADD_TCR_AUTHOR_NAME} \ + --author.mail {GRP_ADD_TCR_AUTHOR_MAIL} \ + --pipeline.path {params.pipeline_folder} + """ diff --git a/rules/Int_Adding_ADT.smk b/rules/Int_Adding_ADT.smk new file mode 100644 index 0000000..59c7810 --- /dev/null +++ b/rules/Int_Adding_ADT.smk @@ -0,0 +1,84 @@ +""" +########################################################################## +This rule add adt information to expression gene analysis in integrated single-cell RNA-seq. +########################################################################## +""" +wildcard_constraints: + int_add_adt_output = "|".join(INT_ADD_ADT_OUTPUT) + +""" +This function allows to determine the input .rda ge file and kallisto adt folder. +""" +def int_add_adt_input(wildcards): + ge_rda_file = dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_INPUT_RDA'] + kallisto_folder = list(dict.fromkeys(dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_INPUT_DIR_ADT'].split(","))) + kallisto_folder.insert(0,ge_rda_file) + return kallisto_folder + +""" +This function allows to determine the singularity binding parameters. +""" +def int_add_adt_params_sing(wildcards): + rda_folder = os.path.dirname(dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_INPUT_RDA']) # output_folder too + concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder) + for kallisto_folder in list(dict.fromkeys(dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_INPUT_DIR_ADT'].split(","))): + kallisto_folder = os.path.dirname(kallisto_folder) + concat = concat + " -B " + kallisto_folder + ":" + os.path.normpath("/WORKDIR/" + kallisto_folder) + return concat + +""" +This function allows to determine the input alignment folder for params section. +""" +def int_add_adt_params_input_folder(wildcards): + return ",".join([ os.path.normpath("/WORKDIR/" + kallisto_folder + "/") for kallisto_folder in list(dict.fromkeys(dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_INPUT_DIR_ADT'].split(","))) ]) + +""" +This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot). +""" +def int_add_adt_params_output_folder(wildcards): + return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.int_add_adt_output)) + "/" + +""" +This function allows to determine the sample.name.adt for params. +""" +def int_add_adt_params_sample_name_adt(wildcards): + return dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_SAMPLE_NAME_ADT'] + + +""" +This rule launches the R script to add adt information to expression gene analysis. +""" +rule int_add_adt_ge: + input: + int_add_adt_file = int_add_adt_input + output: + int_add_adt_rda_file = "{int_add_adt_output}" + "_ADT.rda" + params: + sing_bind = int_add_adt_params_sing, + pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER), + input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]), + kallisto_folder = int_add_adt_params_input_folder, + output_folder = int_add_adt_params_output_folder, + sample_name_adt = int_add_adt_params_sample_name_adt + threads: + 1 + resources: + mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480), + time_min = lambda wildcards, attempt: min(attempt * 120, 200) + shell: + """ + singularity exec --no-home {params.sing_bind} \ + {SINGULARITY_ENV} \ + Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_ADT.R \ + --samples.name.adt {params.sample_name_adt} \ + --input.rda.ge {params.input_rda} \ + --output.dir {params.output_folder} \ + --input.dirs.adt {params.kallisto_folder} \ + --author.name {INT_ADD_ADT_AUTHOR_NAME} \ + --author.mail {INT_ADD_ADT_AUTHOR_MAIL} \ + --nthreads {threads} \ + --pipeline.path {params.pipeline_folder} \ + --gene.names {INT_ADD_ADT_GENE_NAMES} \ + --ADT.min.cutoff {INT_ADD_ADT_MIN_CUTOFF} \ + --ADT.max.cutoff {INT_ADD_ADT_MAX_CUTOFF} + """ diff --git a/rules/Int_Adding_BCR.smk b/rules/Int_Adding_BCR.smk new file mode 100644 index 0000000..c1680fd --- /dev/null +++ b/rules/Int_Adding_BCR.smk @@ -0,0 +1,71 @@ +""" +########################################################################## +This rule add bcr information to expression gene analysis in single-cell RNA-seq. +########################################################################## +""" +wildcard_constraints: + int_add_bcr_output = "|".join(INT_ADD_BCR_OUTPUT) + +""" +This function allows to determine the input .rda file and csv file from cellranger vdj. +""" +def int_add_bcr_input(wildcards): + rda_file = dic_INT_ADD_BCR_INFO[wildcards.int_add_bcr_output]['INT_ADD_BCR_INPUT_RDA'] + csv_file = list(dict.fromkeys(dic_INT_ADD_BCR_INFO[wildcards.int_add_bcr_output]['INT_ADD_BCR_INPUT_CSV_BCR'].split(","))) + csv_file.insert(0, rda_file) + return csv_file + +""" +This function allows to determine the singularity binding parameters. +""" +def int_add_bcr_params_sing(wildcards): + rda_folder = os.path.dirname(dic_INT_ADD_BCR_INFO[wildcards.int_add_bcr_output]['INT_ADD_BCR_INPUT_RDA']) # output_folder too + concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder) + for bcrfile in list(dict.fromkeys(dic_INT_ADD_BCR_INFO[wildcards.int_add_bcr_output]['INT_ADD_BCR_INPUT_CSV_BCR'].split(","))): + bcrfile = os.path.dirname(bcrfile) + concat = concat + " -B " + bcrfile + ":" + os.path.normpath("/WORKDIR/" + bcrfile) + return concat + +""" +This function allows to determine the bcr files folders for params. +""" +def int_add_bcr_params_bcr_files(wildcards): + return ",".join([ os.path.normpath("/WORKDIR/" + bcrfile) for bcrfile in list(dict.fromkeys(dic_INT_ADD_BCR_INFO[wildcards.int_add_bcr_output]['INT_ADD_BCR_INPUT_CSV_BCR'].split(","))) ]) + +""" +This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot). +""" +def int_add_bcr_params_output_folder(wildcards): + return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.int_add_bcr_output)) + "/" + +""" +This rule launches the R script to add adt information to expression gene analysis. +""" +rule int_add_bcr_ge: + input: + int_add_bcr_file = int_add_bcr_input + output: + int_add_bcr_rda_file = "{int_add_bcr_output}" + "_BCR.rda" + params: + sing_bind = int_add_bcr_params_sing, + pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER), + input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]), + input_csv = int_add_bcr_params_bcr_files, + output_folder = int_add_bcr_params_output_folder + threads: + 1 + resources: + mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480), + time_min = lambda wildcards, attempt: min(attempt * 120, 200) + shell: + """ + singularity exec --no-home {params.sing_bind} \ + {SINGULARITY_ENV_TCR_BCR} \ + Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_BCR.R \ + --input.rda {params.input_rda} \ + --output.dir {params.output_folder} \ + --vdj.input.files.bcr {params.input_csv} \ + --author.name {INT_ADD_BCR_AUTHOR_NAME} \ + --author.mail {INT_ADD_BCR_AUTHOR_MAIL} \ + --pipeline.path {params.pipeline_folder} + """ diff --git a/rules/Int_Adding_TCR.smk b/rules/Int_Adding_TCR.smk new file mode 100644 index 0000000..cfbab3a --- /dev/null +++ b/rules/Int_Adding_TCR.smk @@ -0,0 +1,71 @@ +""" +########################################################################## +This rule add tcr information to expression gene analysis in single-cell RNA-seq. +########################################################################## +""" +wildcard_constraints: + int_add_tcr_output = "|".join(INT_ADD_TCR_OUTPUT) + +""" +This function allows to determine the input .rda file and csv file from cellranger vdj. +""" +def int_add_tcr_input(wildcards): + rda_file = dic_INT_ADD_TCR_INFO[wildcards.int_add_tcr_output]['INT_ADD_TCR_INPUT_RDA'] + csv_file = list(dict.fromkeys(dic_INT_ADD_TCR_INFO[wildcards.int_add_tcr_output]['INT_ADD_TCR_INPUT_CSV_TCR'].split(","))) + csv_file.insert(0, rda_file) + return csv_file + +""" +This function allows to determine the singularity binding parameters. +""" +def int_add_tcr_params_sing(wildcards): + rda_folder = os.path.dirname(dic_INT_ADD_TCR_INFO[wildcards.int_add_tcr_output]['INT_ADD_TCR_INPUT_RDA']) # output_folder too + concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder) + for tcrfile in list(dict.fromkeys(dic_INT_ADD_TCR_INFO[wildcards.int_add_tcr_output]['INT_ADD_TCR_INPUT_CSV_TCR'].split(","))): + tcrfile = os.path.dirname(tcrfile) + concat = concat + " -B " + tcrfile + ":" + os.path.normpath("/WORKDIR/" + tcrfile) + return concat + +""" +This function allows to determine the tcr files folders for params. +""" +def int_add_tcr_params_tcr_files(wildcards): + return ",".join([ os.path.normpath("/WORKDIR/" + tcrfile) for tcrfile in list(dict.fromkeys(dic_INT_ADD_TCR_INFO[wildcards.int_add_tcr_output]['INT_ADD_TCR_INPUT_CSV_TCR'].split(","))) ]) + +""" +This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot). +""" +def int_add_tcr_params_output_folder(wildcards): + return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.int_add_tcr_output)) + "/" + +""" +This rule launches the R script to add adt information to expression gene analysis. +""" +rule int_add_tcr_ge: + input: + int_add_tcr_file = int_add_tcr_input + output: + int_add_tcr_rda_file = "{int_add_tcr_output}" + "_TCR.rda" + params: + sing_bind = int_add_tcr_params_sing, + pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER), + input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]), + input_csv = int_add_tcr_params_tcr_files, + output_folder = int_add_tcr_params_output_folder + threads: + 1 + resources: + mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480), + time_min = lambda wildcards, attempt: min(attempt * 120, 200) + shell: + """ + singularity exec --no-home {params.sing_bind} \ + {SINGULARITY_ENV_TCR_BCR} \ + Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_TCR.R \ + --input.rda {params.input_rda} \ + --output.dir {params.output_folder} \ + --vdj.input.files.tcr {params.input_csv} \ + --author.name {INT_ADD_TCR_AUTHOR_NAME} \ + --author.mail {INT_ADD_TCR_AUTHOR_MAIL} \ + --pipeline.path {params.pipeline_folder} + """ diff --git a/rules/Rule_all.smk b/rules/Rule_all.smk index e78859b..e6f8d31 100644 --- a/rules/Rule_all.smk +++ b/rules/Rule_all.smk @@ -78,7 +78,7 @@ def get_targets(STEPS): expand("{add_tcr_output}_TCR.rda", add_tcr_output = ADD_TCR_OUTPUT) ] if "Adding_BCR" in STEPS: - targets["Adding_BCT"]=[ + targets["Adding_BCR"]=[ expand("{add_bcr_output}_BCR.rda", add_bcr_output = ADD_BCR_OUTPUT) ] if "Cerebro" in STEPS: @@ -97,6 +97,18 @@ def get_targets(STEPS): targets["Int_Clust_Markers_Annot_GE"]=[ expand(os.path.normpath("{output_int_clust_markers_annot_dir_ge}" + "/" + INT_CMA_CLUST_FOLDER + "/" + "{name_int}" + "{int_complement}" + "_" + str(INT_CMA_KEEP_DIM) + "_" + str(INT_CMA_KEEP_RES) + ".rda"), zip, output_int_clust_markers_annot_dir_ge = INT_CMA_OUTPUT_DIR_GE, name_int = INT_CMA_NAME_INT, int_complement = INT_CMA_COMPLEMENT) ] + if "Int_Adding_ADT" in STEPS: + targets["Int_Adding_ADT"]=[ + expand("{int_add_adt_output}_ADT.rda", int_add_adt_output = INT_ADD_ADT_OUTPUT) + ] + if "Int_Adding_TCR" in STEPS: + targets["Int_Adding_TCR"]=[ + expand("{int_add_tcr_output}_TCR.rda", int_add_tcr_output = INT_ADD_TCR_OUTPUT) + ] + if "Int_Adding_BCR" in STEPS: + targets["Int_Adding_BCR"]=[ + expand("{int_add_bcr_output}_BCR.rda", int_add_bcr_output = INT_ADD_BCR_OUTPUT) + ] if "Grp_Norm_DimRed_Eval_GE" in STEPS: targets["Grp_Norm_DimRed_Eval_GE"]=[ expand(os.path.normpath("{output_grp_norm_dimred_dir_ge}" + "/GROUPED_ANALYSIS/NO_INTEGRATED/{name_grp}/" + GRP_NDRE_NORM_VTR + "/" + GRP_NDRE_DIMRED_VTR + "/" + "{name_grp}_" + GRP_NDRE_NORM_VTR + "_" + GRP_NDRE_DIMRED_VTR + ".rda"), zip, output_grp_norm_dimred_dir_ge=GRP_NDRE_OUTPUT_DIR_GE, name_grp=GRP_NDRE_NAME_GRP) @@ -105,4 +117,21 @@ def get_targets(STEPS): targets["Grp_Clust_Markers_Annot_GE"]=[ expand(os.path.normpath("{output_grp_clust_markers_annot_dir_ge}" + "/" + GRP_CMA_CLUST_FOLDER + "/" + "{name_grp}" + "{grp_complement}" + "_" + str(GRP_CMA_KEEP_DIM) + "_" + str(GRP_CMA_KEEP_RES) + ".rda"), zip, output_grp_clust_markers_annot_dir_ge = GRP_CMA_OUTPUT_DIR_GE, name_grp = GRP_CMA_NAME_GRP, grp_complement = GRP_CMA_COMPLEMENT) ] + if "Grp_Adding_ADT" in STEPS: + targets["Grp_Adding_ADT"]=[ + expand("{grp_add_adt_output}_ADT.rda", grp_add_adt_output = GRP_ADD_ADT_OUTPUT) + ] + if "Grp_Adding_TCR" in STEPS: + targets["Grp_Adding_TCR"]=[ + expand("{grp_add_tcr_output}_TCR.rda", grp_add_tcr_output = GRP_ADD_TCR_OUTPUT) + ] + if "Grp_Adding_BCR" in STEPS: + targets["Grp_Adding_BCR"]=[ + expand("{grp_add_bcr_output}_BCR.rda", grp_add_bcr_output = GRP_ADD_BCR_OUTPUT) + ] + for s in targets["Grp_Adding_ADT"]: + sys.stderr.write(str(s)+"\n") + for s in GRP_ADD_ADT_OUTPUT: + sys.stderr.write(str(s)+"\n") + return targets diff --git a/scripts/.Rhistory b/scripts/.Rhistory index 5ae370e..d74c4ef 100644 --- a/scripts/.Rhistory +++ b/scripts/.Rhistory @@ -12,3 +12,21 @@ x <- ceiling(sqrt(n)) y <- ceiling(n / x) print(paste0("X : ", x)) print(paste0("Y : ", y)) +samples.name.GE=c("A_GE","B_GE","C_GE") +samples.name <- sub("_GE","",samples.name.GE) +samples.name <- sub("_GE","",samples.name.GE) +samples.name +scmat <- BUSpaRse::read_count_output(dir = "~/home/m_aglave/Bureau/test_int_grp/0732M_ADT/KALLISTOBUS/" +, tcc = FALSE) +scmat <- BUSpaRse::read_count_output(dir = "~/home/m_aglave/Bureau/test_int_grp/0732M_ADT/KALLISTOBUS/"scmat <- BUSpaRse::read_count_output(dir = "~/home/m_aglave/Bureau/test_int_grp/0732M_ADT/KALLISTOBUS/" +scmat <- BUSpaRse::read_count_output(dir = "/home/m_aglave/Bureau/test_int_grp/0732M_ADT/KALLISTOBUS/", tcc = FALSE) +scmat <- BUSpaRse::read_count_output(dir = "/home/m_aglave/Bureau/test_int_grp/0732M_ADT/KALLISTOBUS/", name = "0732M_ADT", tcc = FALSE) +DropletUtils::write10xCounts(path="/home/m_aglave/Bureau/test_int_grp/761_JMG_ADT/KALLISTOBUS/",scmat) +DropletUtils::write10xCounts(path="/home/m_aglave/Bureau/test_int_grp/761_JMG_ADT/KALLISTOBUS/",scmat) +DropletUtils::write10xCounts(path="/home/m_aglave/Bureau/test_int_grp/761_JMG_ADT/KALLISTOBUS/",scmat) +DropletUtils::write10xCounts(path="/home/m_aglave/Bureau/test_int_grp/761_JMG_ADT/KALLISTOBUS/",scmat) +DropletUtils::write10xCounts(path="/home/m_aglave/Bureau/test_int_grp/761_JMG_ADT/",scmat) +DropletUtils::write10xCounts(path="/home/m_aglave/Bureau/test_int_grp/761JMG_ADT/",scmat) +DropletUtils::write10xCounts(path="/home/m_aglave/Bureau/test_int_grp/761JMG_ADT/KALLISTOBUS",scmat) +DropletUtils::write10xCounts(path="/home/m_aglave/Bureau/test_int_grp/761JMG_ADT/KALLISTOBUS",scmat) +dim(scmat) diff --git a/scripts/Grouped_analysis_part1.R b/scripts/Grouped_analysis_part1.R index 8e21663..50f6352 100644 --- a/scripts/Grouped_analysis_part1.R +++ b/scripts/Grouped_analysis_part1.R @@ -91,7 +91,7 @@ if (!is.null(args$options$yaml)){ rm(option_list,parser,args) #### Get path if snakemake/singularity/local #### -if(is.null(pipeline.path)) stop("--pipeline.path parameter must be set!") +if(is.null(pipeline.path)) stop("pipeline.path parameter must be set!") #### Check non-optional parameters #### if (is.null(input.list.rda[1])) stop("input.list.rda parameter can't be empty!") @@ -189,7 +189,7 @@ if(keep.norm) { #keep normalisation norm.method <- unique(n.meth) }else assay <- 'RNA' # redo normalisation -### Add prefix for colnames of sample clustering and clean TCR/BCR +### Add prefix for colnames of sample clustering and clean ADT/TCR/BCR for (i in names(sobj.list)){ # add prefix for colnames of sample clustering to_rename=grep("_res\\.",colnames(sobj.list[[i]]@meta.data), value = TRUE) @@ -197,8 +197,8 @@ for (i in names(sobj.list)){ sobj.list[[i]]@meta.data[[paste0(i,'_',j)]]=sobj.list[[i]]@meta.data[[j]] sobj.list[[i]]@meta.data[[j]]=NULL } - # cleaning sobj for TCR and BCR part - TCR_BCR_col=grep("^TCR|^BCR", colnames(sobj.list[[i]]@meta.data), value = TRUE) + # cleaning sobj for ADT, TCR and BCR part + TCR_BCR_col=grep("^ADT|^TCR|^BCR", colnames(sobj.list[[i]]@meta.data), value = TRUE) if(length(TCR_BCR_col) > 0) sobj.list[[i]]@meta.data[TCR_BCR_col] <- NULL } @@ -281,25 +281,25 @@ if(!is.null(vtr)){ MM_tmp2 <- if(norm.method == 'SCTransform') paste0(" and regress out bias factors (",paste0(vtr, collapse = ", "),")") else NULL MM_tmp3 <- if(dimred.method == 'scbfa') paste0("Per-cell bias factors (including ", paste0(vtr, collapse = ", "),") were regressed out during the scBFA dimension reduction.") else NULL -}else { +}else{ MM_tmp2 <- NULL MM_tmp3 <- NULL } -sobj@misc$parameters$Materials_and_Methods$Grouped_analysis_Norm_DimRed_Eval <- paste0("Seurat (version ",sobj@misc$technical_info$Seurat,") was applied for further data processing. ", +MM_tmp4 <- paste0("Seurat (version ",sobj@misc$technical_info$Seurat,") was applied for further data processing. ") if(keep.norm){ - paste0("Each dataset was normalized independently by ",norm.method,", as described in the Individual Analysis section, then data were merged using the merge() function from Seurat and a common dimension reduction was performed by ",MM_tmp,".") + MM_tmp4 <- paste0(MM_tmp4, paste0("Each dataset was normalized independently by ",norm.method,", as described in the Individual Analysis section, then data were merged using the merge() function from Seurat and a common dimension reduction was performed by ",MM_tmp,".")) }else{ - paste0("Datasets were merged using the merge() function from Seurat and a common normalization and dimension reduction were performed. ") - if(norm.method == 'SCTransform') paste0("The SCTransform normalization method (Hafemeister C, Satija R. Normalization and variance stabilization of single-cell RNA-seq data using regularized negative binomial regression. Genome Biol. 2019;20 10.1186/s13059-019-1874-1.) was used to normalize, scale, select ",features.n," Highly Variable Genes", MM_tmp2,".") - if(norm.method == 'LogNormalize') paste0(features.n," Highly Variable Genes (HVG) were identified using the FindVariableFeatures() method from Seurat applied on data transformed by its LogNormalize method.") - if(dimred.method == 'pca'){ - if(norm.method == 'SCTransform') paste0(" Person residuals from this regression were used for dimension reduction by Principal Component Analysis (PCA).") - if(norm.method == 'LogNormalize') paste0(" HVG were scaled and and centered, providing person residuals used for dimension reduction by Principal Component Analysis (PCA).") - } -}, -if(dimred.method == 'scbfa') paste0(" As the scBFA dimension reduction method (version ",sobj@misc$technical_info$scBFA,") is meant to be applied on a subset of the count matrix, we followed the authors recommendation and applied it on the HVG. ", MM_tmp3), -"The number of ",MM_tmp," dimensions to keep for further analysis was evaluated by assessing a range of reduced ",MM_tmp," spaces using ",dims.min," to ",dims.max," dimensions, with a step of ",dims.steps,". For each generated ",MM_tmp," space, Louvain clustering of cells was performed using a range of values for the resolution parameter from ",res.min," to ",res.max," with a step of ",res.steps,". The optimal space was manually evaluated as the one combination of kept dimensions and clustering resolution resolving the best structure (clusters homogeneity and compacity) in a Uniform Manifold Approximation and Projection space (UMAP). Additionaly, we used the clustree method (version ",sobj@misc$technical_info$clustree,") to assess if the selected optimal space corresponded to a relatively stable position in the clustering results tested for these dimensions / resolution combinations." -) + paste0("Datasets were merged using the merge() function from Seurat and a common normalization and dimension reduction were performed. ") + if(norm.method == 'SCTransform') MM_tmp4 <- paste0(MM_tmp4, paste0("The SCTransform normalization method (Hafemeister C, Satija R. Normalization and variance stabilization of single-cell RNA-seq data using regularized negative binomial regression. Genome Biol. 2019;20 10.1186/s13059-019-1874-1.) was used to normalize, scale, select ",features.n," Highly Variable Genes", MM_tmp2,".")) + if(norm.method == 'LogNormalize') MM_tmp4 <- paste0(MM_tmp4, paste0(features.n," Highly Variable Genes (HVG) were identified using the FindVariableFeatures() method from Seurat applied on data transformed by its LogNormalize method.")) + if(dimred.method == 'pca'){ + if(norm.method == 'SCTransform') MM_tmp4 <- paste0(MM_tmp4, paste0(" Person residuals from this regression were used for dimension reduction by Principal Component Analysis (PCA).")) + if(norm.method == 'LogNormalize') MM_tmp4 <- paste0(MM_tmp4, paste0(" HVG were scaled and and centered, providing person residuals used for dimension reduction by Principal Component Analysis (PCA).")) + } +} +if(dimred.method == 'scbfa') MM_tmp4 <- paste0(MM_tmp4, paste0(" As the scBFA dimension reduction method (version ",sobj@misc$technical_info$scBFA,") is meant to be applied on a subset of the count matrix, we followed the authors recommendation and applied it on the HVG. ", MM_tmp3)) +MM_tmp4 <- paste0(MM_tmp4, paste0("The number of ",MM_tmp," dimensions to keep for further analysis was evaluated by assessing a range of reduced ",MM_tmp," spaces using ",dims.min," to ",dims.max," dimensions, with a step of ",dims.steps,". For each generated ",MM_tmp," space, Louvain clustering of cells was performed using a range of values for the resolution parameter from ",res.min," to ",res.max," with a step of ",res.steps,". The optimal space was manually evaluated as the one combination of kept dimensions and clustering resolution resolving the best structure (clusters homogeneity and compacity) in a Uniform Manifold Approximation and Projection space (UMAP). Additionaly, we used the clustree method (version ",sobj@misc$technical_info$clustree,") to assess if the selected optimal space corresponded to a relatively stable position in the clustering results tested for these dimensions / resolution combinations.")) +sobj@misc$parameters$Materials_and_Methods$Grouped_analysis_Norm_DimRed_Eval <- MM_tmp4 sobj@misc$parameters$Materials_and_Methods$References_packages <- find_ref(MandM = sobj@misc$parameters$Materials_and_Methods, pipeline.path = pipeline.path) ### Saving reduced normalized object @@ -307,15 +307,16 @@ cat("\nSaving object...\n") sobj@misc$params$analysis_type <- paste0("Grouped analysis; Keep individual normalization: ", keep.norm) sobj@misc$params$sobj_creation$Rsession <- utils::capture.output(devtools::session_info()) sobj@misc$params$species <- species -sobj@misc$params$name.grp <- name.grp sobj@misc$params$group$keep.norm <- keep.norm +sobj@misc$params$name.grp <- name.grp +Seurat::Project(sobj) <- name.grp if (!is.null(author.name) && !tolower(author.name) %in% tolower(sobj@misc$params$author.name)) sobj@misc$params$author.name <- c(sobj@misc$params$author.name, author.name) if (!is.null(author.mail) && !tolower(author.mail) %in% tolower(sobj@misc$params$author.mail)) sobj@misc$params$author.mail <- c(sobj@misc$params$author.mail, author.mail) save(sobj, file = paste0(norm.dim.red.dir, '/', paste(c(name.grp, norm_vtr, dimred_vtr), collapse = '_'), '.rda'), compress = "bzip2") ### Correlating reduction dimensions with biases and markers expression cat("\nCorrelation of dimensions...\n") -dimensions.eval(sobj = sobj, reduction = paste0(assay, "_", dimred.method), eval.markers = eval.markers, meta.names = c('orig.ident','nCount_RNA', 'nFeature_RNA', 'percent_mt', 'MTscore', 'percent_rb', 'RBscore', 'percent_st', 'STscore', "Cyclone.S.Score", "Cyclone.G1.Score", "Cyclone.G2M.Score", "Cyclone.SmG2M.Score"), slot = 'data', out.dir = norm.dim.red.dir, nthreads = floor(nthreads/2)) +dimensions.eval(sobj = sobj, reduction = paste0(assay, "_", dimred.method), eval.markers = eval.markers, slot = 'data', out.dir = norm.dim.red.dir, nthreads = floor(nthreads/2)) gc() ### Testing multiple clustering parameters (nb dims kept + Louvain resolution) diff --git a/scripts/Int_Grp_pipeline_ADT.R b/scripts/Int_Grp_pipeline_ADT.R new file mode 100644 index 0000000..2098795 --- /dev/null +++ b/scripts/Int_Grp_pipeline_ADT.R @@ -0,0 +1,242 @@ +#### Read parameters #### +library(optparse) +option_list <- list( + ### Project + make_option("--samples.name.adt", help="Names of CITE-seq samples (ADT)"), + make_option("--input.rda.ge", help="Input seurat object (in .rda format)."), + make_option("--output.dir", help="Output path"), + make_option("--input.dirs.adt", help="Input paths to the KALLISTOBUS results."), + make_option("--author.name", help="Name of author of the analysis"), + make_option("--author.mail", help="Email of author of the analysis"), + ### Computational Parameters + make_option("--nthreads", help="Number of threads to use"), + make_option("--pipeline.path", help="Path to pipeline folder; it allows to change path if this script is used by snakemake and singularity, or singularity only or in local way. Example for singularity only: /WORKDIR/scRNAseq_10X_R4"), + ### Analysis Parameters + make_option("--gene.names", help="List of gene names wich correspond to the ADT proteins."), + make_option("--ADT.min.cutoff", help="List of quantile min to cutoff protein expression for plot."), + make_option("--ADT.max.cutoff", help="List of quantile max to cutoff protein expression for plot."), + ### Yaml parameters file to remplace all parameters before (usefull to use R script without snakemake) + make_option("--yaml", help="Patho to yaml file with all parameters") +) +parser <- OptionParser(usage="Rscript %prog [options]", description = " ", option_list = option_list) +args <- parse_args(parser, positional_arguments = 0) + +#### Formatting Parameters #### +#convert "NULL"/"FALSE"/"TRUE" (in character) into NULL/FALSE/TRUE +for (i in names(args$options)){ + if ((length(args$options[i]) == 0) || (length(args$options[i]) == 1 && toupper(args$options[i]) == "NULL")) { args$options[i] <- NULL + } else if ((length(args$options[i]) == 1) && (toupper(args$options[i]) == "FALSE")) { args$options[i] <- FALSE + } else if ((length(args$options[i]) == 1) && (toupper(args$options[i]) == "TRUE")) { args$options[i] <- TRUE + } +} + +#### Get Paramaters #### +### Project +samples.name.ADT <- unlist(stringr::str_split(args$options$samples.name.adt, ",")) +input.rda.ge <- args$options$input.rda.ge +output.dir <- args$options$output.dir +input.dirs.adt <- unlist(stringr::str_split(args$options$input.dirs.adt, ",")) +author.name <- args$options$author.name +author.mail <- args$options$author.mail +### Computational Parameters +nthreads <- as.numeric(args$options$nthreads) +pipeline.path <- args$options$pipeline.path +### Analysis Parameters +gene.names <- unlist(stringr::str_split(args$options$gene.names, ",")) #gene list correponding to the protein ADT +ADT.min.cutoff <- if (!is.null(args$options$ADT.min.cutoff)) unlist(stringr::str_split(args$options$ADT.min.cutoff, ",")) +ADT.max.cutoff <- if (!is.null(args$options$ADT.max.cutoff)) unlist(stringr::str_split(args$options$ADT.max.cutoff, ",")) +### Yaml parameters file to remplace all parameters before (usefull to use R script without snakemake) +if (!is.null(args$options$yaml)){ + yaml_options <- yaml::yaml.load_file(args$options$yaml) + for(i in names(yaml_options)) { + #convert "NULL"/"FALSE"/"TRUE" (in character) into NULL/FALSE/TRUE + if ((length(yaml_options[[i]]) == 0) || (length(yaml_options[[i]]) == 1 && toupper(yaml_options[[i]]) == "NULL")) { yaml_options[[i]] <- NULL + } else if ((length(yaml_options[[i]]) == 1) && (toupper(yaml_options[[i]]) == "FALSE")) { yaml_options[[i]] <- FALSE + } else if ((length(yaml_options[[i]]) == 1) && (toupper(yaml_options[[i]]) == "TRUE")) { yaml_options[[i]] <- TRUE + } + #assign values + if(i %in% c("nthreads")) assign(i, as.numeric(yaml_options[[i]])) else assign(i, yaml_options[[i]]) + } + rm(yaml_options, i) +} +### Clean +rm(args) + +#### Get path if snakemake/singularity/local #### +if(is.null(pipeline.path)) stop("--pipeline.path parameter must be set!") + +#### Check non-optional parameters #### +if (is.null(samples.name.ADT)) stop("samples.name.adt parameter can't be empty!") +if (is.null(input.rda.ge)) stop("input.rda.ge parameter can't be empty!") +if (is.null(output.dir)) stop("output.dir parameter can't be empty!") +if (is.null(input.dirs.adt)) stop("input.dirs.adt parameter can't be empty!") +if (is.null(gene.names)) stop("gene.names parameter can't be empty!") + +### Load data +load(input.rda.ge) + +### Save project parameters +if (!is.null(author.name) && !tolower(author.name) %in% tolower(sobj@misc$params$author.name)) sobj@misc$params$author.name <- c(sobj@misc$params$author.name, author.name) +if (!is.null(author.mail) && !tolower(author.mail) %in% tolower(sobj@misc$params$author.mail)) sobj@misc$params$author.mail <- c(sobj@misc$params$author.mail, author.mail) + +#### Get Missing Paramaters #### +### Computational Parameters +if (is.null(nthreads)) nthreads <- 4 +### Analysis Parameters +## Normalization and dimension reduction +dimred.method <- sobj@misc$params$reductions$method +norm.method_GE <- sobj@misc$params$normalization$normalization.method +assay <- if(norm.method_GE == "SCTransform") 'SCT' else 'RNA' +## Clustering +GE_file <- sub("\\.rda$", "", input.rda.ge) +INT_GRP.reduction <- sobj@misc$params$clustering$umap +## ADT +if (is.null(ADT.min.cutoff)) ADT.min.cutoff <- rep("q30", length(gene.names)) +if (is.null(ADT.max.cutoff)) ADT.max.cutoff <- rep("q95", length(gene.names)) +## Integration group +name.int_grp <- Seurat::Project(sobj) +#### Fixed parameters #### +output_path_ADT <- paste0(output.dir, "/ADT_results/") +cor.method <- 'spearman' +norm.method_ADT <- 'LogNormalize' #gave good VISUAL results. Avoid sctransform on such small dataset +slot <- 'data' #for correlation and umap + + +#### Sourcing functions #### +source(paste0(pipeline.path, "/scripts/bustools2seurat_preproc_functions.R")) + +######### +## MAIN +######### + +### printing parameters: +print("###########################################") +print(paste0("samples.name.adt : ",samples.name.ADT)) +print(paste0("input.rda.ge : ",input.rda.ge)) +print(paste0("output.dir : ",output.dir)) +print(paste0("input.dirs.adt : ",input.dirs.adt)) +print(paste0("gene.names : ",paste0(gene.names,collapse = ", "))) +print("###########################################") + +## Load libraries +require(patchwork) + +## Set the seed +set.seed(sobj@misc$params$seed) + +### Creating parallel instance +cl <- create.parallel.instance(nthreads = nthreads) + +### Loading raw count matrix +cat("\nLoading raw count matrix of ADT...\n") +dir.create(path = output_path_ADT, recursive = TRUE, showWarnings = FALSE) +sobjADT.list <- sapply(seq_along(input.dirs.adt), function(x) { + sobjADT <- load.sc.data(data.path = input.dirs.adt[x], sample.name = samples.name.ADT[x], assay = 'ADT', droplets.limit = NULL, emptydrops.fdr = NULL, BPPARAM = cl, my.seed = sobj@misc$params$seed, out.dir = output_path_ADT, draw_plots = FALSE) + return(sobjADT) +}) +names(sobjADT.list) <- sub("_ADT","_GE",vapply(sobjADT.list, Seurat::Project, 'a')) +sobjADT <- merge(x = sobjADT.list[[1]], y = sobjADT.list[-1], add.cell.ids = names(sobjADT.list), project = name.int_grp, merge.data = TRUE) +for (x in seq_along(sobjADT.list)) sobjADT@misc$pipeline_commands <- c(sobjADT@misc$pipeline_commands, sobjADT.list[[x]]@misc$pipeline_commands) +sobjADT@misc$params$ADT <- sobjADT.list[[1]]@misc$params$sobj_creation +sobjADT@misc$technical_info <- sobjADT.list[[1]]@misc$technical_info +sobjADT@misc$parameters$Materials_and_Methods$part0_Alignment <- sobjADT.list[[1]]@misc$parameters$Materials_and_Methods$part0_Alignment + +### Check number of protein names and gene.names and quantiles cutoff +if(length(rownames(sobjADT)) != length(gene.names)) stop(paste0("The number of gene.names is not the same as the proteins in the ADT count table: ", length(gene.names), " genes (", paste0(gene.names, collapse=","),") and ",length(rownames(sobjADT))," proteins (",paste0(rownames(sobjADT), collapse=","),").")) +if(length(rownames(sobjADT)) != length(ADT.min.cutoff)) stop(paste0("The number of ADT.min.cutoff is not the same as the proteins in the ADT count table: ", length(ADT.min.cutoff), " quantiles (", paste0(ADT.min.cutoff, collapse=","),") and ",length(rownames(sobjADT))," proteins (",paste0(rownames(sobjADT), collapse=","),").")) +if(length(rownames(sobjADT)) != length(ADT.max.cutoff)) stop(paste0("The number of ADT.max.cutoff is not the same as the proteins in the ADT count table: ", length(ADT.max.cutoff), " quantiles (", paste0(ADT.max.cutoff, collapse=","),") and ",length(rownames(sobjADT))," proteins (",paste0(rownames(sobjADT), collapse=","),").")) + +### Synching ADT to GE cells +cat("\nSynching ADT to GE cells...\n") +sobjADT <- sobjADT[, colnames(sobjADT) %in% colnames(sobj)] +#sobj <- sobj[, colnames(sobj) %in% colnames(sobjADT)] +if (!all(sort(colnames(sobj@assays$RNA@counts)) == sort(colnames(sobjADT@assays$ADT@counts)))){ + #identification of cells which are in sobj but not in ADT + colnames_ADT <- colnames(sobjADT) + cells_without_ADT <- colnames(sobj[, !(colnames(sobj) %in% colnames_ADT)]) + #addition of 0 in the expression of these cells in ADT to be able to CreateAssayObject + addition of a new meta.data + sobjADT@assays$ADT@counts <- cbind(sobjADT@assays$ADT@counts, rep(rep(0, dim(sobjADT@assays$ADT@counts)[1]),length(cells_without_ADT))) + colnames(sobjADT@assays$ADT@counts) <- c(colnames_ADT, cells_without_ADT) + meta.data_ADT <- data.frame(nCount_ADT = c(sobjADT@meta.data$nCount_ADT, rep(0,length(cells_without_ADT))), nFeature_ADT = c(sobjADT@meta.data$nFeature_ADT, rep(0,length(cells_without_ADT))), log_nCount_ADT = c(sobjADT@meta.data$log_nCount_ADT, rep(0,length(cells_without_ADT))), row.names=c(colnames_ADT, cells_without_ADT)) + sobj <- Seurat::AddMetaData(sobj, meta.data_ADT, col.name = c("nCount_ADT","nFeature_ADT","log_nCount_ADT")) +}else{ + sobj <- Seurat::AddMetaData(sobj, sobjADT@meta.data[,c("nCount_ADT","nFeature_ADT","log_nCount_ADT")], col.name = c("nCount_ADT","nFeature_ADT","log_nCount_ADT")) +} + +### Merging GE/ADT +cat("\nMerging ADT to GE...\n") +sobj[['ADT']] <- Seurat::CreateAssayObject(sobjADT@assays[['ADT']]@counts) +sobj@assays[['ADT']]@misc <- sobjADT@misc +sobj@misc$pipeline_commands <- c(sobj@misc$pipeline_commands, sobjADT@misc$pipeline_commands) +sobj@misc$params$ADT <- sobjADT@misc$params$sobj_creation +sobj@misc$technical_info$BUSpaRse <- sobjADT@misc$technical_info$BUSpaRse +sobj@misc$technical_info$DropletUtils <- sobjADT@misc$technical_info$DropletUtils +if(sobj@misc$technical_info$Seurat != sobjADT@misc$technical_info$Seurat) sobj@misc$technical_info$Seurat <- paste0(sobj@misc$technical_info$Seurat, ", ", sobjADT@misc$technical_info$Seurat) +sobj@misc$parameters$Materials_and_Methods$ADT <- sobjADT@misc$parameters$Materials_and_Methods$part0_Alignment +rm(sobjADT) + +### Normalization +cat("\nNormalization ADT expressions...\n") +sobj <- Seurat::NormalizeData(sobj, assay = 'ADT', normalization.method = norm.method_ADT) + +### Computing correlations +cat("\nComputing correlations...\n") +cor.df <- data.frame(RNA_feature = gene.names, ADT_feature = rownames(sobj@assays[['ADT']]@counts), stringsAsFactors = FALSE) +cor.unfiltered <- feature.cor(sobj = sobj, assay1 = assay, assay2 = 'ADT', assay1.features = gene.names, assay2.features = rownames(sobj@assays[['ADT']]@counts), slot = slot, cor.method = cor.method, zero.filter = FALSE, gene.names = gene.names, min.cutoff = NULL, max.cutoff = NULL) +cor.filtered <- feature.cor(sobj = sobj, assay1 = assay, assay2 = 'ADT', assay1.features = gene.names, assay2.features = rownames(sobj@assays[['ADT']]@counts), slot = slot, cor.method = cor.method, zero.filter = TRUE, gene.names = gene.names, min.cutoff = NULL, max.cutoff = NULL) +cor.quantile <- feature.cor(sobj = sobj, assay1 = assay, assay2 = 'ADT', assay1.features = gene.names, assay2.features = rownames(sobj@assays[['ADT']]@counts), slot = slot, cor.method = cor.method, zero.filter = FALSE, gene.names = gene.names, min.cutoff = ADT.min.cutoff, max.cutoff = ADT.max.cutoff) +cor.df <- cbind(cor.df, cor.unfiltered, cor.filtered,cor.quantile) +sobj@assays[['ADT']]@misc$cor <- cor.df +write.table(cor.df, file = paste0(output_path_ADT,'/ADT_correlations.csv'),sep = ";", row.names = FALSE, quote = FALSE) +rm(cor.df,cor.unfiltered,cor.filtered) + +### Co-plot gene expression and ADT protein level +cat("\nCo-plot gene expression and ADT protein level...\n") +#### withtout customized cutoff +RNA_data_plot <- feature_plots(sobj, assay = assay, features = gene.names, slot = slot, reduction = INT_GRP.reduction, min.cutoff = rep(0,length(gene.names)), max.cutoff = rep("q100",length(gene.names))) +ADT_data_plot <- feature_plots(sobj, assay = 'ADT', features = rownames(sobj@assays[['ADT']]@counts), slot = slot, reduction = INT_GRP.reduction, min.cutoff = rep(0,length(rownames(sobj@assays[['ADT']]@counts))), max.cutoff = rep("q100",length(rownames(sobj@assays[['ADT']]@counts)))) +png(paste0(output_path_ADT,'/ADT_dimplot.png'), width = 1200, height = 600 * length(gene.names)) +wrap_elements(RNA_data_plot) + wrap_elements(ADT_data_plot) +dev.off() +#### with customized cutoff +cat("\nCo-plot gene expression and ADT protein level with customized cutoff...\n") +RNA_data_plot <- feature_plots(sobj, assay = assay, features = gene.names, slot = slot, reduction = INT_GRP.reduction, min.cutoff = rep(0,length(gene.names)), max.cutoff = rep("q100",length(gene.names))) +ADT_data_plot <- feature_plots(sobj, assay = 'ADT', features = rownames(sobj@assays[['ADT']]@counts), slot = slot, reduction = INT_GRP.reduction, min.cutoff = ADT.min.cutoff, max.cutoff = ADT.max.cutoff) +png(paste0(output_path_ADT,'/ADT_dimplot_legend_cutoff.png'), width = 1200, height = 600 * length(gene.names)) +wrap_elements(RNA_data_plot) + wrap_elements(ADT_data_plot) +dev.off() +rm(RNA_data_plot,ADT_data_plot) + +### Adding ADT expression (from @data slot) as metadata +adt.values <- t(as.matrix(sobj@assays$ADT@data)) +colnames(adt.values) <- paste0("ADT_", colnames(adt.values)) +sobj <- Seurat::AddMetaData(sobj, adt.values, col.name = colnames(adt.values)) +rm(adt.values) + +### Save parameters +#gene.names: +sobj@assays[['ADT']]@misc$paramters$gene.names <- gene.names +sobj@misc$params$ADT$gene.names <- gene.names +#normalization: +sobj@assays[['ADT']]@misc$paramters$normalization$method <- norm.method_ADT +sobj@misc$params$ADT$normalization$method <- norm.method_ADT +#correlation: +sobj@assays[['ADT']]@misc$paramters$cor$method <- cor.method +sobj@assays[['ADT']]@misc$paramters$cor$slot <- slot +sobj@misc$params$ADT$cor$method <- cor.method +sobj@misc$params$ADT$cor$slot <- slot +#cutoff: +sobj@assays[['ADT']]@misc$paramters$cutoff_min = ADT.min.cutoff +sobj@assays[['ADT']]@misc$paramters$cutoff_max = ADT.max.cutoff +sobj@misc$params$ADT$cutoff_min = ADT.min.cutoff +sobj@misc$params$ADT$cutoff_max = ADT.max.cutoff + +### Materials and Methods +sobj@misc$parameters$Materials_and_Methods$ADT <- paste0(sobj@misc$parameters$Materials_and_Methods$ADT," Only cell barcodes corresponding to the cell barcodes of gene expression were kept. Counting table was log-normalize (NormalizeData() function from Seurat with normalization.method parameters setting to '", norm.method_ADT,"') and ", cor.method," correlation scores beetween protein levels and gene expression levels was computed and ploted on UMAP.") +sobj@misc$parameters$Materials_and_Methods$References_packages <- find_ref(MandM = sobj@misc$parameters$Materials_and_Methods, pipeline.path = pipeline.path) +write_MandM(sobj=sobj, output.dir=output.dir) + +### Saving GE_ADT object +cat("\nSaving object...\n") +GE_ADT_file <- paste0(output.dir, basename(GE_file), '_ADT') +save(sobj, file = paste0(GE_ADT_file, '.rda'), compress = "bzip2") diff --git a/scripts/Int_Grp_pipeline_BCR.R b/scripts/Int_Grp_pipeline_BCR.R new file mode 100644 index 0000000..7f8a64d --- /dev/null +++ b/scripts/Int_Grp_pipeline_BCR.R @@ -0,0 +1,378 @@ +#### Read parameters #### +library(optparse) +option_list <- list( + ### Project + make_option("--input.rda", help="Input seurat object (in .rda format)."), + make_option("--output.dir", help="Output path"), + make_option("--vdj.input.files.bcr", help="File filtered_contig_annotations.csv from CellRanger aligment pipeline."), + make_option("--author.name", help="Name of author of the analysis"), + make_option("--author.mail", help="Email of author of the analysis"), + ### Computational Parameters + make_option("--pipeline.path", help="Path to pipeline folder; it allows to change path if this script is used by snakemake and singularity, or singularity only or in local way. Example for singularity only: /WORKDIR/scRNAseq_10X_R4"), + ### Yaml parameters file to remplace all parameters before (usefull to use R script without snakemake) + make_option("--yaml", help="Patho to yaml file with all parameters") +) +parser <- OptionParser(usage="Rscript %prog [options]", description = " ", option_list = option_list) +args <- parse_args(parser, positional_arguments = 0) + +#### Formatting Parameters #### +#convert "NULL"/"FALSE"/"TRUE" (in character) into NULL/FALSE/TRUE +for (i in names(args$options)){ + if ((length(args$options[i]) == 0) || (length(args$options[i]) == 1 && toupper(args$options[i]) == "NULL")) { args$options[i] <- NULL + } else if ((length(args$options[i]) == 1) && (toupper(args$options[i]) == "FALSE")) { args$options[i] <- FALSE + } else if ((length(args$options[i]) == 1) && (toupper(args$options[i]) == "TRUE")) { args$options[i] <- TRUE + } +} + +#### Get Paramaters #### +### Project +input.rda <- args$options$input.rda +output.dir <- args$options$output.dir +vdj.input.files.bcr <- unlist(stringr::str_split(args$options$vdj.input.files.bcr, ",")) +author.name <- args$options$author.name +author.mail <- args$options$author.mail +### Computational Parameters +pipeline.path <- args$options$pipeline.path +### Yaml parameters file to remplace all parameters before (usefull to use R script without snakemake) +if (!is.null(args$options$yaml)){ + yaml_options <- yaml::yaml.load_file(args$options$yaml) + for(i in names(yaml_options)) { + #convert "NULL"/"FALSE"/"TRUE" (in character) into NULL/FALSE/TRUE + if ((length(yaml_options[[i]]) == 0) || (length(yaml_options[[i]]) == 1 && toupper(yaml_options[[i]]) == "NULL")) { yaml_options[[i]] <- NULL + } else if ((length(yaml_options[[i]]) == 1) && (toupper(yaml_options[[i]]) == "FALSE")) { yaml_options[[i]] <- FALSE + } else if ((length(yaml_options[[i]]) == 1) && (toupper(yaml_options[[i]]) == "TRUE")) { yaml_options[[i]] <- TRUE + } + #assign values + assign(i, yaml_options[[i]]) + } + rm(yaml_options, i) +} +### Clean +rm(args) + +#### Get path if snakemake/singularity/local #### +if(is.null(pipeline.path)) stop("--pipeline.path parameter must be set!") + +#### Check non-optional parameters #### +if (is.null(input.rda)) stop("input.rda parameter can't be empty!") +if (is.null(output.dir)) stop("output.dir parameter can't be empty!") +if (is.null(vdj.input.files.bcr)) stop("vdj.input.files.bcr parameter can't be empty!") + +### Load data +load(input.rda) + +### Save project parameters +if (!is.null(author.name) && !tolower(author.name) %in% tolower(sobj@misc$params$author.name)) sobj@misc$params$author.name <- c(sobj@misc$params$author.name, author.name) +if (!is.null(author.mail) && !tolower(author.mail) %in% tolower(sobj@misc$params$author.mail)) sobj@misc$params$author.mail <- c(sobj@misc$params$author.mail, author.mail) + +#### Get Missing Paramaters #### +### Analysis Parameters +## Normalization and dimension reduction +dimred.method <- sobj@misc$params$reductions$method +## Clustering +GE_file <- sub("\\.rda$", "", input.rda) +dimred.method <- sobj@misc$params$reductions$method +ident.name <- sobj@misc$params$clustering$ident +INT_GRP.reduction <- sobj@misc$params$clustering$umap +sample.name.INT_GRP <- Seurat::Project(sobj) +samples.name.GE <- unique(sort(sobj@meta.data$orig.ident)) +samples.name <- sub("_GE","",samples.name.GE) + +#### Fixed parameters #### +output_path_BCR <- paste0(output.dir, "/BCR_results/") +list_type_clT <- c("gene+nt", "gene", "nt", "aa") +list_type_contig <- c("nt","aa") +caption <- '“gene” - use the genes comprising the BCR +“nt” - use the nucleotide sequence of the CDR3 region +“aa” - use the amino acid sequence of the CDR3 region +“gene+nt” - use the genes comprising the BCR + the nucleotide sequence of the CDR3 region for T cells. This is the proper definition of clonotype.' + +#### Sourcing functions #### +source(paste0(pipeline.path, "/scripts/bustools2seurat_preproc_functions.R")) + +######### +## MAIN +######### + +### printing parameters: +print("###########################################") +print(paste0("input.rda : ",input.rda)) +print(paste0("ident.name : ",ident.name)) +print(paste0("vdj.input.files.bcr : ",vdj.input.files.bcr)) +print("###########################################") + +## Load libraries +require(patchwork) +suppressMessages(require(Seurat)) +library(dplyr) + +## Set the seed +set.seed(sobj@misc$params$seed) + +## GLOBAL ANALYSIS >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +cat("\nGlobal Analysis >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") +global_output <- paste0(output_path_BCR, "Global_analysis") +dir.create(path = global_output, recursive = TRUE, showWarnings = FALSE) + +## Loading input data and Combining contigs +cat("\nLoading input data and Combining contigs...\n") +cr_res <- lapply(seq_along(vdj.input.files.bcr), load.sc.tcr.bcr, sobj=sobj, vdj.input.file=vdj.input.files.bcr, sample.name=samples.name.GE) +bcr.combined <- scRepertoire::combineBCR(df = cr_res, samples = samples.name, ID = rep("BCR", length(samples.name))) + +## Quantification of unique contig analysis +cat("\nQuantification analysis...\n") +Quantif.unique.g(combined = bcr.combined, list_type_clT = list_type_clT, out.dir = global_output, caption = caption, sample.name = sample.name.INT_GRP) + +## Abundance analysis +cat("\nAbundance analysis...\n") +### Plots +for(x in list_type_clT) assign(paste0("plot_abundanceContig_",sub("\\+","_",x)),patchwork::wrap_elements(scRepertoire::abundanceContig(bcr.combined, cloneCall = x, scale = F) + plot_annotation(title = x, theme = ggplot2::theme(plot.title = ggplot2::element_text(size=10, hjust=0.6, face="bold"))) + Seurat::NoLegend())) +### Save +png(paste0(global_output,'/abundanceContig.png'), width = 800, height = 300) +(plot_abundanceContig_gene_nt | plot_abundanceContig_gene | plot_abundanceContig_nt | plot_abundanceContig_aa ) + + plot_annotation(title = sample.name.INT_GRP, subtitle = paste0("(",dim(sobj)[2]," cells)"), caption = caption, theme = ggplot2::theme(plot.caption = ggplot2::element_text(hjust=0), plot.title = ggplot2::element_text(size=20, hjust=0, face="bold"))) +dev.off() + +## Contigs Length analysis +cat("\nContigs Length analysis...\n") +for(x in list_type_contig){ + ### This should give multimodal plot + assign(paste0("plot_lengthContig_",x,"_comb"), scRepertoire::lengthContig(bcr.combined, cloneCall=x, chains = "combined") + Seurat::NoLegend()) + ### Plots the A and B chains distribution separately + assign(paste0("plot_lengthContig_",x,"_sin"), scRepertoire::lengthContig(bcr.combined, cloneCall=x, chains = "single") + Seurat::NoLegend()) +} +### Save +png(paste0(global_output,'/lengthContig.png'), width = 800, height = 800) +(plot_lengthContig_nt_comb | plot_lengthContig_nt_sin) / (plot_lengthContig_aa_comb | plot_lengthContig_aa_sin ) + + plot_annotation(title = sample.name.INT_GRP, theme = ggplot2::theme(plot.title = ggplot2::element_text(size=20, hjust=0, face="bold"))) +dev.off() + +## Clonal Homeostasis analysis +cat("\nClonal Homeostasis analysis...\n") +Homeo.g(combined = bcr.combined, list_type_clT = list_type_clT, out.dir = global_output, caption = caption, sample.name = sample.name.INT_GRP) + +## Clonal Proportions analysis +cat("\nClonal Proportions analysis...\n") +Prop.g(combined = bcr.combined, list_type_clT = list_type_clT, out.dir = global_output, caption = caption, sample.name = sample.name.INT_GRP) + +## Diversity analysis +cat("\nDiversity analysis...\n") +Div.g(combined = bcr.combined, list_type_clT = list_type_clT, out.dir = global_output, caption = caption, sample.name = sample.name.INT_GRP) + +## Combine BCR data with seurat object +cat("\nCombine BCR data with seurat object...\n") + +### Corresponding barcode +for(i in 1:length(bcr.combined)) bcr.combined[[i]]$barcode <- gsub(pattern = "BCR", replacement = "GE", bcr.combined[[i]]$barcode) +### Combination +sobj <- scRepertoire::combineExpression(df = bcr.combined, sc = sobj, cloneCall="aa") +sobj@meta.data$Frequency_indiv=sobj@meta.data$Frequency +### Combination tout echantillon confondu +bcr.combined_unlist <- do.call("rbind", bcr.combined) +sobj <- scRepertoire::combineExpression(df = bcr.combined_unlist, sc = sobj, cloneCall="aa") +sobj@meta.data$Frequency_all=sobj@meta.data$Frequency +sobj@meta.data$Frequency=NULL +rm(bcr.combined,bcr.combined_unlist) +### Spliting CTnt and CTgenes (into separate columns for IG-V/L and corresponding sequences) and save as metadata +### and Adding length of IG sequence to meta.data +sobj <- split.bcr(sobj) +### for all samples +### Plots +for(x in c('IGHV','IGHJ','IGHD','Isotype','IGLV','IGLJ','IGLC')) assign(paste0("dimplot_",x), if (all(is.na(sobj@meta.data[[x]]))) patchwork::plot_spacer() + Seurat::DarkTheme() + ggplot2::ggtitle(x) else Seurat::DimPlot(sobj, group.by = x, reduction = INT_GRP.reduction) + Seurat::DarkTheme() + ggplot2::ggtitle(x)) +for(x in c("cdr3_nt_IGH_len", "cdr3_nt_IGL_len")) { + assign(paste0("featureplot_",x), + tryCatch({ Seurat::FeaturePlot(sobj, features = x, reduction = INT_GRP.reduction) }, + error=function(e) { return(patchwork::plot_spacer()) } + ) + Seurat::DarkTheme() + ggplot2::ggtitle(paste0("Nucleotidic length ", x))) +} +### Save plots +png(paste0(global_output,'/cloneType_',sample.name.INT_GRP,'.png'), width =700, height = 3000) +( wrap_elements(dimplot_IGHV / dimplot_IGHJ / dimplot_IGHD / dimplot_Isotype / dimplot_IGLV / dimplot_IGLJ / dimplot_IGLC / featureplot_cdr3_nt_IGH_len / featureplot_cdr3_nt_IGL_len) + + plot_annotation(title = 'BCR', theme = ggplot2::theme(plot.title = ggplot2::element_text(size=50, hjust=0.5, face="bold"))) ) +dev.off() +### by samples +for (i in seq(samples.name)){ + #### selection des data du sample + cells_sample=rownames(sobj@meta.data[sobj@meta.data$orig.ident == samples.name.GE[i],]) + sub_sobj=subset(sobj, cells = cells_sample) + #### Plots + for(x in c('IGHV','IGHJ','IGHD','Isotype','IGLV','IGLJ','IGLC')) assign(paste0("dimplot_",x), tryCatch( { print(Seurat::DimPlot(sub_sobj, group.by = x, reduction = INT_GRP.reduction) + Seurat::DarkTheme() + ggplot2::ggtitle(x)) }, error=function(err) { patchwork::plot_spacer() + Seurat::DarkTheme() + ggplot2::ggtitle(x) } )) + for(x in c("cdr3_nt_IGH_len", "cdr3_nt_IGL_len")) assign(paste0("featureplot_",x), tryCatch( { print(Seurat::FeaturePlot(sub_sobj, features = x, reduction = INT_GRP.reduction) + Seurat::DarkTheme() + ggplot2::ggtitle(paste0("Nucleotidic length ", x))) }, error=function(err) { patchwork::plot_spacer() + Seurat::DarkTheme() + ggplot2::ggtitle(paste0("Nucleotidic length ", x)) } )) + #### Save plots + png(paste0(global_output,'/cloneType_',samples.name[i],'.png'), width =1400, height = 3000) + print( ( wrap_elements(dimplot_IGHV / dimplot_IGHJ / dimplot_IGHD / dimplot_Isotype / dimplot_IGLV / dimplot_IGLJ / dimplot_IGLC / featureplot_cdr3_nt_IGH_len / featureplot_cdr3_nt_IGL_len) + + plot_annotation(title = 'BCR', theme = ggplot2::theme(plot.title = ggplot2::element_text(size=50, hjust=0.5, face="bold"))) )) + dev.off() +} + +## Frequency analysis +cat("\nFrequency analysis...\n") +sobj <- Freq.g(sobj=sobj, out.dir = global_output, sample.name = sample.name.INT_GRP, reduction = INT_GRP.reduction, freq_col = "Frequency_all") +for (i in seq(samples.name)){ + #### selection des data du sample + cells_sample=rownames(sobj@meta.data[sobj@meta.data$orig.ident == samples.name.GE[i],]) + sub_sobj=subset(sobj, cells = cells_sample) + #### Analysis + #Top 10 frequencies, and top 10 to top 20 frequencies + top20_freq = sub_sobj@meta.data %>% select(Frequency_indiv,CTaa,highlight_aa_all) %>% distinct() %>% arrange(desc(Frequency_indiv)) %>% na.omit() %>% top_n(n = 20, wt = Frequency_indiv) + top20_freq = top20_freq[1:20,] + rownames(top20_freq)=top20_freq$highlight_aa_all + sub_sobj$highlight_aa_top10_freq <- ifelse(sub_sobj$highlight_aa_all %in% top20_freq$highlight[1:10], sub_sobj$highlight_aa_all, NA) + sub_sobj$highlight_aa_top11to20_freq <- ifelse(sub_sobj$highlight_aa_all %in% top20_freq$highlight[11:length(top20_freq$highlight)], sub_sobj$highlight_aa_all, NA) + #UMAP of top 10 frequencies + png(paste0(global_output,'/Frequency_top_10_umap',samples.name[i],'.png'), width = 800, height = (400+350)) + print(patchwork::wrap_elements( (Seurat::DimPlot(sub_sobj, reduction = INT_GRP.reduction, group.by = "highlight_aa_top10_freq") + Seurat::DarkTheme()) / gridExtra::tableGrob(top20_freq[1:10,c("Frequency_indiv","CTaa")], theme = gridExtra::ttheme_default(base_size = 10)) + + plot_annotation(title = paste0(samples.name[i],": Top 10 Clonotypes (by frequencies)"), subtitle = paste0("(",dim(sobj)[2]," cells)"), theme = ggplot2::theme(plot.title = ggplot2::element_text(size=20, hjust=0.5, face="bold"))) + + plot_layout(heights = c(2, 1)))) + dev.off() + #UMAP of top 11 to 20 frequencies + png(paste0(global_output,'/Frequency_top11to20_umap',samples.name[i],'.png'), width = 800, height = (400+350)) + print(patchwork::wrap_elements( (Seurat::DimPlot(sub_sobj, reduction = INT_GRP.reduction, group.by = "highlight_aa_top11to20_freq") + Seurat::DarkTheme()) / gridExtra::tableGrob(top20_freq[11:length(top20_freq$CTaa),c("Frequency_indiv","CTaa")], theme = gridExtra::ttheme_default(base_size = 10)) + + plot_annotation(title = paste0(samples.name[i], ": Top 11 to 20 Clonotypes (by frequencies)"), subtitle = paste0("(",dim(sobj)[2]," cells)"), theme = ggplot2::theme(plot.title = ggplot2::element_text(size=20, hjust=0.5, face="bold"))) + + plot_layout(heights = c(2, 1)))) + dev.off() +} + +## Physicochemical properties of the CDR3 +cat("\nPhysicochemical properties of the CDR3 analysis...\n") +Physicochemical_properties.g(sobj=sobj, list_type_clT = list_type_clT, out.dir = global_output, sample.name=sample.name.INT_GRP, type='BCR') + + + + + +## CLUSTERS LEVEL ANALYSIS >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +cat("\nClusters Level Analysis >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") +clusters_output <- paste0(output_path_BCR, "Clusters_analysis") +dir.create(path = clusters_output, recursive = TRUE, showWarnings = FALSE) + +## By sample +for (i in seq(samples.name)){ + #create directory + sample_output=paste0(clusters_output, "/", samples.name[i]) + dir.create(path = sample_output, recursive = TRUE, showWarnings = FALSE) + + #### selection des data du sample + cells_sample=rownames(sobj@meta.data[sobj@meta.data$orig.ident == samples.name.GE[i],]) + sub_sobj=subset(sobj, cells = cells_sample) + + ## Filter cells that have no value for the x concerned + conversion to a list by clusters + ## Need to filter, otherwise the functions count the 'NA' as a sequence. + for(x in list_type_clT){ + if(x=="gene+nt") y="strict" else y=x + filtred_sobj = sub_sobj[,!is.na(sub_sobj@meta.data[paste0("CT", y)])] + assign(paste0("filtred_metadata_", sub("\\+","_",x)), scRepertoire::expression2List(sc=filtred_sobj, group=ident.name)) + } + rm(filtred_sobj) + + ## Quantification of unique contig analysis ##UTILE??? NB: il n'y a qu'un BCR par cellule alors la quantif d'unique sera de 100% partout!!!!!!!!!!!!!!!!!! + cat("\nQuantification analysis...\n") + sub_sobj <- Quantif.unique.c(sobj = sub_sobj, ident.name = ident.name, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Abundance analysis ##UTILE??? NB: il n'y a qu'un BCR par cellule alors l'abondance sera de 1 partout!!!!!!!!!!!!!!!!!!!!! + cat("\nAbundance analysis...\n") + ### Plots + for(x in list_type_clT) assign(paste0("plot_cluster_abundanceContig_",sub("\\+","_",x)),patchwork::wrap_elements(scRepertoire::abundanceContig(get(paste0("filtred_metadata_", sub("\\+","_",x))), cloneCall = x, scale = F) + plot_annotation(title = x, theme = ggplot2::theme(plot.title = ggplot2::element_text(size=10, hjust=0.6, face="bold"))) )) + ### Save + png(paste0(clusters_output,'/abundanceContig.png'), width = 2000, height = 600) + (plot_cluster_abundanceContig_gene_nt | plot_cluster_abundanceContig_gene | plot_cluster_abundanceContig_nt | plot_cluster_abundanceContig_aa ) + + plot_annotation(title = samples.name[i], caption = caption, theme = ggplot2::theme(plot.caption = ggplot2::element_text(hjust=0), plot.title = ggplot2::element_text(size=20, hjust=0, face="bold"))) + dev.off() + + ## Clonal Homeostasis analysis + cat("\nClonal Homeostasis analysis...\n") + sub_sobj <- Homeo.c(sobj = sub_sobj, ident.name = ident.name, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Clonal Proportions analysis + cat("\nClonal Proportions analysis...\n") + sub_sobj <- Prop.c(sobj = sub_sobj, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Diversity analysis + cat("\nDiversity analysis...\n") + sub_sobj <- Div.c(sobj = sub_sobj, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Frequency analysis + cat("\nFrequency analysis...\n") + Freq.c(sobj = sub_sobj, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], ident.name = ident.name, reduction = INT_GRP.reduction, freq_col = "Frequency_indiv", filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Clonal Overlap analysis (si plus de 1) + cat("\nClonal Overlap analysis...\n") + if(length(levels(Seurat::Idents(sobj)))!=1 && length(unique(sobj@meta.data[!is.na(sobj@meta.data$CTstrict),ident.name]))!=1) Overlap.c(sobj = sub_sobj, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Physico-chemical properties of the CDR3 + cat("\nPhysico-chemical properties of the CDR3 analysis...\n") + Physicochemical_properties.c(sobj = sub_sobj, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], ident.name = ident.name, filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt, type='BCR') +} + +#All samples +## Filter cells that have no value for the x concerned + conversion to a list by clusters +## Need to filter, otherwise the functions count the 'NA' as a sequence. +for(x in list_type_clT){ + if(x=="gene+nt") y="strict" else y=x + filtred_sobj = sobj[,!is.na(sobj@meta.data[paste0("CT", y)])] + assign(paste0("filtred_metadata_", sub("\\+","_",x)), scRepertoire::expression2List(sc=filtred_sobj, group=ident.name)) +} +rm(filtred_sobj) + +## Quantification of unique contig analysis ##UTILE??? NB: il n'y a qu'un BCR par cellule alors la quantif d'unique sera de 100% partout!!!!!!!!!!!!!!!!!! +cat("\nQuantification analysis...\n") +sobj <- Quantif.unique.c(sobj = sobj, ident.name=ident.name, list_type_clT = list_type_clT, out.dir = clusters_output, caption = caption, sample.name = sample.name.INT_GRP, filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + +## Abundance analysis ##UTILE??? NB: il n'y a qu'un BCR par cellule alors l'abondance sera de 1 partout!!!!!!!!!!!!!!!!!!!!! +cat("\nAbundance analysis...\n") +### Plots +for(x in list_type_clT) assign(paste0("plot_cluster_abundanceContig_",sub("\\+","_",x)),patchwork::wrap_elements(scRepertoire::abundanceContig(get(paste0("filtred_metadata_", sub("\\+","_",x))), cloneCall = x, scale = F) + plot_annotation(title = x, theme = ggplot2::theme(plot.title = ggplot2::element_text(size=10, hjust=0.6, face="bold"))) )) +### Save +png(paste0(clusters_output,'/abundanceContig.png'), width = 2000, height = 600) +(plot_cluster_abundanceContig_gene_nt | plot_cluster_abundanceContig_gene | plot_cluster_abundanceContig_nt | plot_cluster_abundanceContig_aa ) + + plot_annotation(title = sample.name.INT_GRP, caption = caption, theme = ggplot2::theme(plot.caption = ggplot2::element_text(hjust=0), plot.title = ggplot2::element_text(size=20, hjust=0, face="bold"))) +dev.off() + +## Clonal Homeostasis analysis +cat("\nClonal Homeostasis analysis...\n") +sobj <- Homeo.c(sobj = sobj, ident.name=ident.name, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Clonal Proportions analysis +cat("\nClonal Proportions analysis...\n") +sobj <- Prop.c(sobj = sobj, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Diversity analysis +cat("\nDiversity analysis...\n") +sobj <- Div.c(sobj = sobj, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Frequency analysis ##UTILE??? NB: il n'y a qu'un BCR par cellule alors la fréquence vaut 1 partout!!!!!!!!!!!!!!! +cat("\nFrequency analysis...\n") +Freq.c(sobj = sobj, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, ident.name=ident.name, reduction=INT_GRP.reduction, freq_col="Frequency", filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Clonal Overlap analysis ##UTILE??? NB: il n'y a qu'un BCR par cellule alors pas d'overlap!!!!!!!!!!!!!!! +cat("\nClonal Overlap analysis...\n") +if(length(levels(Seurat::Idents(sobj)))!=1 && length(unique(sobj@meta.data[!is.na(sobj@meta.data$CTstrict),ident.name]))!=1) Overlap.c(sobj = sobj, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Physico-chemical properties of the CDR3 +cat("\nPhysico-chemical properties of the CDR3 analysis...\n") +Physicochemical_properties.c(sobj = sobj, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, ident.name=ident.name, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt, type='BCR') + +#renamme BCR columns with 'BCR_' prefix +toMatch <- c("^CTgene","^CTnt","^CTaa","^CTstrict","^Frequency","^cloneType","^IGHV","^IGHJ","^IGHD","^Isotype","^IGLV","^IGLJ","^IGLC","^cdr3_nt_IGH", "^cdr3_nt_IGL","^cdr3_nt_IGH_len","^cdr3_nt_IGL_len","^highlight_aa") +matches <- grep(paste(toMatch,collapse="|"), colnames(sobj@meta.data)) +colnames(sobj@meta.data)[matches] <- paste0("BCR_", grep(paste(toMatch,collapse="|"), colnames(sobj@meta.data), value=TRUE)) + +## Save packages versions +sobj@misc$technical_info$scRepertoire <- utils::packageVersion('scRepertoire') +sobj@misc$technical_info$alakazam <- utils::packageVersion('alakazam') + +### Materials and Methods +if(file.exists(paste0(dirname(vdj.input.files.bcr), "/../../Materials_and_Methods.txt"))){ + tmp <- readr::read_tsv(paste0(dirname(vdj.input.files.bcr), "/../../Materials_and_Methods.txt"), col_names = FALSE)$X1 + tmp2 <- "" + for (i in 1:length(tmp)) tmp2=paste(tmp2,tmp[i], sep="") + sobj@misc$parameters$Materials_and_Methods$BCR <- tmp2 +} else sobj@misc$parameters$Materials_and_Methods$BCR <- NULL +sobj@misc$parameters$Materials_and_Methods$BCR <- paste0(sobj@misc$parameters$Materials_and_Methods$BCR, " The annotation was merged with corresponding cell barcode of 5’ gene expression. The scRepertoire package (version ",sobj@misc$technical_info$scRepertoire,") was used to process annotation to assign clonotype based on Ig chains. scRepertoire allows to study contig quantification, contig abundance, contig length, clonal space homeostasis, clonal proportion, clonal overlap beetween clusters and diversity. Physicochemical properties of the CDR3, based on amino-acid sequences, was determined by the alakazam R package (version ",sobj@misc$technical_info$alakazam,").") +sobj@misc$parameters$Materials_and_Methods$References_packages <- find_ref(MandM = sobj@misc$parameters$Materials_and_Methods, pipeline.path = pipeline.path) +write_MandM(sobj=sobj, output.dir=output.dir) + +### Saving GE_ADT_BCR object +cat("\nSaving object...\n") +GE_BCR_file <- paste0(output.dir, basename(GE_file), '_BCR') +save(sobj, file = paste0(GE_BCR_file, '.rda'), compress = "bzip2") diff --git a/scripts/Int_Grp_pipeline_TCR.R b/scripts/Int_Grp_pipeline_TCR.R new file mode 100644 index 0000000..a279c98 --- /dev/null +++ b/scripts/Int_Grp_pipeline_TCR.R @@ -0,0 +1,383 @@ +#### Read parameters #### +library(optparse) +option_list <- list( + ### Project + make_option("--input.rda", help="Input seurat object (in .rda format)."), + make_option("--output.dir", help="Output path"), + make_option("--vdj.input.files.tcr", help="Files filtered_contig_annotations.csv from CellRanger aligment pipeline (in the same order than seurat objects for integration or grouping)."), + make_option("--author.name", help="Name of author of the analysis"), + make_option("--author.mail", help="Email of author of the analysis"), + ### Computational Parameters + make_option("--pipeline.path", help="Path to pipeline folder; it allows to change path if this script is used by snakemake and singularity, or singularity only or in local way. Example for singularity only: /WORKDIR/scRNAseq_10X_R4"), + ### Yaml parameters file to remplace all parameters before (usefull to use R script without snakemake) + make_option("--yaml", help="Patho to yaml file with all parameters") +) +parser <- OptionParser(usage="Rscript %prog [options]", description = " ", option_list = option_list) +args <- parse_args(parser, positional_arguments = 0) + +#### Formatting Parameters #### +#convert "NULL"/"FALSE"/"TRUE" (in character) into NULL/FALSE/TRUE +for (i in names(args$options)){ + if ((length(args$options[i]) == 0) || (length(args$options[i]) == 1 && toupper(args$options[i]) == "NULL")) { args$options[i] <- NULL + } else if ((length(args$options[i]) == 1) && (toupper(args$options[i]) == "FALSE")) { args$options[i] <- FALSE + } else if ((length(args$options[i]) == 1) && (toupper(args$options[i]) == "TRUE")) { args$options[i] <- TRUE + } +} + +#### Get Paramaters #### +### Project +input.rda <- args$options$input.rda +output.dir <- args$options$output.dir +vdj.input.files.tcr <- unlist(stringr::str_split(args$options$vdj.input.files.tcr, ",")) +author.name <- args$options$author.name +author.mail <- args$options$author.mail +### Computational Parameters +pipeline.path <- args$options$pipeline.path +### Yaml parameters file to remplace all parameters before (usefull to use R script without snakemake) +if (!is.null(args$options$yaml)){ + yaml_options <- yaml::yaml.load_file(args$options$yaml) + for(i in names(yaml_options)) { + #convert "NULL"/"FALSE"/"TRUE" (in character) into NULL/FALSE/TRUE + if ((length(yaml_options[[i]]) == 0) || (length(yaml_options[[i]]) == 1 && toupper(yaml_options[[i]]) == "NULL")) { yaml_options[[i]] <- NULL + } else if ((length(yaml_options[[i]]) == 1) && (toupper(yaml_options[[i]]) == "FALSE")) { yaml_options[[i]] <- FALSE + } else if ((length(yaml_options[[i]]) == 1) && (toupper(yaml_options[[i]]) == "TRUE")) { yaml_options[[i]] <- TRUE + } + #assign values + assign(i, yaml_options[[i]]) + } + rm(yaml_options, i) +} +### Clean +rm(args) + +#### Get path if snakemake/singularity/local #### +if(is.null(pipeline.path)) stop("--pipeline.path parameter must be set!") + +#### Check non-optional parameters #### +if (is.null(input.rda)) stop("input.rda parameter can't be empty!") +if (is.null(output.dir)) stop("output.dir parameter can't be empty!") +if (is.null(vdj.input.files.tcr)) stop("vdj.input.files.tcr parameter can't be empty!") + +### Load data +load(input.rda) + +### Save project parameters +if (!is.null(author.name) && !tolower(author.name) %in% tolower(sobj@misc$params$author.name)) sobj@misc$params$author.name <- c(sobj@misc$params$author.name, author.name) +if (!is.null(author.mail) && !tolower(author.mail) %in% tolower(sobj@misc$params$author.mail)) sobj@misc$params$author.mail <- c(sobj@misc$params$author.mail, author.mail) + +#### Get Missing Paramaters #### +### Analysis Parameters +## Normalization and dimension reduction +dimred.method <- sobj@misc$params$reductions$method +## Clustering +GE_file <- sub("\\.rda$", "", input.rda) +dimred.method <- sobj@misc$params$reductions$method +ident.name <- sobj@misc$params$clustering$ident +INT_GRP.reduction <- sobj@misc$params$clustering$umap +sample.name.INT_GRP <- Seurat::Project(sobj) +samples.name.GE <- unique(sort(sobj@meta.data$orig.ident)) +samples.name <- sub("_GE","",samples.name.GE) + +#### Fixed parameters #### +output_path_TCR <- paste0(output.dir, "/TCR_results/") +list_type_clT <- c("gene+nt", "gene", "nt", "aa") +list_type_contig <- c("nt","aa") +caption <- '“gene” - use the genes comprising the TCR +“nt” - use the nucleotide sequence of the CDR3 region +“aa” - use the amino acid sequence of the CDR3 region +“gene+nt” - use the genes comprising the TCR + the nucleotide sequence of the CDR3 region for T cells. This is the proper definition of clonotype.' + +#### Sourcing functions #### +source(paste0(pipeline.path, "/scripts/bustools2seurat_preproc_functions.R")) + +######### +## MAIN +######### + +### printing parameters: +print("###########################################") +print(paste0("input.rda : ",input.rda)) +print(paste0("ident.name : ",ident.name)) +print(paste0("vdj.input.files.tcr : ",vdj.input.files.tcr)) +print("###########################################") + +## Load libraries +require(patchwork) +suppressMessages(require(Seurat)) +library(dplyr) + +## Set the seed +set.seed(sobj@misc$params$seed) + +## GLOBAL ANALYSIS >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +cat("\nGlobal Analysis >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") +global_output <- paste0(output_path_TCR, "Global_analysis") +dir.create(path = global_output, recursive = TRUE, showWarnings = FALSE) + +## Loading input data and Combining contigs +cat("\nLoading input data and Combining contigs...\n") +cr_res <- lapply(seq_along(vdj.input.files.tcr), load.sc.tcr.bcr, sobj=sobj, vdj.input.file=vdj.input.files.tcr, sample.name=samples.name.GE) +tcr.combined <- scRepertoire::combineTCR(df = cr_res, samples = samples.name, ID = rep("TCR", length(samples.name)), cells = "T-AB") + +## Quantification of unique contig analysis +cat("\nQuantification analysis...\n") +Quantif.unique.g(combined = tcr.combined, list_type_clT = list_type_clT, out.dir = global_output, caption=caption, sample.name=sample.name.INT_GRP) + +## Abundance analysis +cat("\nAbundance analysis...\n") +### Plots +for(x in list_type_clT) assign(paste0("plot_abundanceContig_",sub("\\+","_",x)),patchwork::wrap_elements(scRepertoire::abundanceContig(tcr.combined, cloneCall = x, scale = F) + plot_annotation(title = x, theme = ggplot2::theme(plot.title = ggplot2::element_text(size=10, hjust=0.6, face="bold"))) + Seurat::NoLegend())) +### Save +png(paste0(global_output,'/abundanceContig.png'), width = 800, height = 300) +(plot_abundanceContig_gene_nt | plot_abundanceContig_gene | plot_abundanceContig_nt | plot_abundanceContig_aa ) + + plot_annotation(title = sample.name.INT_GRP, subtitle = paste0("(",dim(sobj)[2]," cells)"), caption = caption, theme = ggplot2::theme(plot.caption = ggplot2::element_text(hjust=0), plot.title = ggplot2::element_text(size=20, hjust=0, face="bold"))) +dev.off() + +## Contigs Length analysis +cat("\nContigs Length analysis...\n") +for(x in list_type_contig){ + ### This should give multimodal plot + assign(paste0("plot_lengthContig_",x,"_comb"), scRepertoire::lengthContig(tcr.combined, cloneCall=x, chains = "combined") + Seurat::NoLegend()) + ### Plots the A and B chains distribution separately + assign(paste0("plot_lengthContig_",x,"_sin"), scRepertoire::lengthContig(tcr.combined, cloneCall=x, chains = "single") + Seurat::NoLegend()) +} +### Save +png(paste0(global_output,'/lengthContig.png'), width = 800, height = 800) +(plot_lengthContig_nt_comb | plot_lengthContig_nt_sin) / (plot_lengthContig_aa_comb | plot_lengthContig_aa_sin ) + + plot_annotation(title = sample.name.INT_GRP, theme = ggplot2::theme(plot.title = ggplot2::element_text(size=20, hjust=0, face="bold"))) +dev.off() + +## Clonal Homeostasis analysis +cat("\nClonal Homeostasis analysis...\n") +Homeo.g(combined = tcr.combined, list_type_clT = list_type_clT, out.dir = global_output, caption=caption, sample.name=sample.name.INT_GRP) + +## Clonal Proportions analysis +cat("\nClonal Proportions analysis...\n") +Prop.g(combined = tcr.combined, list_type_clT = list_type_clT, out.dir = global_output, caption=caption, sample.name=sample.name.INT_GRP) + +## Diversity analysis +cat("\nDiversity analysis...\n") +Div.g(combined = tcr.combined, list_type_clT = list_type_clT, out.dir = global_output, caption=caption, sample.name=sample.name.INT_GRP) + +## Combine TCR data with seurat object +cat("\nCombine TCR data with seurat object...\n") +### Corresponding barcode +for(i in 1:length(tcr.combined)) tcr.combined[[i]]$barcode <- gsub(pattern = "TCR", replacement = "GE", tcr.combined[[i]]$barcode) +### Combination +sobj <- scRepertoire::combineExpression(df = tcr.combined, sc = sobj, cloneCall="aa") +sobj@meta.data$Frequency_indiv=sobj@meta.data$Frequency +### Combination tout echantillon confondu +tcr.combined_unlist <- do.call("rbind", tcr.combined) +sobj <- scRepertoire::combineExpression(df = tcr.combined_unlist, sc = sobj, cloneCall="aa") +sobj@meta.data$Frequency_all=sobj@meta.data$Frequency +sobj@meta.data$Frequency=NULL +rm(tcr.combined,tcr.combined_unlist) +### Spliting CTstrict (into separate columns for TRA-V/J/C, TRB-V/J/C and corresponding sequences, with 2 possible clonotypes) and save as metadata +### and Adding length of TR sequence to meta.data +sobj <- split.CTstrict.tcr(sobj) +### for all samples +### Plots +for(x in c("TRAV_1","TRAJ_1","TRAC_1","TRBV_1","TRBJ_1","TRBC_1","TRAV_2","TRAJ_2","TRAC_2","TRBV_2","TRBJ_2","TRBC_2")) assign(paste0("dimplot_",x), if (all(is.na(sobj@meta.data[[x]]))) patchwork::plot_spacer() + Seurat::DarkTheme() + ggplot2::ggtitle(x) else Seurat::DimPlot(sobj, group.by = x, reduction = INT_GRP.reduction) + Seurat::DarkTheme() + ggplot2::ggtitle(x)) +for(x in c("TRA_nt_1_len","TRA_nt_2_len","TRB_nt_1_len","TRB_nt_2_len")) { + assign(paste0("featureplot_",x), + tryCatch({ Seurat::FeaturePlot(sobj, features = x, reduction = INT_GRP.reduction) }, + error=function(e) { return(patchwork::plot_spacer()) } + ) + Seurat::DarkTheme() + ggplot2::ggtitle(paste0("Nucleotidic length ", x))) +} +### Save plots +png(paste0(global_output,'/cloneType_',sample.name.INT_GRP,'.png'), width =1400, height = 3000) +( wrap_elements( (dimplot_TRAV_1 / dimplot_TRAJ_1 / dimplot_TRAC_1 / dimplot_TRBV_1 / dimplot_TRBJ_1 / dimplot_TRBC_1 / featureplot_TRA_nt_1_len / featureplot_TRB_nt_1_len) + + plot_annotation(title = 'TR clonotype 1', theme = ggplot2::theme(plot.title = ggplot2::element_text(size=50, hjust=0.5, face="bold"))) ) | + wrap_elements( (dimplot_TRAV_2 / dimplot_TRAJ_2 /dimplot_TRAC_2 / dimplot_TRBV_2 / dimplot_TRBJ_2 / dimplot_TRBC_2 / featureplot_TRA_nt_2_len / featureplot_TRB_nt_2_len) + + plot_annotation(title = 'TR clonotype 2', theme = ggplot2::theme(plot.title = ggplot2::element_text(size=50, hjust=0.5, face="bold"))) ) + + plot_annotation(title = sample.name.INT_GRP, theme = ggplot2::theme(plot.title = ggplot2::element_text(size=50, hjust=0.5, face="bold"))) ) +dev.off() +### by samples +for (i in seq(samples.name)){ + #### selection des data du sample + cells_sample=rownames(sobj@meta.data[sobj@meta.data$orig.ident == samples.name.GE[i],]) + sub_sobj=subset(sobj, cells = cells_sample) + #### Plots + for(x in c("TRAV_1","TRAJ_1","TRAC_1","TRBV_1","TRBJ_1","TRBC_1","TRAV_2","TRAJ_2","TRAC_2","TRBV_2","TRBJ_2","TRBC_2")) assign(paste0("dimplot_",x), tryCatch( { print(Seurat::DimPlot(sub_sobj, group.by = x, reduction = INT_GRP.reduction) + Seurat::DarkTheme() + ggplot2::ggtitle(x)) }, error=function(err) { patchwork::plot_spacer() + Seurat::DarkTheme() + ggplot2::ggtitle(x) } )) + for(x in c("TRA_nt_1_len","TRA_nt_2_len","TRB_nt_1_len","TRB_nt_2_len")) assign(paste0("featureplot_",x), tryCatch( { print(Seurat::FeaturePlot(sub_sobj, features = x, reduction = INT_GRP.reduction) + Seurat::DarkTheme() + ggplot2::ggtitle(paste0("Nucleotidic length ", x))) }, error=function(err) { patchwork::plot_spacer() + Seurat::DarkTheme() + ggplot2::ggtitle(paste0("Nucleotidic length ", x)) } )) + #### Save plots + png(paste0(global_output,'/cloneType_',samples.name[i],'.png'), width =1400, height = 3000) + print( wrap_elements( (dimplot_TRAV_1 / dimplot_TRAJ_1 / dimplot_TRAC_1 / dimplot_TRBV_1 / dimplot_TRBJ_1 / dimplot_TRBC_1 / featureplot_TRA_nt_1_len / featureplot_TRB_nt_1_len) + + plot_annotation(title = 'TR clonotype 1', theme = ggplot2::theme(plot.title = ggplot2::element_text(size=50, hjust=0.5, face="bold"))) ) | + wrap_elements( (dimplot_TRAV_2 / dimplot_TRAJ_2 /dimplot_TRAC_2 / dimplot_TRBV_2 / dimplot_TRBJ_2 / dimplot_TRBC_2 / featureplot_TRA_nt_2_len / featureplot_TRB_nt_2_len) + + plot_annotation(title = 'TR clonotype 2', theme = ggplot2::theme(plot.title = ggplot2::element_text(size=50, hjust=0.5, face="bold"))) ) + + plot_annotation(title = samples.name[i], theme = ggplot2::theme(plot.title = ggplot2::element_text(size=50, hjust=0.5, face="bold"))) ) + dev.off() +} + +## Frequency analysis +cat("\nFrequency analysis...\n") +sobj <- Freq.g(sobj=sobj, out.dir = global_output, sample.name = sample.name.INT_GRP, reduction = INT_GRP.reduction, freq_col = "Frequency_all") +for (i in seq(samples.name)){ + #### selection des data du sample + cells_sample=rownames(sobj@meta.data[sobj@meta.data$orig.ident == samples.name.GE[i],]) + sub_sobj=subset(sobj, cells = cells_sample) + #### Analysis + #Top 10 frequencies, and top 10 to top 20 frequencies + top20_freq = sub_sobj@meta.data %>% select(Frequency_indiv,CTaa,highlight_aa_all) %>% distinct() %>% arrange(desc(Frequency_indiv)) %>% na.omit() %>% top_n(n = 20, wt = Frequency_indiv) + top20_freq = top20_freq[1:20,] + rownames(top20_freq)=top20_freq$highlight_aa_all + sub_sobj$highlight_aa_top10_freq <- ifelse(sub_sobj$highlight_aa_all %in% top20_freq$highlight[1:10], sub_sobj$highlight_aa_all, NA) + sub_sobj$highlight_aa_top11to20_freq <- ifelse(sub_sobj$highlight_aa_all %in% top20_freq$highlight[11:length(top20_freq$highlight)], sub_sobj$highlight_aa_all, NA) + #UMAP of top 10 frequencies + png(paste0(global_output,'/Frequency_top_10_umap',samples.name[i],'.png'), width = 800, height = (400+350)) + print(patchwork::wrap_elements( (Seurat::DimPlot(sub_sobj, reduction = INT_GRP.reduction, group.by = "highlight_aa_top10_freq") + Seurat::DarkTheme()) / gridExtra::tableGrob(top20_freq[1:10,c("Frequency_indiv","CTaa")], theme = gridExtra::ttheme_default(base_size = 10)) + + plot_annotation(title = paste0(samples.name[i],": Top 10 Clonotypes (by frequencies)"), subtitle = paste0("(",dim(sobj)[2]," cells)"), theme = ggplot2::theme(plot.title = ggplot2::element_text(size=20, hjust=0.5, face="bold"))) + + plot_layout(heights = c(2, 1)))) + dev.off() + #UMAP of top 11 to 20 frequencies + png(paste0(global_output,'/Frequency_top11to20_umap',samples.name[i],'.png'), width = 800, height = (400+350)) + print(patchwork::wrap_elements( (Seurat::DimPlot(sub_sobj, reduction = INT_GRP.reduction, group.by = "highlight_aa_top11to20_freq") + Seurat::DarkTheme()) / gridExtra::tableGrob(top20_freq[11:length(top20_freq$CTaa),c("Frequency_indiv","CTaa")], theme = gridExtra::ttheme_default(base_size = 10)) + + plot_annotation(title = paste0(samples.name[i], ": Top 11 to 20 Clonotypes (by frequencies)"), subtitle = paste0("(",dim(sobj)[2]," cells)"), theme = ggplot2::theme(plot.title = ggplot2::element_text(size=20, hjust=0.5, face="bold"))) + + plot_layout(heights = c(2, 1)))) + dev.off() +} + +## Physicochemical properties of the CDR3 +cat("\nPhysicochemical properties of the CDR3 analysis...\n") +Physicochemical_properties.g(sobj=sobj, list_type_clT = list_type_clT, out.dir = global_output, sample.name=sample.name.INT_GRP, type='TCR') + + + + +## CLUSTERS LEVEL ANALYSIS >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +cat("\nClusters Level Analysis >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") +clusters_output <- paste0(output_path_TCR, "Clusters_analysis") +dir.create(path = clusters_output, recursive = TRUE, showWarnings = FALSE) + +## By sample +for (i in seq(samples.name)){ + #create directory + sample_output=paste0(clusters_output, "/", samples.name[i]) + dir.create(path = sample_output, recursive = TRUE, showWarnings = TRUE) + + #### selection des data du sample + cells_sample=rownames(sobj@meta.data[sobj@meta.data$orig.ident == samples.name.GE[i],]) + sub_sobj=subset(sobj, cells = cells_sample) + + ## Filter cells that have no value for the x concerned + conversion to a list by clusters + ## Need to filter, otherwise the functions count the 'NA' as a sequence. + for(x in list_type_clT){ + if(x=="gene+nt") y="strict" else y=x + filtred_sobj = sub_sobj[,!is.na(sub_sobj@meta.data[paste0("CT", y)])] + assign(paste0("filtred_metadata_", sub("\\+","_",x)), scRepertoire::expression2List(sc=filtred_sobj, group=ident.name)) + } + rm(filtred_sobj) + + ## Quantification of unique contig analysis + cat("\nQuantification analysis...\n") + sub_sobj <- Quantif.unique.c(sobj = sub_sobj, ident.name = ident.name, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Abundance analysis + cat("\nAbundance analysis...\n") + ### Plots + for(x in list_type_clT) assign(paste0("plot_cluster_abundanceContig_",sub("\\+","_",x)),patchwork::wrap_elements(scRepertoire::abundanceContig(get(paste0("filtred_metadata_", sub("\\+","_",x))), cloneCall = x, scale = F) + plot_annotation(title = x, theme = ggplot2::theme(plot.title = ggplot2::element_text(size=10, hjust=0.6, face="bold"))) )) + ### Save + png(paste0(clusters_output,'/abundanceContig.png'), width = 2000, height = 600) + (plot_cluster_abundanceContig_gene_nt | plot_cluster_abundanceContig_gene | plot_cluster_abundanceContig_nt | plot_cluster_abundanceContig_aa ) + + plot_annotation(title = samples.name[i], caption = caption, theme = ggplot2::theme(plot.caption = ggplot2::element_text(hjust=0), plot.title = ggplot2::element_text(size=20, hjust=0, face="bold"))) + dev.off() + + ## Clonal Homeostasis analysis + cat("\nClonal Homeostasis analysis...\n") + sub_sobj <- Homeo.c(sobj = sub_sobj, ident.name = ident.name, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Clonal Proportions analysis + cat("\nClonal Proportions analysis...\n") + sub_sobj <- Prop.c(sobj = sub_sobj, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Diversity analysis + cat("\nDiversity analysis...\n") + sub_sobj <- Div.c(sobj = sub_sobj, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Frequency analysis + cat("\nFrequency analysis...\n") + Freq.c(sobj = sub_sobj, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], ident.name = ident.name, reduction=INT_GRP.reduction, freq_col = "Frequency_indiv", filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Clonal Overlap analysis (si plus de 1) + cat("\nClonal Overlap analysis...\n") + if(length(levels(Seurat::Idents(sobj)))!=1 && length(unique(sobj@meta.data[!is.na(sobj@meta.data$CTstrict),ident.name]))!=1) Overlap.c(sobj = sub_sobj, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt) + + ## Physico-chemical properties of the CDR3 + cat("\nPhysico-chemical properties of the CDR3 analysis...\n") + Physicochemical_properties.c(sobj = sub_sobj, list_type_clT = list_type_clT, out.dir = sample_output, caption = caption, sample.name = samples.name[i], ident.name = ident.name, filtred_metadata_aa = filtred_metadata_aa, filtred_metadata_nt = filtred_metadata_nt, filtred_metadata_gene = filtred_metadata_gene, filtred_metadata_gene_nt = filtred_metadata_gene_nt, type='TCR') +} + +#All samples +## Filter cells that have no value for the x concerned + conversion to a list by clusters +## Need to filter, otherwise the functions count the 'NA' as a sequence. +for(x in list_type_clT){ + if(x=="gene+nt") y="strict" else y=x + filtred_sobj = sobj[,!is.na(sobj@meta.data[paste0("CT", y)])] + assign(paste0("filtred_metadata_", sub("\\+","_",x)), scRepertoire::expression2List(sc=filtred_sobj, group=ident.name)) +} +rm(filtred_sobj) + +## Quantification of unique contig analysis +cat("\nQuantification analysis...\n") +sobj <- Quantif.unique.c(sobj = sobj, ident.name=ident.name, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Abundance analysis +cat("\nAbundance analysis...\n") +### Plots +for(x in list_type_clT) assign(paste0("plot_cluster_abundanceContig_",sub("\\+","_",x)),patchwork::wrap_elements(scRepertoire::abundanceContig(get(paste0("filtred_metadata_", sub("\\+","_",x))), cloneCall = x, scale = F) + plot_annotation(title = x, theme = ggplot2::theme(plot.title = ggplot2::element_text(size=10, hjust=0.6, face="bold"))) )) +### Save +png(paste0(clusters_output,'/abundanceContig.png'), width = 2000, height = 600) +(plot_cluster_abundanceContig_gene_nt | plot_cluster_abundanceContig_gene | plot_cluster_abundanceContig_nt | plot_cluster_abundanceContig_aa ) + + plot_annotation(title = sample.name.INT_GRP, caption = caption, theme = ggplot2::theme(plot.caption = ggplot2::element_text(hjust=0), plot.title = ggplot2::element_text(size=20, hjust=0, face="bold"))) +dev.off() + +## Clonal Homeostasis analysis +cat("\nClonal Homeostasis analysis...\n") +sobj <- Homeo.c(sobj = sobj, ident.name=ident.name, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Clonal Proportions analysis +cat("\nClonal Proportions analysis...\n") +sobj <- Prop.c(sobj = sobj, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Diversity analysis +cat("\nDiversity analysis...\n") +sobj <- Div.c(sobj = sobj, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Frequency analysis +cat("\nFrequency analysis...\n") +Freq.c(sobj = sobj, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, ident.name=ident.name, reduction=, freq_col="Frequency", filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Clonal Overlap analysis (si plus de 1) +cat("\nClonal Overlap analysis...\n") +if(length(levels(Seurat::Idents(sobj)))!=1 && length(unique(sobj@meta.data[!is.na(sobj@meta.data$CTstrict),ident.name]))!=1) Overlap.c(sobj = sobj, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt) + +## Physico-chemical properties of the CDR3 +cat("\nPhysico-chemical properties of the CDR3 analysis...\n") +Physicochemical_properties.c(sobj = sobj, list_type_clT = list_type_clT, out.dir = clusters_output, caption=caption, sample.name=sample.name.INT_GRP, ident.name=ident.name, filtred_metadata_aa=filtred_metadata_aa, filtred_metadata_nt=filtred_metadata_nt, filtred_metadata_gene=filtred_metadata_gene, filtred_metadata_gene_nt=filtred_metadata_gene_nt, type='TCR') + + +#renamme TCR columns with 'TCR_' prefix +toMatch <- c("^CTgene","^CTnt","^CTaa","^CTstrict","^Frequency","^cloneType","^TRAV_1","^TRAJ_1","^TRAC_1","^TRAV_2","^TRAJ_2","^TRAC_2","^TRA_nt_1","^TRA_nt_2","^TRBV_1","^TRBJ_1","^None_1","^TRBC_1","^TRBV_2","^TRBJ_2","^None_2","^TRBC_2","^TRB_nt_1","^TRB_nt_2","^TRA_nt_1_len","^TRA_nt_2_len","^TRB_nt_1_len","^TRB_nt_2_len","^highlight_aa") +matches <- grep(paste(toMatch,collapse="|"), colnames(sobj@meta.data)) +colnames(sobj@meta.data)[matches] <- paste0("TCR_", grep(paste(toMatch,collapse="|"), colnames(sobj@meta.data), value=TRUE)) + +## Save packages versions +sobj@misc$technical_info$scRepertoire <- utils::packageVersion('scRepertoire') +sobj@misc$technical_info$alakazam <- utils::packageVersion('alakazam') + +### Materials and Methods +if(file.exists(paste0(dirname(vdj.input.files.tcr), "/Materials_and_Methods.txt"))){ + tmp <- readr::read_tsv(paste0(dirname(vdj.input.files.tcr), "/Materials_and_Methods.txt"), col_names = FALSE)$X1 + tmp2 <- "" + for (i in 1:length(tmp)) tmp2=paste(tmp2,tmp[i], sep="") + sobj@misc$parameters$Materials_and_Methods$tcr <- tmp2 +} else sobj@misc$parameters$Materials_and_Methods$TCR <- NULL +sobj@misc$parameters$Materials_and_Methods$TCR <- paste0(sobj@misc$parameters$Materials_and_Methods$TCR, " The annotation was merged with corresponding cell barcode of 5’ gene expression. The scRepertoire package (version ",sobj@misc$technical_info$scRepertoire,") was used to process annotation to assign clonotype based on TCR chains. scRepertoire allows to study contig quantification, contig abundance, contig length, clonal space homeostasis, clonal proportion, clonal overlap beetween clusters and diversity. Physicochemical properties of the CDR3, based on amino-acid sequences, was determined by the alakazam R package (version ",sobj@misc$technical_info$alakazam,").") +sobj@misc$parameters$Materials_and_Methods$References_packages <- find_ref(MandM = sobj@misc$parameters$Materials_and_Methods, pipeline.path = pipeline.path) +write_MandM(sobj=sobj, output.dir=output.dir) + +### Saving GE_ADT_TCR object +cat("\nSaving object...\n") +GE_TCR_file <- paste0(output.dir, basename(GE_file), '_TCR') +save(sobj, file = paste0(GE_TCR_file, '.rda'), compress = "bzip2") diff --git a/scripts/Integration_part1.R b/scripts/Integration_part1.R index 29b75e8..43cf209 100644 --- a/scripts/Integration_part1.R +++ b/scripts/Integration_part1.R @@ -213,7 +213,7 @@ if(integration.method %in% c("Seurat","Liger")) { #keep normalisation norm.method <- unique(n.meth) } -### Add prefix for colnames of sample clustering and clean TCR/BCR +### Add prefix for colnames of sample clustering and clean ADT/TCR/BCR for (i in names(sobj.list)){ # add prefix for colnames of sample clustering to_rename=grep("_res\\.",colnames(sobj.list[[i]]@meta.data), value = TRUE) @@ -221,8 +221,8 @@ for (i in names(sobj.list)){ sobj.list[[i]]@meta.data[[paste0(i,'_',j)]]=sobj.list[[i]]@meta.data[[j]] sobj.list[[i]]@meta.data[[j]]=NULL } - # cleaning sobj for TCR and BCR part - TCR_BCR_col=grep("^TCR|^BCR", colnames(sobj.list[[i]]@meta.data), value = TRUE) + # cleaning sobj for ADT, TCR and BCR part + TCR_BCR_col=grep("^ADT|^TCR|^BCR", colnames(sobj.list[[i]]@meta.data), value = TRUE) if(length(TCR_BCR_col) > 0) sobj.list[[i]]@meta.data[TCR_BCR_col] <- NULL } @@ -255,17 +255,24 @@ if((integration.method == "Seurat") || (integration.method == 'Liger')){ ## Integration if(integration.method == "Seurat"){ message(paste0(integration.method," integration...")) + sobj.list <- sapply(seq_along(sobj.list), function(x) { #Rename cells + new_cells_name <- paste0(names(sobj.list)[x],"_",colnames(sobj.list[[x]])) + sobj.list[[x]] <- Seurat::RenameCells(sobj.list[[x]], new.names = new_cells_name) + return(sobj.list[[x]]) + }) if(assay == 'SCT') int.norm.method <- 'SCT' else int.norm.method <- 'LogNormalize' sobj.features <- Seurat::SelectIntegrationFeatures(object.list = sobj.list, nfeatures = features.n) #Sélection des marqueurs biologiques partagés sobj.list <- Seurat::PrepSCTIntegration(object.list = sobj.list, anchor.features = sobj.features) #Verifie que les résidus de Pearson ont bien été calculés sobj.anchors <- Seurat::FindIntegrationAnchors(object.list = sobj.list, normalization.method = int.norm.method, anchor.features = sobj.features) #CCA + L2normalisation; puis KNN; puis MNNs : identification des paires de cellules; filtrage des anchors; calcul des scores sobj <- Seurat::IntegrateData(anchorset = sobj.anchors, normalization.method = int.norm.method) #Calcul des poids; application des poids sur la matrice d'expression: intégration + # Params Seurat::Project(sobj) <- name.int sobj@misc$params$seed <- sobj.list[[1]]@misc$params$seed DefaultAssay(sobj) <- "integrated" sobj@assays[["integrated"]]@misc$scaling$vtr = NA sobj@assays[["integrated"]]@misc$params$normalization$normalization.method <- norm.method + sobj@misc$params$normalization$normalization.method <- norm.method sobj@misc$params$integration$method <- integration.method sobj@misc$params$integration$orig.assay <- assay sobj@misc$params$integration$out.assay <- "integrated" @@ -433,12 +440,13 @@ sobj@misc$params$analysis_type <- paste0("Integrated analysis; Method: ", integr sobj@misc$params$sobj_creation$Rsession <- utils::capture.output(devtools::session_info()) sobj@misc$params$species <- species sobj@misc$params$name.int <- name.int +Seurat::Project(sobj) <- name.int if (!is.null(author.name) && !tolower(author.name) %in% tolower(sobj@misc$params$author.name)) sobj@misc$params$author.name <- c(sobj@misc$params$author.name, author.name) if (!is.null(author.mail) && !tolower(author.mail) %in% tolower(sobj@misc$params$author.mail)) sobj@misc$params$author.mail <- c(sobj@misc$params$author.mail, author.mail) save(sobj, file = paste0(norm.dim.red.dir, '/', paste(c(name.int, norm_vtr, dimred_vtr), collapse = '_'), '.rda'), compress = "bzip2") ### Correlating reduction dimensions with biases and markers expression -dimensions.eval(sobj = sobj, reduction = red.name, eval.markers = eval.markers, meta.names = c('orig.ident','nCount_RNA', 'nFeature_RNA', 'percent_mt', 'MTscore', 'percent_rb', 'RBscore', 'percent_st', 'STscore', "Cyclone.S.Score", "Cyclone.G1.Score", "Cyclone.G2M.Score", "Cyclone.SmG2M.Score"), slot = 'data', out.dir = norm.dim.red.dir, nthreads = floor(nthreads/2)) +dimensions.eval(sobj = sobj, reduction = red.name, eval.markers = eval.markers, slot = 'data', out.dir = norm.dim.red.dir, nthreads = floor(nthreads/2)) gc() ### Testing multiple clustering parameters (nb dims kept + Louvain resolution) diff --git a/scripts/bustools2seurat_preproc_functions.R b/scripts/bustools2seurat_preproc_functions.R index f279841..2271098 100644 --- a/scripts/bustools2seurat_preproc_functions.R +++ b/scripts/bustools2seurat_preproc_functions.R @@ -2250,9 +2250,9 @@ feature.cor <- function(sobj = NULL, assay1 = 'RNA', assay2 = 'ADT', assay1.feat if (zero.filter) zerocells.get(sobj = sobj, assay1 = assay1, assay2 = assay2, assay1.feature = assay1.features[k], assay2.feature = assay2.features[k], slot = slot) else rep(TRUE, ncol(sobj@assays[[assay1]])) }, simplify = FALSE) #correlation - res_cor <- t(sapply(seq_along(gene.names), function(k) { + res_cor <- t(cbind(sapply(seq_along(gene.names), function(k) { cell.idx <- cell.idx.list[[k]] - if(length(which(cell.idx)) < 2) return(NA) # must to get at least 2 cells with RNA and protein expressions to compute correlation + if(length(which(cell.idx)) < 2) return(c(NA,NA,NA)) # must to get at least 2 cells with RNA and protein expressions to compute correlation tryCatch({ suppressMessages(library(dplyr)) data_ADT <- slot(sobj@assays[[assay2]], slot)[rownames(slot(sobj@assays[[assay2]], slot)) == assay2.features[k],cell.idx] @@ -2266,7 +2266,7 @@ feature.cor <- function(sobj = NULL, assay1 = 'RNA', assay2 = 'ADT', assay1.feat }, error = function(cond) { return(c(NA,NA,NA)) }) - })) + }))) #add the number of cells and colnames res_cor <- data.frame(vapply(cell.idx.list, function(x) { length(which(x)) }, 1L), res_cor, stringsAsFactors = FALSE) colnames(res_cor) <- c("nb_cells","cor","pval","significant_cor") @@ -2287,7 +2287,8 @@ feature_plots <- function(sobj, assay, features, slot, reduction, min.cutoff, ma Seurat::FeaturePlot(sobj, features = features[k], slot = slot, reduction = reduction, ncol = 1, pt.size = 2, order = TRUE, min.cutoff = min.cutoff[k], max.cutoff = max.cutoff[k]) }, warning=function(cond) { - if(grepl("All cells have the same value", as.character(cond)) ) tmp_plot = Seurat::FeaturePlot(sobj, features = features[k], slot = slot, reduction = reduction, ncol = 1, pt.size = 2, order = TRUE, min.cutoff = min.cutoff[k], max.cutoff = max.cutoff[k]) + if(grepl("NAs introduced by coercion", as.character(cond)) ) tmp_plot = suppressWarnings(Seurat::FeaturePlot(sobj, features = features[k], slot = slot, reduction = reduction, ncol = 1, pt.size = 2, order = TRUE, min.cutoff = min.cutoff[k], max.cutoff = max.cutoff[k])) + if(grepl("All cells have the same value", as.character(cond)) ) tmp_plot = suppressWarnings(Seurat::FeaturePlot(sobj, features = features[k], slot = slot, reduction = reduction, ncol = 1, pt.size = 2, order = TRUE, min.cutoff = min.cutoff[k], max.cutoff = max.cutoff[k])) if(grepl("The following requested variables were not found", as.character(cond)) ) tmp_plot = patchwork::wrap_elements(patchwork::plot_spacer() + patchwork::plot_annotation(title = features[k], theme = ggplot2::theme(plot.title = ggplot2::element_text(size=18, hjust=0.5, face="bold")))) if(grepl("Could not find", as.character(cond)) ) tmp_plot = patchwork::wrap_elements(patchwork::plot_spacer() + patchwork::plot_annotation(title = features[k], theme = ggplot2::theme(plot.title = ggplot2::element_text(size=18, hjust=0.5, face="bold")))) return(tmp_plot) diff --git a/scripts/pipeline_BCR.R b/scripts/pipeline_BCR.R index 6b36955..d7f95e8 100644 --- a/scripts/pipeline_BCR.R +++ b/scripts/pipeline_BCR.R @@ -110,7 +110,7 @@ set.seed(sobj@misc$params$seed) ## GLOBAL ANALYSIS >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> cat("\nGlobal Analysis >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") global_output <- paste0(output_path_BCR, "Global_analysis") -dir.create(path = global_output, recursive = TRUE, showWarnings = TRUE) +dir.create(path = global_output, recursive = TRUE, showWarnings = FALSE) ## Loading input data and Combining contigs cat("\nLoading input data and Combining contigs...\n")