add ADT, TCR and BCR for Integrated or Grouped analysis

gustaveroussy · May 14, 2021 · cf77427 · cf77427
1 parent 5e0d97b
commit cf77427
Show file tree

Hide file tree

Showing 16 changed files with 1,753 additions and 141 deletions.
diff --git a/Snakefile b/Snakefile
diff --git a/rules/Grp_Adding_ADT.smk b/rules/Grp_Adding_ADT.smk
@@ -0,0 +1,86 @@
+"""
+##########################################################################
+This rule add adt information to expression gene analysis in grouped single-cell RNA-seq.
+##########################################################################
+"""
+
+wildcard_constraints:
+    grp_add_adt_output = "|".join(GRP_ADD_ADT_OUTPUT)
+
+"""
+This function allows to determine the input .rda ge file and kallisto adt folder.
+"""
+def grp_add_adt_input(wildcards):
+    sys.stderr.write(str(wildcards.grp_add_adt_output)+"\n")
+    ge_rda_file = dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_INPUT_RDA']
+    kallisto_folder = list(dict.fromkeys(dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_INPUT_DIR_ADT'].split(",")))
+    kallisto_folder.insert(0,ge_rda_file)
+    return kallisto_folder
+
+"""
+This function allows to determine the singularity binding parameters.
+"""
+def grp_add_adt_params_sing(wildcards):
+    rda_folder = os.path.dirname(dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_INPUT_RDA']) # output_folder too
+    concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder)
+    for kallisto_folder in list(dict.fromkeys(dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_INPUT_DIR_ADT'].split(","))):
+        kallisto_folder = os.path.dirname(kallisto_folder)
+        concat = concat + " -B " + kallisto_folder + ":" + os.path.normpath("/WORKDIR/" + kallisto_folder)
+    return concat
+
+"""
+This function allows to determine the input alignment folder for params section.
+"""
+def grp_add_adt_params_input_folder(wildcards):
+    return ",".join([ os.path.normpath("/WORKDIR/" + kallisto_folder + "/") for kallisto_folder in list(dict.fromkeys(dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_INPUT_DIR_ADT'].split(","))) ])
+
+"""
+This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot).
+"""
+def grp_add_adt_params_output_folder(wildcards):
+    return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.grp_add_adt_output)) + "/"
+
+"""
+This function allows to determine the sample.name.adt for params.
+"""
+def grp_add_adt_params_sample_name_adt(wildcards):
+    return dic_GRP_ADD_ADT_INFO[wildcards.grp_add_adt_output]['GRP_ADD_ADT_SAMPLE_NAME_ADT']
+
+
+"""
+This rule launches the R script to add adt information to expression gene analysis.
+"""
+rule grp_add_adt_ge:
+    input:
+        grp_add_adt_file = grp_add_adt_input
+    output:
+        grp_add_adt_rda_file = "{grp_add_adt_output}" + "_ADT.rda"
+    params:
+        sing_bind = grp_add_adt_params_sing,
+        pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER),
+        input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]),
+        kallisto_folder = grp_add_adt_params_input_folder,
+        output_folder = grp_add_adt_params_output_folder,
+        sample_name_adt = grp_add_adt_params_sample_name_adt
+    threads:
+        1
+    resources:
+        mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480),
+        time_min = lambda wildcards, attempt: min(attempt * 120, 200)
+    shell:
+        """
+        singularity exec --no-home {params.sing_bind} \
+        {SINGULARITY_ENV} \
+        Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_ADT.R \
+        --samples.name.adt {params.sample_name_adt} \
+        --input.rda.ge {params.input_rda} \
+        --output.dir {params.output_folder} \
+        --input.dirs.adt {params.kallisto_folder} \
+        --author.name {GRP_ADD_ADT_AUTHOR_NAME} \
+        --author.mail {GRP_ADD_ADT_AUTHOR_MAIL} \
+        --nthreads {threads} \
+        --pipeline.path {params.pipeline_folder} \
+        --gene.names  {GRP_ADD_ADT_GENE_NAMES} \
+        --ADT.min.cutoff {GRP_ADD_ADT_MIN_CUTOFF} \
+        --ADT.max.cutoff {GRP_ADD_ADT_MAX_CUTOFF}
+        """
diff --git a/rules/Grp_Adding_BCR.smk b/rules/Grp_Adding_BCR.smk
@@ -0,0 +1,71 @@
+"""
+##########################################################################
+This rule add bcr information to expression gene analysis in single-cell RNA-seq.
+##########################################################################
+"""
+wildcard_constraints:
+    grp_add_bcr_output = "|".join(GRP_ADD_BCR_OUTPUT)
+
+"""
+This function allows to determine the input .rda file and csv file from cellranger vdj.
+"""
+def grp_add_bcr_input(wildcards):
+    rda_file = dic_GRP_ADD_BCR_INFO[wildcards.grp_add_bcr_output]['GRP_ADD_BCR_INPUT_RDA']
+    csv_file = list(dict.fromkeys(dic_GRP_ADD_BCR_INFO[wildcards.grp_add_bcr_output]['GRP_ADD_BCR_INPUT_CSV_BCR'].split(",")))
+    csv_file.insert(0, rda_file)
+    return csv_file
+
+"""
+This function allows to determine the singularity binding parameters.
+"""
+def grp_add_bcr_params_sing(wildcards):
+    rda_folder = os.path.dirname(dic_GRP_ADD_BCR_INFO[wildcards.grp_add_bcr_output]['GRP_ADD_BCR_INPUT_RDA']) # output_folder too
+    concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder)
+    for bcrfile in list(dict.fromkeys(dic_GRP_ADD_BCR_INFO[wildcards.grp_add_bcr_output]['GRP_ADD_BCR_INPUT_CSV_BCR'].split(","))):
+        bcrfile = os.path.dirname(bcrfile)
+        concat = concat + " -B " + bcrfile + ":" + os.path.normpath("/WORKDIR/" + bcrfile)
+    return concat
+
+"""
+This function allows to determine the bcr files folders for params.
+"""
+def grp_add_bcr_params_bcr_files(wildcards):
+    return ",".join([ os.path.normpath("/WORKDIR/" + bcrfile) for bcrfile in list(dict.fromkeys(dic_GRP_ADD_BCR_INFO[wildcards.grp_add_bcr_output]['GRP_ADD_BCR_INPUT_CSV_BCR'].split(","))) ])
+
+"""
+This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot).
+"""
+def grp_add_bcr_params_output_folder(wildcards):
+    return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.grp_add_bcr_output)) + "/"
+
+"""
+This rule launches the R script to add adt information to expression gene analysis.
+"""
+rule grp_add_bcr_ge:
+    input:
+        grp_add_bcr_file = grp_add_bcr_input
+    output:
+        grp_add_bcr_rda_file = "{grp_add_bcr_output}" + "_BCR.rda"
+    params:
+        sing_bind = grp_add_bcr_params_sing,
+        pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER),
+        input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]),
+        input_csv = grp_add_bcr_params_bcr_files,
+        output_folder = grp_add_bcr_params_output_folder
+    threads:
+        1
+    resources:
+        mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480),
+        time_min = lambda wildcards, attempt: min(attempt * 120, 200)
+    shell:
+        """
+        singularity exec --no-home {params.sing_bind} \
+        {SINGULARITY_ENV_TCR_BCR} \
+        Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_BCR.R \
+        --input.rda {params.input_rda} \
+        --output.dir {params.output_folder} \
+        --vdj.input.files.bcr {params.input_csv} \
+        --author.name {GRP_ADD_BCR_AUTHOR_NAME} \
+        --author.mail {GRP_ADD_BCR_AUTHOR_MAIL} \
+        --pipeline.path {params.pipeline_folder}
+        """
diff --git a/rules/Grp_Adding_TCR.smk b/rules/Grp_Adding_TCR.smk
@@ -0,0 +1,71 @@
+"""
+##########################################################################
+This rule add tcr information to expression gene analysis in single-cell RNA-seq.
+##########################################################################
+"""
+wildcard_constraints:
+    grp_add_tcr_output = "|".join(GRP_ADD_TCR_OUTPUT)
+
+"""
+This function allows to determine the input .rda file and csv file from cellranger vdj.
+"""
+def grp_add_tcr_input(wildcards):
+    rda_file = dic_GRP_ADD_TCR_INFO[wildcards.grp_add_tcr_output]['GRP_ADD_TCR_INPUT_RDA']
+    csv_file = list(dict.fromkeys(dic_GRP_ADD_TCR_INFO[wildcards.grp_add_tcr_output]['GRP_ADD_TCR_INPUT_CSV_TCR'].split(",")))
+    csv_file.insert(0, rda_file)
+    return csv_file
+
+"""
+This function allows to determine the singularity binding parameters.
+"""
+def grp_add_tcr_params_sing(wildcards):
+    rda_folder = os.path.dirname(dic_GRP_ADD_TCR_INFO[wildcards.grp_add_tcr_output]['GRP_ADD_TCR_INPUT_RDA']) # output_folder too
+    concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder)
+    for tcrfile in list(dict.fromkeys(dic_GRP_ADD_TCR_INFO[wildcards.grp_add_tcr_output]['GRP_ADD_TCR_INPUT_CSV_TCR'].split(","))):
+        tcrfile = os.path.dirname(tcrfile)
+        concat = concat + " -B " + tcrfile + ":" + os.path.normpath("/WORKDIR/" + tcrfile)
+    return concat
+
+"""
+This function allows to determine the tcr files folders for params.
+"""
+def grp_add_tcr_params_tcr_files(wildcards):
+    return ",".join([ os.path.normpath("/WORKDIR/" + tcrfile) for tcrfile in list(dict.fromkeys(dic_GRP_ADD_TCR_INFO[wildcards.grp_add_tcr_output]['GRP_ADD_TCR_INPUT_CSV_TCR'].split(","))) ])
+
+"""
+This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot).
+"""
+def grp_add_tcr_params_output_folder(wildcards):
+    return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.grp_add_tcr_output)) + "/"
+
+"""
+This rule launches the R script to add adt information to expression gene analysis.
+"""
+rule grp_add_tcr_ge:
+    input:
+        grp_add_tcr_file = grp_add_tcr_input
+    output:
+        grp_add_tcr_rda_file = "{grp_add_tcr_output}" + "_TCR.rda"
+    params:
+        sing_bind = grp_add_tcr_params_sing,
+        pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER),
+        input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]),
+        input_csv = grp_add_tcr_params_tcr_files,
+        output_folder = grp_add_tcr_params_output_folder
+    threads:
+        1
+    resources:
+        mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480),
+        time_min = lambda wildcards, attempt: min(attempt * 120, 200)
+    shell:
+        """
+        singularity exec --no-home {params.sing_bind} \
+        {SINGULARITY_ENV_TCR_BCR} \
+        Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_TCR.R \
+        --input.rda {params.input_rda} \
+        --output.dir {params.output_folder} \
+        --vdj.input.files.tcr {params.input_csv} \
+        --author.name {GRP_ADD_TCR_AUTHOR_NAME} \
+        --author.mail {GRP_ADD_TCR_AUTHOR_MAIL} \
+        --pipeline.path {params.pipeline_folder}
+        """
diff --git a/rules/Int_Adding_ADT.smk b/rules/Int_Adding_ADT.smk
@@ -0,0 +1,84 @@
+"""
+##########################################################################
+This rule add adt information to expression gene analysis in integrated single-cell RNA-seq.
+##########################################################################
+"""
+wildcard_constraints:
+    int_add_adt_output = "|".join(INT_ADD_ADT_OUTPUT)
+
+"""
+This function allows to determine the input .rda ge file and kallisto adt folder.
+"""
+def int_add_adt_input(wildcards):
+    ge_rda_file = dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_INPUT_RDA']
+    kallisto_folder = list(dict.fromkeys(dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_INPUT_DIR_ADT'].split(",")))
+    kallisto_folder.insert(0,ge_rda_file)
+    return kallisto_folder
+
+"""
+This function allows to determine the singularity binding parameters.
+"""
+def int_add_adt_params_sing(wildcards):
+    rda_folder = os.path.dirname(dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_INPUT_RDA']) # output_folder too
+    concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder)
+    for kallisto_folder in list(dict.fromkeys(dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_INPUT_DIR_ADT'].split(","))):
+        kallisto_folder = os.path.dirname(kallisto_folder)
+        concat = concat + " -B " + kallisto_folder + ":" + os.path.normpath("/WORKDIR/" + kallisto_folder)
+    return concat
+
+"""
+This function allows to determine the input alignment folder for params section.
+"""
+def int_add_adt_params_input_folder(wildcards):
+    return ",".join([ os.path.normpath("/WORKDIR/" + kallisto_folder + "/") for kallisto_folder in list(dict.fromkeys(dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_INPUT_DIR_ADT'].split(","))) ])
+
+"""
+This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot).
+"""
+def int_add_adt_params_output_folder(wildcards):
+    return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.int_add_adt_output)) + "/"
+
+"""
+This function allows to determine the sample.name.adt for params.
+"""
+def int_add_adt_params_sample_name_adt(wildcards):
+    return dic_INT_ADD_ADT_INFO[wildcards.int_add_adt_output]['INT_ADD_ADT_SAMPLE_NAME_ADT']
+
+
+"""
+This rule launches the R script to add adt information to expression gene analysis.
+"""
+rule int_add_adt_ge:
+    input:
+        int_add_adt_file = int_add_adt_input
+    output:
+        int_add_adt_rda_file = "{int_add_adt_output}" + "_ADT.rda"
+    params:
+        sing_bind = int_add_adt_params_sing,
+        pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER),
+        input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]),
+        kallisto_folder = int_add_adt_params_input_folder,
+        output_folder = int_add_adt_params_output_folder,
+        sample_name_adt = int_add_adt_params_sample_name_adt
+    threads:
+        1
+    resources:
+        mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480),
+        time_min = lambda wildcards, attempt: min(attempt * 120, 200)
+    shell:
+        """
+        singularity exec --no-home {params.sing_bind} \
+        {SINGULARITY_ENV} \
+        Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_ADT.R \
+        --samples.name.adt {params.sample_name_adt} \
+        --input.rda.ge {params.input_rda} \
+        --output.dir {params.output_folder} \
+        --input.dirs.adt {params.kallisto_folder} \
+        --author.name {INT_ADD_ADT_AUTHOR_NAME} \
+        --author.mail {INT_ADD_ADT_AUTHOR_MAIL} \
+        --nthreads {threads} \
+        --pipeline.path {params.pipeline_folder} \
+        --gene.names  {INT_ADD_ADT_GENE_NAMES} \
+        --ADT.min.cutoff {INT_ADD_ADT_MIN_CUTOFF} \
+        --ADT.max.cutoff {INT_ADD_ADT_MAX_CUTOFF}
+        """
diff --git a/rules/Int_Adding_BCR.smk b/rules/Int_Adding_BCR.smk
@@ -0,0 +1,71 @@
+"""
+##########################################################################
+This rule add bcr information to expression gene analysis in single-cell RNA-seq.
+##########################################################################
+"""
+wildcard_constraints:
+    int_add_bcr_output = "|".join(INT_ADD_BCR_OUTPUT)
+
+"""
+This function allows to determine the input .rda file and csv file from cellranger vdj.
+"""
+def int_add_bcr_input(wildcards):
+    rda_file = dic_INT_ADD_BCR_INFO[wildcards.int_add_bcr_output]['INT_ADD_BCR_INPUT_RDA']
+    csv_file = list(dict.fromkeys(dic_INT_ADD_BCR_INFO[wildcards.int_add_bcr_output]['INT_ADD_BCR_INPUT_CSV_BCR'].split(",")))
+    csv_file.insert(0, rda_file)
+    return csv_file
+
+"""
+This function allows to determine the singularity binding parameters.
+"""
+def int_add_bcr_params_sing(wildcards):
+    rda_folder = os.path.dirname(dic_INT_ADD_BCR_INFO[wildcards.int_add_bcr_output]['INT_ADD_BCR_INPUT_RDA']) # output_folder too
+    concat = " -B " + PIPELINE_FOLDER + ":" + os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER) + " -B " + rda_folder + ":" + os.path.normpath("/WORKDIR/" + rda_folder)
+    for bcrfile in list(dict.fromkeys(dic_INT_ADD_BCR_INFO[wildcards.int_add_bcr_output]['INT_ADD_BCR_INPUT_CSV_BCR'].split(","))):
+        bcrfile = os.path.dirname(bcrfile)
+        concat = concat + " -B " + bcrfile + ":" + os.path.normpath("/WORKDIR/" + bcrfile)
+    return concat
+
+"""
+This function allows to determine the bcr files folders for params.
+"""
+def int_add_bcr_params_bcr_files(wildcards):
+    return ",".join([ os.path.normpath("/WORKDIR/" + bcrfile) for bcrfile in list(dict.fromkeys(dic_INT_ADD_BCR_INFO[wildcards.int_add_bcr_output]['INT_ADD_BCR_INPUT_CSV_BCR'].split(","))) ])
+
+"""
+This function allows to determine the output folder for params (os.path.dirname() not allowed in params slot).
+"""
+def int_add_bcr_params_output_folder(wildcards):
+    return os.path.normpath("/WORKDIR/" + os.path.dirname(wildcards.int_add_bcr_output)) + "/"
+
+"""
+This rule launches the R script to add adt information to expression gene analysis.
+"""
+rule int_add_bcr_ge:
+    input:
+        int_add_bcr_file = int_add_bcr_input
+    output:
+        int_add_bcr_rda_file = "{int_add_bcr_output}" + "_BCR.rda"
+    params:
+        sing_bind = int_add_bcr_params_sing,
+        pipeline_folder = os.path.normpath("/WORKDIR/" + PIPELINE_FOLDER),
+        input_rda = lambda wildcards, input: os.path.normpath("/WORKDIR/" + input[0]),
+        input_csv = int_add_bcr_params_bcr_files,
+        output_folder = int_add_bcr_params_output_folder
+    threads:
+        1
+    resources:
+        mem_mb = lambda wildcards, attempt: min(5120 + attempt * 3072, 20480),
+        time_min = lambda wildcards, attempt: min(attempt * 120, 200)
+    shell:
+        """
+        singularity exec --no-home {params.sing_bind} \
+        {SINGULARITY_ENV_TCR_BCR} \
+        Rscript {params.pipeline_folder}/scripts/Int_Grp_pipeline_BCR.R \
+        --input.rda {params.input_rda} \
+        --output.dir {params.output_folder} \
+        --vdj.input.files.bcr {params.input_csv} \
+        --author.name {INT_ADD_BCR_AUTHOR_NAME} \
+        --author.mail {INT_ADD_BCR_AUTHOR_MAIL} \
+        --pipeline.path {params.pipeline_folder}
+        """