Merge pull request #166 from sangeetashukla/deseq_cavatica_publishing

Enable using module to publish cavatica app
d3b-center · Apr 20, 2022 · c0e5692 · c0e5692
2 parents 9a1f438 + 1b39fbf
commit c0e5692
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 28 deletions.
diff --git a/analyses/tumor-normal-differential-expression/README.md b/analyses/tumor-normal-differential-expression/README.md
@@ -72,4 +72,7 @@ docker run --volume $PWD:/analysis deseq2_cavatica bash -c "cd /analysis && Rscr
 Note: In the above command, `--HIST_i` and `-–GTEX_i` are initialized as 1, since the process is CPU heavy, and not recommended to run on non-HPC servers. Initializing those index values small, is meant only for testing purposes.
 
 ## CAVATICA
-This module is also wrapped into a CAVATICA application that is made public. The application can be found [here](https://cavatica.sbgenomics.com/u/shuklas1/tumor-normal-differential-expression/apps/#shuklas1/tumor-normal-differential-expression/tumor-normal-differential-expression) . The data files required for running the application are also publicly available [here](https://cavatica.sbgenomics.com/u/cavatica/opentarget). 
+While the module requires an HPC environment for implementation, to enable non-HPC implementation this module is also wrapped into a [CAVATICA](https://d3b.center/our-research/cavatica/) application. To run on CAVATICA, the user must download scripts and publish an application. Following command can be used to publish an application on CAVATICA:
+`sbpack cavatica user/projectname/workflowname workflows/run_deseq_analysis_wf.cwl`
+Refer to this link for instructions on setting up [sbpack](https://docs.cavatica.org/docs/maintaining-and-versioning-cwl-on-external-tool-repositories).
+The data files required for running the application are also publicly available [here](https://cavatica.sbgenomics.com/u/cavatica/opentarget). 
diff --git a/analyses/tumor-normal-differential-expression/run-generate-Hist-GTEx-indices-file.sh b/analyses/tumor-normal-differential-expression/run-generate-Hist-GTEx-indices-file.sh
@@ -17,7 +17,7 @@ cd "$script_directory" || exit
 #This script creates a subset of the histologies.tsv, to use for testing the deseq module.
 
 module load R/4.1.0
-Rscript --vanilla run-Generate_Hist_GTEx_indices_file.R \
+Rscript --vanilla run-generate_Hist_GTEx_indices_file.R \
         --hist_file ../../data/histologies.tsv \
         --counts_file ../../data/gene-counts-rsem-expected_count-collapsed.rds \
         --outdir Input_Data \

diff --git a/analyses/tumor-normal-differential-expression/workflows/cavatica_app_pub.py b/analyses/tumor-normal-differential-expression/workflows/cavatica_app_pub.py
@@ -0,0 +1,88 @@
+import argparse
+from ruamel import yaml
+from ruamel.yaml.scalarstring import PreservedScalarString as pss
+import sys
+import pdb
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-i', '--input-cwl', action='store', dest='cwl', help='Input cwl file',required=True)
+parser.add_argument('-r', '--readme', action='store', dest='readme', help='Readme file to insert into workflow/tool doc, if applicable', required=False)
+parser.add_argument('-n', '--id-name', action='store', dest='id_name', help='Short app ID link name to use, i.e. kfdrc-align-wf', required=False)
+parser.add_argument('-l', '--label', action='store', dest='label', help='User-friendly label to add to tool/workflow cwl, if needed', required=False)
+parser.add_argument('-t', '--tags', action='store', dest='tags', help='Seven bridges tags file, as csv string, ex RNASEQ,FUSION', required=False)
+parser.add_argument('-f', '--files', action='store', dest='files', help='Cavatica-style tsv manifest with file ID, file name, associated cwl input key, and, optionally, secondaryFile names and IDs', required=False)
+parser.add_argument('-p', '--publisher', action='store', dest='pub', help='Publisher name', required=False, default="KFDRC")
+args = parser.parse_args()
+
+
+def update_file_paths(manifest, yaml_obj):
+    mf = open(manifest)
+    head = next(mf) # fields: fid,name,key,sfids,snames
+    # see if more than one file exists for input keys - will determine if values is to be File or File[]
+    in_dict = {}
+    for entry in mf:
+        info = [None] * 5
+        data = entry.rstrip('\n').split('\t')
+        info[:len(data)] = data
+        fid,name,key,sfids,snames = info
+        if sfids in [None,"None"]:
+            dat = {"class":"File","path":fid,"name":name}
+        else:
+            assert len(sfids.split(',')) == len(snames.split(',')), "Lengths of secondaryFile IDs and names MUST BE equal"
+            sfidsList = sfids.split(',')
+            snamesList = snames.split(',')
+            sfileList = []
+            for i in range(len(sfidsList)):
+                sfileList.append({"class":"File","path":sfidsList[i],"name":snamesList[i]})
+            dat = {"class":"File","path":fid,"name":name,"secondaryFiles":sfileList}
+        if key not in in_dict:
+            in_dict[key] = []
+        in_dict[key].append(dat)
+        for key in in_dict:
+            if len(in_dict[key]) == 1:
+                yaml_obj['inputs'][key]["sbg:suggestedValue"] = in_dict[key][0]
+            else:
+                yaml_obj['inputs'][key]["sbg:suggestedValue"] = in_dict[key]
+
+# round tripper preservers order and formatting of keys and values
+data = yaml.load(open(args.cwl), yaml.RoundTripLoader, preserve_quotes=True)
+if args.files:
+    update_file_paths(args.files, data)
+# check for license, pub
+if 'sbg:license' not in data:
+    data['sbg:license'] = "Apache License 2.0"
+if 'sbg:publisher' not in data:
+    data['sbg:publisher'] = args.pub
+if args.tags:
+    data['sbg:categories'] = []
+    for tag in sorted(args.tags.split(',')):
+        data['sbg:categories'].append(tag)
+key_list = list(data.keys())
+if args.label:
+    data.insert(key_list.index('id')+1, 'label', args.label)
+    data['label'] = args.label
+
+if args.readme:
+    rm = open(args.readme)
+    rm_str = rm.read()
+    rm.close()
+    # code from should improve readme appearance
+    pss_rm_str = pss(rm_str)
+    key_list = list(data.keys())
+    if 'doc' not in data:
+        try:
+            data.insert(key_list.index('label')+1, 'doc', pss_rm_str)
+        except Exception as e:
+            sys.stderr.write(str(e) + "\nFailed to add doc field after label field, trying after id\n")
+            data.insert(key_list.index('id')+1, 'doc', pss_rm_str)
+    else:
+        data['doc'] = pss_rm_str
+if args.id_name:
+    key_list = list(data.keys())
+    if 'id' not in data:
+        data.insert(key_list.index('class')+1, 'id', args.id_name)
+    else:
+        data['id'] = args.id_name
+
+yaml.dump(data, sys.stdout, Dumper=yaml.RoundTripDumper, default_flow_style=False)
diff --git a/analyses/tumor-normal-differential-expression/workflows/cavatica_app_readme.txt b/analyses/tumor-normal-differential-expression/workflows/cavatica_app_readme.txt
@@ -0,0 +1,20 @@
+#Introduction
+
+DESeq application takes the gene expression counts data from the pediatric cancer cohorts and normal tissue from GTEx to determine the differential expression genes between all pediatric tumors and normal tissue. This data will be used in the Open Targets platform to help investigators identify potential therapeutic targets in various childhood cancers.
+
+
+
+#Method
+
+This app uses R for data manipulation and differential expression analysis using packages DESeq2, jsonlite, among others. 
+The data set includes cancers with at least 3 cases, and DESeq analysis was performed on every combination of tumor and normal tissue. Data files for the analysis are available in the [OpenTarget Open Access](http://https://cavatica.sbgenomics.com/u/cavatica/opentarget "OpenTarget Open Access") (now Molecular Targets) project. User can also choose to upload their files in similar format.
+
+
+
+
+#Result
+
+The app delivers analysis output for each cancer cohort and normal tissue pair, with the gene symbol, logfc, pval, EFO ID, MONDO ID, mean TPM values in jsonl, json, and tsv formats.
+
+#Contact
+For additional information or questions, contact [Alvin Farrel](mailto:farrela@chop.edu) or [Sangeeta Shukla](mailto:shuklas1@chop.edu).
diff --git a/analyses/tumor-normal-differential-expression/workflows/run_deseq_analysis_wf.cwl b/analyses/tumor-normal-differential-expression/workflows/run_deseq_analysis_wf.cwl
@@ -1,20 +1,38 @@
 cwlVersion: v1.2
 class: Workflow
-id: run_deseq2_analysis
-label: Run DESeq2 Analysis comparing samples in cancer histology groups to GTEX
-doc: |-
-  # Run DESeq2 Analysis comparing samples in cancer histology groups to GTEX
+id: PMTL DESeq
+label: Differential Expression Analysis
+doc: |
+  #Introduction
 
-requirements:
-  - class: ScatterFeatureRequirement
-  - class: MultipleInputFeatureRequirement
-  - class: StepInputExpressionRequirement
-  - class: InlineJavascriptRequirement
+  DESeq application takes the gene expression counts data from the pediatric cancer cohorts and normal tissue from GTEx to determine the differential expression genes between all pediatric tumors and normal tissue. This data will be used in the Open Targets platform to help investigators identify potential therapeutic targets in various childhood cancers.
+
+
+
+  #Method
+
+  This app uses R for data manipulation and differential expression analysis using packages DESeq2, jsonlite, among others. 
+  The data set includes cancers with at least 3 cases, and DESeq analysis was performed on every combination of tumor and normal tissue. Data files for the analysis are available in the [OpenTarget Open Access](http://https://cavatica.sbgenomics.com/u/cavatica/opentarget "OpenTarget Open Access") (now Molecular Targets) project. User can also choose to upload their files in similar format.
+
+
+
+
+  #Result
+
+  The app delivers analysis output for each cancer cohort and normal tissue pair, with the gene symbol, logfc, pval, EFO ID, MONDO ID, mean TPM values in jsonl, json, and tsv formats.
 
+  #Contact
+  For additional information or questions, contact [Alvin Farrel](mailto:farrela@chop.edu) or [Sangeeta Shukla](mailto:shuklas1@chop.edu).
+requirements:
+- class: ScatterFeatureRequirement
+- class: MultipleInputFeatureRequirement
+- class: StepInputExpressionRequirement
+- class: InlineJavascriptRequirement
 inputs:
   output_basename: {type: string, doc: "Output basename for workflow output files"}
   gene_count_file: {type: File, doc: "RSEM gene counts rds file"}
-  histology_file: {type: File, doc: "Histology file, should be the base histology file"}
+  histology_file: {type: File, doc: "Histology file, should be the base histology\
+      \ file"}
   tpm_file: {type: File, doc: "TPM counts rds file"}
   hugo_file: {type: File, doc: "ENSG Hugo codes tsv file"}
   mondo_file: {type: File, doc: "MONDO and EFO codes tsv file"}
@@ -23,14 +41,15 @@ inputs:
   ind_eachcohort: {type: File, doc: "Independent specimens for each cohort file"}
   ram: {type: 'int?', default: 32, doc: "In GB"}
   cpus: {type: 'int?', default: 4, doc: "Number of CPUs to request"}
-  hist_max_index_test: {type: 'int?', doc: "Maximum number of histology groups to use for testing, this overrides the number of histology groups from the subsetting tool."}
-  gtex_max_index_test: {type: 'int?', doc: "Maximum number of gtex groups to use for testing, this overrides the number of gtex groups from the subsetting tool."}
-
+  hist_max_index_test: {type: 'int?', doc: "Maximum number of histology groups to\
+      \ use for testing, this overrides the number of histology groups from the subsetting\
+      \ tool."}
+  gtex_max_index_test: {type: 'int?', doc: "Maximum number of gtex groups to use for\
+      \ testing, this overrides the number of gtex groups from the subsetting tool."}
 outputs:
   output_tsv: {type: File, outputSource: combine_output_files/combined_tsv}
   output_jsonl: {type: File, outputSource: combine_output_files/combined_jsonl}
   output_rds: {type: File, outputSource: convert_tsv_to_rds/merged_rds}
-
 steps:
 
   subset_inputs:
@@ -41,21 +60,18 @@ steps:
       ind_allcohorts: ind_allcohorts
       ind_eachcohort: ind_eachcohort
     out: [subsetted_histology, subsetted_count, histology_length_file, gtex_length_file]
-
   build_hist_array:
     run: ../tools/build_index_array.cwl
     in:
       index_max_file: subset_inputs/histology_length_file
       test_maximum: hist_max_index_test
     out: [index_array]
-
   build_gtex_array:
     run: ../tools/build_index_array.cwl
     in:
       index_max_file: subset_inputs/gtex_length_file
       test_maximum: gtex_max_index_test
     out: [index_array]
-
   run_deseq2:
     run: ../tools/run_deseq.cwl
     scatter: [histology_index, gtex_index]
@@ -75,26 +91,28 @@ steps:
       ram: ram
       cpus: cpus
     out: [results_dir]
-
   combine_output_files:
     run: ../tools/combine_output_files.cwl
     in:
       results_dirs: run_deseq2/results_dir
       output_basename: output_basename
-    out:
-      [combined_tsv, combined_jsonl]
-
+    out: [combined_tsv, combined_jsonl]
   convert_tsv_to_rds:
     run: ../tools/convert_tsv_to_rds.cwl
     in:
       combined_tsv: combine_output_files/combined_tsv
       output_basename: output_basename
     out: [merged_rds]
-
 $namespaces:
   sbg: https://sevenbridges.com
 hints:
-  - class: 'sbg:maxNumberOfParallelInstances'
-    value: 80
-  - class: 'sbg:AWSInstanceType'
-    value: r5.24xlarge
+- class: 'sbg:maxNumberOfParallelInstances'
+  value: 80
+- class: 'sbg:AWSInstanceType'
+  value: r5.24xlarge
+sbg:license: Apache License 2.0
+sbg:publisher: PMT
+sbg:categories:
+- DESeq
+- Differential Expression
+- Molecular Targets