d3b-center · jharenza · Apr 30, 2023 · Mar 10, 2023 · Mar 10, 2023 · Mar 10, 2023
@@ -34,3 +34,77 @@ This script is not run via `run-collapse-rnaseq.sh`.
 * `01-summarize_matrices.R` - this script generates the collapsed matrices as described above.
 In addition, this script calculates the average Pearson correlation between the values of the gene symbol that is kept and those duplicates that are discarded.
 * `02-analyze-drops.Rmd` - this is used to display tables from `01-summarize_matrices.R`.
+
+## CWL Tools and scripts
+This contains a cwl workflow wrapper (to run using cwlrunner or on a cwl-enabled platform, like CAVATICA).
+The workflow runs a liftover step and a convert-to-red step.
+The wrapper `.sh` script runs the python script and gzips the outputs in the end.
+The python script does the liftover and collapse with no compression.
+Since it uses base python packages, one can run it in any environment with a standard python3 installation.
+The convert-to-rds step is for downstream compatibility with modules
+```
+cwl
+├── scripts
+│   ├── convert_to_rds.R
+│   ├── liftover_collapse_rnaseq.py
+│   └── liftover_collapse_rnaseq_wrapper.sh
+├── tools
+│   ├── convert_to_rds.cwl
+│   └── liftover_collapse_rnaseq.cwl
+└── workflows
+    └── liftover_collapse_exp_wf.cwl
+```
+
+### analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl
+This workflow wrapper runs all of the tools described below
+
+### analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py
+```
+usage: liftover_collapse_rnaseq.py [-h] [-t TABLE] [-o OUT] [-i GTF] [-g GENE_ID] [-n GENE_NAME] [-s SKIP] [-c]
+
+Script to use a GTF and expression matrix with ENSG to liftover gene symbols.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -t TABLE, --table TABLE
+                        Input expression matrix. Can be gzipped or plain tsv
+  -o OUT, --output-basename OUT
+                        output basename of liftover and collapse if flag given
+  -i GTF, --input-gtf GTF
+                        GTF file to use as liftover reference. Can be gzipped or plain tsv
+  -g GENE_ID, --gene-id GENE_ID
+                        NAME of gene ID (ENSG values) column
+  -n GENE_NAME, --gene-name GENE_NAME
+                        NAME of gene name column, if present to keep if not found
+  -s SKIP, --skip SKIP  Number of lines to skip if needed
+  -c, --collapse        If set, will collapse on gene_symbol
+```
+Examples:
+`python3 liftover_collapse_rnaseq.py -t TCGA_gene_counts.tsv -o TCGA -i gencode.v39.primary_assembly.gtf -g Gene_id -n Gene_name --collapse 2> errs.log` will generate:
+```
+├── TCGA.collapsed.tsv
+├── TCGA.liftover.tsv
+```
+### analyses/collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl
+gzips the output from liftover step
+
+### analyses/collapse-rnaseq/cwl/scripts/convert_to_rds.R
+```
+Usage: convert_to_rds.R [options]
+
+
+Options:
+	--input-tsv=INPUT-TSV
+		tsv RNA expression matrix
+
+	--gene-col=GENE-COL
+		Name of column with gene symbol
+
+	--output-filename=OUTPUT-FILENAME
+		Output file name, should fit pattern gene-[expression/counts]-rsem-[fpkm/tpm/expected_count]-collapsed.rds
+
+	-h, --help
+		Show this help message and exit
+```
+Example run with collapsed TCGA output:
+`Rscript convert_to_rds.R --gene-col gene_name --input-tsv TCGA_v36_TEST.collapsed.tsv.gz --output-filename fancy_long_filename.rds`
@@ -0,0 +1,32 @@
+if(!require(optparse)){install.packages('optparse')}
+library("optparse")
+
+#process inputs
+option_list <- list(
+  make_option(
+    opt_str = "--input-tsv",
+    type = "character",
+    dest = "input_tsv",
+    help = "tsv RNA expression matrix",
+  ),
+  make_option(
+    opt_str = "--gene-col",
+    type = "character",
+    dest = "gene_col",
+    help = "Name of column with gene symbol"
+  ),
+  make_option(
+    opt_str = "--output-filename",
+    type = "character",
+    dest = "out",
+    help = "Output file name, should fit pattern gene-[expression/counts]-rsem-[fpkm/tpm/expected_count]-collapsed.rds"
+  )
+)
+
+#parse options
+opts <- parse_args(OptionParser(option_list = option_list))
+message("Reading in tsv...")
+tsv_df = read.table(file = opts$input_tsv, sep = '\t', header = TRUE, check.names = FALSE, row.names=opts$gene_col)
+message("Saving as rds...")
+saveRDS(tsv_df, file=opts$out)
+message("Done!")
@@ -0,0 +1,177 @@
+"""
+Script to use a GTF and expression matrix with ENSG to liftover gene symbols
+"""
+
+import argparse
+import gzip
+import re
+import sys
+from statistics import mean
+
+
+def update_gtf_dict(gtf_dict, in_text, tx_type, tx_desc, id_find, name_find):
+    """
+    Update ensg - sym ID dictionary on gene entries only
+    """
+    data = in_text.strip('\n').split('\t')
+    if data[tx_type] == 'gene':
+        try:
+            gene_id = id_find.match(data[tx_desc]).group(1)
+            gene_name = name_find.match(data[tx_desc]).group(1)
+            gtf_dict[gene_id] = gene_name
+        except Exception as e:
+            print(str(e), file=sys.stderr)
+            print("Failed to parse gene ID and name from " + in_text, file=sys.stderr)
+
+
+def liftover(line_from_infile, gene_sym_ct_dict, g_idx, n_idx, gtf_dict, d_idx):
+    """
+    Update gene symbol if found in dict, leave alone if not
+    """
+    data = line_from_infile.rstrip('\n').split('\t')
+    cur_id = data[g_idx].split('.')[0]
+    if cur_id in gtf_dict:
+        data[n_idx] = gtf_dict[cur_id]
+    gene_sym = data[n_idx]
+    if gene_sym not in gene_sym_ct_dict:
+        gene_sym_ct_dict[gene_sym] = 1
+    else:
+        gene_sym_ct_dict[gene_sym] += 1
+    out_data = "{}\t{}\t{}\n".format(cur_id, gene_sym, '\t'.join(data[d_idx:]))
+    return out_data
+
+
+def collate_dups(in_data, dup_gene_sym_dict, ct, gene_sym_ct_dict, d_idx, n_idx):
+    """
+    Checks if entry is in dup dict (really symbol seen > 1 in gene_sym_ct_dict),
+    then add to dict ot process, or output  to open file handle as-is if not repeated
+    """
+    data = in_data.rstrip('\n').split('\t')
+    gene_sym = data[n_idx]
+    out_data = None
+    if gene_sym_ct_dict[gene_sym] > 1:
+        if gene_sym not in dup_gene_sym_dict:
+            dup_gene_sym_dict[gene_sym] = []
+            ct += 1
+        # cast as int for later
+        dup_gene_sym_dict[gene_sym].append(list(map(float, data[d_idx:])))
+    else:
+        out_data = "{}\t{}\n".format(gene_sym, '\t'.join(data[d_idx:]))
+    return ct, out_data
+
+
+def collapse_dups(dup_gene_sym_dict):
+    """
+    Get highest mean for repeated gene symbols and output that row
+    """
+    for gene_sym in dup_gene_sym_dict:
+        means = []
+        for data in dup_gene_sym_dict[gene_sym]:
+            means.append(mean(data))
+        top_idx = means.index(max(means))
+        out_data = "{}\t{}\n".format(gene_sym, '\t'.join(list(map(str, dup_gene_sym_dict[gene_sym][top_idx]))))
+        return out_data
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Script to use a GTF and expression matrix with ENSG to liftover gene symbols."
+    )
+    parser.add_argument(
+        "-t",
+        "--table",
+        action="store",
+        dest="table",
+        help="Input expression matrix. Can be gzipped or plain tsv",
+    )
+    parser.add_argument(
+        "-o", "--output-basename", action="store", dest="out", help="output basename of liftover and collapse if flag given"
+    )
+    parser.add_argument(
+        "-i", "--input-gtf", action="store", dest="gtf", help="GTF file to use as liftover reference. Can be gzipped or plain tsv"
+    )
+    parser.add_argument(
+        "-g", "--gene-id", action="store", dest="gene_id", help="NAME of gene ID (ENSG values) column"
+    )
+    parser.add_argument(
+        "-n", "--gene-name", action="store", dest="gene_name", help="NAME of gene name column, if present to keep if not found"
+    )
+    parser.add_argument(
+        "-s", "--skip", action="store", dest="skip", type=int, help="Number of lines to skip if needed"
+    )
+    parser.add_argument(
+        "-c", "--collapse", action="store_true", dest="collapse", help="If set, will collapse on gene_symbol"
+    )
+
+    args = parser.parse_args()
+
+    gtf_dict = {}
+    # hardcoded for now, could make opts
+    tx_type = 2
+    tx_desc = 8
+    # set up pattern matcher ahead of time for efficiency
+    id_find = re.compile(r"gene_id \"(ENSG\d+)\.\d+\";")
+    name_find = re.compile(r".*gene_name \"(\S+)\";")
+    print('Indexing gtf', file=sys.stderr)
+    with (gzip.open if args.gtf.endswith("gz") else open)(args.gtf, "rt", encoding="utf-8") as gtf:
+        for line in gtf:
+            if not line.startswith('##'):
+                update_gtf_dict(gtf_dict, line, tx_type, tx_desc, id_find, name_find)
+    # track sym occurrences if collapse flag given for later processing
+    gene_sym_ct_dict = {}
+    with (gzip.open if args.table.endswith("gz") else open)(args.table, "rt", encoding="utf-8") as table:
+        if args.skip:
+            print('Skipping ' + str(args.skip) + ' lines', file=sys.stderr)
+            for i in range(args.skip):
+                skip = next(table)
+        head = next(table)
+        header = head.rstrip('\n').split('\t')
+        # get positions of current gene id and name
+        g_idx = header.index(args.gene_id)
+        n_idx = header.index(args.gene_name)
+        # make assumption that data will start after ID and name
+        d_idx = max(g_idx, n_idx) + 1
+        print('Processing table', file=sys.stderr)
+        out_lift_fn = args.out + '.liftover.tsv'
+        out_lift = open(out_lift_fn, 'w')
+        # Some counters to track progress
+        n = 10000
+        l = 0
+        out_head = "{}\t{}\t{}\n".format('gene_id', 'gene_name', '\t'.join(header[d_idx:]))
+        out_lift.write(out_head)
+        for line in table:
+            if l % n == 0:
+                print("Processed " + str(l) + " lines", file=sys.stderr)
+            out_data = liftover(line, gene_sym_ct_dict, g_idx, n_idx, gtf_dict, d_idx)
+            out_lift.write(out_data)
+            l += 1
+        # dump out rest of leftover lines
+        out_lift.close()
+
+    if args.collapse:
+        print("Collapse flag given. Collating dups from liftover", file=sys.stderr)
+        dup_gene_sym_dict = {}
+        dup_ct = 0
+        out_collapse_fn = args.out + '.collapsed.tsv'
+        out_collapse = open(out_collapse_fn, 'w')
+        with (gzip.open if out_lift_fn.endswith("gz") else open)(out_lift_fn, "rt", encoding="utf-8") as lifted:
+            # already got header as array, so can skip splitting
+            skip = next(lifted)
+            out_head = "{}\t{}\n".format('gene_name', '\t'.join(header[d_idx:]))
+            out_collapse.write(out_head)
+            l = 0
+            for line in lifted:
+                dup_ct, out_data = collate_dups(line, dup_gene_sym_dict, dup_ct, gene_sym_ct_dict, d_idx, n_idx)
+                if l % n == 0:
+                    print("Processed " + str(l) + " lines", file=sys.stderr)
+                if out_data:
+                    out_collapse.write(out_data)
+                l += 1
+        print("Processing " + str(dup_ct) + " repeat gene symbols and choosing highest mean expression", file=sys.stderr)
+        out_data = collapse_dups(dup_gene_sym_dict)
+        out_collapse.write(out_data)
+        print("Done!", file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,35 @@
+cwlVersion: v1.2
+class: CommandLineTool
+id: convert-to-rds
+doc: "Converts tsv table to rds"
+
+requirements:
+  - class: ShellCommandRequirement
+  - class: DockerRequirement
+    dockerPull: 'rocker/tidyverse:3.5.3'
+  - class: InlineJavascriptRequirement
+  - class: InitialWorkDirRequirement
+    listing:
+      - entryname: convert_to_rds.R
+        entry:
+          $include: ../scripts/convert_to_rds.R
+  - class: ResourceRequirement
+    ramMin: $(inputs.ram * 1000)
+
+baseCommand: [Rscript convert_to_rds.R]
+
+inputs:
+  input_tsv: {type: File, doc: "Input expression matrix. Can be gzipped or plain tsv",
+    inputBinding: { prefix: "--input-tsv", position: 1} }
+  gene_col: {type: 'string?', doc: "Name of column with gene symbol", default: 'gene_name',
+    inputBinding: { prefix: "--gene-col", position: 1} }
+  output_filename: {type: string, doc: "Output file name, should fit pattern gene-[expression/counts]-rsem-[fpkm/tpm/expected_count]-collapsed.rds",
+    inputBinding: { prefix: "--output-filename", position: 1} }
+  ram: { type: 'int?', doc: "Set ram requirement. Unfortunately reading and writing rds files is beefy!", default: 32 }
+
+outputs:
+  pirate_output:
+    type: File
+    outputBinding:
+      glob: '*.rds'
+    doc: "rds file generated from input tsv"
@@ -0,0 +1,50 @@
+cwlVersion: v1.2
+class: CommandLineTool
+id: liftover-collapse-rnaseq
+doc: "Tool to use a GTF and expression matrix with ENSG to liftover gene symbols and collapse repeat symbols if needed"
+
+requirements:
+  - class: ShellCommandRequirement
+  - class: DockerRequirement
+    dockerPull: 'python:3.9'
+  - class: InlineJavascriptRequirement
+  - class: InitialWorkDirRequirement
+    listing:
+      - entryname: liftover_collapse_rnaseq.py
+        entry:
+          $include: ../scripts/liftover_collapse_rnaseq.py
+
+
+baseCommand: [python3 liftover_collapse_rnaseq.py]
+
+inputs:
+  table: {type: File, doc: "Input expression matrix. Can be gzipped or plain tsv",
+    inputBinding: { prefix: "--table", position: 1 } }
+  output_basename: {type: string, doc: "output basename of liftover and collapse if flag given",
+    inputBinding: { prefix: "--output-basename", position: 1 } }
+  input_gtf: {type: File, doc: "GTF file to use as liftover reference. Can be gzipped or plain tsv",
+    inputBinding: { prefix: "--input-gtf", position: 1 } }
+  gene_id: {type: string, doc: "NAME of gene ID (ENSG values) column",
+    inputBinding: { prefix: "--gene-id", position: 1 } }
+  gene_name: {type: string, doc: "NAME of gene name column, if present to keep if not found",
+    inputBinding: { prefix: "--gene-name", position: 1 } }
+  skip: {type: 'int?', doc: "Number of lines to skip if needed, i.e. file has extra unneeded header lines",
+    inputBinding: { prefix: "--skip", position: 1 } }
+  collapse: {type: 'boolean?', doc: "If set, will collapse on repeat gene symbols and choose highest mean expression", default: true,
+    inputBinding: { prefix: "--collapse", position: 1 } }
+
+
+
+
+outputs:
+  liftover:
+    type: File
+    outputBinding:
+      glob: '*.liftover.tsv'
+    doc: "Un-collapsed expression matrix with lifted-over gene symbols"
+  collapsed:
+    type: File
+    outputBinding:
+      glob: '*.collapsed.tsv'
+    doc: "Collapsed expression matrix with lifted-over gene symbols"
+