From bc33c83de8af56da327aff10e5023690193c7f41 Mon Sep 17 00:00:00 2001 From: migbro Date: Fri, 10 Mar 2023 11:05:04 -0500 Subject: [PATCH 01/13] :tada: created liftover scripts and wrapper cwl --- .../cwl/scripts/liftover_collapse_rnaseq.py | 180 ++++++++++++++++++ .../liftover_collapse_rnaseq_wrapper.sh | 2 + .../cwl/tools/liftover_collapse_rnaseq.cwl | 52 +++++ 3 files changed, 234 insertions(+) create mode 100644 analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py create mode 100755 analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh create mode 100644 analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py new file mode 100644 index 0000000000..f04c09f225 --- /dev/null +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py @@ -0,0 +1,180 @@ +""" +Script to use a GTF and expression matrix with ENSG to liftover gene symbols +""" + +import argparse +import gzip +import re +import sys +from statistics import mean + + +def update_gtf_dict(in_dict, in_text): + """ + Update ensg - sym ID dictionary on gene entries only + """ + data = in_text.strip('\n').split('\t') + if data[tx_type] == 'gene': + try: + gene_id = id_find.match(data[tx_desc]).group(1) + gene_name = name_find.match(data[tx_desc]).group(1) + in_dict[gene_id] = gene_name + except Exception as e: + sys.stderr.write(str(e) + '\n') + sys.stderr.write("Failed to parse gene ID and name from " + in_text) + + +def liftover(in_data, out_lift, n, l): + """ + Update gene symbol if found in dict, leave alone if not + """ + if l % n == 0: + sys.stderr.write("Processed " + str(l) + " lines\n") + sys.stderr.flush() + data = in_data.rstrip('\n').split('\t') + cur_id = data[g_idx].split('.')[0] + if cur_id in gtf_dict: + data[n_idx] = gtf_dict[cur_id] + gene_sym = data[n_idx] + if gene_sym not in sym_ct: + sym_ct[gene_sym] = 1 + else: + sym_ct[gene_sym] += 1 + out_data = cur_id + '\t' + gene_sym + '\t' + '\t'.join(data[d_idx:]) + '\n' + out_lift.write(out_data) + l += 1 + return l + + +def collate_dups(in_data, in_dict, ct, out_file, l): + """ + Checks if entry is in dup dict (really smybol seen > 1 in sym_ct), + then add to dict ot process, or output to open file handle as-is if not repeated + """ + if l % n == 0: + sys.stderr.write("Processed " + str(l) + " lines\n") + sys.stderr.flush() + data = in_data.rstrip('\n').split('\t') + gene_sym = data[n_idx] + if sym_ct[gene_sym] > 1: + if gene_sym not in dup_dict: + dup_dict[gene_sym] = [] + ct += 1 + # cast as int for later + dup_dict[gene_sym].append(list(map(float, data[d_idx:]))) + else: + out_data = gene_sym + "\t" + "\t".join(data[d_idx:]) + '\n' + out_file.write(out_data) + l += 1 + return ct, l + + +def collapse_dups(dup_dict, out_file): + """ + Get highest mean for repeated gene symbols and output that row + """ + for gene_sym in dup_dict: + means = [] + for data in dup_dict[gene_sym]: + means.append(mean(data)) + top_idx = means.index(max(means)) + out_data = gene_sym + "\t" + "\t".join(list(map(str, dup_dict[gene_sym][top_idx]))) + out_file.write(out_data) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description="Script to use a GTF and expression matrix with ENSG to liftover gene symbols." + ) + parser.add_argument( + "-t", + "--table", + action="store", + dest="table", + help="Input expression matrix", + ) + parser.add_argument( + "-o", "--output-basename", action="store", dest="out", help="output basename of liftover and collapse if flag given" + ) + parser.add_argument( + "-i", "--input-gtf", action="store", dest="gtf", help="GTF file to use as liftover reference" + ) + parser.add_argument( + "-g", "--gene-id", action="store", dest="gene_id", help="NAME of gene ID (ENSG values) column" + ) + parser.add_argument( + "-n", "--gene-name", action="store", dest="gene_name", help="NAME of gene name column, if present to keep if not found" + ) + parser.add_argument( + "-s", "--skip", action="store", dest="skip", type=int, help="Number of lines to skip if needed" + ) + parser.add_argument( + "-c", "--collapse", action="store_true", dest="collapse", help="If set, will collapse on gene_symbol" + ) + + args = parser.parse_args() + + gtf_dict = {} + # hardcoded for now, could make opts + tx_type = 2 + tx_desc = 8 + # set up pattern matcher ahead of time for efficiency + id_find = re.compile(r"gene_id \"(ENSG\d+)\.\d+\";") + name_find = re.compile(r".*gene_name \"(\S+)\";") + sys.stderr.write('Indexing gtf\n') + with (gzip.open if args.gtf.endswith("gz") else open)(args.gtf, "rt", encoding="utf-8") as gtf: + for line in gtf: + if not line.startswith('##'): + update_gtf_dict(gtf_dict, line) + # track sym occurrences if collapse flag given for later processing + sym_ct = {} + with (gzip.open if args.table.endswith("gz") else open)(args.table, "rt", encoding="utf-8") as table: + if args.skip: + sys.stderr.write('Skipping ' + str(args.skip) + ' lines\n') + for i in range(args.skip): + skip = next(table) + head = next(table) + header = head.rstrip('\n').split('\t') + # get positions of current gene id and name + g_idx = header.index(args.gene_id) + n_idx = header.index(args.gene_name) + # make assumption that data will start after ID and name + d_idx = g_idx + 1 + if n_idx > g_idx: + d_idx = n_idx + 1 + sys.stderr.write('Processing table\n') + sys.stderr.flush() + out_lift_fn = args.out + '.liftover.tsv' + out_lift = open(out_lift_fn, 'w') + # Some counters to track progress + n = 1000 + l = 0 + out_head = ('gene_id\tgene_name\t' + '\t'.join(header[d_idx:])) + out_lift.write(out_head) + for line in table: + l = liftover(line, out_lift, n, l) + # dump out rest of leftover lines + out_lift.close() + + if args.collapse: + sys.stderr.write("Collapse flag given. Collating dups from liftover\n") + sys.stderr.flush() + dup_dict = {} + dup_ct = 0 + out_collapse_fn = args.out + '.collapsed.tsv' + out_collapse = open(out_collapse_fn, 'w') + with (gzip.open if out_lift_fn.endswith("gz") else open)(out_lift_fn, "rt", encoding="utf-8") as lifted: + # already got header as array, so can skip splitting + skip = next(lifted) + out_head = 'gene_name' + '\t' + '\t'.join(header[d_idx:]) + '\n' + out_collapse.write(out_head) + l = 0 + for line in lifted: + dup_ct, l = collate_dups(line, dup_dict, dup_ct, out_collapse, l) + sys.stderr.write("Processing " + str(dup_ct) + " repeat gene symbols and choosing highest mean expression\n") + sys.stderr.flush() + collapse_dups(dup_dict, out_collapse) + sys.stderr.write("Done!\n") + sys.stderr.flush() + diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh new file mode 100755 index 0000000000..5dbc7fc961 --- /dev/null +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python3 liftover_collapse_rnaseq.py $@ && gzip *.tsv \ No newline at end of file diff --git a/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl b/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl new file mode 100644 index 0000000000..ca84870f5d --- /dev/null +++ b/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl @@ -0,0 +1,52 @@ +cwlVersion: v1.0 +class: CommandLineTool +id: liftover-collapse-rnaseq +doc: "Tool to use a GTF and expression matrix with ENSG to liftover gene symbols and collapse repeat symbols if needed" + +requirements: + - class: ShellCommandRequirement + - class: DockerRequirement + dockerPull: 'pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest' + - class: InlineJavascriptRequirement + - class: InitialWorkDirRequirement + listing: + - entryname: liftover_collapse_rnaseq_wrapper.sh + entry: + $include: ../scripts/liftover_collapse_rnaseq_wrapper.sh + - entryname: liftover_collapsed_rnaseq.py + entry: + $include: ../scripts/liftover_collapse_rnaseq.py + + +baseCommand: [./liftover_collapse_rnaseq_wrapper.sh] + +inputs: + table: {type: File, doc: "Input expression matrix", + inputBinding: { prefix: "--table", position: 1} } + output_basename: {type: string, doc: "output basename of liftover and collapse if flag given", + inputBinding: { prefix: "--output-basename", position: 1} } + input_gtf: {type: string, doc: "GTF file to use as liftover reference", + inputBinding: { prefix: "--input-gtf", position: 1} } + gene_id: {type: string, doc: "NAME of gene ID (ENSG values) column", + inputBinding: { prefix: "--gene-id", position: 1} } + gene_name: {type: string, doc: "NAME of gene name column, if present to keep if not found", + inputBinding: { prefix: "--gene-name", position: 1} } + skip: {type: 'int?', doc: "Number of lines to skip if needed, i.e. file has extra unneeded header lines", + inputBinding: { prefix: "skip", position: 1} } + collapse: {type: 'boolean?', doc: "If set, will collapse on repeat gene symbols and choose highest mean expression", default: true, + inputBinding: { prefix: "skip", position: 1} } + + + + +outputs: + liftover: + type: File + outputBinding: + glob: '*.liftover.tsv.gz' + doc: "Un-collapsed expression matrix with lifted-over gene symbols" + collapsed: + type: File + outputBinding: + glob: '*.collapsed.tsv.gz' + doc: "Collapsed expression matrix with lifted-over gene symbols" From fade157c462981157f415421d50b6a7d347ad715 Mon Sep 17 00:00:00 2001 From: migbro Date: Fri, 10 Mar 2023 12:15:54 -0500 Subject: [PATCH 02/13] :hammer: switched to python docker since it pull faster :pencil: updated docs :wrench: fixed bugs :racehorse: parrallel gzip of files --- analyses/collapse-rnaseq/README.md | 51 +++++++++++++++++++ .../cwl/scripts/liftover_collapse_rnaseq.py | 4 +- .../liftover_collapse_rnaseq_wrapper.sh | 2 +- .../cwl/tools/liftover_collapse_rnaseq.cwl | 14 ++--- 4 files changed, 61 insertions(+), 10 deletions(-) diff --git a/analyses/collapse-rnaseq/README.md b/analyses/collapse-rnaseq/README.md index 6516bebb41..154b14c27a 100644 --- a/analyses/collapse-rnaseq/README.md +++ b/analyses/collapse-rnaseq/README.md @@ -34,3 +34,54 @@ This script is not run via `run-collapse-rnaseq.sh`. * `01-summarize_matrices.R` - this script generates the collapsed matrices as described above. In addition, this script calculates the average Pearson correlation between the values of the gene symbol that is kept and those duplicates that are discarded. * `02-analyze-drops.Rmd` - this is used to display tables from `01-summarize_matrices.R`. + +## CWL Tools and scripts +This contains a cwl tool wrapper (to run using cwlrunner or on a cwl-enabled platform, like CAVATICA). +The wrapper `.sh` script runs the python script and gzips the outputs in the end. +The python script does the liftover and collapse with no compression. +Since it uses base python packages, one can run it in any environment with a standard python3 installation. +``` +cwl +├── scripts +│ ├── liftover_collapse_rnaseq.py +│ └── liftover_collapse_rnaseq_wrapper.sh +└── tools + └── liftover_collapse_rnaseq.cwl +``` +### analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py +``` +usage: liftover_collapse_rnaseq.py [-h] [-t TABLE] [-o OUT] [-i GTF] [-g GENE_ID] [-n GENE_NAME] [-s SKIP] [-c] + +Script to use a GTF and expression matrix with ENSG to liftover gene symbols. + +optional arguments: + -h, --help show this help message and exit + -t TABLE, --table TABLE + Input expression matrix. Can be gzipped or plain tsv + -o OUT, --output-basename OUT + output basename of liftover and collapse if flag given + -i GTF, --input-gtf GTF + GTF file to use as liftover reference. Can be gzipped or plain tsv + -g GENE_ID, --gene-id GENE_ID + NAME of gene ID (ENSG values) column + -n GENE_NAME, --gene-name GENE_NAME + NAME of gene name column, if present to keep if not found + -s SKIP, --skip SKIP Number of lines to skip if needed + -c, --collapse If set, will collapse on gene_symbol +``` +Examples: +`python3 liftover_collapse_rnaseq.py -t TCGA_gene_counts.tsv -o TCGA -i gencode.v39.primary_assembly.gtf -g Gene_id -n Gene_name --collapse 2> errs.log` will generate: +``` +├── TCGA.collapsed.tsv +├── TCGA.liftover.tsv +``` +### analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh +Just runs the python script with an added `gzip *.tsv` step. +Example with GTEx input: +`bash liftover_collapse_rnaseq_wrapper.sh --collapse --gene-id Name --gene-name Description --input-gtf gencode.v39.primary_assembly.annotation.gtf --output-basename GTEx --skip 2 --table GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz` +So output would be: +``` +├── TCGA.collapsed.tsv.gz +├── TCGA.liftover.tsv.gz +``` +from example above \ No newline at end of file diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py index f04c09f225..8fa54869d9 100644 --- a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py @@ -92,13 +92,13 @@ def collapse_dups(dup_dict, out_file): "--table", action="store", dest="table", - help="Input expression matrix", + help="Input expression matrix. Can be gzipped or plain tsv", ) parser.add_argument( "-o", "--output-basename", action="store", dest="out", help="output basename of liftover and collapse if flag given" ) parser.add_argument( - "-i", "--input-gtf", action="store", dest="gtf", help="GTF file to use as liftover reference" + "-i", "--input-gtf", action="store", dest="gtf", help="GTF file to use as liftover reference. Can be gzipped or plain tsv" ) parser.add_argument( "-g", "--gene-id", action="store", dest="gene_id", help="NAME of gene ID (ENSG values) column" diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh index 5dbc7fc961..ea89663652 100755 --- a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh @@ -1,2 +1,2 @@ #!/bin/bash -python3 liftover_collapse_rnaseq.py $@ && gzip *.tsv \ No newline at end of file +python3 liftover_collapse_rnaseq.py $@ && ls *.tsv | xargs -IFN -P 2 gzip FN \ No newline at end of file diff --git a/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl b/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl index ca84870f5d..6a814c4ee6 100644 --- a/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl +++ b/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl @@ -6,35 +6,35 @@ doc: "Tool to use a GTF and expression matrix with ENSG to liftover gene symbols requirements: - class: ShellCommandRequirement - class: DockerRequirement - dockerPull: 'pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest' + dockerPull: 'python:3.9' - class: InlineJavascriptRequirement - class: InitialWorkDirRequirement listing: - entryname: liftover_collapse_rnaseq_wrapper.sh entry: $include: ../scripts/liftover_collapse_rnaseq_wrapper.sh - - entryname: liftover_collapsed_rnaseq.py + - entryname: liftover_collapse_rnaseq.py entry: $include: ../scripts/liftover_collapse_rnaseq.py -baseCommand: [./liftover_collapse_rnaseq_wrapper.sh] +baseCommand: [bash liftover_collapse_rnaseq_wrapper.sh] inputs: - table: {type: File, doc: "Input expression matrix", + table: {type: File, doc: "Input expression matrix. Can be gzipped or plain tsv", inputBinding: { prefix: "--table", position: 1} } output_basename: {type: string, doc: "output basename of liftover and collapse if flag given", inputBinding: { prefix: "--output-basename", position: 1} } - input_gtf: {type: string, doc: "GTF file to use as liftover reference", + input_gtf: {type: File, doc: "GTF file to use as liftover reference. Can be gzipped or plain tsv", inputBinding: { prefix: "--input-gtf", position: 1} } gene_id: {type: string, doc: "NAME of gene ID (ENSG values) column", inputBinding: { prefix: "--gene-id", position: 1} } gene_name: {type: string, doc: "NAME of gene name column, if present to keep if not found", inputBinding: { prefix: "--gene-name", position: 1} } skip: {type: 'int?', doc: "Number of lines to skip if needed, i.e. file has extra unneeded header lines", - inputBinding: { prefix: "skip", position: 1} } + inputBinding: { prefix: "--skip", position: 1} } collapse: {type: 'boolean?', doc: "If set, will collapse on repeat gene symbols and choose highest mean expression", default: true, - inputBinding: { prefix: "skip", position: 1} } + inputBinding: { prefix: "--collapse", position: 1} } From 840d0ae57e0d5871de197ae1f6bce454dfcc2e9f Mon Sep 17 00:00:00 2001 From: migbro Date: Fri, 10 Mar 2023 16:49:35 -0500 Subject: [PATCH 03/13] :tada: added convert to rds tool and workflow --- .../cwl/scripts/convert_to_rds.R | 32 +++++++++++ .../cwl/tools/convert_to_rds.cwl | 37 +++++++++++++ .../cwl/tools/liftover_collapse_rnaseq.cwl | 1 + .../workflows/liftover_collapse_exp_wf.cwl | 53 +++++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 analyses/collapse-rnaseq/cwl/scripts/convert_to_rds.R create mode 100644 analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl create mode 100644 analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl diff --git a/analyses/collapse-rnaseq/cwl/scripts/convert_to_rds.R b/analyses/collapse-rnaseq/cwl/scripts/convert_to_rds.R new file mode 100644 index 0000000000..b2f8b29d7c --- /dev/null +++ b/analyses/collapse-rnaseq/cwl/scripts/convert_to_rds.R @@ -0,0 +1,32 @@ +if(!require(optparse)){install.packages('optparse')} +library("optparse") + +#process inputs +option_list <- list( + make_option( + opt_str = "--input-tsv", + type = "character", + dest = "input_tsv", + help = "tsv RNA expression matrix", + ), + make_option( + opt_str = "--gene-col", + type = "character", + dest = "gene_col", + help = "Name of column with gene symbol" + ), + make_option( + opt_str = "--output-filename", + type = "character", + dest = "out", + help = "Output file name, should fit pattern gene-[expression/counts]-rsem-[fpkm/tpm/expected_count]-collapsed.rds" + ) +) + +#parse options +opts <- parse_args(OptionParser(option_list = option_list)) +message("Reading in tsv...") +tsv_df = read.table(file = opts$input_tsv, sep = '\t', header = TRUE, check.names = FALSE, row.names=opts$gene_col) +message("Saving as rds...") +saveRDS(tsv_df, file=opts$out) +message("Done!") diff --git a/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl b/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl new file mode 100644 index 0000000000..0cd1ed2d03 --- /dev/null +++ b/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl @@ -0,0 +1,37 @@ +cwlVersion: v1.0 +class: CommandLineTool +id: convert-to-rds +doc: "Converts tsv table to rds" + +requirements: + - class: ShellCommandRequirement + - class: DockerRequirement + dockerPull: 'rocker/tidyverse:3.5.3' + - class: InlineJavascriptRequirement + - class: InitialWorkDirRequirement + listing: + - entryname: convert_to_rds.R + entry: + $include: ../scripts/convert_to_rds.R + - class: ResourceRequirement + ramMin: $(inputs.ram * 1000) + + + +baseCommand: [Rscript convert_to_rds.R] + +inputs: + input_tsv: {type: File, doc: "Input expression matrix. Can be gzipped or plain tsv", + inputBinding: { prefix: "--input-tsv", position: 1} } + gene_col: {type: 'string?', doc: "Name of column with gene symbol", default: 'gene_name', + inputBinding: { prefix: "--gene-col", position: 1} } + output_filename: {type: string, doc: "Output file name, should fit pattern gene-[expression/counts]-rsem-[fpkm/tpm/expected_count]-collapsed.rds", + inputBinding: { prefix: "--output-filename", position: 1} } + ram: { type: 'int?', doc: "Set ram requirement. Unfortunately reading and writing rds files is beefy!", default: 32} + +outputs: + pirate_output: + type: File + outputBinding: + glob: '*.rds' + doc: "rds file generated from input tsv" diff --git a/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl b/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl index 6a814c4ee6..d3f50eeb35 100644 --- a/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl +++ b/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl @@ -50,3 +50,4 @@ outputs: outputBinding: glob: '*.collapsed.tsv.gz' doc: "Collapsed expression matrix with lifted-over gene symbols" + \ No newline at end of file diff --git a/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl b/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl new file mode 100644 index 0000000000..aeff88c5f1 --- /dev/null +++ b/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl @@ -0,0 +1,53 @@ +cwlVersion: v1.2 +class: Workflow +id: liftover-collapse-rnaseq-wf +label: Liftover and Collapse RNAseq WF +doc: "Lifts over gene symbols from given gtf and output lifted and collapsed by gene symbol tsv, and rds of collapsed file" + + +inputs: + # lift and collapse + table: { type: File, doc: "Input expression matrix. Can be gzipped or plain tsv" } + output_basename: {type: string, doc: "output basename of liftover and collapse if flag given" } + input_gtf: { type: File, doc: "GTF file to use as liftover reference. Can be gzipped or plain tsv" } + gene_id: { type: string, doc: "NAME of gene ID (ENSG values) column" } + gene_name: { type: string, doc: "NAME of gene name column, if present to keep if not found" } + skip: { type: 'int?', doc: "Number of lines to skip if needed, i.e. file has extra unneeded header lines" } + collapse: { type: 'boolean?', doc: "If set, will collapse on repeat gene symbols and choose highest mean expression" } + # convert to rds + output_filename: { type: string, doc: "Output file name, should fit pattern gene-[expression/counts]-rsem-[fpkm/tpm/expected_count]-collapsed.rds" } + rds_ram: { type: 'int?', doc: "Set ram requirement. Unfortunately reading and writing rds files is beefy!", default: 32} + +outputs: + lifted: { type: File, outputSource: liftover_collapse_rnaseq/liftover } + collapsed: { type: File, outputSource: liftover_collapse_rnaseq/collapsed } + expression_rds: { type: 'File?', outputSource: convert_to_rds/pirate_output } + +steps: + liftover_collapse_rnaseq: + run: ../tools/liftover_collapse_rnaseq.cwl + hints: + - class: 'sbg:AWSInstanceType' + value: c5.2xlarge;ebs-gp2;400 + doc: "Chosen for speed and lower cost" + in: + table: table + output_basename: output_basename + input_gtf: input_gtf + gene_id: gene_id + gene_name: gene_name + skip: skip + collapse: collapse + out: [liftover, collapsed] + + convert_to_rds: + run: ../tools/convert_to_rds.cwl + when: $(inputs.input_tsv != null) + in: + input_tsv: liftover_collapse_rnaseq/collapsed + output_filename: output_filename + ram: rds_ram + out: [pirate_output] + +$namespaces: + sbg: https://sevenbridges.com From e6bce114efa5e6380ef2ca4c82d2d3867702a08d Mon Sep 17 00:00:00 2001 From: migbro Date: Fri, 10 Mar 2023 17:28:20 -0500 Subject: [PATCH 04/13] :wrench: fixed output bug in collapse script :wrench: fixed hint :pencil: Updated readme --- analyses/collapse-rnaseq/README.md | 39 +++++++++++++++++-- .../cwl/scripts/liftover_collapse_rnaseq.py | 4 +- .../cwl/tools/convert_to_rds.cwl | 4 +- .../workflows/liftover_collapse_exp_wf.cwl | 4 +- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/analyses/collapse-rnaseq/README.md b/analyses/collapse-rnaseq/README.md index 154b14c27a..5719e4324e 100644 --- a/analyses/collapse-rnaseq/README.md +++ b/analyses/collapse-rnaseq/README.md @@ -36,18 +36,28 @@ In addition, this script calculates the average Pearson correlation between the * `02-analyze-drops.Rmd` - this is used to display tables from `01-summarize_matrices.R`. ## CWL Tools and scripts -This contains a cwl tool wrapper (to run using cwlrunner or on a cwl-enabled platform, like CAVATICA). +This contains a cwl workflow wrapper (to run using cwlrunner or on a cwl-enabled platform, like CAVATICA). +The workflow runs a liftover step and a convert-to-red step. The wrapper `.sh` script runs the python script and gzips the outputs in the end. The python script does the liftover and collapse with no compression. Since it uses base python packages, one can run it in any environment with a standard python3 installation. +The convert-to-rds step is for downstream compatibility with modules ``` cwl ├── scripts +│ ├── convert_to_rds.R │ ├── liftover_collapse_rnaseq.py │ └── liftover_collapse_rnaseq_wrapper.sh -└── tools - └── liftover_collapse_rnaseq.cwl +├── tools +│ ├── convert_to_rds.cwl +│ └── liftover_collapse_rnaseq.cwl +└── workflows + └── liftover_collapse_exp_wf.cwl ``` + +### analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl +This workflow wrapper runs all of the tools described below + ### analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py ``` usage: liftover_collapse_rnaseq.py [-h] [-t TABLE] [-o OUT] [-i GTF] [-g GENE_ID] [-n GENE_NAME] [-s SKIP] [-c] @@ -84,4 +94,25 @@ So output would be: ├── TCGA.collapsed.tsv.gz ├── TCGA.liftover.tsv.gz ``` -from example above \ No newline at end of file +from example above + +### analyses/collapse-rnaseq/cwl/scripts/convert_to_rds.R +``` +Usage: convert_to_rds.R [options] + + +Options: + --input-tsv=INPUT-TSV + tsv RNA expression matrix + + --gene-col=GENE-COL + Name of column with gene symbol + + --output-filename=OUTPUT-FILENAME + Output file name, should fit pattern gene-[expression/counts]-rsem-[fpkm/tpm/expected_count]-collapsed.rds + + -h, --help + Show this help message and exit +``` +Example run with collapsed TCGA output: +`Rscript convert_to_rds.R --gene-col gene_name --input-tsv TCGA_v36_TEST.collapsed.tsv.gz --output-filename fancy_long_filename.rds` \ No newline at end of file diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py index 8fa54869d9..d22636a8fb 100644 --- a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py @@ -78,7 +78,7 @@ def collapse_dups(dup_dict, out_file): for data in dup_dict[gene_sym]: means.append(mean(data)) top_idx = means.index(max(means)) - out_data = gene_sym + "\t" + "\t".join(list(map(str, dup_dict[gene_sym][top_idx]))) + out_data = gene_sym + "\t" + "\t".join(list(map(str, dup_dict[gene_sym][top_idx]))) + '\n' out_file.write(out_data) @@ -148,7 +148,7 @@ def collapse_dups(dup_dict, out_file): out_lift_fn = args.out + '.liftover.tsv' out_lift = open(out_lift_fn, 'w') # Some counters to track progress - n = 1000 + n = 10000 l = 0 out_head = ('gene_id\tgene_name\t' + '\t'.join(header[d_idx:])) out_lift.write(out_head) diff --git a/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl b/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl index 0cd1ed2d03..4c23a8efff 100644 --- a/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl +++ b/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl @@ -16,8 +16,6 @@ requirements: - class: ResourceRequirement ramMin: $(inputs.ram * 1000) - - baseCommand: [Rscript convert_to_rds.R] inputs: @@ -27,7 +25,7 @@ inputs: inputBinding: { prefix: "--gene-col", position: 1} } output_filename: {type: string, doc: "Output file name, should fit pattern gene-[expression/counts]-rsem-[fpkm/tpm/expected_count]-collapsed.rds", inputBinding: { prefix: "--output-filename", position: 1} } - ram: { type: 'int?', doc: "Set ram requirement. Unfortunately reading and writing rds files is beefy!", default: 32} + ram: { type: 'int?', doc: "Set ram requirement. Unfortunately reading and writing rds files is beefy!", default: 32 } outputs: pirate_output: diff --git a/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl b/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl index aeff88c5f1..2d500b9af6 100644 --- a/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl +++ b/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl @@ -28,8 +28,8 @@ steps: run: ../tools/liftover_collapse_rnaseq.cwl hints: - class: 'sbg:AWSInstanceType' - value: c5.2xlarge;ebs-gp2;400 - doc: "Chosen for speed and lower cost" + value: c5.2xlarge;ebs-gp2;400 + doc: "Chosen for speed and lower cost" in: table: table output_basename: output_basename From 53aaec0c71742a7ceafac2b8c4473ea0f767fdcb Mon Sep 17 00:00:00 2001 From: migbro Date: Tue, 14 Mar 2023 11:12:12 -0400 Subject: [PATCH 05/13] :hammer: made gzip it's own step to run in parallel :pencil: edited readme to reflect changes :heavy_minus_sign: dropped superfluous shell script --- analyses/collapse-rnaseq/README.md | 12 ++----- .../liftover_collapse_rnaseq_wrapper.sh | 2 -- .../cwl/tools/liftover_collapse_rnaseq.cwl | 23 ++++++------- .../collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl | 33 +++++++++++++++++++ .../workflows/liftover_collapse_exp_wf.cwl | 21 ++++++++++-- 5 files changed, 63 insertions(+), 28 deletions(-) delete mode 100755 analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh create mode 100644 analyses/collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl diff --git a/analyses/collapse-rnaseq/README.md b/analyses/collapse-rnaseq/README.md index 5719e4324e..e2730707d0 100644 --- a/analyses/collapse-rnaseq/README.md +++ b/analyses/collapse-rnaseq/README.md @@ -85,16 +85,8 @@ Examples: ├── TCGA.collapsed.tsv ├── TCGA.liftover.tsv ``` -### analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh -Just runs the python script with an added `gzip *.tsv` step. -Example with GTEx input: -`bash liftover_collapse_rnaseq_wrapper.sh --collapse --gene-id Name --gene-name Description --input-gtf gencode.v39.primary_assembly.annotation.gtf --output-basename GTEx --skip 2 --table GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz` -So output would be: -``` -├── TCGA.collapsed.tsv.gz -├── TCGA.liftover.tsv.gz -``` -from example above +### analyses/collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl +gzips the output from liftover step ### analyses/collapse-rnaseq/cwl/scripts/convert_to_rds.R ``` diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh deleted file mode 100755 index ea89663652..0000000000 --- a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq_wrapper.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -python3 liftover_collapse_rnaseq.py $@ && ls *.tsv | xargs -IFN -P 2 gzip FN \ No newline at end of file diff --git a/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl b/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl index d3f50eeb35..977022f6b2 100644 --- a/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl +++ b/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl @@ -10,31 +10,28 @@ requirements: - class: InlineJavascriptRequirement - class: InitialWorkDirRequirement listing: - - entryname: liftover_collapse_rnaseq_wrapper.sh - entry: - $include: ../scripts/liftover_collapse_rnaseq_wrapper.sh - entryname: liftover_collapse_rnaseq.py entry: $include: ../scripts/liftover_collapse_rnaseq.py -baseCommand: [bash liftover_collapse_rnaseq_wrapper.sh] +baseCommand: [python3 liftover_collapse_rnaseq.py] inputs: table: {type: File, doc: "Input expression matrix. Can be gzipped or plain tsv", - inputBinding: { prefix: "--table", position: 1} } + inputBinding: { prefix: "--table", position: 1 } } output_basename: {type: string, doc: "output basename of liftover and collapse if flag given", - inputBinding: { prefix: "--output-basename", position: 1} } + inputBinding: { prefix: "--output-basename", position: 1 } } input_gtf: {type: File, doc: "GTF file to use as liftover reference. Can be gzipped or plain tsv", - inputBinding: { prefix: "--input-gtf", position: 1} } + inputBinding: { prefix: "--input-gtf", position: 1 } } gene_id: {type: string, doc: "NAME of gene ID (ENSG values) column", - inputBinding: { prefix: "--gene-id", position: 1} } + inputBinding: { prefix: "--gene-id", position: 1 } } gene_name: {type: string, doc: "NAME of gene name column, if present to keep if not found", - inputBinding: { prefix: "--gene-name", position: 1} } + inputBinding: { prefix: "--gene-name", position: 1 } } skip: {type: 'int?', doc: "Number of lines to skip if needed, i.e. file has extra unneeded header lines", - inputBinding: { prefix: "--skip", position: 1} } + inputBinding: { prefix: "--skip", position: 1 } } collapse: {type: 'boolean?', doc: "If set, will collapse on repeat gene symbols and choose highest mean expression", default: true, - inputBinding: { prefix: "--collapse", position: 1} } + inputBinding: { prefix: "--collapse", position: 1 } } @@ -43,11 +40,11 @@ outputs: liftover: type: File outputBinding: - glob: '*.liftover.tsv.gz' + glob: '*.liftover.tsv' doc: "Un-collapsed expression matrix with lifted-over gene symbols" collapsed: type: File outputBinding: - glob: '*.collapsed.tsv.gz' + glob: '*.collapsed.tsv' doc: "Collapsed expression matrix with lifted-over gene symbols" \ No newline at end of file diff --git a/analyses/collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl b/analyses/collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl new file mode 100644 index 0000000000..29c1d0f64e --- /dev/null +++ b/analyses/collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl @@ -0,0 +1,33 @@ +cwlVersion: v1.0 +class: CommandLineTool +id: ubuntu-gzip +doc: "Simple tool to gzip files" + +requirements: + - class: ShellCommandRequirement + - class: DockerRequirement + dockerPull: 'ubuntu:22.04' + - class: InlineJavascriptRequirement + - class: InitialWorkDirRequirement + listing: + - entryname: files.txt + entry: | + $(inputs.input_files.map(function(e){return e.path}).join('\n')) + +baseCommand: [cat] +arguments: + - position: 1 + shellQuote: false + valueFrom: >- + files.txt | xargs -IFN -P $(inputs.cores) gzip FN + +inputs: + input_files: { type: 'File[]', doc: "Files to gzip" } + cores: { type: 'int?', doc: "Num threads to use. Uses xargs to gzip files in parallel", default: 2 } + +outputs: + gzipped_files: + type: 'File[]' + outputBinding: + glob: '*.gz' + doc: "Gzipped inputs" diff --git a/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl b/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl index 2d500b9af6..a669b1537b 100644 --- a/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl +++ b/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl @@ -13,14 +13,13 @@ inputs: gene_id: { type: string, doc: "NAME of gene ID (ENSG values) column" } gene_name: { type: string, doc: "NAME of gene name column, if present to keep if not found" } skip: { type: 'int?', doc: "Number of lines to skip if needed, i.e. file has extra unneeded header lines" } - collapse: { type: 'boolean?', doc: "If set, will collapse on repeat gene symbols and choose highest mean expression" } + collapse: { type: 'boolean?', doc: "If set, will collapse on repeat gene symbols and choose highest mean expression", default: true } # convert to rds output_filename: { type: string, doc: "Output file name, should fit pattern gene-[expression/counts]-rsem-[fpkm/tpm/expected_count]-collapsed.rds" } rds_ram: { type: 'int?', doc: "Set ram requirement. Unfortunately reading and writing rds files is beefy!", default: 32} outputs: - lifted: { type: File, outputSource: liftover_collapse_rnaseq/liftover } - collapsed: { type: File, outputSource: liftover_collapse_rnaseq/collapsed } + gzipped_results: { type: 'File[]', outputSource: gzip_tsv/gzipped_files } expression_rds: { type: 'File?', outputSource: convert_to_rds/pirate_output } steps: @@ -40,6 +39,19 @@ steps: collapse: collapse out: [liftover, collapsed] + gzip_tsv: + run: ../tools/ubuntu_gzip.cwl + hints: + - class: 'sbg:AWSInstanceType' + value: c5.2xlarge;ebs-gp2;400 + doc: "Chosen for speed and lower cost" + in: + input_files: + source: [liftover_collapse_rnaseq/liftover, liftover_collapse_rnaseq/collapsed] + valueFrom: | + $([self]) + out: [gzipped_files] + convert_to_rds: run: ../tools/convert_to_rds.cwl when: $(inputs.input_tsv != null) @@ -51,3 +63,6 @@ steps: $namespaces: sbg: https://sevenbridges.com +hints: +- class: "sbg:maxNumberOfParallelInstances" + value: 2 From 2d396ba3a9b077476815dc49d8756bc8d47dd83d Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Tue, 14 Mar 2023 13:23:57 -0400 Subject: [PATCH 06/13] Apply suggestions from code review Co-authored-by: Dan Miller --- .../collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py index d22636a8fb..797a07a1d5 100644 --- a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py @@ -40,7 +40,7 @@ def liftover(in_data, out_lift, n, l): sym_ct[gene_sym] = 1 else: sym_ct[gene_sym] += 1 - out_data = cur_id + '\t' + gene_sym + '\t' + '\t'.join(data[d_idx:]) + '\n' + out_data = "{}\t{}\t{}\n".format(cur_id, gene_sym, '\t'.join(data[d_idx:]) out_lift.write(out_data) l += 1 return l @@ -140,9 +140,7 @@ def collapse_dups(dup_dict, out_file): g_idx = header.index(args.gene_id) n_idx = header.index(args.gene_name) # make assumption that data will start after ID and name - d_idx = g_idx + 1 - if n_idx > g_idx: - d_idx = n_idx + 1 + d_idx = max(g_idx, n_idx) + 1 sys.stderr.write('Processing table\n') sys.stderr.flush() out_lift_fn = args.out + '.liftover.tsv' From 29734ae195901e631817a1789d8f9c2e00985419 Mon Sep 17 00:00:00 2001 From: migbro Date: Tue, 14 Mar 2023 13:24:30 -0400 Subject: [PATCH 07/13] :pencil: cwl version updates :hammer: wf refactor --- .../collapse-rnaseq/cwl/tools/convert_to_rds.cwl | 2 +- .../cwl/tools/liftover_collapse_rnaseq.cwl | 2 +- .../collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl | 15 +++++++++------ .../cwl/workflows/liftover_collapse_exp_wf.cwl | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl b/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl index 4c23a8efff..af114244eb 100644 --- a/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl +++ b/analyses/collapse-rnaseq/cwl/tools/convert_to_rds.cwl @@ -1,4 +1,4 @@ -cwlVersion: v1.0 +cwlVersion: v1.2 class: CommandLineTool id: convert-to-rds doc: "Converts tsv table to rds" diff --git a/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl b/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl index 977022f6b2..f0d0b4e08a 100644 --- a/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl +++ b/analyses/collapse-rnaseq/cwl/tools/liftover_collapse_rnaseq.cwl @@ -1,4 +1,4 @@ -cwlVersion: v1.0 +cwlVersion: v1.2 class: CommandLineTool id: liftover-collapse-rnaseq doc: "Tool to use a GTF and expression matrix with ENSG to liftover gene symbols and collapse repeat symbols if needed" diff --git a/analyses/collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl b/analyses/collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl index 29c1d0f64e..2a1f8933f6 100644 --- a/analyses/collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl +++ b/analyses/collapse-rnaseq/cwl/tools/ubuntu_gzip.cwl @@ -1,4 +1,4 @@ -cwlVersion: v1.0 +cwlVersion: v1.2 class: CommandLineTool id: ubuntu-gzip doc: "Simple tool to gzip files" @@ -10,16 +10,14 @@ requirements: - class: InlineJavascriptRequirement - class: InitialWorkDirRequirement listing: - - entryname: files.txt - entry: | - $(inputs.input_files.map(function(e){return e.path}).join('\n')) + - $(inputs.input_files) -baseCommand: [cat] arguments: - position: 1 shellQuote: false valueFrom: >- - files.txt | xargs -IFN -P $(inputs.cores) gzip FN + echo '${return inputs.input_files.map(function(e){return e.path}).join('\n')}' > files.txt + && cat files.txt | xargs -IFN -P $(inputs.cores) gzip FN inputs: input_files: { type: 'File[]', doc: "Files to gzip" } @@ -31,3 +29,8 @@ outputs: outputBinding: glob: '*.gz' doc: "Gzipped inputs" + file_list: + type: File + outputBinding: + glob: 'files.txt' + doc: "More of a debug file" diff --git a/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl b/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl index a669b1537b..725f596886 100644 --- a/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl +++ b/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl @@ -49,7 +49,7 @@ steps: input_files: source: [liftover_collapse_rnaseq/liftover, liftover_collapse_rnaseq/collapsed] valueFrom: | - $([self]) + $(self) out: [gzipped_files] convert_to_rds: From 2268199ba8ed61acb011660a7ce84c4db94277c2 Mon Sep 17 00:00:00 2001 From: migbro Date: Tue, 14 Mar 2023 14:40:41 -0400 Subject: [PATCH 08/13] :wrench: fix bug introduced by suggested commit :trophy: address PR concerns and suggestions --- .../cwl/scripts/liftover_collapse_rnaseq.py | 107 +++++++++--------- 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py index 797a07a1d5..a466377a5b 100644 --- a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py @@ -9,7 +9,7 @@ from statistics import mean -def update_gtf_dict(in_dict, in_text): +def update_gtf_dict(gtf_dict, in_text, tx_type, tx_desc, id_find, name_find): """ Update ensg - sym ID dictionary on gene entries only """ @@ -18,72 +18,62 @@ def update_gtf_dict(in_dict, in_text): try: gene_id = id_find.match(data[tx_desc]).group(1) gene_name = name_find.match(data[tx_desc]).group(1) - in_dict[gene_id] = gene_name + gtf_dict[gene_id] = gene_name except Exception as e: - sys.stderr.write(str(e) + '\n') - sys.stderr.write("Failed to parse gene ID and name from " + in_text) + print(str(e), file=sys.stderr) + print("Failed to parse gene ID and name from " + in_text, file=sys.stderr) -def liftover(in_data, out_lift, n, l): +def liftover(line_from_infile, gene_sym_ct_dict, g_idx, n_idx, gtf_dict, d_idx): """ Update gene symbol if found in dict, leave alone if not """ - if l % n == 0: - sys.stderr.write("Processed " + str(l) + " lines\n") - sys.stderr.flush() - data = in_data.rstrip('\n').split('\t') + data = line_from_infile.rstrip('\n').split('\t') cur_id = data[g_idx].split('.')[0] if cur_id in gtf_dict: data[n_idx] = gtf_dict[cur_id] gene_sym = data[n_idx] - if gene_sym not in sym_ct: - sym_ct[gene_sym] = 1 + if gene_sym not in gene_sym_ct_dict: + gene_sym_ct_dict[gene_sym] = 1 else: - sym_ct[gene_sym] += 1 - out_data = "{}\t{}\t{}\n".format(cur_id, gene_sym, '\t'.join(data[d_idx:]) - out_lift.write(out_data) - l += 1 - return l + gene_sym_ct_dict[gene_sym] += 1 + out_data = "{}\t{}\t{}\n".format(cur_id, gene_sym, '\t'.join(data[d_idx:])) + return out_data -def collate_dups(in_data, in_dict, ct, out_file, l): +def collate_dups(in_data, dup_gene_sym_dict, ct, gene_sym_ct_dict, d_idx, n_idx): """ - Checks if entry is in dup dict (really smybol seen > 1 in sym_ct), + Checks if entry is in dup dict (really symbol seen > 1 in gene_sym_ct_dict), then add to dict ot process, or output to open file handle as-is if not repeated """ - if l % n == 0: - sys.stderr.write("Processed " + str(l) + " lines\n") - sys.stderr.flush() data = in_data.rstrip('\n').split('\t') gene_sym = data[n_idx] - if sym_ct[gene_sym] > 1: - if gene_sym not in dup_dict: - dup_dict[gene_sym] = [] + out_data = None + if gene_sym_ct_dict[gene_sym] > 1: + if gene_sym not in dup_gene_sym_dict: + dup_gene_sym_dict[gene_sym] = [] ct += 1 # cast as int for later - dup_dict[gene_sym].append(list(map(float, data[d_idx:]))) + dup_gene_sym_dict[gene_sym].append(list(map(float, data[d_idx:]))) else: - out_data = gene_sym + "\t" + "\t".join(data[d_idx:]) + '\n' - out_file.write(out_data) - l += 1 - return ct, l + out_data = "{}\t{}\n".format(gene_sym, '\t'.join(data[d_idx:])) + return ct, out_data -def collapse_dups(dup_dict, out_file): +def collapse_dups(dup_gene_sym_dict): """ Get highest mean for repeated gene symbols and output that row """ - for gene_sym in dup_dict: + for gene_sym in dup_gene_sym_dict: means = [] - for data in dup_dict[gene_sym]: + for data in dup_gene_sym_dict[gene_sym]: means.append(mean(data)) top_idx = means.index(max(means)) - out_data = gene_sym + "\t" + "\t".join(list(map(str, dup_dict[gene_sym][top_idx]))) + '\n' - out_file.write(out_data) + out_data = "{}\t{}\n".format(gene_sym, '\t'.join(list(map(str, dup_gene_sym_dict[gene_sym][top_idx])))) + return out_data -if __name__ == '__main__': - +def main(): parser = argparse.ArgumentParser( description="Script to use a GTF and expression matrix with ENSG to liftover gene symbols." ) @@ -122,16 +112,16 @@ def collapse_dups(dup_dict, out_file): # set up pattern matcher ahead of time for efficiency id_find = re.compile(r"gene_id \"(ENSG\d+)\.\d+\";") name_find = re.compile(r".*gene_name \"(\S+)\";") - sys.stderr.write('Indexing gtf\n') + print('Indexing gtf', file=sys.stderr) with (gzip.open if args.gtf.endswith("gz") else open)(args.gtf, "rt", encoding="utf-8") as gtf: for line in gtf: if not line.startswith('##'): - update_gtf_dict(gtf_dict, line) + update_gtf_dict(gtf_dict, line, tx_type, tx_desc, id_find, name_find) # track sym occurrences if collapse flag given for later processing - sym_ct = {} + gene_sym_ct_dict = {} with (gzip.open if args.table.endswith("gz") else open)(args.table, "rt", encoding="utf-8") as table: if args.skip: - sys.stderr.write('Skipping ' + str(args.skip) + ' lines\n') + print('Skipping ' + str(args.skip) + ' lines', file=sys.stderr) for i in range(args.skip): skip = next(table) head = next(table) @@ -141,38 +131,47 @@ def collapse_dups(dup_dict, out_file): n_idx = header.index(args.gene_name) # make assumption that data will start after ID and name d_idx = max(g_idx, n_idx) + 1 - sys.stderr.write('Processing table\n') - sys.stderr.flush() + print('Processing table', file=sys.stderr) out_lift_fn = args.out + '.liftover.tsv' out_lift = open(out_lift_fn, 'w') # Some counters to track progress n = 10000 l = 0 - out_head = ('gene_id\tgene_name\t' + '\t'.join(header[d_idx:])) + out_head = "{}\t{}\t{}\n".format('gene_id', 'gene_name', '\t'.join(header[d_idx:])) out_lift.write(out_head) for line in table: - l = liftover(line, out_lift, n, l) + if l % n == 0: + print("Processed " + str(l) + " lines", file=sys.stderr) + out_data = liftover(line, gene_sym_ct_dict, g_idx, n_idx, gtf_dict, d_idx) + out_lift.write(out_data) + l += 1 # dump out rest of leftover lines out_lift.close() if args.collapse: - sys.stderr.write("Collapse flag given. Collating dups from liftover\n") - sys.stderr.flush() - dup_dict = {} + print("Collapse flag given. Collating dups from liftover", file=sys.stderr) + dup_gene_sym_dict = {} dup_ct = 0 out_collapse_fn = args.out + '.collapsed.tsv' out_collapse = open(out_collapse_fn, 'w') with (gzip.open if out_lift_fn.endswith("gz") else open)(out_lift_fn, "rt", encoding="utf-8") as lifted: # already got header as array, so can skip splitting skip = next(lifted) - out_head = 'gene_name' + '\t' + '\t'.join(header[d_idx:]) + '\n' + out_head = "{}\t{}\n".format('gene_name', '\t'.join(header[d_idx:])) out_collapse.write(out_head) l = 0 for line in lifted: - dup_ct, l = collate_dups(line, dup_dict, dup_ct, out_collapse, l) - sys.stderr.write("Processing " + str(dup_ct) + " repeat gene symbols and choosing highest mean expression\n") - sys.stderr.flush() - collapse_dups(dup_dict, out_collapse) - sys.stderr.write("Done!\n") - sys.stderr.flush() + dup_ct, out_data = collate_dups(line, dup_gene_sym_dict, dup_ct, gene_sym_ct_dict, d_idx, n_idx) + if l % n == 0: + print("Processed " + str(l) + " lines", file=sys.stderr) + if out_data: + out_collapse.write(out_data) + l += 1 + print("Processing " + str(dup_ct) + " repeat gene symbols and choosing highest mean expression", file=sys.stderr) + out_data = collapse_dups(dup_gene_sym_dict) + out_collapse.write(out_data) + print("Done!", file=sys.stderr) + +if __name__ == '__main__': + main() From dc2e30249517d0130395645b0df44ceba7e1600d Mon Sep 17 00:00:00 2001 From: migbro Date: Tue, 14 Mar 2023 15:26:37 -0400 Subject: [PATCH 09/13] :pencil: implemented format suggestions --- .../cwl/scripts/liftover_collapse_rnaseq.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py index a466377a5b..fb2bf20c9c 100644 --- a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py @@ -121,7 +121,7 @@ def main(): gene_sym_ct_dict = {} with (gzip.open if args.table.endswith("gz") else open)(args.table, "rt", encoding="utf-8") as table: if args.skip: - print('Skipping ' + str(args.skip) + ' lines', file=sys.stderr) + print("Skipping {} lines".format(args.skip), file=sys.stderr) for i in range(args.skip): skip = next(table) head = next(table) @@ -141,7 +141,7 @@ def main(): out_lift.write(out_head) for line in table: if l % n == 0: - print("Processed " + str(l) + " lines", file=sys.stderr) + print("Processed {} lines".format(l), file=sys.stderr) out_data = liftover(line, gene_sym_ct_dict, g_idx, n_idx, gtf_dict, d_idx) out_lift.write(out_data) l += 1 @@ -163,11 +163,11 @@ def main(): for line in lifted: dup_ct, out_data = collate_dups(line, dup_gene_sym_dict, dup_ct, gene_sym_ct_dict, d_idx, n_idx) if l % n == 0: - print("Processed " + str(l) + " lines", file=sys.stderr) + print("Processed {} lines".format(l), file=sys.stderr) if out_data: out_collapse.write(out_data) l += 1 - print("Processing " + str(dup_ct) + " repeat gene symbols and choosing highest mean expression", file=sys.stderr) + print("Processing {} repeat gene symbols and choosing highest mean expression".format(dup_ct), file=sys.stderr) out_data = collapse_dups(dup_gene_sym_dict) out_collapse.write(out_data) print("Done!", file=sys.stderr) From 64909a9d177e5998498f38ba7fa7ff4ee405062d Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Tue, 14 Mar 2023 15:27:42 -0400 Subject: [PATCH 10/13] Update analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py Co-authored-by: Dan Miller --- .../collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py index fb2bf20c9c..15c6d46eab 100644 --- a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py @@ -65,9 +65,7 @@ def collapse_dups(dup_gene_sym_dict): Get highest mean for repeated gene symbols and output that row """ for gene_sym in dup_gene_sym_dict: - means = [] - for data in dup_gene_sym_dict[gene_sym]: - means.append(mean(data)) + means = [mean(data) for data in dup_gene_sym_dict[gene_sym]] top_idx = means.index(max(means)) out_data = "{}\t{}\n".format(gene_sym, '\t'.join(list(map(str, dup_gene_sym_dict[gene_sym][top_idx])))) return out_data From b1588c549da55dade236d4c814943ec114b5d29f Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Tue, 14 Mar 2023 15:31:43 -0400 Subject: [PATCH 11/13] Update analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py Co-authored-by: Dan Miller --- .../cwl/scripts/liftover_collapse_rnaseq.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py index 15c6d46eab..86582832ca 100644 --- a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py @@ -112,9 +112,12 @@ def main(): name_find = re.compile(r".*gene_name \"(\S+)\";") print('Indexing gtf', file=sys.stderr) with (gzip.open if args.gtf.endswith("gz") else open)(args.gtf, "rt", encoding="utf-8") as gtf: - for line in gtf: - if not line.startswith('##'): - update_gtf_dict(gtf_dict, line, tx_type, tx_desc, id_find, name_find) + newline = gtf.readline() + while newline.startswith('##'): + newline = gtf.readline() + while newline != '': + update_gtf_dict(gtf_dict, newline, tx_type, tx_desc, id_find, name_find) + newline = gtf.readline() # track sym occurrences if collapse flag given for later processing gene_sym_ct_dict = {} with (gzip.open if args.table.endswith("gz") else open)(args.table, "rt", encoding="utf-8") as table: From b5fad686dcebd97097896f691058770ca9d98ea4 Mon Sep 17 00:00:00 2001 From: migbro Date: Tue, 14 Mar 2023 15:38:30 -0400 Subject: [PATCH 12/13] :heavy_minus_sign: dropped self --- .../collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl | 2 -- 1 file changed, 2 deletions(-) diff --git a/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl b/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl index 725f596886..3c8522f568 100644 --- a/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl +++ b/analyses/collapse-rnaseq/cwl/workflows/liftover_collapse_exp_wf.cwl @@ -48,8 +48,6 @@ steps: in: input_files: source: [liftover_collapse_rnaseq/liftover, liftover_collapse_rnaseq/collapsed] - valueFrom: | - $(self) out: [gzipped_files] convert_to_rds: From 1b5a673ead65ab99380f5b335d170fee365bf9d4 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Tue, 14 Mar 2023 15:42:08 -0400 Subject: [PATCH 13/13] Update analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py Co-authored-by: Dan Miller --- .../cwl/scripts/liftover_collapse_rnaseq.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py index 86582832ca..3293a13696 100644 --- a/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py +++ b/analyses/collapse-rnaseq/cwl/scripts/liftover_collapse_rnaseq.py @@ -112,12 +112,12 @@ def main(): name_find = re.compile(r".*gene_name \"(\S+)\";") print('Indexing gtf', file=sys.stderr) with (gzip.open if args.gtf.endswith("gz") else open)(args.gtf, "rt", encoding="utf-8") as gtf: + newline = gtf.readline() + while newline.startswith('##'): + newline = gtf.readline() + while newline != '': + update_gtf_dict(gtf_dict, newline, tx_type, tx_desc, id_find, name_find) newline = gtf.readline() - while newline.startswith('##'): - newline = gtf.readline() - while newline != '': - update_gtf_dict(gtf_dict, newline, tx_type, tx_desc, id_find, name_find) - newline = gtf.readline() # track sym occurrences if collapse flag given for later processing gene_sym_ct_dict = {} with (gzip.open if args.table.endswith("gz") else open)(args.table, "rt", encoding="utf-8") as table: