From e03155af2b39c8b65f600f5576160350fbad4928 Mon Sep 17 00:00:00 2001 From: Eric Wafula Date: Wed, 7 Jun 2023 20:23:34 -0400 Subject: [PATCH] add probe annotations lifover process --- .../01-calculate-tpm-medians.R | 2 +- .../02-calculate-methly-quantiles.R | 2 +- .../03-methyl-tpm-correlation.py | 2 +- .../04-tpm-transcript-representation.py | 2 +- .../05-create-methyl-summary-table.R | 2 +- analyses/methylation-summary/README.md | 15 +++++++++------ 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/analyses/methylation-summary/01-calculate-tpm-medians.R b/analyses/methylation-summary/01-calculate-tpm-medians.R index 9b559dfa50..5100e594e3 100755 --- a/analyses/methylation-summary/01-calculate-tpm-medians.R +++ b/analyses/methylation-summary/01-calculate-tpm-medians.R @@ -21,7 +21,7 @@ option_list <- list( help = "OpenPedCan rnaseq tpm gene or isoform matrix file", metavar = "character"), make_option(opt_str = "--methyl_probe_annot", type = "character", default = NULL, - help = "Methyl gencode array probe annotation results file", + help = "Methyl gencode array probe annotations", metavar = "character"), make_option(opt_str = "--methyl_independent_samples", type = "character", default = NULL, help = "OpenPedCan methyl independent biospecimen list file", diff --git a/analyses/methylation-summary/02-calculate-methly-quantiles.R b/analyses/methylation-summary/02-calculate-methly-quantiles.R index ff627e4cae..4914738b21 100755 --- a/analyses/methylation-summary/02-calculate-methly-quantiles.R +++ b/analyses/methylation-summary/02-calculate-methly-quantiles.R @@ -20,7 +20,7 @@ option_list <- list( help = "OPenPedCan methyl beta-values or m-values matrix file", metavar = "character"), make_option(opt_str = "--methyl_probe_annot", type = "character", default = NULL, - help = "Methyl gencode array probe annotation results file", + help = "Methyl gencode array probe annotations", metavar = "character"), make_option(opt_str = "--independent_samples", type = "character", default = NULL, help = "OpenPedCan methyl independent biospecimen list file", diff --git a/analyses/methylation-summary/03-methyl-tpm-correlation.py b/analyses/methylation-summary/03-methyl-tpm-correlation.py index 25f2d337f1..8749c48667 100755 --- a/analyses/methylation-summary/03-methyl-tpm-correlation.py +++ b/analyses/methylation-summary/03-methyl-tpm-correlation.py @@ -35,7 +35,7 @@ def read_parameters(): p.add_argument('METHYL_INDEPENDENT_SAMPLES', type=str, default=None, help="OPenPedCan methyl independent biospecimen list file\n\n") p.add_argument('METHLY_MATRIX', type=str, default=None, help="OpenPedCan methyl beta-values or m-values matrix file\n\n") p.add_argument('EXP_MATRIX', type=str, default=None, help="OPenPedCan expression matrix file\n\n") - p.add_argument('PROBE_ANNOT', type=str, default=None, help="Methylation aaray probe gencode annotation results file\n\n") + p.add_argument('PROBE_ANNOT', type=str, default=None, help="Methyl gencode array probe annotations\n\n") p.add_argument('-m', '--methyl_values', type=str, default='beta', choices=METHLY_VALUES, help="OpenPedCan methly matrix values: beta (default) and m\n\n") p.add_argument('-e', '--exp_values', type=str, default='gene', choices=EXP_TYPE, help="OpenPedCan expression matrix values: gene (default) and isoform\n\n") p.add_argument('-v', '--version', action='version', version="03-methyl-tpm-correlation.py version {} ({})".format(__version__, __date__), help="Print the current 03-methyl-tpm-correlation.py version and exit\n\n") diff --git a/analyses/methylation-summary/04-tpm-transcript-representation.py b/analyses/methylation-summary/04-tpm-transcript-representation.py index de514ca0af..39e3ac7b66 100755 --- a/analyses/methylation-summary/04-tpm-transcript-representation.py +++ b/analyses/methylation-summary/04-tpm-transcript-representation.py @@ -31,7 +31,7 @@ def read_parameters(): p.add_argument('METHYL_INDEPENDENT_SAMPLES', type=str, default=None, help="OPenPedCan methyl independent biospecimen list file\n\n") p.add_argument('GENE_EXP_MATRIX', type=str, default=None, help="OPenPedCan gene expression matrix file\n\n") p.add_argument('ISOFORM_EXP_MATRIX', type=str, default=None, help="OPenPedCan isoform expression matrix file\n\n") - p.add_argument('PROBE_ANNOT', type=str, default=None, help="Methylation array probe gencode annotation results file\n\n") + p.add_argument('PROBE_ANNOT', type=str, default=None, help="Methyl gencode array probe annotations\n\n") p.add_argument('-v', '--version', action='version', version="04-tpm-transcript-representation.py version {} ({})".format(__version__, __date__), help="Print the current 04-tpm-transcript-representation.py version and exit\n\n") return p.parse_args() diff --git a/analyses/methylation-summary/05-create-methyl-summary-table.R b/analyses/methylation-summary/05-create-methyl-summary-table.R index 27baf0c95c..5e6e59ee49 100755 --- a/analyses/methylation-summary/05-create-methyl-summary-table.R +++ b/analyses/methylation-summary/05-create-methyl-summary-table.R @@ -25,7 +25,7 @@ option_list <- list( help = "Methyl array probe beta/m-values quantiles results file", metavar = "character"), make_option(opt_str = "--methyl_probe_annot", type = "character", default = NULL, - help = "Methyl gencode array probe annotation results file", + help = "Methyl gencode array probe annotations", metavar = "character"), make_option(opt_str = "--rnaseq_tpm_medians", type = "character", default = NULL, help = "RNA-Seq gene-level or isoform-level tmp median expression results file", diff --git a/analyses/methylation-summary/README.md b/analyses/methylation-summary/README.md index ecc5fdc5b6..c7be6a8432 100755 --- a/analyses/methylation-summary/README.md +++ b/analyses/methylation-summary/README.md @@ -2,7 +2,10 @@ ## Purpose -Summarize preprocessed `Illumina Infinium HumanMethylation` array measurements produced by the [OpenPedCan methylation-preprocessing module](https://github.com/PediatricOpenTargets/OpenPedCan-analysis/tree/dev/analyses/methylation-preprocessing) and [Illumina infinium methylation array CpG probe coordinates](https://support.illumina.com/array/array_kits/infinium-methylationepic-beadchip-kit/downloads.html) lifted-over from GRCh37 to GRCh38 build and annotated with GENCODE v39 release that is currently utilized in the OpenPedCan data analyses. +Summarize preprocessed `Illumina Infinium Human Methylation` array measurements produced by the [OpenPedCan methylation-preprocessing module](https://github.com/PediatricOpenTargets/OpenPedCan-analysis/tree/dev/analyses/methylation-preprocessing) and [Illumina infinium methylation array CpG probe coordinates](https://support.illumina.com/array/array_kits/infinium-methylationepic-beadchip-kit/downloads.html) lifted-over from `GRCh37` to `GRCh38` build and annotated with [GENCODE v39 release](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/) that is currently utilized in the OpenPedCan data analyses. + +## Methylation array CpG probe coordinates liftover +The 450K and EPIC Illumina Infinium methylation array CpG probe coordinates are based on the `Human Build 37 (GRCh37/hg19)` genome assembly. Probe coordinates were converted to `Human Build 38 (GRCh38/hg38)` using the [ENSEMBL Assembly Converter tool](https://useast.ensembl.org/Homo_sapiens/Tools/AssemblyConverter). A probe annotation file, `infinium.gencode.v39.probe.annotations.tsv` currently used in the module analyses, was created by annotating all the probes that were lifted over with associated gene features (i.e., `promoter`, `5' UTR`, `exon`, `intron`, `3'UTR`, and `intergenic`) based on `GENCODE v39` release. Intron coordinates, typically not included in the GFF3/GTF genome annotation formats, were added to the GENCODE annotations file using [GenomeTools](http://genometools.org/). Probe locations were then assigned with their intersecting gene annotation features with [bedtools](https://bedtools.readthedocs.io/en/latest/content/bedtools-suite.html). ## Analysis scripts @@ -19,7 +22,7 @@ Options: OpenPedCan rnaseq tpm gene or isoform matrix file --methyl_probe_annot=CHARACTER - Methyl gencode array probe annotation results file + Methyl gencode array probe annotations --methyl_independent_samples=CHARACTER OpenPedCan methyl independent biospecimen list file @@ -46,7 +49,7 @@ Options: OPenPedCan methyl beta-values or m-values matrix file --methyl_probe_annot=CHARACTER - Methyl gencode array probe annotation results file + Methyl gencode array probe annotations --independent_samples=CHARACTER OpenPedCan methyl independent biospecimen list file @@ -78,7 +81,7 @@ positional arguments: EXP_MATRIX OPenPedCan expression matrix file - PROBE_ANNOT Methylation aaray probe gencode annotation results file + PROBE_ANNOT Methyl gencode array probe annotations optional arguments: -h, --help show this help message and exit @@ -129,7 +132,7 @@ positional arguments: ISOFORM_EXP_MATRIX OPenPedCan isoform expression matrix file - PROBE_ANNOT Methylation aaray probe gencode annotation results file + PROBE_ANNOT Methyl gencode array probe annotations -v, --version Print the current 04-tpm-transcript-representation.py version and exit ``` @@ -168,7 +171,7 @@ Options: Methyl array probe beta/m-values quantiles results file --methyl_probe_annot=CHARACTER - Methyl gencode array probe annotation results file + Methyl gencode array probe annotations --rnaseq_tpm_medians=CHARACTER RNA-Seq gene-level or isoform-level tmp median expression results file