diff --git a/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule b/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule index 51d5ba88b..cc3110cdd 100644 --- a/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule +++ b/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule @@ -70,7 +70,7 @@ elif config["analysis"]["sequencing_type"] == "wgs" and config["analysis"]["anal message: "Converting CNVs from VCF to the CGH format using vcf2cytosure for {params.case_name}" shell: """ -grep -E "#|PASS" {input.ascat_vcf} | bgzip -l 9 -c > {output.ascat_vcf}; +zgrep -E "#|PASS" {input.ascat_vcf} | bgzip -l 9 -c > {output.ascat_vcf}; vcf2cytosure --vcf {output.ascat_vcf} --coverage {input.tiddit_cov_tumor} --out {output.cgh_tumor} --sex {params.gender} --bins 20 diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index e054a1f4f..e737bcd13 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -22,7 +22,7 @@ rule manta_tumor_normal: tumor = get_sample_type(config["samples"], "tumor"), normal = get_sample_type(config["samples"], "normal"), case_name = config["analysis"]["case_id"], - manta_install_path = "/opt/conda/share/manta-1.6.0-1" + manta_install_path = "/opt/conda/share/manta-1.6.0-2" threads: get_threads(cluster_config, "manta_tumor_normal") message: diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule index db701f6c7..45914185c 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule @@ -19,7 +19,7 @@ rule manta_tumor_only: runmode = "local", tumor = get_sample_type(config["samples"], "tumor"), case_name = config["analysis"]["case_id"], - manta_install_path= "/opt/conda/share/manta-1.6.0-1" + manta_install_path= "/opt/conda/share/manta-1.6.0-2" threads: get_threads(cluster_config, "manta_tumor_only") message: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f0b8c8a3b..c323d858e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -37,6 +37,7 @@ Fixed: * `run_validate.sh` script https://github.com/Clinical-Genomics/BALSAMIC/pull/952 * Somatic SV tumor normal rules https://github.com/Clinical-Genomics/BALSAMIC/pull/959 * Missing `genderChr` flag for `ascat_tumor_normal` rule https://github.com/Clinical-Genomics/BALSAMIC/pull/963 +* Command in vcf2cytosure rule and updated ReadtheDocs https://github.com/Clinical-Genomics/BALSAMIC/pull/966 Removed ^^^^^^^ diff --git a/docs/FAQs.rst b/docs/FAQs.rst index e712acf41..f0ea33aa5 100644 --- a/docs/FAQs.rst +++ b/docs/FAQs.rst @@ -96,59 +96,3 @@ Make a pull request to master at this point. After pull request is approved and - Never force rebase commits into either `master` or `develop` branches. - When merging pull requests commits into `master` branch, use **Create a merge commit**, which helps to capture all the commit history. On contrary, when merging pull requests into `develop` branch, use **Squash and merge** button, which combines multiple commits messages into one commit. -**References** -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -**How to generate reference files for ascatNGS** - -Detailed information is available from `ascatNGS <https://github.com/cancerit/ascatNgs>`_ documentation - -Briefly, ascatNGS needs gender loci file if gender information for the input sample is not available. The second file is *SnpGcCorrections.tsv*, which is prepared from the 1000 genome SNP panel. - -1. **Gender loci file:** - -GRCh37d5_Y.loci contains the following contents: - -.. line-block:: - Y 4546684 - Y 2934912 - Y 4550107 - Y 4549638 - - -2. **GC correction file:** - -First step is to download the 1000 genome snp file and convert it from .vcf to .tsv. The detailed procedure to for this step is available from `ascatNGS-reference-files <https://github.com/cancerit/ascatNgs/wiki/Human-reference-files-from-1000-genomes-VCFs>`_ (Human reference files from 1000 genomes VCFs) - -.. code-block:: - - export TG_DATA=ftp://ftp.ensembl.org/pub/grch37/release-83/variation/vcf/homo_sapiens/1000GENOMES-phase_3.vcf.gz - - -Followed by: - -.. code-block:: - - curl -sSL $TG_DATA | zgrep -F 'E_Multiple_observations' | grep -F 'TSA=SNV' |\ - perl -ane 'next if($F[0] !~ m/^\d+$/ && $F[0] !~ m/^[XY]$/);\ - next if($F[0] eq $l_c && $F[1]-1000 < $l_p); $F[7]=~m/MAF=([^;]+)/;\ - next if($1 < 0.05); printf "%s\t%s\t%d\n", $F[2],$F[0],$F[1];\ - $l_c=$F[0]; $l_p=$F[1];' > SnpPositions_GRCh37_1000g.tsv - - ---or-- - -.. code-block:: - - curl -sSL $TG_DATA | zgrep -F 'E_Multiple_observations' | grep -F 'TSA=SNV' |\ - perl -ane 'next if($F[0] !~ m/^\d+$/ && $F[0] !~ m/^[XY]$/); $F[7]=~m/MAF=([^;]+)/;\ - next if($1 < 0.05); next if($F[0] eq $l_c && $F[1]-1000 < $l_p);\ - printf "%s\t%s\t%d\n", $F[2],$F[0],$F[1]; $l_c=$F[0]; $l_p=$F[1];'\ - > SnpPositions_GRCh37_1000g.tsv - -Second step is to use *SnpPositions.tsv* file and generate *SnpGcCorrections.tsv* file, more details see `ascatNGS-convert-snppositions <https://github.com/cancerit/ascatNgs/wiki/Convert-SnpPositions.tsv-to-SnpGcCorrections.tsv>`_ - -.. code-block:: - - ascatSnpPanelGcCorrections.pl genome.fa SnpPositions.tsv > SnpGcCorrections.tsv - diff --git a/docs/README.rst b/docs/README.rst index 2872ebd20..5a7737b74 100644 --- a/docs/README.rst +++ b/docs/README.rst @@ -1,5 +1,5 @@ ========= -Build Doc +Build doc ========= Following steps explains how to build documents locally. diff --git a/docs/balsamic_annotation.rst b/docs/balsamic_annotation.rst index 8c19e2326..0a5b932ed 100644 --- a/docs/balsamic_annotation.rst +++ b/docs/balsamic_annotation.rst @@ -1,5 +1,5 @@ *********************************** -Annotation Resources +Annotation resources *********************************** BALSAMIC annotates somatic single nucleotide variants (SNVs) using ``ensembl-vep`` and ``vcfanno``. Somatic structural variants (SVs), somatic copy-number variants (CNVs) and germline single nucleotide variants are annotated using only ``ensembl-vep``. All SVs and CNVs are merged using ``SVDB`` before annotating for `Target Genome Analysis (TGA)` or `Whole Genome Sequencing (WGS)` analyses. diff --git a/docs/balsamic_filters.rst b/docs/balsamic_filters.rst index 83b8125b7..b33780aa8 100644 --- a/docs/balsamic_filters.rst +++ b/docs/balsamic_filters.rst @@ -1,9 +1,52 @@ *********************************** -Calling and Filtering Variants +Calling and filtering variants *********************************** -In BALSAMIC, various bioinfo tools are integrated for reporting somatic and germline variants. Also, the choice of these tools differs between the type of analysis, -e.g.: `Target Genome Analysis (TGA)` or analysis of `Whole Genome Sequencing (WGS)`. Various filters (Pre-call and Post-call filtering) are applied at different levels to report high-confidence variant calls. +In BALSAMIC, various bioinfo tools are integrated for reporting somatic and germline variants summarized in the table below. The choice of these tools differs between the type of analysis, `Target Genome Analysis (TGA)` or analysis of `Whole Genome Sequencing (WGS)`. + + +.. list-table:: SNV and small-Indel callers + :widths: 22 27 25 20 20 + :header-rows: 1 + + * - Variant caller + - Sequencing type + - Analysis type + - Somatic/Germline + - Variant type + * - DNAscope + - WGS + - tumor-normal, tumor-only + - germline + - SNV, InDel + * - TNhaplotyper + - TGA, WES, WGS :superscript:`1` + - tumor-normal, tumor-only + - somatic + - SNV, InDel + * - TNscope :superscript:`2` + - WGS + - tumor-normal, tumor-only + - somatic + - SNV, InDel + * - TNScope_umi + - TGA, WGS + - tumor-normal, tumor-only + - somatic, germline + - SNV, InDel + * - VarDict + - TGA, WGS + - tumor-normal, tumor-only + - somatic + - SNV, InDel + +:superscript:`1` TNhaplotyper is only executed for tumor-only if a WGS case is being analysed + +:superscript:`2` TNscope output is being merged with TNhaplotyper calls for TO-WGS analysis + + + +Various filters (Pre-call and Post-call filtering) are applied at different levels to report high-confidence variant calls. **Pre-call filtering** is where the variant-calling tool decides not to add a variant to the VCF file if the default filters of the variant-caller did not pass the filter criteria. The set of default filters differs between the various variant-calling algorithms. diff --git a/docs/balsamic_methods.rst b/docs/balsamic_methods.rst index a51b18a3d..ee1bab85c 100644 --- a/docs/balsamic_methods.rst +++ b/docs/balsamic_methods.rst @@ -1,6 +1,6 @@ -======== -Methods -======== +=================== +Method description +=================== Target Genome Analysis ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/balsamic_sv_cnv.rst b/docs/balsamic_sv_cnv.rst index 82529b7c4..146bccf79 100644 --- a/docs/balsamic_sv_cnv.rst +++ b/docs/balsamic_sv_cnv.rst @@ -1,5 +1,5 @@ ************************************ -Structural and Copy Number Variants +Structural and Copy Number variants ************************************ Depending on the sequencing type, BALSAMIC is currently running the following structural and copy number variant callers: @@ -42,6 +42,8 @@ Depending on the sequencing type, BALSAMIC is currently running the following st Further details about a specific caller can be found in the links for the repositories containing the documentation for SV and CNV callers along with the links for the articles are listed in `bioinfo softwares <https://github.com/Clinical-Genomics/BALSAMIC/blob/master/docs/bioinfo_softwares.rst>`_. +It mandatory to provide the gender of the sample from BALSAMIC version >= 10.0.0 For CNV analysis. + The copy number variants, identified using ascatNgs and `dellycnv`, are converted to deletion and duplications before they are merged using `SVDB` with `--bnd_distance = 5000` (distance between end points for the variants from different callers) and `--overlap = 0.80` (percentage for overlapping bases for the variants from different callers). `SVDB` prioritizes the merging of variants from SV and CNV callers to fetch position and genotype information, in the following order: .. list-table:: SVDB merge caller priority order @@ -81,4 +83,52 @@ The following command can be used to fetch the variants identified by a specific :: - zgrep -E "#|<Caller>" <*.svdb.vcf.gz> \ No newline at end of file + zgrep -E "#|<Caller>" <*.svdb.vcf.gz> + + + +**Genome Reference Files** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**How to generate genome reference files for ascatNGS** + +Detailed information is available from `ascatNGS <https://github.com/cancerit/ascatNgs>`_ documentation + +The file *SnpGcCorrections.tsv* prepared from the 1000 genome SNP panel. + +**GC correction file:** + +First step is to download the 1000 genome snp file and convert it from .vcf to .tsv. The detailed procedure to for this step is available from `ascatNGS-reference-files <https://github.com/cancerit/ascatNgs/wiki/Human-reference-files-from-1000-genomes-VCFs>`_ (Human reference files from 1000 genomes VCFs) + +.. code-block:: + + export TG_DATA=ftp://ftp.ensembl.org/pub/grch37/release-83/variation/vcf/homo_sapiens/1000GENOMES-phase_3.vcf.gz + + +Followed by: + +.. code-block:: + + curl -sSL $TG_DATA | zgrep -F 'E_Multiple_observations' | grep -F 'TSA=SNV' |\ + perl -ane 'next if($F[0] !~ m/^\d+$/ && $F[0] !~ m/^[XY]$/);\ + next if($F[0] eq $l_c && $F[1]-1000 < $l_p); $F[7]=~m/MAF=([^;]+)/;\ + next if($1 < 0.05); printf "%s\t%s\t%d\n", $F[2],$F[0],$F[1];\ + $l_c=$F[0]; $l_p=$F[1];' > SnpPositions_GRCh37_1000g.tsv + + +--or-- + +.. code-block:: + + curl -sSL $TG_DATA | zgrep -F 'E_Multiple_observations' | grep -F 'TSA=SNV' |\ + perl -ane 'next if($F[0] !~ m/^\d+$/ && $F[0] !~ m/^[XY]$/); $F[7]=~m/MAF=([^;]+)/;\ + next if($1 < 0.05); next if($F[0] eq $l_c && $F[1]-1000 < $l_p);\ + printf "%s\t%s\t%d\n", $F[2],$F[0],$F[1]; $l_c=$F[0]; $l_p=$F[1];'\ + > SnpPositions_GRCh37_1000g.tsv + +Second step is to use *SnpPositions.tsv* file and generate *SnpGcCorrections.tsv* file, more details see `ascatNGS-convert-snppositions <https://github.com/cancerit/ascatNgs/wiki/Convert-SnpPositions.tsv-to-SnpGcCorrections.tsv>`_ + +.. code-block:: + + ascatSnpPanelGcCorrections.pl genome.fa SnpPositions.tsv > SnpGcCorrections.tsv + diff --git a/docs/bioinfo_softwares.rst b/docs/bioinfo_softwares.rst index b172ec17d..8a806e55c 100644 --- a/docs/bioinfo_softwares.rst +++ b/docs/bioinfo_softwares.rst @@ -1,5 +1,5 @@ ================================= -List of bioinformatics software +Tools and software ================================= BALSAMIC ( **version** = 9.0.1 ) uses myriad of tools and softwares to analyze fastq files. This section covers why each diff --git a/docs/cli_package.rst b/docs/cli_package.rst index e446f96a7..2a6b58972 100644 --- a/docs/cli_package.rst +++ b/docs/cli_package.rst @@ -1,7 +1,7 @@ ============= -CLI reference +CLI usage ============= .. click:: BALSAMIC.commands.base:cli :prog: BALSAMIC - :show-nested: + :nested: full diff --git a/docs/conf.py b/docs/conf.py index 34a287b69..8b5020b6e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,7 +13,7 @@ import os import sys -sys.path.insert(0, os.path.abspath("../")) +sys.path.insert(0, os.path.abspath("..")) # -- Project information ----------------------------------------------------- @@ -31,7 +31,7 @@ "sphinx.ext.mathjax", "sphinx.ext.viewcode", "sphinxcontrib.napoleon", - "sphinx_click.ext", + "sphinx_click", "sphinxarg.ext", "recommonmark", ] diff --git a/docs/git_etiquette.rst b/docs/git_etiquette.rst index 4c285806f..d72ed3b62 100644 --- a/docs/git_etiquette.rst +++ b/docs/git_etiquette.rst @@ -1,5 +1,5 @@ ============= -Git Etiquette +Git etiquette ============= It is recommended to follow a system to standardize the commit messages loosely. Following up from commit messages discussed on https://github.com/Clinical-Genomics/development/pull/97 , the format below is recommended for commit messages: diff --git a/docs/history.rst b/docs/history.rst index 12ee6b6f1..4de03af31 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -1,4 +1,4 @@ -CHANGELOG +Changelog ========= .. include:: ../CHANGELOG.rst diff --git a/docs/index.rst b/docs/index.rst index b4aca47b4..e5355b71f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,11 +8,12 @@ install user_guide + cli_package .. toctree:: - :caption: Resources - :name: resources + :caption: Detailed documentation + :name: detailed documentation :hidden: :maxdepth: 1 @@ -20,26 +21,10 @@ balsamic_sv_cnv balsamic_annotation balsamic_methods + history bioinfo_softwares -.. toctree:: - :caption: CLI reference - :name: api_cli_reference - :hidden: - :maxdepth: 1 - - cli_package - -.. toctree:: - :caption: Other Info - :name: other_info - :hidden: - :maxdepth: 1 - - history - resources - .. toctree:: :caption: Development guide :name: development_guide @@ -51,3 +36,4 @@ README semver FAQs + resources diff --git a/docs/requirements.txt b/docs/requirements.txt index 2fefe4578..77590bced 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,7 +4,7 @@ docutils>=0.14,<0.18 recommonmark==0.7.1 sphinx==4.2.0 sphinx-argparse==0.3.1 -sphinx-click==3.0.1 +sphinx-click==3.0.2 sphinx_rtd_theme==1.0.0 sphinxcontrib-napoleon==0.7 furo==2021.10.9 diff --git a/docs/resources.rst b/docs/resources.rst index b8839a9f9..7ebe1c640 100644 --- a/docs/resources.rst +++ b/docs/resources.rst @@ -1,11 +1,8 @@ -=============== +================ Other resources -=============== +================ -Resources ---------- - *Main resources including knowledge base and databases necessary for pipeline development* diff --git a/docs/snakemake_etiquette.rst b/docs/snakemake_etiquette.rst index 462a93ce9..27b064770 100644 --- a/docs/snakemake_etiquette.rst +++ b/docs/snakemake_etiquette.rst @@ -1,5 +1,5 @@ =================== -Snakemake Etiquette +Snakemake etiquette =================== The bioinformatics core analysis in BALSAMIC is defined by set of rules written as a Snakemake rules (``*.rule``) and Snakemake