diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..a4de0bff --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -W +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/fusion.md b/docs/fusion.md new file mode 100644 index 00000000..ad2395de --- /dev/null +++ b/docs/fusion.md @@ -0,0 +1,30 @@ +# fusion + +The `fusion` module uses [Arriba](https://github.com/suhrig/arriba) to call fusion events. + +## Tools +This module uses the bam file from [STAR](https://github.com/alexdobin/STAR) to +call fusion events. + +The fusion events are filtered based on the `blacklist` from Arriba itself. Also, only fusions where at least one of the involved genes is in `report_genes` will be included in the final output. + +For each fusion event that remains after filtering, we also generate a figure using the `draw_fusions.R` script provided by Arriba. + +## Input +The input for this module is a single bam file, generated by STAR per sample, specified in a PEP configuration file, as is shown [here](../test/pep/chrM-bam.csv). + +## Output +The output of this module are a JSON file with an overview of the most important results, as well as a number of other output files: +- The final Arriba output file, after filtering. +- One figure per fusion event + +## configuration +| Option | description | required | +| --------------------------- | --------------------------------------- | -------- | +| `genome_fasta` | Reference genome, in FASTA format | yes | +| `gtf` | GTF file with transcript information | yes | +| `blacklist` | File of blacklisted variants | yes | +| `known_fusions` | A file of known fusion events | yes | +| `report_genes` | A file of genes to report fusions for | yes | +| `cytobands` | A file with cytoband information | yes | +| `protein_domains` | A file with protein domains | yes | diff --git a/docs/itd.md b/docs/itd.md new file mode 100644 index 00000000..c0c4c631 --- /dev/null +++ b/docs/itd.md @@ -0,0 +1,28 @@ +# itd + +The `itd` module is responsible for finding Internal Tandem Duplications in select genes, specifically *FLT3* and *KMT2A*. + +## Tools +First, this module uses [bwa]() to align the trimmed reads to a custom reference, which contains the transcript sequence of *FLT3* and *KMT2A*. Next, a custom tool, [rose-dt](https://git.lumc.nl/hem/rose-dt), +is used to detect and visualise Internal Tandem Duplications, using evindence from soft-clipped reads. + +## Input +The input for this module is a single pair of FastQ files per sample, specified in a PEP configuration file, as is shown [here](../test/pep/itd.csv). + +## Output +The output of this module are a JSON file with an overview of the most important results, as well as a number of other output files: +- For both *FLT3 and *KMT2A*, a .csv file with the detected tandem duplications. +- For both *FLT3* and *KMT2A*, a figure to visualise the detected tandem duplications. + +## configuration +The configuration for this module is tailored to the provided reference files, be very careful if you want to modify any of these settings. + +| Option | description | required | +| --------------------------- | --------------------------------------------- | -------- | +| `fasta` | The fasta file, which contains FLT3 and KMT2A | yes | +| `flt3_name` | The name of the FLT3 sequence | yes | +| `flt3_start` | The start of the FLT3 region to investigate | yes | +| `flt3_end` | The end of the FLT3 region to investigate | yes | +| `kmt2a_name` | The name of the KMT2A sequence | yes | +| `kmt2a_start` | The start of the KMT2A region to investigate | yes | +| `kmt2a_end` | The end of the KMT2A region to investigate | yes | diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..6247f7e2 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/qc-seq.md b/docs/qc-seq.md new file mode 100644 index 00000000..9e05bab6 --- /dev/null +++ b/docs/qc-seq.md @@ -0,0 +1,25 @@ +# qc-seq + +The `qc-seq` module is responsible for removing adapter sequences and low +quality reads, and generating read-level statistics. It also merges the FastQ +files per sample, so they can be used by the other modules. Every set of FastQ +files can be analysed in parallel. + +## Tools +This module uses [cutadapt](https://cutadapt.readthedocs.io/en/stable/) to remove adapter sequences and low quality bases. +[FastQC](https://cutadapt.readthedocs.io/en/stable/) is used to generate detailed quality statistics. + +## Input +The input for this module is one or more pairs of FastQ files per sample, specified in a PEP configuration file, as is shown [here](../test/pep/chrM-trio-subsamples.csv). + +## Output +The output of this module are one set of merged FastQ files per sample, as well as a JSON file with statistics. + +## configuration +The only configurable option for this module is adapter sequences for +[cutadapt](https://cutadapt.readthedocs.io/en/stable/) to remove. + +| Option | description | required | +| --------------- | ---------------------------- | -------- | +| `forward_adapter` | The forward adapter sequence | yes | +| `reverse_adapter` | The reverse adapter sequence | yes | diff --git a/docs/snv-indels.md b/docs/snv-indels.md new file mode 100644 index 00000000..3c2ecc93 --- /dev/null +++ b/docs/snv-indels.md @@ -0,0 +1,47 @@ +# snv-indels + +The `snv-indels` module is responsible for aligning the reads to the reference, and calling SNVs and insertions/deletion. + +## Tools +This module uses [STAR](https://github.com/alexdobin/STAR) to align the reads to the reference using twopass mode.[VarDict](https://github.com/AstraZeneca-NGS/VarDictJava) is used to call variants, which are annotated using [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html). +For each variant, this module determines if it is located inside one of the defined `bed_variant_hotspots`. + +The variants annotated by VEP are then filtered based on a number of different criteria: +1. Variants that are present on the `blacklist` are excluded. +2. Only variants that are present on one of the specified transcripts in + `ref_id_mapping` are included. +3. Only variants that match one of the consequences defined in + `vep_include_consequence` are included. +4. Variant that have a population frequency of more than 1% in the `gnomADe` + population are excluded. + +Picard is used to generate various alignment statistics. + +## Input +The input for this module is a single pair of FastQ files per sample, specified in a PEP configuration file, as is shown [here](../test/pep/targetted.csv). + +## Output +The output of this module are a JSON file with an overview of the most important results, as well as a number of other output files: +- A .bam and .bai per sample, which contain the aligned reads. +- A VEP output file (`vep_high`), which contains the final set of filtered variants. +- A VEP output file (`vep_target`), which contains the variants on the transcripts of interest. These variants have not been filtered on `vep_include_consequence` terms. +- A VCF file that only contains those variants that fall in one of the `bed_variant_hotspots` regions. + +## configuration + +| Option | description | required | +| --------------------------- | --------------------------------------- | -------- | +| `forward_adapter` | The forward adapter sequence | yes | +| `reverse_adapter` | The reverse adapter sequence | yes | +| `genome_fasta` | Reference genome, in FASTA format | yes | +| `genome_fai` | .fai index for the reference | yes | +| `genome_dict` | .dict index for the reference | yes | +| `star_index` | STAR index database | yes | +| `ref_id_mapping` | File of transcripts of interest | yes | +| `rrna_refflat` | File of rRNA transcripts | yes | +| `bed_variant_hotspots` | BED file of hotspot regions | yes | +| `bed_variant_call_regions` | BED file of regions to call variants | yes | +| `gtf` | GTF file with transcripts, used by STAR | yes | +| `annotation_refflat` | File used to determine exon coverage | yes | +| `blacklist` | File of blacklisted variants | yes | +| `vep_include_consequence` | List of [VEP consequences](http://www.ensembl.org/info/genome/variation/prediction/predicted_data.html) to include | yes | diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..97b5bedf --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,35 @@ +# Configuration file for the Sphinx documentation builder. + +# -- Project information + +project = 'HAMLET' +copyright = '2018, LUMC' +author = 'Wibowo Arindrarto, Redmar van den Berg' + +release = '2.0' +version = '2.0.0' + +# -- General configuration + +extensions = [ + 'sphinx.ext.duration', + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', +] + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), +} +intersphinx_disabled_domains = ['std'] + +templates_path = ['_templates'] +master_doc = 'index' + +# -- Options for HTML output + +html_theme = 'sphinx_rtd_theme' + +# -- Options for EPUB output +epub_show_urls = 'footnote' diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..a200d8df --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,5 @@ +Welcome to the documentation for HAMLET +================================================ + +This is currently a placeholder. You can see the full documentation +on the `HAMLET github page`. diff --git a/test/test_docs.yml b/test/test_docs.yml new file mode 100644 index 00000000..d32b3bf3 --- /dev/null +++ b/test/test_docs.yml @@ -0,0 +1,8 @@ +# Test generating the report in html format +- name: test-docs + tags: + - hamlet + - docs + command: make -C docs/ html + files: + - path: docs/build/html/genindex.html