Skip to content
This repository has been archived by the owner on Jan 27, 2020. It is now read-only.

Remapping #717

Merged
merged 17 commits into from
Feb 20, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,16 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## [Unreleased]

### `Added`

- [#717](https://github.com/SciLifeLab/Sarek/pull/717) - New `remapping` step to add the possibility to remap BAM files

### `Changed`

- [#710](https://github.com/SciLifeLab/Sarek/pull/710) - Improve release checklist and script
- [#711](https://github.com/SciLifeLab/Sarek/pull/711) - Improve configuration priorities
- [#717](https://github.com/SciLifeLab/Sarek/pull/717) - `fastqFiles` renamed to `inputFiles`
- [#717](https://github.com/SciLifeLab/Sarek/pull/717) - `MapReads` for remapping will now convert BAM to FASTQ and feed it to BWA on the fly

## [2.2.2] - 2018-12-19

Expand Down
94 changes: 69 additions & 25 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,21 @@ if (!params.sample && !params.sampleDir) {
if (params.test || step != 'mapping') tsvPath = tsvPaths[step]
}

// Set up the fastqFiles and bamFiles channels. One of them remains empty
fastqFiles = Channel.empty()
// Set up the inputFiles and bamFiles channels. One of them will remain empty
inputFiles = Channel.empty()
bamFiles = Channel.empty()
if (tsvPath) {
tsvFile = file(tsvPath)
switch (step) {
case 'mapping': fastqFiles = extractFastq(tsvFile); break
case 'mapping': inputFiles = extractFastq(tsvFile); break
case 'remapping': inputFiles = extractInputBAM(tsvFile); break
case 'recalibrate': bamFiles = extractRecal(tsvFile); break
default: exit 1, "Unknown step ${step}"
}
} else if (params.sampleDir) {
if (step != 'mapping') exit 1, '--sampleDir does not support steps other than "mapping"'
fastqFiles = extractFastqFromDir(params.sampleDir)
(fastqFiles, fastqTmp) = fastqFiles.into(2)
inputFiles = extractFastqFromDir(params.sampleDir)
(inputFiles, fastqTmp) = inputFiles.into(2)
fastqTmp.toList().subscribe onNext: {
if (it.size() == 0) {
exit 1, "No FASTQ files found in --sampleDir directory '${params.sampleDir}'"
Expand All @@ -102,8 +103,8 @@ if (tsvPath) {
tsvFile = params.sampleDir // used in the reports
} else exit 1, 'No sample were defined, see --help'

if (step == 'mapping') (patientGenders, fastqFiles) = SarekUtils.extractGenders(fastqFiles)
else (patientGenders, bamFiles) = SarekUtils.extractGenders(bamFiles)
if (step == 'recalibrate') (patientGenders, bamFiles) = SarekUtils.extractGenders(bamFiles)
else (patientGenders, inputFiles) = SarekUtils.extractGenders(inputFiles)

/*
================================================================================
Expand All @@ -113,9 +114,9 @@ else (patientGenders, bamFiles) = SarekUtils.extractGenders(bamFiles)

startMessage()

(fastqFiles, fastqFilesforFastQC) = fastqFiles.into(2)
(inputFiles, inputFilesforFastQC) = inputFiles.into(2)

if (params.verbose) fastqFiles = fastqFiles.view {
if (params.verbose) inputFiles = inputFiles.view {
"FASTQs to preprocess:\n\
ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\tRun : ${it[3]}\n\
Files : [${it[4].fileName}, ${it[5].fileName}]"
Expand All @@ -133,17 +134,22 @@ process RunFastQC {
publishDir "${directoryMap.fastQC}/${idRun}", mode: params.publishDirMode

input:
set idPatient, status, idSample, idRun, file(fastqFile1), file(fastqFile2) from fastqFilesforFastQC
set idPatient, status, idSample, idRun, file(inputFile1), file(inputFile2) from inputFilesforFastQC

output:
file "*_fastqc.{zip,html}" into fastQCreport

when: step == 'mapping' && !params.noReports
when: (step == 'mapping' || step == 'remapping') && !params.noReports

script:
"""
fastqc -t 2 -q ${fastqFile1} ${fastqFile2}
"""
if (step == 'mapping')
"""
fastqc -t 2 -q ${inputFile1} ${inputFile2}
"""
else if (step == 'remapping')
"""
fastqc -t 2 -q ${inputFile1}
"""
}

if (params.verbose) fastQCreport = fastQCreport.view {
Expand All @@ -155,24 +161,39 @@ process MapReads {
tag {idPatient + "-" + idRun}

input:
set idPatient, status, idSample, idRun, file(fastqFile1), file(fastqFile2) from fastqFiles
set idPatient, status, idSample, idRun, file(inputFile1), file(inputFile2) from inputFiles
set file(genomeFile), file(bwaIndex) from Channel.value([referenceMap.genomeFile, referenceMap.bwaIndex])

output:
set idPatient, status, idSample, idRun, file("${idRun}.bam") into (mappedBam, mappedBamForQC)

when: step == 'mapping' && !params.onlyQC
when: (step == 'mapping' || step == 'remapping') && !params.onlyQC

script:
CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : ""
readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina"
// adjust mismatch penalty for tumor samples
extra = status == 1 ? "-B 3" : ""
"""
bwa mem -R \"${readGroup}\" ${extra} -t ${task.cpus} -M \
${genomeFile} ${fastqFile1} ${fastqFile2} | \
samtools sort --threads ${task.cpus} -m 2G - > ${idRun}.bam
"""
if (step == 'mapping')
"""
bwa mem -R \"${readGroup}\" ${extra} -t ${task.cpus} -M \
${genomeFile} ${inputFile1} ${inputFile2} | \
samtools sort --threads ${task.cpus} -m 2G - > ${idRun}.bam
"""
else if (step == 'remapping')
"""
gatk --java-options -Xmx${task.memory.toGiga()}g \
SamToFastq \
--INPUT=${inputFile1} \
--FASTQ=/dev/stdout \
--INTERLEAVE=true \
--NON_PF=true \
| \
bwa mem -K 1000000 -p -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${genomeFile} \
maxulysse marked this conversation as resolved.
Show resolved Hide resolved
/dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2) \
| \
samtools sort --threads ${task.cpus} -m 2G - > ${idRun}.bam
"""
}

if (params.verbose) mappedBam = mappedBam.view {
Expand Down Expand Up @@ -236,7 +257,7 @@ process MergeBams {
output:
set idPatient, status, idSample, file("${idSample}.bam") into mergedBam

when: step == 'mapping' && !params.onlyQC
when: (step == 'mapping' || step == 'remapping') && !params.onlyQC

script:
"""
Expand Down Expand Up @@ -281,7 +302,7 @@ process MarkDuplicates {
set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai") into markDuplicatesTSV
file ("${idSample}.bam.metrics") into markDuplicatesReport

when: step == 'mapping' && !params.onlyQC
when: (step == 'mapping' || step == 'remapping') && !params.onlyQC

script:
"""
Expand Down Expand Up @@ -343,7 +364,7 @@ process CreateRecalibrationTable {
set idPatient, status, idSample, file("${idSample}.recal.table") into recalibrationTable
set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai"), val("${idSample}.recal.table") into recalibrationTableTSV

when: ( step == 'mapping' ) && !params.onlyQC
when: (step == 'mapping' || step == 'remapping') && !params.onlyQC

script:
known = knownIndels.collect{ "--known-sites ${it}" }.join(' ')
Expand Down Expand Up @@ -536,7 +557,8 @@ def defineReferenceMap() {
def defineStepList() {
return [
'mapping',
'recalibrate'
'recalibrate',
'remapping'
]
}

Expand All @@ -562,6 +584,28 @@ def extractFastq(tsvFile) {
}
}

def extractInputBAM(tsvFile) {
// Channeling the TSV file containing BAM.
// Format is: "subject gender status sample lane bam"
Channel.from(tsvFile)
.splitCsv(sep: '\t')
.map { row ->
SarekUtils.checkNumberOfItem(row, 7)
def idPatient = row[0]
def gender = row[1]
def status = SarekUtils.returnStatus(row[2].toInteger())
def idSample = row[3]
def idRun = row[4]
def bamFile = SarekUtils.returnFile(row[5])
def baiFile = SarekUtils.returnFile(row[6])

SarekUtils.checkFileExtension(bamFile,".bam")
SarekUtils.checkFileExtension(baiFile,".bai")

[idPatient, gender, status, idSample, idRun, bamFile, baiFile]
}
}

def extractFastqFromDir(pattern) {
// create a channel of FASTQs from a directory pattern such as
// "my_samples/*/". All samples are considered 'normal'.
Expand Down