Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

⚡ Optimize samtools/depth step #74

Merged
merged 9 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"coverage",
"cpus",
"demultiplexing",
"depthfile",
"downsampling",
"fasta",
"fastq",
Expand Down
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 0.6.1 - [2024-04-11]

- Calculate `samtools/depth` on each chromosomes ([#73](https://github.com/cnr-ibba/nf-resequencing-mem/issues/73))

### `Fixed`

- Passing args to `modules/local/freebayes_splitcram`
- Calculate `samtools/depth` without _0 coverage_ regions

## 0.6.0 - [2024-04-04]

- Replace `*.bam` file format with `*.cram` ([#9](https://github.com/cnr-ibba/nf-resequencing-mem/issues/9))
Expand Down
4 changes: 2 additions & 2 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
report_comment: >
This report has been generated by the <a href="https://github.com/cnr-ibba/nf-resequencing-mem/releases/tag/v0.6.0" target="_blank">nf-resequencing-mem</a>
This report has been generated by the <a href="https://github.com/cnr-ibba/nf-resequencing-mem/releases/tag/v0.6.1" target="_blank">nf-resequencing-mem</a>
analysis pipeline. For information about how to interpret these results, please see the
<a href="https://github.com/cnr-ibba/nf-resequencing-mem/blob/v0.6.0/README.md" target="_blank">documentation</a>.
<a href="https://github.com/cnr-ibba/nf-resequencing-mem/blob/v0.6.1/README.md" target="_blank">documentation</a>.
report_section_order:
"nf-resequencing-mem-methods-description":
order: -1000
Expand Down
80 changes: 49 additions & 31 deletions bin/split_ref_by_depth.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,19 @@
# -*- coding: utf-8 -*-

"""

This script will write a list of region from a reference file relying on
coverage depth
coverage depth. Is supposed to work on a single chromosome at a time, it
requires chromosome length in order to work on sample depth with no 0 regions

Author: Paolo Cozzi
Date: 2024/03/21
Date: 2024/04/11

Usage:
python split_ref_by_depth.py --depth_file <depth_file>
python split_ref_by_depth.py --depth_file <depth_file> \
--chromosome <chromosome> --chromosome_length <chromosome_length> \
[--min_length <min_length>] [--max_coverage <max_coverage>] \
[--overlap_size <overlap_size>] [--verbose <verbose>] [--quiet <quiet>]

Arguments:
depth_file: The path of samtools depth output file
Expand Down Expand Up @@ -126,49 +131,40 @@ def append_or_extend_region(


def split_ref_by_coverage(
depthfile: str, max_coverage: int, min_length: int, overlap_size: int):
depthfile: str, chromosome: str, chromosome_length: int,
max_coverage: int, min_length: int, overlap_size: int):
with gzip.open(depthfile, "rt") as handle:
reader = csv.reader(handle, delimiter="\t", lineterminator="\n")
header = next(reader)

# header: ["#CHROM", "POS", "Sample_1.bam", "Sample_2.bam", ...]
logging.debug(f"Got header: {header}")

# Inizialize some variables
line = next(reader)
start_chrom = line[0]
start_pos = int(line[1])
old_pos = start_pos
cumulative_coverage = 0
# test if I have reads aligned in this file
try:
line = next(reader)

except StopIteration:
logging.error(
f"File '{depthfile}' has no coverage data"
)
return [[chromosome, 0, chromosome_length]]

# Initialize some variables
regions = []
start_chrom = chromosome
start_pos = 1
old_pos = start_pos
cumulative_coverage = 0

logging.debug(f"Starting from chrom: '{start_chrom}'")

for i, line in enumerate(reader):
chrom, pos = line[0], int(line[1])

if chrom != start_chrom:
logging.debug(f"{i}: Got a new chromosome '{chrom}'")
logging.debug(
f"{i}: Test for a new region with: "
f"{[start_chrom, start_pos, old_pos]}"
f" ({old_pos-start_pos+1} bp; "
f"{cumulative_coverage:.2e} cumulative coverage)"
)

# add and open a new region
regions = append_or_extend_region(
regions,
[start_chrom, start_pos, old_pos],
min_length,
overlap_size
)

# reset variables
start_chrom = chrom
start_pos = pos
cumulative_coverage = 0
logging.critical(f"{i}: Got a new chromosome '{chrom}'")
raise Exception("This script works on a single chromosome at a time")

# determine region size
length = pos - start_pos
Expand Down Expand Up @@ -219,6 +215,14 @@ def split_ref_by_coverage(
overlap_size
)

# check for last region end
last_region = regions[-1]
if last_region[2] < chromosome_length:
logging.debug(
f"Extending the last region: {last_region} to {chromosome_length}"
)
last_region[2] = chromosome_length

return regions


Expand All @@ -232,6 +236,14 @@ def split_ref_by_coverage(
'-d', '--depth_file', required=True,
help="The output of samtools depth file for all samples"
)
parser.add_argument(
'-c', '--chromosome', required=True, type=str,
help="The chromosome name"
)
parser.add_argument(
'-l', '--chromosome_length', required=True, type=int,
help="The length of the chromosome"
)
parser.add_argument(
"--min_length", default=100_000, type=int,
help="minimum fragment length in bp"
Expand Down Expand Up @@ -263,7 +275,13 @@ def split_ref_by_coverage(

# split reference by coverage depth
regions = split_ref_by_coverage(
args.depth_file, args.max_coverage, args.min_length, args.overlap_size)
args.depth_file,
args.chromosome,
args.chromosome_length,
args.max_coverage,
args.min_length,
args.overlap_size
)

logging.info(f"Number of regions: {len(regions)}")

Expand Down
6 changes: 3 additions & 3 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,17 @@ process {
withLabel:process_single {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 4.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
time = { check_max( 6.h * task.attempt, 'time' ) }
}
withLabel:process_low {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 4.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
time = { check_max( 6.h * task.attempt, 'time' ) }
}
withLabel:process_medium {
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
time = { check_max( 12.h * task.attempt, 'time' ) }
}
withLabel:process_high {
cpus = { check_max( 12 * task.attempt, 'cpus' ) }
Expand Down
6 changes: 5 additions & 1 deletion modules/local/freebayes_splitcram.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ process FREEBAYES_SPLITCRAM {
def prefix = task.ext.prefix ?: "${meta.id}"
"""
split_ref_by_depth.py \\
--depth_file ${depth} > ${prefix}.regions.txt
${args} \\
--depth_file ${depth} \\
--chromosome ${meta.id} \\
--chromosome_length ${meta.length} \\
> ${prefix}.regions.txt

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
11 changes: 6 additions & 5 deletions modules/nf-core/samtools/depth/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 15 additions & 7 deletions modules/nf-core/samtools/depth/samtools-depth.diff

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ manifest {
description = 'Nextflow Resequencing with BWA MEM'
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
version = '0.6.0'
version = '0.6.1'
}

// Load modules.config for DSL2 module specific options
Expand Down
22 changes: 19 additions & 3 deletions subworkflows/local/cram_freebayes_parallel/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,16 @@ workflow CRAM_FREEBAYES_PARALLEL {
main:
ch_versions = Channel.empty()

// calculate total coverage depth for all samples
SAMTOOLS_DEPTH( bam, [[], []] )
// read chromosome list from fasta index
chromosome_ch = fai.map{ it -> it[1] }
.splitCsv(header: false, sep: '\t')
.map{ row -> [[id: row[0], length: row[1]], row[0]] }
// .view()

// calculate total coverage depth for all samples by chromosome
// bam is a value channel; chromosome_ch is a queue channel
// for every chromosome
SAMTOOLS_DEPTH( bam, bai, chromosome_ch )
ch_versions = ch_versions.mix(SAMTOOLS_DEPTH.out.versions)

// split fasta in chunks relying BAM size
Expand All @@ -33,7 +41,15 @@ workflow CRAM_FREEBAYES_PARALLEL {
.map{ it -> [[id: it.trim()], it.trim()]}

// call freebayes on each region
FREEBAYES_CHUNK ( regions_ch, bam, bai, SAMTOOLS_DEPTH.out.bam_list, fasta, fai )
FREEBAYES_CHUNK (
regions_ch,
bam,
bai,
// converting to a value channel
SAMTOOLS_DEPTH.out.bam_list.first(),
fasta,
fai
)
ch_versions = ch_versions.mix(FREEBAYES_CHUNK.out.versions)

// merge freebayes chunks
Expand Down
Loading