Skip to content

Commit

Permalink
various optimizations: streamline library processing (RC guide sequen…
Browse files Browse the repository at this point in the history
…ce, padding), refine alignment, combine_counts and test data
  • Loading branch information
fandersch committed Oct 23, 2024
1 parent f91cb22 commit c88efe7
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 29 deletions.
5 changes: 2 additions & 3 deletions bin/combine_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,17 @@ library <- readr::read_tsv(library_file) %>%
dplyr::select(id, group)

names(count_files) <- stringr::str_replace(basename(count_files), ".txt", "")
pattern <- paste(c(paste0(names(count_files), "_"), ".sam"), collapse = "|")
pattern <- paste(c(paste0(names(count_files), "#"), "\\.sam"), collapse = "|")

lapply(count_files, read_featurecounts) %>%
purrr::map(tidyr::gather, sample_name, count, -id) %>%
dplyr::bind_rows() %>%
dplyr::mutate(sample_name = stringr::str_replace_all(sample_name, pattern, "")) %>%
dplyr::mutate(sample_name = stringr::str_split(sample_name, "#") %>% lapply("[[", 2)) %>%
dplyr::group_by(id, sample_name) %>%
dplyr::summarize(count = sum(count)) %>%
dplyr::ungroup() %>%
tidyr::spread(sample_name, count) %>%
dplyr::inner_join(library, by = "id") %>%
dplyr::select(id, group, dplyr::everything()) %>%
readr::format_tsv() %>%
cat
cat
13 changes: 10 additions & 3 deletions bin/process_library.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
### command line parameters
args <- commandArgs(trailingOnly = TRUE)
input_file <- args[1]
padding_beginning <- toupper(args[2])
padding <- toupper(args[2])
library_name <- stringr::str_replace(basename(input_file), ".txt", "")

### functions
Expand All @@ -31,11 +31,18 @@ stopifnot(!any(duplicated(raw$id)))
stopifnot(!any(duplicated(raw$sequence)))

### generate fasta file for bowtie2 index

#max guide length + overhang of 1
seq_length <- max(nchar(raw$sequence))

paste0(padding_beginning, raw$sequence) %>%
#reverse complement sgRNAs sequence
raw$sequence_rc <- Biostrings::DNAStringSet(raw$sequence) %>%
Biostrings::reverseComplement() %>%
as.character()

paste0(raw$sequence_rc, padding) %>%
toupper %>%
stringr::str_sub(start=-23) %>%
stringr::str_sub(end=seq_length) %>%
purrr::set_names(raw$id) %>%
Biostrings::DNAStringSet() %>%
Biostrings::writeXStringSet(paste0(library_name, ".fasta"), format = "fasta")
Expand Down
4 changes: 2 additions & 2 deletions conf/resources.config
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,13 @@ process {
memory = { 15.GB * task.attempt }
}
withName: align {
cpus = { 8 * task.attempt }
cpus = { 16 * task.attempt }
}
withName: fastqc {
time = { 3.h * task.attempt }
}
withName: count {
cpus = { 8 * task.attempt }
cpus = { 16 * task.attempt }
}
}

Expand Down
22 changes: 12 additions & 10 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,14 @@ def helpMessage() {
--spacer_seq Nucleotide sequence in spacer sequence between
barcodes and sgRNA / shRNA sequence.
(default: TTCCAGCATAGCTCTTAAAC)
(default: TTCCAGCATAGCTCTTAAAC)
--guide_length Number of nucleotides in guide sequence. (default: 21)
--max_guide_length Number of nucleotides in guide sequence. (default: 21)
--padding_beginning Nucleotides used for 5' padding if sgRNA / shRNA are of
unequal length. Must be one of G, C, T, and A.
(default: ACC)
--padding Nucleotides used for padding if sgRNA / shRNA are of
unequal length. Corresponds to nucleotides downstream of
the sgRNA (post guide sequence). Must be one of G, C, T,
and A. (default: GTT)
--add_unknown_to_fastqc Add unknown sequences (no barcode match during demultiplexing)
to multiQC report. (default: false)
Expand Down Expand Up @@ -85,8 +86,8 @@ log.info " barcode file : ${params.barcodes}"
log.info " barcode demultiplex (nt) : ${params.barcode_length}"
log.info " spacer (nt) : ${params.spacer_seq}"
log.info " demultiplex mismatches : ${params.barcode_demux_mismatches}"
log.info " 5' guide padding base : ${params.padding_beginning}"
log.info " guide length : ${params.guide_length}"
log.info " guide padding bases : ${params.padding}"
log.info " max guide length : ${params.max_guide_length}"
log.info " add unknown to FastQC : ${params.add_unknown_to_fastqc}"
log.info " ======================"
log.info ""
Expand Down Expand Up @@ -230,7 +231,7 @@ process trim_barcode_and_spacer {
length_barcode_spacer=\${#barcode_spacer}
mv ${fastq} input.fastq.gz
cutadapt input.fastq.gz -j ${task.cpus} -u \${length_barcode_spacer} -o ${id}.fastq.gz -l ${params.guide_length}
cutadapt input.fastq.gz -j ${task.cpus} -u \${length_barcode_spacer} -o ${id}.fastq.gz -l ${params.max_guide_length}
fastqc -q ${id}.fastq.gz
"""
}
Expand All @@ -248,7 +249,7 @@ process process_library {

script:
"""
process_library.R ${library} ${params.padding_beginning}
process_library.R ${library} ${params.padding}
"""
}

Expand Down Expand Up @@ -286,10 +287,11 @@ process align {
bowtie2 \
--threads \$((${task.cpus})) \
-x ${index}/index \
-L ${params.guide_length} \
-L ${params.max_guide_length} \
--score-min 'C,0,-1' \
-N 0 \
--seed 42 \
--norc \
<(zcat ${fastq}) 2> ${id}.log > ${id}.sam
"""
}
Expand Down
4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ params {
barcode_demux_mismatches = 0
barcode_length = 4
spacer_seq = 'TTCCAGCATAGCTCTTAAAC'
guide_length = 21
padding_beginning = 'ACC'
max_guide_length = 21
padding = 'GGT'
add_unknown_to_fastqc = false
}

Expand Down
18 changes: 9 additions & 9 deletions test_data/expected_output.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id group testple_1 testple_2 testple_3 testple_4 testple_5 testple_6
id group test_sample_1 test_sample_2 test_sample_3 test_sample_4 test_sample_5 test_sample_6
ABHD16B_1 ABHD16B 100 100 100 100 100 100
ABHD16B_2 ABHD16B 100 100 100 100 100 100
ABHD16B_3 ABHD16B 100 100 100 100 100 100
Expand Down Expand Up @@ -199,17 +199,17 @@ LSS_1 LSS 100 100 100 100 100 100
LSS_2 LSS 100 100 100 100 100 100
LSS_3 LSS 100 100 100 100 100 100
LSS_4 LSS 100 100 100 100 100 100
LYPD5_1 LYPD5 97 97 97 97 97 97
LYPD5_1 LYPD5 100 100 100 100 100 100
LYPD5_2 LYPD5 100 100 100 100 100 100
LYPD5_3 LYPD5 103 103 103 103 103 103
LYPD5_3 LYPD5 100 100 100 100 100 100
LYPD5_4 LYPD5 100 100 100 100 100 100
LYRM7_1 LYRM7 100 100 100 100 100 100
LYRM7_2 LYRM7 100 100 100 100 100 100
LYRM7_3 LYRM7 44 44 44 44 44 44
LYRM7_4 LYRM7 156 156 156 156 156 156
M6PR_1 M6PR 146 146 146 146 146 146
LYRM7_3 LYRM7 100 100 100 100 100 100
LYRM7_4 LYRM7 100 100 100 100 100 100
M6PR_1 M6PR 100 100 100 100 100 100
M6PR_2 M6PR 100 100 100 100 100 100
M6PR_3 M6PR 54 54 54 54 54 54
M6PR_3 M6PR 100 100 100 100 100 100
M6PR_4 M6PR 100 100 100 100 100 100
MAU2_1 MAU2 100 100 100 100 100 100
MAU2_2 MAU2 100 100 100 100 100 100
Expand Down Expand Up @@ -347,8 +347,8 @@ TMEM211_1 TMEM211 100 100 100 100 100 100
TMEM211_2 TMEM211 100 100 100 100 100 100
TMEM211_3 TMEM211 100 100 100 100 100 100
TMEM211_4 TMEM211 100 100 100 100 100 100
TMEM256_1 TMEM256 1 1 1 1 1 1
TMEM256_2 TMEM256 199 199 199 199 199 199
TMEM256_1 TMEM256 100 100 100 100 100 100
TMEM256_2 TMEM256 100 100 100 100 100 100
TMEM256_3 TMEM256 100 100 100 100 100 100
TMEM256_4 TMEM256 100 100 100 100 100 100
TPP1_1 TPP1 100 100 100 100 100 100
Expand Down

0 comments on commit c88efe7

Please sign in to comment.