diff --git a/bin/combine_counts.R b/bin/combine_counts.R index c4b234b..50d6801 100755 --- a/bin/combine_counts.R +++ b/bin/combine_counts.R @@ -28,13 +28,12 @@ library <- readr::read_tsv(library_file) %>% dplyr::select(id, group) names(count_files) <- stringr::str_replace(basename(count_files), ".txt", "") -pattern <- paste(c(paste0(names(count_files), "_"), ".sam"), collapse = "|") +pattern <- paste(c(paste0(names(count_files), "#"), "\\.sam"), collapse = "|") lapply(count_files, read_featurecounts) %>% purrr::map(tidyr::gather, sample_name, count, -id) %>% dplyr::bind_rows() %>% dplyr::mutate(sample_name = stringr::str_replace_all(sample_name, pattern, "")) %>% - dplyr::mutate(sample_name = stringr::str_split(sample_name, "#") %>% lapply("[[", 2)) %>% dplyr::group_by(id, sample_name) %>% dplyr::summarize(count = sum(count)) %>% dplyr::ungroup() %>% @@ -42,4 +41,4 @@ lapply(count_files, read_featurecounts) %>% dplyr::inner_join(library, by = "id") %>% dplyr::select(id, group, dplyr::everything()) %>% readr::format_tsv() %>% - cat + cat \ No newline at end of file diff --git a/bin/process_library.R b/bin/process_library.R index 456431c..6f0e591 100755 --- a/bin/process_library.R +++ b/bin/process_library.R @@ -17,7 +17,7 @@ ### command line parameters args <- commandArgs(trailingOnly = TRUE) input_file <- args[1] -padding_beginning <- toupper(args[2]) +padding <- toupper(args[2]) library_name <- stringr::str_replace(basename(input_file), ".txt", "") ### functions @@ -31,11 +31,18 @@ stopifnot(!any(duplicated(raw$id))) stopifnot(!any(duplicated(raw$sequence))) ### generate fasta file for bowtie2 index + +#max guide length + overhang of 1 seq_length <- max(nchar(raw$sequence)) -paste0(padding_beginning, raw$sequence) %>% +#reverse complement sgRNAs sequence +raw$sequence_rc <- Biostrings::DNAStringSet(raw$sequence) %>% + Biostrings::reverseComplement() %>% + as.character() + +paste0(raw$sequence_rc, padding) %>% toupper %>% - stringr::str_sub(start=-23) %>% + stringr::str_sub(end=seq_length) %>% purrr::set_names(raw$id) %>% Biostrings::DNAStringSet() %>% Biostrings::writeXStringSet(paste0(library_name, ".fasta"), format = "fasta") diff --git a/conf/resources.config b/conf/resources.config index a0adc87..e6eb2ed 100644 --- a/conf/resources.config +++ b/conf/resources.config @@ -36,13 +36,13 @@ process { memory = { 15.GB * task.attempt } } withName: align { - cpus = { 8 * task.attempt } + cpus = { 16 * task.attempt } } withName: fastqc { time = { 3.h * task.attempt } } withName: count { - cpus = { 8 * task.attempt } + cpus = { 16 * task.attempt } } } diff --git a/main.nf b/main.nf index daac0fa..70e2fbe 100755 --- a/main.nf +++ b/main.nf @@ -43,13 +43,14 @@ def helpMessage() { --spacer_seq Nucleotide sequence in spacer sequence between barcodes and sgRNA / shRNA sequence. - (default: TTCCAGCATAGCTCTTAAAC) + (default: TTCCAGCATAGCTCTTAAAC) - --guide_length Number of nucleotides in guide sequence. (default: 21) + --max_guide_length Number of nucleotides in guide sequence. (default: 21) - --padding_beginning Nucleotides used for 5' padding if sgRNA / shRNA are of - unequal length. Must be one of G, C, T, and A. - (default: ACC) + --padding Nucleotides used for padding if sgRNA / shRNA are of + unequal length. Corresponds to nucleotides downstream of + the sgRNA (post guide sequence). Must be one of G, C, T, + and A. (default: GTT) --add_unknown_to_fastqc Add unknown sequences (no barcode match during demultiplexing) to multiQC report. (default: false) @@ -85,8 +86,8 @@ log.info " barcode file : ${params.barcodes}" log.info " barcode demultiplex (nt) : ${params.barcode_length}" log.info " spacer (nt) : ${params.spacer_seq}" log.info " demultiplex mismatches : ${params.barcode_demux_mismatches}" -log.info " 5' guide padding base : ${params.padding_beginning}" -log.info " guide length : ${params.guide_length}" +log.info " guide padding bases : ${params.padding}" +log.info " max guide length : ${params.max_guide_length}" log.info " add unknown to FastQC : ${params.add_unknown_to_fastqc}" log.info " ======================" log.info "" @@ -230,7 +231,7 @@ process trim_barcode_and_spacer { length_barcode_spacer=\${#barcode_spacer} mv ${fastq} input.fastq.gz - cutadapt input.fastq.gz -j ${task.cpus} -u \${length_barcode_spacer} -o ${id}.fastq.gz -l ${params.guide_length} + cutadapt input.fastq.gz -j ${task.cpus} -u \${length_barcode_spacer} -o ${id}.fastq.gz -l ${params.max_guide_length} fastqc -q ${id}.fastq.gz """ } @@ -248,7 +249,7 @@ process process_library { script: """ - process_library.R ${library} ${params.padding_beginning} + process_library.R ${library} ${params.padding} """ } @@ -286,10 +287,11 @@ process align { bowtie2 \ --threads \$((${task.cpus})) \ -x ${index}/index \ - -L ${params.guide_length} \ + -L ${params.max_guide_length} \ --score-min 'C,0,-1' \ -N 0 \ --seed 42 \ + --norc \ <(zcat ${fastq}) 2> ${id}.log > ${id}.sam """ } diff --git a/nextflow.config b/nextflow.config index ed1c64b..1563a36 100755 --- a/nextflow.config +++ b/nextflow.config @@ -10,8 +10,8 @@ params { barcode_demux_mismatches = 0 barcode_length = 4 spacer_seq = 'TTCCAGCATAGCTCTTAAAC' - guide_length = 21 - padding_beginning = 'ACC' + max_guide_length = 21 + padding = 'GGT' add_unknown_to_fastqc = false } diff --git a/test_data/expected_output.txt b/test_data/expected_output.txt index 2675239..040f65d 100755 --- a/test_data/expected_output.txt +++ b/test_data/expected_output.txt @@ -1,4 +1,4 @@ -id group testple_1 testple_2 testple_3 testple_4 testple_5 testple_6 +id group test_sample_1 test_sample_2 test_sample_3 test_sample_4 test_sample_5 test_sample_6 ABHD16B_1 ABHD16B 100 100 100 100 100 100 ABHD16B_2 ABHD16B 100 100 100 100 100 100 ABHD16B_3 ABHD16B 100 100 100 100 100 100 @@ -199,17 +199,17 @@ LSS_1 LSS 100 100 100 100 100 100 LSS_2 LSS 100 100 100 100 100 100 LSS_3 LSS 100 100 100 100 100 100 LSS_4 LSS 100 100 100 100 100 100 -LYPD5_1 LYPD5 97 97 97 97 97 97 +LYPD5_1 LYPD5 100 100 100 100 100 100 LYPD5_2 LYPD5 100 100 100 100 100 100 -LYPD5_3 LYPD5 103 103 103 103 103 103 +LYPD5_3 LYPD5 100 100 100 100 100 100 LYPD5_4 LYPD5 100 100 100 100 100 100 LYRM7_1 LYRM7 100 100 100 100 100 100 LYRM7_2 LYRM7 100 100 100 100 100 100 -LYRM7_3 LYRM7 44 44 44 44 44 44 -LYRM7_4 LYRM7 156 156 156 156 156 156 -M6PR_1 M6PR 146 146 146 146 146 146 +LYRM7_3 LYRM7 100 100 100 100 100 100 +LYRM7_4 LYRM7 100 100 100 100 100 100 +M6PR_1 M6PR 100 100 100 100 100 100 M6PR_2 M6PR 100 100 100 100 100 100 -M6PR_3 M6PR 54 54 54 54 54 54 +M6PR_3 M6PR 100 100 100 100 100 100 M6PR_4 M6PR 100 100 100 100 100 100 MAU2_1 MAU2 100 100 100 100 100 100 MAU2_2 MAU2 100 100 100 100 100 100 @@ -347,8 +347,8 @@ TMEM211_1 TMEM211 100 100 100 100 100 100 TMEM211_2 TMEM211 100 100 100 100 100 100 TMEM211_3 TMEM211 100 100 100 100 100 100 TMEM211_4 TMEM211 100 100 100 100 100 100 -TMEM256_1 TMEM256 1 1 1 1 1 1 -TMEM256_2 TMEM256 199 199 199 199 199 199 +TMEM256_1 TMEM256 100 100 100 100 100 100 +TMEM256_2 TMEM256 100 100 100 100 100 100 TMEM256_3 TMEM256 100 100 100 100 100 100 TMEM256_4 TMEM256 100 100 100 100 100 100 TPP1_1 TPP1 100 100 100 100 100 100