nf-core · ggabernet · Aug 27, 2021 · Jan 14, 2020 · Jul 16, 2021 · Jul 19, 2021
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -50,7 +50,7 @@ jobs:
       matrix:
         # Nextflow versions: check pipeline minimum and current latest
         nxf_ver: ["21.04.0", ""]
-        profile: ["test_tcr"]
+        profile: ["test_tcr", "test_no_umi"]
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v2

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1,7 +1,5 @@
-files_unchanged:
-  - lib/NfcoreSchema.groovy
-actions_awsfulltest: False
-params_used: False
-files_exist:
+lint:
+  files_unchanged:
+    - assets/multiqc_config.yaml
   - environment.yml
   - Dockerfile
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -2,6 +2,22 @@ report_comment: >
     This report has been generated by the <a href="https://github.com/nf-core/bcellmagic" target="_blank">nf-core/bcellmagic</a>
     analysis pipeline. For information about how to interpret these results, please see the
     <a href="https://github.com/nf-core/bcellmagic" target="_blank">documentation</a>.
+
+module_order:
+  - fastqc:
+      name: 'FastQC (raw)'
+      info: 'This section of the report shows FastQC results from the original reads'
+      path_filters:
+        - './*.zip'
+      path_filters_exclude:
+        - './*_ASSEMBLED_fastqc.zip'
+  - fastqc:
+      name: 'FastQC (post-assembly)'
+      info: 'This section of the report shows FastQC results after paired reads
+      are assembled and QC filtered but before collapsing duplicates.'
+      path_filters:
+        - './*_ASSEMBLED_fastqc.zip'
+
 report_section_order:
     software_versions:
         order: -1000

diff --git a/bin/TIgGER-shazam.R b/bin/TIgGER-shazam.R
@@ -24,21 +24,27 @@ fastas = args[4:length(args)]
 output_folder = dirname(inputtable)
 
 db <- read.table(inputtable, header=TRUE, sep="\t")
+# Add label for source species
+sourceLabel <- gsub(pattern = "\\.tsv$", "", inputtable)
 
 if (loci == "ig"){
 
-    db_fasta <- readIgFasta(fastas, strip_down_name = TRUE)
+    db_fasta <- c()
+    for (fasta in fastas) {
+        dbf <- readIgFasta(fasta, strip_down_name = TRUE)
+        db_fasta <- c(db_fasta, dbf)
+    }
 
     gt <- inferGenotype(db, v_call = "v_call", find_unmutated = F)
 
     # Filter out Duplicate sequences as not supported by Tigger 1.0.0
-    gt_filt <- filter(gt, !grepl("D", gene))
+    gt_filt <- filter(gt, !grepl("D|d", gene))
 
     gtseq <- genotypeFasta(gt_filt, db_fasta)
-    writeFasta(gtseq, paste(output_folder,"v_genotype.fasta",sep="/"))
+    writeFasta(gtseq, paste(output_folder,paste0(sourceLabel, "_v_genotype.fasta"),sep="/"))
 
     # Plot genotype
-    ggsave(paste(output_folder,"genotype.pdf",sep="/"), plotGenotype(gt, silent=T))
+    ggsave(paste(output_folder,paste0(sourceLabel, "_genotype.pdf"),sep="/"), plotGenotype(gt, silent=T))
 
     # Modify allele calls and output TSV file
     db_reassigned <- reassignAlleles(db, gtseq)
@@ -52,7 +58,7 @@ if (loci == "ig"){
                                 normalize="len",
                                 nproc=1,
                                 first = FALSE)
-    writeChangeoDb(db_reassigned, paste(output_folder,"v_genotyped.tab",sep="/"))
+    writeChangeoDb(db_reassigned, paste(output_folder,paste0(sourceLabel, "_v_genotyped.tab"),sep="/"))
 
 } else if (loci == "tr") {
 
@@ -64,10 +70,10 @@ if (loci == "ig"){
     gt <- inferGenotype(db, v_call = "v_call", find_unmutated = FALSE)
 
     gtseq <- genotypeFasta(gt, c(db_fasta_TRAV,db_fasta_TRBV,db_fasta_TRDV))
-    writeFasta(gtseq, paste(output_folder,"TRxV_genotype.fasta",sep="/"))
+    writeFasta(gtseq, paste(output_folder,paste0(sourceLabel, "_TRxV_genotype.fasta"),sep="/"))
 
     # Plot genotype
-    ggsave(paste(output_folder,"genotype.pdf",sep="/"), plotGenotype(gt, silent=T))
+    ggsave(paste(output_folder,paste0(sourceLabel, "_genotype.pdf"),sep="/"), plotGenotype(gt, silent=T))
 
     # Modify allele calls and output TSV file
     db_reassigned <- reassignAlleles(db, gtseq)
@@ -82,25 +88,32 @@ if (loci == "ig"){
                                 nproc=1,
                                 first = FALSE)
 
-    writeChangeoDb(db, paste(output_folder,"v_tr_genotyped.tab",sep="/"))
+    writeChangeoDb(db, paste(output_folder,paste0(sourceLabel, "_v_tr_genotyped.tab"),sep="/"))
 
 } else {
     stop("Loci specified is not available, please choose from: ig, tr.")
 }
 
-# Find threshold using chosen method
-
-if (threshold_method == "density") {
-    output <- findThreshold(dist_ham$dist_nearest, method="density")
-    threshold <- output@threshold
-} else if (threshold_method == "gmm") {
-    output <- findThreshold(dist_ham$dist_nearest, method="gmm")
-    threshold <- output@threshold
+num_dist <- length(unique(na.omit(dist_ham$dist_nearest)))
+if (num_dist > 3) {
+    # Find threshold using chosen method
+    if (threshold_method == "density") {
+        output <- findThreshold(dist_ham$dist_nearest, method="density")
+        threshold <- output@threshold
+    } else if (threshold_method == "gmm") {
+        output <- findThreshold(dist_ham$dist_nearest, method="gmm")
+        threshold <- output@threshold
+    } else {
+        stop("Threshold method is not available, please choose from: density, gmm")
+    }
+    # Plot distance histogram, density estimate and optimum threshold
+    ggsave(paste(output_folder,paste0(sourceLabel, "_Hamming_distance_threshold.pdf"),sep="/"), plot(output), device="pdf")
 } else {
-    stop("Threshold method is not available, please choose from: density, gmm")
+    # Workaround for sources with too few nearest distance values to determine an effective threshold.
+    # Set threshold to 0 and print a warning
+    threshold <- 0.0
+    warning(paste("Could not determine an effective Hamming distance threshold for source:", sourceLabel, ", which has", num_dist, "unique nearest distances. Threshold defaulting to 0.",  sep=" "))
+    ggsave(paste(output_folder,paste0(sourceLabel, "_Hamming_distance_threshold.pdf"),sep="/"), plot(dist_ham$dist_nearest, dist_ham$duplicate_count), device="pdf")
 }
 
-# Plot distance histogram, density estimate and optimum threshold
-ggsave(paste(output_folder,"Hamming_distance_threshold.pdf",sep="/"), plot(output), device="pdf")
-
-write.table(threshold, file= paste(output_folder,"threshold.txt",sep="/"), quote=FALSE, sep="", row.names = FALSE, col.names = FALSE)
+write.table(threshold, file= paste(output_folder,paste0(sourceLabel, "_threshold.txt"),sep="/"), quote=FALSE, sep="", row.names = FALSE, col.names = FALSE)
diff --git a/bin/log_parsing.py b/bin/log_parsing.py
@@ -41,7 +41,7 @@
             with open(logfile, "r") as f:
                 for line in f:
                     if " START>" in line:
-                        s_code.append(logfile.split("/")[1].split("_")[0])
+                        s_code.append(logfile.split("/")[1].split("_command_log")[0])
                         process_name.append(process)
                     elif "PAIRS>" in line:
                         pairs.append(line.strip().lstrip("PAIRS> "))
@@ -80,7 +80,7 @@
                 for line in f:
                     if " START>" in line:
                         if c < 1:
-                            s_code.append(logfile.split("/")[1].split("_")[0])
+                            s_code.append(logfile.split("/")[1].split("_command_log")[0])
                             process_name.append(process)
                     elif "SEQUENCES>" in line:
                         if c < 1:
@@ -127,7 +127,7 @@
                 # print(f.read())
                 for line in f:
                     if " START>" in line:
-                        s_code.append(logfile.split("/")[1].split("_")[0])
+                        s_code.append(logfile.split("/")[1].split("_command_log")[0])
                         process_name.append(process)
                     elif "SEQUENCES1>" in line:
                         seqs1.append(line.strip().lstrip("SEQUENCES1").lstrip("> "))
@@ -162,7 +162,7 @@
                 # print(f.read())
                 for line in f:
                     if " START>" in line:
-                        s_code.append(logfile.split("/")[1].split("_")[0])
+                        s_code.append(logfile.split("/")[1].split("_command_log")[0])
                         process_name.append(process)
                     elif "OUTPUT>" in line:
                         output_file.append(line.strip().lstrip("OUTPUT> "))
@@ -202,7 +202,7 @@
                 # print(f.read())
                 for line in f:
                     if " START>" in line:
-                        s_code.append(logfile.split("/")[1].split("_")[0])
+                        s_code.append(logfile.split("/")[1].split("_command_log")[0])
                         process_name.append(process)
                     elif "SEQUENCES>" in line:
                         seqs.append(line.strip().lstrip("SEQUENCES> "))
@@ -235,7 +235,7 @@
                 # print(f.read())
                 for line in f:
                     if "PASS>" in line:
-                        s_code.append(logfile.split("/")[1].split("_")[0])
+                        s_code.append(logfile.split("/")[1].split("_command_log")[0])
                         pass_blast.append(line.strip().lstrip("PASS> "))
                     elif "FAIL>" in line:
                         fail_blast.append(line.strip().lstrip("FAIL> "))
@@ -267,7 +267,7 @@
                 # print(f.read())
                 for line in f:
                     if " START>" in line:
-                        s_code.append(logfile.split("/")[1].split("_")[0])
+                        s_code.append(logfile.split("/")[1].split("_command_log")[0])
                         process_name.append(process)
                     elif "RECORDS>" in line:
                         seqs.append(line.strip().lstrip("RECORDS> "))
@@ -303,7 +303,7 @@
                 # print(f.read())
                 for line in f:
                     if " START>" in line:
-                        s_code.append(logfile.split("/")[1].split("_")[0])
+                        s_code.append(logfile.split("/")[1].split("_command_log")[0])
                         process_name.append(process)
                     elif "RECORDS>" in line:
                         seqs.append(line.strip().lstrip("RECORDS> "))