sagc-bioinformatics · ziadbkh · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,15 +9,13 @@ keywords = ["fastq", "MGI", "demultiplexing"]
 
 [dependencies]
 flate2 = { version = "1.0.28", features = ["zlib-ng-compat"], default-features = false }
-#gzp = { version = "0.11.3", default-features = false, features = ["deflate_zlib_ng"] }
 libz-sys = { version = "1.1.8", default-features = false, features = ["libc"] }
 mimalloc = { version = "0.1.34", default-features = false }
 combinations = "*"
 itertools = "*"
 chrono = "0.4"
 termion = "2.0.1"
 clap = "4.3.21"
-noodles-bgzf = "0.26.0"
 memchr = "2.6.4"
 libdeflater = "1.12.0"
 niffler = { version = "2.5.0", default-features = false, features = ["gz"]}

diff --git a/bins/mgikit-V0.1.5.zip b/bins/mgikit-V0.1.5.zip
diff --git a/docs/pages/demultiplex.md b/docs/pages/demultiplex.md
@@ -141,7 +141,9 @@ the number of allowed mismatches is high.
 
 + **`--memory`**: The requested maximum memory to be used (in giga byte). Check the documentation for memory optimisation options. Default is 0 then the tool will use the available memory on the machine.
 
-
++ **`--not-mgi`**: This flag needs to be enabled if the input fastq files don't have MGI format.
+
+
 ### Understanding input files
 
 MGI sequencing machine output a directory for the run (flowcell_id) with a subdirectory for each lane (L01, L02 ..) depending on the machine.
@@ -471,9 +473,9 @@ The expected memory usage is influenced by three main factors,
 
 The expected allocated memory is 
 
-+ **Single-end input**: `number of smaples * (writing buffer size + 2 * compression buffer size)`.
++ **Single-end input**: `number of samples * (writing buffer size + 2 * compression buffer size)`.
 
-+ **Paired-end input**: `2 * number of smaples * (writing buffer size + 2 * compression buffer size)`.
++ **Paired-end input**: `2 * number of samples * (writing buffer size + 2 * compression buffer size)`.
 
 When using the default parameters:
 
@@ -484,6 +486,10 @@ When using the default parameters:
 Reducing the writing buffer size will reduce the required memory but also affect the performance time.
 
 
+### Testing datasets
+
+We have attached a simple python script to generate paired-end fastq files. The script is available under [`mgikit/testing_data/generate_fastq/`](https://github.com/sagc-bioinformatics/mgikit/tree/main/testing_data/generate_fastq). You can use this script to generate large fastq files with random content for testing as described in the readme file under the directory.
+
 ### Execution examples
 
 You can use the datasets at `testing_data` to perform these tests.

diff --git a/src/lib.rs b/src/lib.rs
@@ -333,8 +333,8 @@ fn create_output_file_name(sample_name: &String, lane: &String, sample_index: us
         }
     }else
     {
-        //return (format!("{}_{}_R1.fastq.gz", sample_name, lane), format!("{}_{}_R2.fastq.gz", sample_name, lane));
-        return (format!("{}_R1.fastq.gz", sample_name), format!("{}_R2.fastq.gz", sample_name));
+        return (format!("{}_{}_R1.fastq.gz", sample_name, lane), format!("{}_{}_R2.fastq.gz", sample_name, lane));
+        //return (format!("{}_R1.fastq.gz", sample_name), format!("{}_R2.fastq.gz", sample_name));
     }    
 }
 
@@ -348,7 +348,7 @@ fn get_reader(input_file:&PathBuf) -> Box< dyn Read>{
     reader
 }
 
-fn get_flowcell_info(mgi_header:&String) ->(String, usize){
+fn get_flowcell_lane_info(mgi_header:&String) ->(String, usize, String){
     let mut l_position = mgi_header.len() - 1;
     for header_chr in mgi_header.chars().rev() {
         if header_chr == 'L' {
@@ -362,9 +362,10 @@ fn get_flowcell_info(mgi_header:&String) ->(String, usize){
         panic!("Can not find the flowcell id in this header {}!", mgi_header);
     }
     let flowcell = mgi_header[1..l_position].to_string();
+    let lane = mgi_header[l_position + 1..l_position + 2].to_string();
 
     info!("Detected flowcell from the header of the first read is {}.", flowcell);
-    (flowcell, l_position)
+    (flowcell, l_position, lane)
 }
 
 fn parse_sb_file_name(file_name: &String) -> (String, String, String, String){
@@ -681,7 +682,8 @@ pub fn demultiplex(
     compression_buffer_size: usize,
     ignore_undetermined:bool,
     all_index_error:bool,
-    memory:f64
+    memory:f64,
+    not_mgi:bool
 ) -> Result<(), Box<dyn Error>> {
 
     // Validate and prepare input data and parameters.
@@ -690,10 +692,13 @@ pub fn demultiplex(
 
     let (paired_read_file_path_final, read_barcode_file_path_final, single_read_input) = 
     if input_folder_path.len() > 0{
+        info!("Input directory: {}", input_folder_path);
         get_read_files_from_input_dir(input_folder_path, read1_file_name_suf, read2_file_name_suf)
     }else{
         validate_and_assigne_input_reads(read1_file_path, read2_file_path)
     };
+
+    info!("MGI input fastq files:  {}", !not_mgi);
 
     let (mut instrument, mut run) = 
                 parse_info_file(
@@ -711,7 +716,7 @@ pub fn demultiplex(
         run = arg_run.clone();
     }
 
-    let lane: String = match arg_lane.len() > 0{
+    let mut lane: String = match arg_lane.len() > 0{
         true => arg_lane.clone(),
         false => get_lane_from_file(&read_barcode_file_path_final , 1, '_')
     };
@@ -723,16 +728,29 @@ pub fn demultiplex(
         panic!("Sample sheet file is invalid!");
     }
     check_file(sample_sheet_file_path);
-
-
+
+    // Printing analysis information
+
     info!("Output directory: {}", output_directory.display());
     info!("Reports directory: {}", report_directory.display());
-    info!("Instrumnet: {}", instrument);
-    info!("Run: {}", run);
-    info!("Lane: {}", lane);
-    info!("Comprehensive scan mood: {}", comprehensive_scan);
+
+    info!("Comprehensive scan mode: {}", comprehensive_scan);
     info!("Dynamic read determination: {}.", dynamic_demultiplexing);
-    info!("Compression level: {}. (0 no compression but fast, 12 best compression but slow.)", compression_level);
+    info!("Allowed mismatches when finding the index are: {}", allowed_mismatches);
+
+    if all_index_error{
+        info!(
+            "The allowed mismatches will be compared to the total mismatches for all indicies combined."
+        );
+    }else{
+        info!(
+            "The allowed mismatches will be considered per index."
+        );
+    }
+    if allowed_mismatches > 2{
+        warn!("It is not recommended to allow more than 2 mismatches per index! This might decrease the accuracy of the demultipexing!");
+    }
+
     info!("Trim Barcode: {}", trim_barcode);
     // writing_threshold: usize, read_merging_threshold
     let writing_buffer_size: usize;
@@ -752,39 +770,27 @@ pub fn demultiplex(
     if compression_buffer_size > writing_buffer_size{
         panic!("Compression buffer size '--compression-buffer-size' should be less than Writing buffer size ('--writing-buffer-size').");
     }
-    info!(
-        "Reads that match with multiple samples will be saved in ambiguous_read1/2.fastq file"
-    );
-    info!("The paired read files are assumed to contain the paired reads in the same order!");
-    info!(
-        "Allowed mismatches when finding the index are: {}",
-        allowed_mismatches
-    );
-    if all_index_error{
-        info!(
-            "The allowed mismatches will be compared to the total mismatches for all indicies combined."
-        );
-    }else{
-        info!(
-            "The allowed mismatches will be consdered per index."
-        );
-    }
+    info!("Compression level: {}. (0 no compression but fast, 12 best compression but slow.)", compression_level);
+
     //let per_index_mismatch = if all_index_error 
 
-    if allowed_mismatches > 2{
-        warn!("It is not recommended to allow more than 2 mismatches per index! This might decrease the accuracy of the demultipexing!");
-    }
-
+
     let trim_barcode = !keep_barcode;
     let illumina_format = !disable_illumina_format;
     if 0.0 < memory && memory <= 0.5{
         panic!("Requested memory should be greater than 0.5 GB!")
     }
-
 
+    info!(
+        "Reads that match with multiple samples will be saved in ambiguous_read1/2.fastq file"
+    );
+    info!("The paired read files are assumed to contain the paired reads in the same order!");
 
+    if illumina_format && not_mgi{
+        panic!("mgikit does not refomat output files in Illumina foramt unless the input fastq files are in MGI format! Disable `--not-mgi` or enable `--disable-illumina-format`");
+    }
+
     //  Get reads information 
-
     let mut whole_read_barcode_len: usize ;
     let mut whole_paired_read_len: usize = 0;
     let barcode_read_length: usize;
@@ -794,48 +800,48 @@ pub fn demultiplex(
     let header_length_r2:usize;
     let mut header_length_r1: usize = 0;   
     let flowcell:String;
+    let header_lane:String;
     let l_position: usize;
     let only_plus_r2: bool;
 
-    {
-        let mut reader_barcode_read_tmp = get_buf_reader(&read_barcode_file_path_final); 
-        let (header, seq, plus, quality) = get_read_parts(&mut reader_barcode_read_tmp);
-
-        whole_read_barcode_len = header.len() + seq.len() + plus.len() + quality.len();
-        header_length_r2 = header.len();
-        (flowcell, l_position) = get_flowcell_info(&header);    
+    let mut reader_barcode_read_tmp = get_buf_reader(&read_barcode_file_path_final); 
+    let (header, seq, plus, quality) = get_read_parts(&mut reader_barcode_read_tmp);
+
+    whole_read_barcode_len = header.len() + seq.len() + plus.len() + quality.len();
+    header_length_r2 = header.len();
+    if not_mgi{
+        flowcell = Local::now().format("%Y%m%dT%H%M%S").to_string();
+        l_position = 0;
+    }else{
+        (flowcell, l_position, header_lane) = get_flowcell_lane_info(&header);
         info!("Detected flowcell from the header of the first read is {}.", flowcell);
-
-        barcode_read_length = seq.chars().count() - 1;
-        only_plus_r2 = plus == "+\n";
-        if ! only_plus_r2 && ! dynamic_demultiplexing{
-            panic!("Expected read format is not satisified. You can try rerunning using --flexible parameter.");
+        info!("Detected lane from the header of the first read is {}.", header_lane);
+        if "1234".contains(&header_lane) {
+            warn!("The detected lane ( = {}) is not recognised! Expected 1, 2, 3 or 4!", header_lane);
         }
-
-
-        if !single_read_input {
-            let mut reader_paired_read_buff = get_buf_reader(&paired_read_file_path_final);
-            let (header, seq, plus, quality) = get_read_parts(&mut reader_paired_read_buff);
-            whole_paired_read_len = header.len() + seq.len() + plus.len() + quality.len();
-            header_length_r1 = header.len();
-            paired_read_length = seq.chars().count() - 1;
-            only_plus_r1 = plus == "+\n";
-            if ! only_plus_r1{
-                panic!("Expected read format is not satisified. You can try running --flexible command.");
+        if lane.len() == 0 {
+            lane = format!("L0{}", header_lane);
+        }else{
+            if lane != format!("L0{}", header_lane){
+                warn!("The lane in the read header (L0{}) does not match with the lane provided or extracted from the input file name {}!", header_lane, lane);
             }
         }
+    }
+
+    if lane.len() == 0{
 
     }
-    info!("The length of the read with barcode is: {}", barcode_read_length);
-    info!("The length of the paired read is: {}", paired_read_length);
-    //println!("ZZLENGTH {}  -  {}", whole_paired_read_len, whole_read_barcode_len);
 
+
 
     let mut illumina_header_prefix_str = String::new();
     let mut illumina_header_prefix = illumina_header_prefix_str.as_bytes();
 
     if illumina_format {
         info!("Read header and Output files: Illumina format.");
+        info!("Instrumnet: {}", instrument);
+        info!("Run: {}", run);
+        info!("Lane: {}", lane);
         if lane.len() == 0 || instrument.len() == 0 || run.len() == 0 {
             panic!("Instrument id and run number are required for QC reports and when Illumina format is requested!")
         }
@@ -844,11 +850,40 @@ pub fn demultiplex(
 
     }else {
         info!("Read header and Output files: MGI format.");
+        info!("Lane: {}", lane);
         if lane.len() == 0 {
             panic!("Lane number is required for QC reports!")
         }
     }
 
+
+
+    barcode_read_length = seq.chars().count() - 1;
+    only_plus_r2 = plus == "+\n";
+
+    if ! only_plus_r2 && ! dynamic_demultiplexing{
+        panic!("Expected read format is not satisified. You can try rerunning using --flexible parameter.");
+    }
+
+    if !single_read_input {
+        let mut reader_paired_read_buff = get_buf_reader(&paired_read_file_path_final);
+        let (header, seq, plus, quality) = get_read_parts(&mut reader_paired_read_buff);
+        whole_paired_read_len = header.len() + seq.len() + plus.len() + quality.len();
+        header_length_r1 = header.len();
+        paired_read_length = seq.chars().count() - 1;
+        only_plus_r1 = plus == "+\n";
+        if ! only_plus_r1{
+            panic!("Expected read format is not satisified. You can try running --flexible command.");
+        }
+    }
+
+    info!("The length of the read with barcode is: {}", barcode_read_length);
+    info!("The length of the paired read is: {}", paired_read_length);
+    //println!("ZZLENGTH {}  -  {}", whole_paired_read_len, whole_read_barcode_len);
+
+
+
+
 
     // parse sample/index file and get all mismatches
     let mut i7_rc = arg_i7_rc;
@@ -2516,7 +2551,7 @@ pub fn reformat(
     whole_read_barcode_len += tmp_str.len();
     header_length_r2 = tmp_str.len();
 
-    let (flowcell, l_position) = get_flowcell_info(&tmp_str);        
+    let (flowcell, l_position, _) = get_flowcell_lane_info(&tmp_str);        
     if sb_flowcell != flowcell{
         warn!("The flowcell extracted from the file name ({}) is not the same as the flowcell extracted from the rewad header ({})", sb_flowcell, flowcell);
     }

diff --git a/src/main.rs b/src/main.rs
@@ -283,7 +283,14 @@ fn main() {
                         .default_value("0")
                         .value_parser(clap::value_parser!(f64))
                         .help("The requested maximum memory to be used (in giga byte). Check the documentation for memory optimisation options. Default is 0 then the tool will use the available memory on the machine.")
-                )  
+                )
+                .arg(
+                    Arg::new("arg_not_mgi")
+                        .long("not-mgi")
+                        .action(ArgAction::SetTrue)
+                        .default_value("false")
+                        .help("This flag needs to be enabled if the input fastq files don't have MGI format.")
+                    ) 
         )
         .subcommand(
             Command::new("template")
@@ -519,9 +526,7 @@ fn main() {
                         .default_value("")
                         .help("The barcode of the specific sample to calulate the mismatches for the reports. If not provided, no mismtahces will be calculated.")
                 )  
-
-
-
+
         )
         .get_matches();
 
@@ -566,7 +571,7 @@ fn main() {
                 let arg_ignore_undetermined:  &bool = demultiplex_command.get_one::<bool>("arg_ignore_undetermined").unwrap();
                 let arg_all_index_error:  &bool = demultiplex_command.get_one::<bool>("arg_all_index_error").unwrap();
                 let arg_memory: &f64 = demultiplex_command.get_one::<f64>("arg_memory").unwrap();
-
+                let arg_not_mgi: &bool = demultiplex_command.get_one::<bool>("arg_not_mgi").unwrap();
                 match demultiplex(
                     arg_input_folder_path,
                     &mut arg_read1_file_path,
@@ -598,7 +603,8 @@ fn main() {
                     *arg_compression_buffer_size,
                     *arg_ignore_undetermined,
                     *arg_all_index_error,
-                    *arg_memory
+                    *arg_memory,
+                    *arg_not_mgi
                 ) {
                     Ok(_) => {},
                     Err(err) => eprintln!("Error: {}", err),

diff --git a/testing_data/expected/ds015/ds015-0/L01.mgikit.info b/testing_data/expected/ds015/ds015-0/L01.mgikit.info
@@ -0,0 +1,7 @@
+sample	0-mismatches
+Undetermined	8
+SAM-03	4
+SAM-05	3
+SAM-01	2
+SAM-02	2
+SAM-04	2
diff --git a/testing_data/expected/ds015/ds015-0/L01.mgikit.sample_stats b/testing_data/expected/ds015/ds015-0/L01.mgikit.sample_stats
@@ -0,0 +1,7 @@
+job_number	sample_id	r1_qc_30	r2_qc_30	r3_qc_30	r1_bases	r2_bases	r3_bases	r1_qc	r2_qc	r3_qc	all_reads	0-mismatches
+.	SAM-01	100	196	48	100	196	48	3500	6664	1632	2	2
+.	SAM-02	100	196	48	100	196	48	3500	6664	1632	2	2
+.	SAM-03	200	392	96	200	392	96	7000	13328	3264	4	4
+.	SAM-04	100	196	48	100	196	48	3500	6664	1632	2	2
+.	SAM-05	150	294	72	150	294	72	5250	9996	2448	3	3
+.	Undetermined	400	490	144	400	784	192	14000	19306	5208	8	8
diff --git a/testing_data/expected/ds015/ds015-0/L01.mgikit.undetermined_barcode b/testing_data/expected/ds015/ds015-0/L01.mgikit.undetermined_barcode
@@ -0,0 +1,8 @@
+TGAAGACGGGTGACATCTGGGGGG	1
+TCTCAGCAGAAGGGCAGAATCGTG	1
+TCACAGCAGAAGATCAGAATCTTG	1
+GCCTCGTGTCGTGTGAGGGTTTTT	1
+GAGTCGTGTCCTGTGACGATATAT	1
+CTGCAGTAACAAAAAGAAAATTCT	1
+CAATGTGGCCACTGTGCTGGATGA	1
+CAATGGGGCCAAAATGCTGGTGTA	1