Skip to content

Commit

Permalink
Merge pull request #8 from sagc-bioinformatics/V0.1.5
Browse files Browse the repository at this point in the history
V0.1.5
  • Loading branch information
ziadbkh authored Mar 28, 2024
2 parents b0c0b1b + da12975 commit a816ab5
Show file tree
Hide file tree
Showing 131 changed files with 469 additions and 86 deletions.
2 changes: 0 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@ keywords = ["fastq", "MGI", "demultiplexing"]

[dependencies]
flate2 = { version = "1.0.28", features = ["zlib-ng-compat"], default-features = false }
#gzp = { version = "0.11.3", default-features = false, features = ["deflate_zlib_ng"] }
libz-sys = { version = "1.1.8", default-features = false, features = ["libc"] }
mimalloc = { version = "0.1.34", default-features = false }
combinations = "*"
itertools = "*"
chrono = "0.4"
termion = "2.0.1"
clap = "4.3.21"
noodles-bgzf = "0.26.0"
memchr = "2.6.4"
libdeflater = "1.12.0"
niffler = { version = "2.5.0", default-features = false, features = ["gz"]}
Expand Down
Binary file modified bins/mgikit-V0.1.5.zip
Binary file not shown.
12 changes: 9 additions & 3 deletions docs/pages/demultiplex.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,9 @@ the number of allowed mismatches is high.

+ **`--memory`**: The requested maximum memory to be used (in giga byte). Check the documentation for memory optimisation options. Default is 0 then the tool will use the available memory on the machine.

+ **`--not-mgi`**: This flag needs to be enabled if the input fastq files don't have MGI format.


### Understanding input files

MGI sequencing machine output a directory for the run (flowcell_id) with a subdirectory for each lane (L01, L02 ..) depending on the machine.
Expand Down Expand Up @@ -471,9 +473,9 @@ The expected memory usage is influenced by three main factors,

The expected allocated memory is

+ **Single-end input**: `number of smaples * (writing buffer size + 2 * compression buffer size)`.
+ **Single-end input**: `number of samples * (writing buffer size + 2 * compression buffer size)`.

+ **Paired-end input**: `2 * number of smaples * (writing buffer size + 2 * compression buffer size)`.
+ **Paired-end input**: `2 * number of samples * (writing buffer size + 2 * compression buffer size)`.

When using the default parameters:

Expand All @@ -484,6 +486,10 @@ When using the default parameters:
Reducing the writing buffer size will reduce the required memory but also affect the performance time.


### Testing datasets

We have attached a simple python script to generate paired-end fastq files. The script is available under [`mgikit/testing_data/generate_fastq/`](https://github.com/sagc-bioinformatics/mgikit/tree/main/testing_data/generate_fastq). You can use this script to generate large fastq files with random content for testing as described in the readme file under the directory.

### Execution examples

You can use the datasets at `testing_data` to perform these tests.
Expand Down
161 changes: 98 additions & 63 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,8 @@ fn create_output_file_name(sample_name: &String, lane: &String, sample_index: us
}
}else
{
//return (format!("{}_{}_R1.fastq.gz", sample_name, lane), format!("{}_{}_R2.fastq.gz", sample_name, lane));
return (format!("{}_R1.fastq.gz", sample_name), format!("{}_R2.fastq.gz", sample_name));
return (format!("{}_{}_R1.fastq.gz", sample_name, lane), format!("{}_{}_R2.fastq.gz", sample_name, lane));
//return (format!("{}_R1.fastq.gz", sample_name), format!("{}_R2.fastq.gz", sample_name));
}
}

Expand All @@ -348,7 +348,7 @@ fn get_reader(input_file:&PathBuf) -> Box< dyn Read>{
reader
}

fn get_flowcell_info(mgi_header:&String) ->(String, usize){
fn get_flowcell_lane_info(mgi_header:&String) ->(String, usize, String){
let mut l_position = mgi_header.len() - 1;
for header_chr in mgi_header.chars().rev() {
if header_chr == 'L' {
Expand All @@ -362,9 +362,10 @@ fn get_flowcell_info(mgi_header:&String) ->(String, usize){
panic!("Can not find the flowcell id in this header {}!", mgi_header);
}
let flowcell = mgi_header[1..l_position].to_string();
let lane = mgi_header[l_position + 1..l_position + 2].to_string();

info!("Detected flowcell from the header of the first read is {}.", flowcell);
(flowcell, l_position)
(flowcell, l_position, lane)
}

fn parse_sb_file_name(file_name: &String) -> (String, String, String, String){
Expand Down Expand Up @@ -681,7 +682,8 @@ pub fn demultiplex(
compression_buffer_size: usize,
ignore_undetermined:bool,
all_index_error:bool,
memory:f64
memory:f64,
not_mgi:bool
) -> Result<(), Box<dyn Error>> {

// Validate and prepare input data and parameters.
Expand All @@ -690,10 +692,13 @@ pub fn demultiplex(

let (paired_read_file_path_final, read_barcode_file_path_final, single_read_input) =
if input_folder_path.len() > 0{
info!("Input directory: {}", input_folder_path);
get_read_files_from_input_dir(input_folder_path, read1_file_name_suf, read2_file_name_suf)
}else{
validate_and_assigne_input_reads(read1_file_path, read2_file_path)
};

info!("MGI input fastq files: {}", !not_mgi);

let (mut instrument, mut run) =
parse_info_file(
Expand All @@ -711,7 +716,7 @@ pub fn demultiplex(
run = arg_run.clone();
}

let lane: String = match arg_lane.len() > 0{
let mut lane: String = match arg_lane.len() > 0{
true => arg_lane.clone(),
false => get_lane_from_file(&read_barcode_file_path_final , 1, '_')
};
Expand All @@ -723,16 +728,29 @@ pub fn demultiplex(
panic!("Sample sheet file is invalid!");
}
check_file(sample_sheet_file_path);



// Printing analysis information

info!("Output directory: {}", output_directory.display());
info!("Reports directory: {}", report_directory.display());
info!("Instrumnet: {}", instrument);
info!("Run: {}", run);
info!("Lane: {}", lane);
info!("Comprehensive scan mood: {}", comprehensive_scan);

info!("Comprehensive scan mode: {}", comprehensive_scan);
info!("Dynamic read determination: {}.", dynamic_demultiplexing);
info!("Compression level: {}. (0 no compression but fast, 12 best compression but slow.)", compression_level);
info!("Allowed mismatches when finding the index are: {}", allowed_mismatches);

if all_index_error{
info!(
"The allowed mismatches will be compared to the total mismatches for all indicies combined."
);
}else{
info!(
"The allowed mismatches will be considered per index."
);
}
if allowed_mismatches > 2{
warn!("It is not recommended to allow more than 2 mismatches per index! This might decrease the accuracy of the demultipexing!");
}

info!("Trim Barcode: {}", trim_barcode);
// writing_threshold: usize, read_merging_threshold
let writing_buffer_size: usize;
Expand All @@ -752,39 +770,27 @@ pub fn demultiplex(
if compression_buffer_size > writing_buffer_size{
panic!("Compression buffer size '--compression-buffer-size' should be less than Writing buffer size ('--writing-buffer-size').");
}
info!(
"Reads that match with multiple samples will be saved in ambiguous_read1/2.fastq file"
);
info!("The paired read files are assumed to contain the paired reads in the same order!");
info!(
"Allowed mismatches when finding the index are: {}",
allowed_mismatches
);
if all_index_error{
info!(
"The allowed mismatches will be compared to the total mismatches for all indicies combined."
);
}else{
info!(
"The allowed mismatches will be consdered per index."
);
}
info!("Compression level: {}. (0 no compression but fast, 12 best compression but slow.)", compression_level);

//let per_index_mismatch = if all_index_error

if allowed_mismatches > 2{
warn!("It is not recommended to allow more than 2 mismatches per index! This might decrease the accuracy of the demultipexing!");
}


let trim_barcode = !keep_barcode;
let illumina_format = !disable_illumina_format;
if 0.0 < memory && memory <= 0.5{
panic!("Requested memory should be greater than 0.5 GB!")
}


info!(
"Reads that match with multiple samples will be saved in ambiguous_read1/2.fastq file"
);
info!("The paired read files are assumed to contain the paired reads in the same order!");

if illumina_format && not_mgi{
panic!("mgikit does not refomat output files in Illumina foramt unless the input fastq files are in MGI format! Disable `--not-mgi` or enable `--disable-illumina-format`");
}

// Get reads information

let mut whole_read_barcode_len: usize ;
let mut whole_paired_read_len: usize = 0;
let barcode_read_length: usize;
Expand All @@ -794,48 +800,48 @@ pub fn demultiplex(
let header_length_r2:usize;
let mut header_length_r1: usize = 0;
let flowcell:String;
let header_lane:String;
let l_position: usize;
let only_plus_r2: bool;

{
let mut reader_barcode_read_tmp = get_buf_reader(&read_barcode_file_path_final);
let (header, seq, plus, quality) = get_read_parts(&mut reader_barcode_read_tmp);

whole_read_barcode_len = header.len() + seq.len() + plus.len() + quality.len();
header_length_r2 = header.len();
(flowcell, l_position) = get_flowcell_info(&header);
let mut reader_barcode_read_tmp = get_buf_reader(&read_barcode_file_path_final);
let (header, seq, plus, quality) = get_read_parts(&mut reader_barcode_read_tmp);

whole_read_barcode_len = header.len() + seq.len() + plus.len() + quality.len();
header_length_r2 = header.len();
if not_mgi{
flowcell = Local::now().format("%Y%m%dT%H%M%S").to_string();
l_position = 0;
}else{
(flowcell, l_position, header_lane) = get_flowcell_lane_info(&header);
info!("Detected flowcell from the header of the first read is {}.", flowcell);

barcode_read_length = seq.chars().count() - 1;
only_plus_r2 = plus == "+\n";
if ! only_plus_r2 && ! dynamic_demultiplexing{
panic!("Expected read format is not satisified. You can try rerunning using --flexible parameter.");
info!("Detected lane from the header of the first read is {}.", header_lane);
if "1234".contains(&header_lane) {
warn!("The detected lane ( = {}) is not recognised! Expected 1, 2, 3 or 4!", header_lane);
}


if !single_read_input {
let mut reader_paired_read_buff = get_buf_reader(&paired_read_file_path_final);
let (header, seq, plus, quality) = get_read_parts(&mut reader_paired_read_buff);
whole_paired_read_len = header.len() + seq.len() + plus.len() + quality.len();
header_length_r1 = header.len();
paired_read_length = seq.chars().count() - 1;
only_plus_r1 = plus == "+\n";
if ! only_plus_r1{
panic!("Expected read format is not satisified. You can try running --flexible command.");
if lane.len() == 0 {
lane = format!("L0{}", header_lane);
}else{
if lane != format!("L0{}", header_lane){
warn!("The lane in the read header (L0{}) does not match with the lane provided or extracted from the input file name {}!", header_lane, lane);
}
}
}

if lane.len() == 0{

}
info!("The length of the read with barcode is: {}", barcode_read_length);
info!("The length of the paired read is: {}", paired_read_length);
//println!("ZZLENGTH {} - {}", whole_paired_read_len, whole_read_barcode_len);



let mut illumina_header_prefix_str = String::new();
let mut illumina_header_prefix = illumina_header_prefix_str.as_bytes();

if illumina_format {
info!("Read header and Output files: Illumina format.");
info!("Instrumnet: {}", instrument);
info!("Run: {}", run);
info!("Lane: {}", lane);
if lane.len() == 0 || instrument.len() == 0 || run.len() == 0 {
panic!("Instrument id and run number are required for QC reports and when Illumina format is requested!")
}
Expand All @@ -844,11 +850,40 @@ pub fn demultiplex(

}else {
info!("Read header and Output files: MGI format.");
info!("Lane: {}", lane);
if lane.len() == 0 {
panic!("Lane number is required for QC reports!")
}
}



barcode_read_length = seq.chars().count() - 1;
only_plus_r2 = plus == "+\n";

if ! only_plus_r2 && ! dynamic_demultiplexing{
panic!("Expected read format is not satisified. You can try rerunning using --flexible parameter.");
}

if !single_read_input {
let mut reader_paired_read_buff = get_buf_reader(&paired_read_file_path_final);
let (header, seq, plus, quality) = get_read_parts(&mut reader_paired_read_buff);
whole_paired_read_len = header.len() + seq.len() + plus.len() + quality.len();
header_length_r1 = header.len();
paired_read_length = seq.chars().count() - 1;
only_plus_r1 = plus == "+\n";
if ! only_plus_r1{
panic!("Expected read format is not satisified. You can try running --flexible command.");
}
}

info!("The length of the read with barcode is: {}", barcode_read_length);
info!("The length of the paired read is: {}", paired_read_length);
//println!("ZZLENGTH {} - {}", whole_paired_read_len, whole_read_barcode_len);





// parse sample/index file and get all mismatches
let mut i7_rc = arg_i7_rc;
Expand Down Expand Up @@ -2516,7 +2551,7 @@ pub fn reformat(
whole_read_barcode_len += tmp_str.len();
header_length_r2 = tmp_str.len();

let (flowcell, l_position) = get_flowcell_info(&tmp_str);
let (flowcell, l_position, _) = get_flowcell_lane_info(&tmp_str);
if sb_flowcell != flowcell{
warn!("The flowcell extracted from the file name ({}) is not the same as the flowcell extracted from the rewad header ({})", sb_flowcell, flowcell);
}
Expand Down
18 changes: 12 additions & 6 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,14 @@ fn main() {
.default_value("0")
.value_parser(clap::value_parser!(f64))
.help("The requested maximum memory to be used (in giga byte). Check the documentation for memory optimisation options. Default is 0 then the tool will use the available memory on the machine.")
)
)
.arg(
Arg::new("arg_not_mgi")
.long("not-mgi")
.action(ArgAction::SetTrue)
.default_value("false")
.help("This flag needs to be enabled if the input fastq files don't have MGI format.")
)
)
.subcommand(
Command::new("template")
Expand Down Expand Up @@ -519,9 +526,7 @@ fn main() {
.default_value("")
.help("The barcode of the specific sample to calulate the mismatches for the reports. If not provided, no mismtahces will be calculated.")
)




)
.get_matches();

Expand Down Expand Up @@ -566,7 +571,7 @@ fn main() {
let arg_ignore_undetermined: &bool = demultiplex_command.get_one::<bool>("arg_ignore_undetermined").unwrap();
let arg_all_index_error: &bool = demultiplex_command.get_one::<bool>("arg_all_index_error").unwrap();
let arg_memory: &f64 = demultiplex_command.get_one::<f64>("arg_memory").unwrap();

let arg_not_mgi: &bool = demultiplex_command.get_one::<bool>("arg_not_mgi").unwrap();
match demultiplex(
arg_input_folder_path,
&mut arg_read1_file_path,
Expand Down Expand Up @@ -598,7 +603,8 @@ fn main() {
*arg_compression_buffer_size,
*arg_ignore_undetermined,
*arg_all_index_error,
*arg_memory
*arg_memory,
*arg_not_mgi
) {
Ok(_) => {},
Err(err) => eprintln!("Error: {}", err),
Expand Down
7 changes: 7 additions & 0 deletions testing_data/expected/ds015/ds015-0/L01.mgikit.info
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
sample 0-mismatches
Undetermined 8
SAM-03 4
SAM-05 3
SAM-01 2
SAM-02 2
SAM-04 2
7 changes: 7 additions & 0 deletions testing_data/expected/ds015/ds015-0/L01.mgikit.sample_stats
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
job_number sample_id r1_qc_30 r2_qc_30 r3_qc_30 r1_bases r2_bases r3_bases r1_qc r2_qc r3_qc all_reads 0-mismatches
. SAM-01 100 196 48 100 196 48 3500 6664 1632 2 2
. SAM-02 100 196 48 100 196 48 3500 6664 1632 2 2
. SAM-03 200 392 96 200 392 96 7000 13328 3264 4 4
. SAM-04 100 196 48 100 196 48 3500 6664 1632 2 2
. SAM-05 150 294 72 150 294 72 5250 9996 2448 3 3
. Undetermined 400 490 144 400 784 192 14000 19306 5208 8 8
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
TGAAGACGGGTGACATCTGGGGGG 1
TCTCAGCAGAAGGGCAGAATCGTG 1
TCACAGCAGAAGATCAGAATCTTG 1
GCCTCGTGTCGTGTGAGGGTTTTT 1
GAGTCGTGTCCTGTGACGATATAT 1
CTGCAGTAACAAAAAGAAAATTCT 1
CAATGTGGCCACTGTGCTGGATGA 1
CAATGGGGCCAAAATGCTGGTGTA 1
Loading

0 comments on commit a816ab5

Please sign in to comment.