From 110cf1e40c1943385cd7afe7ef2c95684b05f758 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 27 Mar 2024 13:57:09 -0500 Subject: [PATCH 1/2] support gzipped PAF files --- Cargo.toml | 1 + src/main.rs | 2 +- src/paf.rs | 26 ++++++++++++++++++++------ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 650fef4..266fd01 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,3 +13,4 @@ itertools = "0.10.0" fnv = "1.0.7" rgb = "0.8" line_drawing = "0.8.0" +flate2 = "1.0.28" \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 7b9f150..fbbf568 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,7 +8,7 @@ use crate::paf::{PafFile, paf_query_end, paf_target, paf_target_length, paf_targ fn main() { let matches = App::new("paf2chain") - .version("0.1.0") + .version("0.1.1") .author("Andrea Guarracino") .about("Generate a CHAIN format file from a PAF format file") .arg( diff --git a/src/paf.rs b/src/paf.rs index 965a95e..4abdcef 100644 --- a/src/paf.rs +++ b/src/paf.rs @@ -1,7 +1,9 @@ use boomphf::Mphf; use itertools::Itertools; use std::fs::File; -use std::io::{BufRead, BufReader}; +use std::io::{BufRead, BufReader, Read}; +use std::path::Path; +use flate2::read::GzDecoder; #[derive(Debug, Clone)] struct AlignedSeq { @@ -81,7 +83,19 @@ pub fn paf_target_end(line: &str) -> usize { fn _for_each_line_in_file(paf_filename: &str, mut callback: impl FnMut(&str)) { let file = File::open(paf_filename).unwrap(); - let reader = BufReader::new(file); + // Determine if the file is gzipped based on its extension + let gzipped = Path::new(paf_filename) + .extension() + .map_or(false, |ext| ext == "gz"); + + // Create a dynamic reader based on the file type + let box_reader: Box = if gzipped { + Box::new(GzDecoder::new(file)) + } else { + Box::new(file) + }; + + let reader = BufReader::new(box_reader); for line in reader.lines() { callback(&line.unwrap()); } @@ -109,7 +123,7 @@ impl PafFile { let query_id = query_mphf.hash(&query_name) as usize; if !seen_queries[query_id] { seen_queries[query_id] = true; - let mut query = &mut queries[query_id]; + let query = &mut queries[query_id]; query.name = query_name; query.length = paf_query_length(l); } @@ -117,7 +131,7 @@ impl PafFile { let target_id = target_mphf.hash(&target_name) as usize; if !seen_targets[target_id] { seen_targets[target_id] = true; - let mut target = &mut targets[target_id]; + let target = &mut targets[target_id]; target.name = target_name; target.length = paf_target_length(l); } @@ -129,7 +143,7 @@ impl PafFile { let mut target_offset: usize = 0; targets_sort.iter().for_each(|t| { let target_id = target_mphf.hash(&t.name) as usize; - let mut target = &mut targets[target_id]; + let target = &mut targets[target_id]; target.rank = target_idx; target_idx += 1; target.offset = target_offset; @@ -141,7 +155,7 @@ impl PafFile { let mut query_offset: usize = 0; queries_sort.iter().for_each(|q| { let query_id = query_mphf.hash(&q.name) as usize; - let mut query = &mut queries[query_id]; + let query = &mut queries[query_id]; query.rank = query_idx; query_idx += 1; query.offset = query_offset; From d2e77fb7b05fe20061f3b3c3f789b4d761647d09 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 27 Mar 2024 13:59:21 -0500 Subject: [PATCH 2/2] README --- Cargo.toml | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 266fd01..4995736 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,4 +13,4 @@ itertools = "0.10.0" fnv = "1.0.7" rgb = "0.8" line_drawing = "0.8.0" -flate2 = "1.0.28" \ No newline at end of file +flate2 = "1.0.28" diff --git a/README.md b/README.md index 10be568..287dd8b 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ cargo install --force --path . Generate alignments with the cigar string attached to the `cg:Z:` tag. These can be made by several aligners, including `minimap2 -c`, `wfmash`, or `lastz --format=paf:wfmash`. -With alignments in `aln.paf`, we would convert it into a CHAIN format file using this call: +With alignments in `aln.paf` (or `aln.paf.gz` if gzip-compressed), we would convert it into a CHAIN format file using this call: ``` paf2chain -i aln.paf > aln.chain