From bf8b4cec645331f4e66bd6c2f50fa401b11f8f6d Mon Sep 17 00:00:00 2001 From: Jiayu Huang Date: Tue, 12 Nov 2024 23:35:37 -0600 Subject: [PATCH] added reading pwm files --- .gitignore | 3 +- Cargo.toml | 4 +- readme.md | 60 ++++++++++++++++ src/main.rs | 8 +-- src/occupancy.rs | 141 ++++++++++++++++++++++++++++++++++++++ src/types.rs | 9 +++ tests/data/tdmMotifs.meme | 78 +++++++++++++++++++++ 7 files changed, 295 insertions(+), 8 deletions(-) create mode 100644 tests/data/tdmMotifs.meme diff --git a/.gitignore b/.gitignore index 3ff2236..0586a1d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target /Cargo.lock -*.py \ No newline at end of file +*.py +/.vscode \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 209e483..ad50cd5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tf-binding-rs" -version = "0.1.1" +version = "0.1.2" edition = "2021" description = "Fast transcription factor binding site prediction and FASTA manipulation in Rust" license = "MIT" @@ -10,7 +10,7 @@ license = "MIT" [dependencies] ndarray = "0.16.1" polars = { version = "0.44.2", features = ["lazy"] } -serde = { version = "1.0", features = ["derive"] } +serde = { version = "1.0.215", features = ["derive"] } thiserror = "2.0.3" statrs = "0.17.1" phf = {version = "0.11.2", features = ["macros"]} diff --git a/readme.md b/readme.md index 582bbfb..782c59d 100644 --- a/readme.md +++ b/readme.md @@ -20,9 +20,69 @@ Built with performance in mind, this library offers a fast and memory-efficient - 📈 Occupancy landscape calculation - 🧮 Statistical and thermodynamic calculations +## Installation + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +tf-binding-rs = "0.1.1" +``` + +Or install using cargo: + +```bash +cargo add tf-binding-rs +``` + +## Examples + +### Reading FASTA Files + +```rust +use tf_binding_rs::fasta; + +fn main() -> Result<(), Box> { + // Read sequences from a FASTA file + let sequences = fasta::read_fasta("path/to/sequences.fasta")?; + + // Print sequence information + println!("Number of sequences: {}", sequences.height()); + + // Calculate GC content + let gc_stats = fasta::gc_content(&sequences)?; + println!("GC content analysis: {:?}", gc_stats); + + Ok(()) +} +``` + +### Working with PWM Files + +```rust +use tf_binding_rs::occupancy; + +fn main() -> Result<(), Box> { + // Read PWM motifs from MEME format file + let pwm_collection = occupancy::read_pwm_files("path/to/motifs.meme")?; + + // Process each motif + for (motif_id, pwm) in pwm_collection { + println!("Processing motif: {}", motif_id); + println!("Matrix dimensions: {:?}", pwm.shape()); + } + + Ok(()) +} +``` + ## Use Cases - Genomic sequence analysis - TF binding site prediction - Regulatory sequence characterization - High-throughput DNA sequence processing + +## Documentation + +For detailed API documentation, visit [docs.rs/tf-binding-rs](https://docs.rs/tf-binding-rs) diff --git a/src/main.rs b/src/main.rs index 368b9eb..7e62262 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,7 @@ use polars::prelude::*; -use tf_binding_rs::fasta; +use tf_binding_rs::occupancy; fn main() { - let df: DataFrame = fasta::read_fasta("tests/data/test1.fasta").unwrap(); - let restrictions = ["AGCTTTTTAATAGAGTCAGCAAAACTGAA", "TCACTGA"]; - let has_restriction_sites_df = fasta::has_restriction_sites(&df, &restrictions).unwrap(); - println!("{:?}", has_restriction_sites_df); + let df = occupancy::read_pwm_files("tests/data/tdmMotifs.meme").unwrap(); + println!("{:?}", df); } diff --git a/src/occupancy.rs b/src/occupancy.rs index e69de29..2373598 100644 --- a/src/occupancy.rs +++ b/src/occupancy.rs @@ -0,0 +1,141 @@ +use crate::error::MotifError; +use crate::types::*; +use polars::prelude::*; +use std::collections::HashMap; +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::iter::Peekable; + +/// Advances the iterator until a MOTIF line is found +fn skip_until_motif(lines: &mut Peekable) +where + I: Iterator>, +{ + while let Some(Ok(line)) = lines.peek() { + if line.starts_with("MOTIF") { + break; + } + lines.next(); + } +} + +/// Parses a single PWM from the iterator +fn parse_pwm(lines: &mut I) -> Result, MotifError> +where + I: Iterator>, +{ + // Get motif ID from MOTIF line + let motif_line = match lines.next() { + Some(Ok(line)) if line.starts_with("MOTIF") => line, + _ => return Ok(None), + }; + + let motif_id = motif_line + .split_whitespace() + .nth(1) + .ok_or_else(|| MotifError::InvalidFileFormat("Missing motif ID".into()))? + .to_string(); + + // Skip header lines + for _ in 0..2 { + lines.next(); + } + + // Read PWM rows until we hit a non-PWM line + let pwm_rows: Vec> = lines + .take_while(|line| { + line.as_ref() + .map(|l| l.starts_with(|c: char| c.is_whitespace() || c == '0' || c == '1')) + .unwrap_or(false) + }) + .map(|line| { + let line = line.map_err(|e| MotifError::Io(e))?; + let values: Vec = line + .split_whitespace() + .map(|s| s.parse::()) + .collect::, _>>() + .map_err(|e| MotifError::InvalidFileFormat(format!("Invalid PWM value: {}", e)))?; + + Ok(values) + }) + .collect::, MotifError>>()?; + + if pwm_rows.is_empty() { + return Err(MotifError::InvalidFileFormat("Empty PWM".into())); + } + + // Create PWM DataFrame + let pwm = DataFrame::new(vec![ + Column::new( + "A".into(), + pwm_rows.iter().map(|row| row[0]).collect::>(), + ), + Column::new( + "C".into(), + pwm_rows.iter().map(|row| row[1]).collect::>(), + ), + Column::new( + "G".into(), + pwm_rows.iter().map(|row| row[2]).collect::>(), + ), + Column::new( + "T".into(), + pwm_rows.iter().map(|row| row[3]).collect::>(), + ), + ]) + .map_err(|e| MotifError::DataError(e.to_string()))?; + + Ok(Some((motif_id, pwm))) +} + +/// Reads Position Weight Matrices (PWMs) from a MEME format file +/// +/// This function parses a MEME-formatted file containing one or more Position Weight Matrices, +/// each identified by a unique motif ID. The PWMs represent DNA binding motifs where each position +/// contains probabilities for the four nucleotides (A, C, G, T). +/// +/// # Arguments +/// * `filename` - Path to the MEME format file to read +/// +/// # Returns +/// * `Result` - A HashMap where keys are motif IDs and values are their corresponding PWMs +/// +/// # Errors +/// * `MotifError::Io` - If the file cannot be opened or read +/// * `MotifError::InvalidFileFormat` - If the file format is invalid or no PWMs are found +/// * `MotifError::DataError` - If there are issues creating the PWM DataFrame +/// +/// # Example +/// ```no_run +/// use your_crate::occupancy::read_pwm_files; +/// +/// let pwms = read_pwm_files("path/to/motifs.meme")?; +/// for (motif_id, pwm) in pwms { +/// println!("Found motif: {}", motif_id); +/// } +/// ``` +/// +/// # Format +/// The input file should be in MEME format, where each PWM is preceded by a "MOTIF" line +/// containing the motif ID, followed by the matrix values. +pub fn read_pwm_files(filename: &str) -> Result { + let file = File::open(filename)?; + let reader = BufReader::new(file); + let mut lines = reader.lines().peekable(); + let mut pwms = HashMap::new(); + + // Skip header until first MOTIF + skip_until_motif(&mut lines); + + // Parse all PWMs + while let Some((id, pwm)) = parse_pwm(&mut lines)? { + pwms.insert(id, pwm); + skip_until_motif(&mut lines); + } + + if pwms.is_empty() { + return Err(MotifError::InvalidFileFormat("No PWMs found".into())); + } + + Ok(pwms) +} diff --git a/src/types.rs b/src/types.rs index e69de29..2630056 100644 --- a/src/types.rs +++ b/src/types.rs @@ -0,0 +1,9 @@ +use polars::prelude::*; +use std::collections::HashMap; + +/// Represents a Position Weight Matrix (PWM) +/// Stored as a DataFrame with columns A, C, G, T +pub type PWM = DataFrame; + +/// Collection of PWMs indexed by motif ID +pub type PWMCollection = HashMap; diff --git a/tests/data/tdmMotifs.meme b/tests/data/tdmMotifs.meme new file mode 100644 index 0000000..50062c4 --- /dev/null +++ b/tests/data/tdmMotifs.meme @@ -0,0 +1,78 @@ +MEME version 5.0.4 + +ALPHABET "DNA" DNA-LIKE +A "Adenine" CC0000 ~ T "Thymine" 008000 +C "Cytosine" 0000CC ~ G "Guanine" FFB300 +N "Any base" = ACGT +X = ACGT +. = ACGT +V "Not T" = ACG +H "Not G" = ACT +D "Not C" = AGT +B "Not A" = CGT +M "Amino" = AC +R "Purine" = AG +W "Weak" = AT +S "Strong" = CG +Y "Pyrimidine" = CT +K "Keto" = GT +U = T +END ALPHABET + +strands: + - + +Background letter frequencies (from unknown source): + A 0.250 C 0.250 G 0.250 T 0.250 + + +MOTIF GFI1_MOUSE.H11MO.0.C +r +letter-probability matrix: alength= 4 w= 10 nsites= 150 E= 1.0e+000 + 0.291469 0.069515 0.587580 0.051437 + 0.077749 0.684556 0.139318 0.098377 + 0.405185 0.100381 0.075645 0.418790 + 0.052015 0.285514 0.563133 0.099338 + 0.239209 0.016978 0.079967 0.663846 + 0.000000 0.000000 1.000000 0.000000 + 1.000000 0.000000 0.000000 0.000000 + 0.000000 0.000000 0.000000 1.000000 + 0.000000 0.000000 0.000000 1.000000 + 0.103502 0.118062 0.190824 0.587611 + +URL http://hocomoco.autosome.ru/motif/GFI1_MOUSE.H11MO.0.C + + +MOTIF MAZ_MOUSE.H11MO.1.A + +letter-probability matrix: alength= 4 w= 11 nsites= 486 E= 1.0e+000 + 0.323045 0.137860 0.512346 0.026749 + 0.487654 0.045267 0.368313 0.098765 + 0.065844 0.006173 0.921811 0.006173 + 0.000000 0.000000 0.989712 0.010288 + 0.002058 0.006173 0.989712 0.002058 + 0.648148 0.302469 0.000000 0.049383 + 0.028807 0.002058 0.967078 0.002058 + 0.004115 0.000000 0.989712 0.006173 + 0.123457 0.016461 0.837449 0.022634 + 0.123457 0.006173 0.866255 0.004115 + 0.115226 0.341564 0.469136 0.074074 + +URL http://hocomoco.autosome.ru/motif/MAZ_MOUSE.H11MO.1.A + + +MOTIF NRL_HUMAN.MA0842.1 + +letter-probability matrix: alength= 4 w= 11 nsites= 5786 E= 0.0e+000 + 0.389043 0.133426 0.212236 0.265296 + 0.439088 0.105063 0.151201 0.304648 + 0.344971 0.153992 0.102834 0.398203 + 0.241576 0.238120 0.213582 0.306722 + 0.090677 0.199953 0.036736 0.672634 + 0.008534 0.003926 0.987541 0.000000 + 0.136911 0.839159 0.000000 0.023930 + 0.068134 0.041275 0.012595 0.877997 + 0.051302 0.013023 0.761115 0.174559 + 0.868638 0.048191 0.022219 0.060952 + 0.047010 0.571206 0.158140 0.223643 + +URL http://jaspar.genereg.net/matrix/MA0842.1