Skip to content

Commit

Permalink
added reading pwm files
Browse files Browse the repository at this point in the history
  • Loading branch information
peter6866 committed Nov 13, 2024
1 parent 8f3c445 commit bf8b4ce
Show file tree
Hide file tree
Showing 7 changed files with 295 additions and 8 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/target
/Cargo.lock
*.py
*.py
/.vscode
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tf-binding-rs"
version = "0.1.1"
version = "0.1.2"
edition = "2021"
description = "Fast transcription factor binding site prediction and FASTA manipulation in Rust"
license = "MIT"
Expand All @@ -10,7 +10,7 @@ license = "MIT"
[dependencies]
ndarray = "0.16.1"
polars = { version = "0.44.2", features = ["lazy"] }
serde = { version = "1.0", features = ["derive"] }
serde = { version = "1.0.215", features = ["derive"] }
thiserror = "2.0.3"
statrs = "0.17.1"
phf = {version = "0.11.2", features = ["macros"]}
60 changes: 60 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,69 @@ Built with performance in mind, this library offers a fast and memory-efficient
- 📈 Occupancy landscape calculation
- 🧮 Statistical and thermodynamic calculations

## Installation

Add this to your `Cargo.toml`:

```toml
[dependencies]
tf-binding-rs = "0.1.1"
```

Or install using cargo:

```bash
cargo add tf-binding-rs
```

## Examples

### Reading FASTA Files

```rust
use tf_binding_rs::fasta;

fn main() -> Result<(), Box<dyn std::error::Error>> {
// Read sequences from a FASTA file
let sequences = fasta::read_fasta("path/to/sequences.fasta")?;

// Print sequence information
println!("Number of sequences: {}", sequences.height());

// Calculate GC content
let gc_stats = fasta::gc_content(&sequences)?;
println!("GC content analysis: {:?}", gc_stats);

Ok(())
}
```

### Working with PWM Files

```rust
use tf_binding_rs::occupancy;

fn main() -> Result<(), Box<dyn std::error::Error>> {
// Read PWM motifs from MEME format file
let pwm_collection = occupancy::read_pwm_files("path/to/motifs.meme")?;

// Process each motif
for (motif_id, pwm) in pwm_collection {
println!("Processing motif: {}", motif_id);
println!("Matrix dimensions: {:?}", pwm.shape());
}

Ok(())
}
```

## Use Cases

- Genomic sequence analysis
- TF binding site prediction
- Regulatory sequence characterization
- High-throughput DNA sequence processing

## Documentation

For detailed API documentation, visit [docs.rs/tf-binding-rs](https://docs.rs/tf-binding-rs)
8 changes: 3 additions & 5 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
use polars::prelude::*;
use tf_binding_rs::fasta;
use tf_binding_rs::occupancy;

fn main() {
let df: DataFrame = fasta::read_fasta("tests/data/test1.fasta").unwrap();
let restrictions = ["AGCTTTTTAATAGAGTCAGCAAAACTGAA", "TCACTGA"];
let has_restriction_sites_df = fasta::has_restriction_sites(&df, &restrictions).unwrap();
println!("{:?}", has_restriction_sites_df);
let df = occupancy::read_pwm_files("tests/data/tdmMotifs.meme").unwrap();
println!("{:?}", df);
}
141 changes: 141 additions & 0 deletions src/occupancy.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
use crate::error::MotifError;
use crate::types::*;
use polars::prelude::*;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::iter::Peekable;

/// Advances the iterator until a MOTIF line is found
fn skip_until_motif<I>(lines: &mut Peekable<I>)
where
I: Iterator<Item = Result<String, std::io::Error>>,
{
while let Some(Ok(line)) = lines.peek() {
if line.starts_with("MOTIF") {
break;
}
lines.next();
}
}

/// Parses a single PWM from the iterator
fn parse_pwm<I>(lines: &mut I) -> Result<Option<(String, PWM)>, MotifError>
where
I: Iterator<Item = Result<String, std::io::Error>>,
{
// Get motif ID from MOTIF line
let motif_line = match lines.next() {
Some(Ok(line)) if line.starts_with("MOTIF") => line,
_ => return Ok(None),
};

let motif_id = motif_line
.split_whitespace()
.nth(1)
.ok_or_else(|| MotifError::InvalidFileFormat("Missing motif ID".into()))?
.to_string();

// Skip header lines
for _ in 0..2 {
lines.next();
}

// Read PWM rows until we hit a non-PWM line
let pwm_rows: Vec<Vec<f64>> = lines
.take_while(|line| {
line.as_ref()
.map(|l| l.starts_with(|c: char| c.is_whitespace() || c == '0' || c == '1'))
.unwrap_or(false)
})
.map(|line| {
let line = line.map_err(|e| MotifError::Io(e))?;
let values: Vec<f64> = line
.split_whitespace()
.map(|s| s.parse::<f64>())
.collect::<Result<Vec<_>, _>>()
.map_err(|e| MotifError::InvalidFileFormat(format!("Invalid PWM value: {}", e)))?;

Ok(values)
})
.collect::<Result<Vec<_>, MotifError>>()?;

if pwm_rows.is_empty() {
return Err(MotifError::InvalidFileFormat("Empty PWM".into()));
}

// Create PWM DataFrame
let pwm = DataFrame::new(vec![
Column::new(
"A".into(),
pwm_rows.iter().map(|row| row[0]).collect::<Vec<f64>>(),
),
Column::new(
"C".into(),
pwm_rows.iter().map(|row| row[1]).collect::<Vec<f64>>(),
),
Column::new(
"G".into(),
pwm_rows.iter().map(|row| row[2]).collect::<Vec<f64>>(),
),
Column::new(
"T".into(),
pwm_rows.iter().map(|row| row[3]).collect::<Vec<f64>>(),
),
])
.map_err(|e| MotifError::DataError(e.to_string()))?;

Ok(Some((motif_id, pwm)))
}

/// Reads Position Weight Matrices (PWMs) from a MEME format file
///
/// This function parses a MEME-formatted file containing one or more Position Weight Matrices,
/// each identified by a unique motif ID. The PWMs represent DNA binding motifs where each position
/// contains probabilities for the four nucleotides (A, C, G, T).
///
/// # Arguments
/// * `filename` - Path to the MEME format file to read
///
/// # Returns
/// * `Result<PWMCollection, MotifError>` - A HashMap where keys are motif IDs and values are their corresponding PWMs
///
/// # Errors
/// * `MotifError::Io` - If the file cannot be opened or read
/// * `MotifError::InvalidFileFormat` - If the file format is invalid or no PWMs are found
/// * `MotifError::DataError` - If there are issues creating the PWM DataFrame
///
/// # Example
/// ```no_run
/// use your_crate::occupancy::read_pwm_files;
///
/// let pwms = read_pwm_files("path/to/motifs.meme")?;
/// for (motif_id, pwm) in pwms {
/// println!("Found motif: {}", motif_id);
/// }
/// ```
///
/// # Format
/// The input file should be in MEME format, where each PWM is preceded by a "MOTIF" line
/// containing the motif ID, followed by the matrix values.
pub fn read_pwm_files(filename: &str) -> Result<PWMCollection, MotifError> {
let file = File::open(filename)?;
let reader = BufReader::new(file);
let mut lines = reader.lines().peekable();
let mut pwms = HashMap::new();

// Skip header until first MOTIF
skip_until_motif(&mut lines);

// Parse all PWMs
while let Some((id, pwm)) = parse_pwm(&mut lines)? {
pwms.insert(id, pwm);
skip_until_motif(&mut lines);
}

if pwms.is_empty() {
return Err(MotifError::InvalidFileFormat("No PWMs found".into()));
}

Ok(pwms)
}
9 changes: 9 additions & 0 deletions src/types.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
use polars::prelude::*;
use std::collections::HashMap;

/// Represents a Position Weight Matrix (PWM)
/// Stored as a DataFrame with columns A, C, G, T
pub type PWM = DataFrame;

/// Collection of PWMs indexed by motif ID
pub type PWMCollection = HashMap<String, PWM>;
78 changes: 78 additions & 0 deletions tests/data/tdmMotifs.meme
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
MEME version 5.0.4

ALPHABET "DNA" DNA-LIKE
A "Adenine" CC0000 ~ T "Thymine" 008000
C "Cytosine" 0000CC ~ G "Guanine" FFB300
N "Any base" = ACGT
X = ACGT
. = ACGT
V "Not T" = ACG
H "Not G" = ACT
D "Not C" = AGT
B "Not A" = CGT
M "Amino" = AC
R "Purine" = AG
W "Weak" = AT
S "Strong" = CG
Y "Pyrimidine" = CT
K "Keto" = GT
U = T
END ALPHABET

strands: + -

Background letter frequencies (from unknown source):
A 0.250 C 0.250 G 0.250 T 0.250


MOTIF GFI1_MOUSE.H11MO.0.C
r
letter-probability matrix: alength= 4 w= 10 nsites= 150 E= 1.0e+000
0.291469 0.069515 0.587580 0.051437
0.077749 0.684556 0.139318 0.098377
0.405185 0.100381 0.075645 0.418790
0.052015 0.285514 0.563133 0.099338
0.239209 0.016978 0.079967 0.663846
0.000000 0.000000 1.000000 0.000000
1.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 1.000000
0.000000 0.000000 0.000000 1.000000
0.103502 0.118062 0.190824 0.587611

URL http://hocomoco.autosome.ru/motif/GFI1_MOUSE.H11MO.0.C


MOTIF MAZ_MOUSE.H11MO.1.A

letter-probability matrix: alength= 4 w= 11 nsites= 486 E= 1.0e+000
0.323045 0.137860 0.512346 0.026749
0.487654 0.045267 0.368313 0.098765
0.065844 0.006173 0.921811 0.006173
0.000000 0.000000 0.989712 0.010288
0.002058 0.006173 0.989712 0.002058
0.648148 0.302469 0.000000 0.049383
0.028807 0.002058 0.967078 0.002058
0.004115 0.000000 0.989712 0.006173
0.123457 0.016461 0.837449 0.022634
0.123457 0.006173 0.866255 0.004115
0.115226 0.341564 0.469136 0.074074

URL http://hocomoco.autosome.ru/motif/MAZ_MOUSE.H11MO.1.A


MOTIF NRL_HUMAN.MA0842.1

letter-probability matrix: alength= 4 w= 11 nsites= 5786 E= 0.0e+000
0.389043 0.133426 0.212236 0.265296
0.439088 0.105063 0.151201 0.304648
0.344971 0.153992 0.102834 0.398203
0.241576 0.238120 0.213582 0.306722
0.090677 0.199953 0.036736 0.672634
0.008534 0.003926 0.987541 0.000000
0.136911 0.839159 0.000000 0.023930
0.068134 0.041275 0.012595 0.877997
0.051302 0.013023 0.761115 0.174559
0.868638 0.048191 0.022219 0.060952
0.047010 0.571206 0.158140 0.223643

URL http://jaspar.genereg.net/matrix/MA0842.1

0 comments on commit bf8b4ce

Please sign in to comment.