-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
295 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
/target | ||
/Cargo.lock | ||
*.py | ||
*.py | ||
/.vscode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,7 @@ | ||
use polars::prelude::*; | ||
use tf_binding_rs::fasta; | ||
use tf_binding_rs::occupancy; | ||
|
||
fn main() { | ||
let df: DataFrame = fasta::read_fasta("tests/data/test1.fasta").unwrap(); | ||
let restrictions = ["AGCTTTTTAATAGAGTCAGCAAAACTGAA", "TCACTGA"]; | ||
let has_restriction_sites_df = fasta::has_restriction_sites(&df, &restrictions).unwrap(); | ||
println!("{:?}", has_restriction_sites_df); | ||
let df = occupancy::read_pwm_files("tests/data/tdmMotifs.meme").unwrap(); | ||
println!("{:?}", df); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
use crate::error::MotifError; | ||
use crate::types::*; | ||
use polars::prelude::*; | ||
use std::collections::HashMap; | ||
use std::fs::File; | ||
use std::io::{BufRead, BufReader}; | ||
use std::iter::Peekable; | ||
|
||
/// Advances the iterator until a MOTIF line is found | ||
fn skip_until_motif<I>(lines: &mut Peekable<I>) | ||
where | ||
I: Iterator<Item = Result<String, std::io::Error>>, | ||
{ | ||
while let Some(Ok(line)) = lines.peek() { | ||
if line.starts_with("MOTIF") { | ||
break; | ||
} | ||
lines.next(); | ||
} | ||
} | ||
|
||
/// Parses a single PWM from the iterator | ||
fn parse_pwm<I>(lines: &mut I) -> Result<Option<(String, PWM)>, MotifError> | ||
where | ||
I: Iterator<Item = Result<String, std::io::Error>>, | ||
{ | ||
// Get motif ID from MOTIF line | ||
let motif_line = match lines.next() { | ||
Some(Ok(line)) if line.starts_with("MOTIF") => line, | ||
_ => return Ok(None), | ||
}; | ||
|
||
let motif_id = motif_line | ||
.split_whitespace() | ||
.nth(1) | ||
.ok_or_else(|| MotifError::InvalidFileFormat("Missing motif ID".into()))? | ||
.to_string(); | ||
|
||
// Skip header lines | ||
for _ in 0..2 { | ||
lines.next(); | ||
} | ||
|
||
// Read PWM rows until we hit a non-PWM line | ||
let pwm_rows: Vec<Vec<f64>> = lines | ||
.take_while(|line| { | ||
line.as_ref() | ||
.map(|l| l.starts_with(|c: char| c.is_whitespace() || c == '0' || c == '1')) | ||
.unwrap_or(false) | ||
}) | ||
.map(|line| { | ||
let line = line.map_err(|e| MotifError::Io(e))?; | ||
let values: Vec<f64> = line | ||
.split_whitespace() | ||
.map(|s| s.parse::<f64>()) | ||
.collect::<Result<Vec<_>, _>>() | ||
.map_err(|e| MotifError::InvalidFileFormat(format!("Invalid PWM value: {}", e)))?; | ||
|
||
Ok(values) | ||
}) | ||
.collect::<Result<Vec<_>, MotifError>>()?; | ||
|
||
if pwm_rows.is_empty() { | ||
return Err(MotifError::InvalidFileFormat("Empty PWM".into())); | ||
} | ||
|
||
// Create PWM DataFrame | ||
let pwm = DataFrame::new(vec![ | ||
Column::new( | ||
"A".into(), | ||
pwm_rows.iter().map(|row| row[0]).collect::<Vec<f64>>(), | ||
), | ||
Column::new( | ||
"C".into(), | ||
pwm_rows.iter().map(|row| row[1]).collect::<Vec<f64>>(), | ||
), | ||
Column::new( | ||
"G".into(), | ||
pwm_rows.iter().map(|row| row[2]).collect::<Vec<f64>>(), | ||
), | ||
Column::new( | ||
"T".into(), | ||
pwm_rows.iter().map(|row| row[3]).collect::<Vec<f64>>(), | ||
), | ||
]) | ||
.map_err(|e| MotifError::DataError(e.to_string()))?; | ||
|
||
Ok(Some((motif_id, pwm))) | ||
} | ||
|
||
/// Reads Position Weight Matrices (PWMs) from a MEME format file | ||
/// | ||
/// This function parses a MEME-formatted file containing one or more Position Weight Matrices, | ||
/// each identified by a unique motif ID. The PWMs represent DNA binding motifs where each position | ||
/// contains probabilities for the four nucleotides (A, C, G, T). | ||
/// | ||
/// # Arguments | ||
/// * `filename` - Path to the MEME format file to read | ||
/// | ||
/// # Returns | ||
/// * `Result<PWMCollection, MotifError>` - A HashMap where keys are motif IDs and values are their corresponding PWMs | ||
/// | ||
/// # Errors | ||
/// * `MotifError::Io` - If the file cannot be opened or read | ||
/// * `MotifError::InvalidFileFormat` - If the file format is invalid or no PWMs are found | ||
/// * `MotifError::DataError` - If there are issues creating the PWM DataFrame | ||
/// | ||
/// # Example | ||
/// ```no_run | ||
/// use your_crate::occupancy::read_pwm_files; | ||
/// | ||
/// let pwms = read_pwm_files("path/to/motifs.meme")?; | ||
/// for (motif_id, pwm) in pwms { | ||
/// println!("Found motif: {}", motif_id); | ||
/// } | ||
/// ``` | ||
/// | ||
/// # Format | ||
/// The input file should be in MEME format, where each PWM is preceded by a "MOTIF" line | ||
/// containing the motif ID, followed by the matrix values. | ||
pub fn read_pwm_files(filename: &str) -> Result<PWMCollection, MotifError> { | ||
let file = File::open(filename)?; | ||
let reader = BufReader::new(file); | ||
let mut lines = reader.lines().peekable(); | ||
let mut pwms = HashMap::new(); | ||
|
||
// Skip header until first MOTIF | ||
skip_until_motif(&mut lines); | ||
|
||
// Parse all PWMs | ||
while let Some((id, pwm)) = parse_pwm(&mut lines)? { | ||
pwms.insert(id, pwm); | ||
skip_until_motif(&mut lines); | ||
} | ||
|
||
if pwms.is_empty() { | ||
return Err(MotifError::InvalidFileFormat("No PWMs found".into())); | ||
} | ||
|
||
Ok(pwms) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
use polars::prelude::*; | ||
use std::collections::HashMap; | ||
|
||
/// Represents a Position Weight Matrix (PWM) | ||
/// Stored as a DataFrame with columns A, C, G, T | ||
pub type PWM = DataFrame; | ||
|
||
/// Collection of PWMs indexed by motif ID | ||
pub type PWMCollection = HashMap<String, PWM>; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
MEME version 5.0.4 | ||
|
||
ALPHABET "DNA" DNA-LIKE | ||
A "Adenine" CC0000 ~ T "Thymine" 008000 | ||
C "Cytosine" 0000CC ~ G "Guanine" FFB300 | ||
N "Any base" = ACGT | ||
X = ACGT | ||
. = ACGT | ||
V "Not T" = ACG | ||
H "Not G" = ACT | ||
D "Not C" = AGT | ||
B "Not A" = CGT | ||
M "Amino" = AC | ||
R "Purine" = AG | ||
W "Weak" = AT | ||
S "Strong" = CG | ||
Y "Pyrimidine" = CT | ||
K "Keto" = GT | ||
U = T | ||
END ALPHABET | ||
|
||
strands: + - | ||
|
||
Background letter frequencies (from unknown source): | ||
A 0.250 C 0.250 G 0.250 T 0.250 | ||
|
||
|
||
MOTIF GFI1_MOUSE.H11MO.0.C | ||
r | ||
letter-probability matrix: alength= 4 w= 10 nsites= 150 E= 1.0e+000 | ||
0.291469 0.069515 0.587580 0.051437 | ||
0.077749 0.684556 0.139318 0.098377 | ||
0.405185 0.100381 0.075645 0.418790 | ||
0.052015 0.285514 0.563133 0.099338 | ||
0.239209 0.016978 0.079967 0.663846 | ||
0.000000 0.000000 1.000000 0.000000 | ||
1.000000 0.000000 0.000000 0.000000 | ||
0.000000 0.000000 0.000000 1.000000 | ||
0.000000 0.000000 0.000000 1.000000 | ||
0.103502 0.118062 0.190824 0.587611 | ||
|
||
URL http://hocomoco.autosome.ru/motif/GFI1_MOUSE.H11MO.0.C | ||
|
||
|
||
MOTIF MAZ_MOUSE.H11MO.1.A | ||
|
||
letter-probability matrix: alength= 4 w= 11 nsites= 486 E= 1.0e+000 | ||
0.323045 0.137860 0.512346 0.026749 | ||
0.487654 0.045267 0.368313 0.098765 | ||
0.065844 0.006173 0.921811 0.006173 | ||
0.000000 0.000000 0.989712 0.010288 | ||
0.002058 0.006173 0.989712 0.002058 | ||
0.648148 0.302469 0.000000 0.049383 | ||
0.028807 0.002058 0.967078 0.002058 | ||
0.004115 0.000000 0.989712 0.006173 | ||
0.123457 0.016461 0.837449 0.022634 | ||
0.123457 0.006173 0.866255 0.004115 | ||
0.115226 0.341564 0.469136 0.074074 | ||
|
||
URL http://hocomoco.autosome.ru/motif/MAZ_MOUSE.H11MO.1.A | ||
|
||
|
||
MOTIF NRL_HUMAN.MA0842.1 | ||
|
||
letter-probability matrix: alength= 4 w= 11 nsites= 5786 E= 0.0e+000 | ||
0.389043 0.133426 0.212236 0.265296 | ||
0.439088 0.105063 0.151201 0.304648 | ||
0.344971 0.153992 0.102834 0.398203 | ||
0.241576 0.238120 0.213582 0.306722 | ||
0.090677 0.199953 0.036736 0.672634 | ||
0.008534 0.003926 0.987541 0.000000 | ||
0.136911 0.839159 0.000000 0.023930 | ||
0.068134 0.041275 0.012595 0.877997 | ||
0.051302 0.013023 0.761115 0.174559 | ||
0.868638 0.048191 0.022219 0.060952 | ||
0.047010 0.571206 0.158140 0.223643 | ||
|
||
URL http://jaspar.genereg.net/matrix/MA0842.1 |