Skip to content

Commit

Permalink
optimized fasta utils
Browse files Browse the repository at this point in the history
  • Loading branch information
peter6866 committed Nov 10, 2024
1 parent 3d7c3d9 commit 28cd57d
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 17 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ license = "MIT"
ndarray = "0.16.1"
polars = { version = "0.44.2", features = ["lazy"] }
serde = { version = "1.0", features = ["derive"] }
thiserror = "1.0.68"
thiserror = "2.0.3"
statrs = "0.17.1"
phf = {version = "0.11.2", features = ["macros"]}
3 changes: 3 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# tf-binding-rs (In Development)

[<img alt="github" src="https://img.shields.io/badge/github-peter6866/tf--binding--rs-8da0cb?style=for-the-badge&labelColor=555555&logo=github" height="20">](https://github.com/peter6866/tf-binding-rs)
[<img alt="crates.io" src="https://img.shields.io/crates/v/tf-binding-rs.svg?style=for-the-badge&color=fc8d62&logo=rust" height="20">](https://crates.io/crates/tf-binding-rs)

A Rust library for predicting transcription factor (TF) binding site occupancy in DNA sequences. This toolkit provides efficient implementations for:

- FASTA file manipulation and sequence processing
Expand Down
7 changes: 5 additions & 2 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@ pub enum MotifError {
value: String,
message: String,
},

#[error("Invalid input: {0}")]
InvalidInput(String),
}

// Type alias for Result with MotifError
pub type Result<T> = std::result::Result<T, MotifError>;
// // Type alias for Result with MotifError
// pub type Result<T> = std::result::Result<T, MotifError>;

impl MotifError {
/// Create a new InvalidSequence error
Expand Down
47 changes: 33 additions & 14 deletions src/fasta.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::error::{MotifError, Result};
use crate::error::MotifError;
use polars::prelude::*;
use std::collections::{HashMap, HashSet};
use std::collections::HashSet;
use std::fs::File;
use std::io::{BufRead, BufReader, Write};

Expand All @@ -18,7 +18,7 @@ use std::io::{BufRead, BufReader, Write};
/// * Returns `MotifError::InvalidFileFormat` if no sequences are found
/// * Returns `MotifError::DataError` if DataFrame creation fails
/// * Returns `std::io::Error` for file reading issues
pub fn read_fasta(filename: &str) -> Result<DataFrame> {
pub fn read_fasta(filename: &str) -> Result<DataFrame, MotifError> {
let mut sequences: Vec<(String, String)> = Vec::new();
let file = File::open(filename)?;
let reader = BufReader::new(file);
Expand Down Expand Up @@ -46,15 +46,17 @@ pub fn read_fasta(filename: &str) -> Result<DataFrame> {
}

if sequences.is_empty() {
return Err(MotifError::InvalidFileFormat("No sequences found".into()));
return Err(MotifError::InvalidFileFormat(
"No sequences found".to_string(),
));
}

let (labels, sequences): (Vec<String>, Vec<String>) = sequences.into_iter().unzip();
let df = DataFrame::new(vec![
Column::new("label".into(), labels),
Column::new("sequence".into(), sequences),
])
.map_err(|_| MotifError::DataError("Failed to create DataFrame".into()))?;
.map_err(|e| MotifError::DataError(e.to_string()))?;

Ok(df)
}
Expand All @@ -71,7 +73,7 @@ pub fn read_fasta(filename: &str) -> Result<DataFrame> {
/// # Errors
/// * Returns `MotifError::DataError` if required columns are missing
/// * Returns `MotifError::Io` for file writing issues
pub fn write_fasta(df: &DataFrame, filename: &str) -> Result<()> {
pub fn write_fasta(df: &DataFrame, filename: &str) -> Result<(), MotifError> {
let labels = df
.column("label")
.map_err(|e| MotifError::DataError(e.to_string()))?
Expand Down Expand Up @@ -102,15 +104,29 @@ pub fn write_fasta(df: &DataFrame, filename: &str) -> Result<()> {
/// * `sequence` - Input DNA sequence string
///
/// # Returns
/// * `String` - The reverse complement sequence where:
/// * `Result<String>` - The reverse complement sequence where:
/// - A ↔ T
/// - C ↔ G
///
/// # Panics
/// * Panics if the input sequence contains characters other than A, T, C, or G
pub fn rev_comp(sequence: &str) -> String {
let compliment = HashMap::from([('A', 'T'), ('T', 'A'), ('C', 'G'), ('G', 'C')]);
sequence.chars().rev().map(|c| compliment[&c]).collect()
/// # Errors
/// * Returns `MotifError::InvalidInput` if sequence contains invalid nucleotides
pub fn reverse_complement(sequence: &str) -> Result<String, MotifError> {
static COMPLEMENT: phf::Map<char, char> = phf::phf_map! {
'A' => 'T',
'T' => 'A',
'C' => 'G',
'G' => 'C',
};

sequence
.chars()
.rev()
.map(|c| {
COMPLEMENT
.get(&c)
.ok_or_else(|| MotifError::InvalidInput(format!("Invalid nucleotide: {}", c)))
})
.collect()
}

/// Calculates the GC content for each sequence in the input DataFrame.
Expand All @@ -125,7 +141,7 @@ pub fn rev_comp(sequence: &str) -> String {
///
/// # Errors
/// * Returns `MotifError::DataError` if required columns are missing or DataFrame creation fails
pub fn gc_content(df: &DataFrame) -> Result<DataFrame> {
pub fn gc_content(df: &DataFrame) -> Result<DataFrame, MotifError> {
let sequences = df
.column("sequence")
.map_err(|e| MotifError::DataError(e.to_string()))?
Expand Down Expand Up @@ -167,7 +183,10 @@ pub fn gc_content(df: &DataFrame) -> Result<DataFrame> {
///
/// # Errors
/// * Returns `MotifError::DataError` if required columns are missing or DataFrame creation fails
pub fn has_restriction_sites(df: &DataFrame, restrictions: &[&str]) -> Result<DataFrame> {
pub fn has_restriction_sites(
df: &DataFrame,
restrictions: &[&str],
) -> Result<DataFrame, MotifError> {
let restrictions_set: HashSet<String> = restrictions.iter().map(|r| r.to_string()).collect();

let sequences = df
Expand Down
19 changes: 19 additions & 0 deletions tests/fasta_utils_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,22 @@ fn test_write_fasta() {
// clean up
std::fs::remove_file(path).unwrap();
}

#[test]
fn test_reverse_complement() {
// Test basic reverse complement
let sequence = "ATCG";
assert_eq!(fasta::reverse_complement(sequence).unwrap(), "CGAT");

// Test longer sequence
let sequence = "AATTCCGG";
assert_eq!(fasta::reverse_complement(sequence).unwrap(), "CCGGAATT");

// Test palindromic sequence
let sequence = "GCGC";
assert_eq!(fasta::reverse_complement(sequence).unwrap(), "GCGC");

// Test error case with invalid nucleotide
let sequence = "ATCGX";
assert!(fasta::reverse_complement(sequence).is_err());
}

0 comments on commit 28cd57d

Please sign in to comment.