Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Export matched fragment ions for rescoring & spectral library generation #101

Merged
merged 2 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions crates/sage-cli/src/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ pub struct Search {

#[serde(skip_serializing)]
pub write_pin: bool,

#[serde(skip_serializing)]
pub annotate_matches: bool,
}

#[derive(Deserialize)]
Expand All @@ -59,6 +62,7 @@ pub struct Input {
output_directory: Option<String>,
mzml_paths: Option<Vec<String>>,

annotate_matches: Option<bool>,
write_pin: Option<bool>,
}

Expand Down Expand Up @@ -179,6 +183,10 @@ impl Input {
input.write_pin = Some(write_pin);
}

if let Some(annotate_matches) = matches.get_one::<bool>("annotate-matches").copied() {
input.annotate_matches = Some(annotate_matches);
}

// avoid to later panic if these parameters are not set (but doesn't check if files exist)

ensure!(
Expand Down Expand Up @@ -288,6 +296,7 @@ impl Input {
min_peaks: self.min_peaks.unwrap_or(15),
min_matched_peaks: self.min_matched_peaks.unwrap_or(4),
max_fragment_charge: self.max_fragment_charge,
annotate_matches: self.annotate_matches.unwrap_or(false),
precursor_charge: self.precursor_charge.unwrap_or((2, 4)),
isotope_errors: self.isotope_errors.unwrap_or((0, 0)),
deisotope: self.deisotope.unwrap_or(true),
Expand Down
22 changes: 22 additions & 0 deletions crates/sage-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ impl Runner {
chimera: self.parameters.chimera,
report_psms: self.parameters.report_psms,
wide_window: self.parameters.wide_window,
annotate_matches: self.parameters.annotate_matches,
};

//Collect all results into a single container
Expand Down Expand Up @@ -335,6 +336,14 @@ impl Runner {
path.write_bytes_sync(bytes)?;
self.parameters.output_paths.push(path.to_string());

if self.parameters.annotate_matches {
let bytes =
sage_cloudpath::parquet::serialize_matched_fragments(&outputs.features)?;
let path = self.make_path("matched_fragments.sage.parquet");
path.write_bytes_sync(bytes)?;
self.parameters.output_paths.push(path.to_string());
}

if let Some(areas) = &areas {
let bytes =
sage_cloudpath::parquet::serialize_lfq(areas, &filenames, &self.database)?;
Expand All @@ -347,6 +356,13 @@ impl Runner {
self.parameters
.output_paths
.push(self.write_features(&outputs.features, &filenames)?);

if self.parameters.annotate_matches {
self.parameters
.output_paths
.push(self.write_fragments(&outputs.features)?);
}

if !outputs.quant.is_empty() {
self.parameters
.output_paths
Expand Down Expand Up @@ -443,6 +459,12 @@ fn main() -> anyhow::Result<()> {
.action(clap::ArgAction::SetTrue)
.help("Write search output in parquet format instead of tsv"),
)
.arg(
Arg::new("annotate-matches")
.long("annotate-matches")
.action(clap::ArgAction::SetTrue)
.help("Write matched fragments output file."),
)
.arg(
Arg::new("write-pin")
.long("write-pin")
Expand Down
101 changes: 95 additions & 6 deletions crates/sage-cli/src/output.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use csv::ByteRecord;
use std::collections::HashMap;

use rayon::prelude::*;
use sage_core::ion_series::Kind;
use sage_core::scoring::Fragments;
use sage_core::{
lfq::{Peak, PrecursorId},
scoring::Feature,
Expand All @@ -12,6 +15,9 @@ use crate::Runner;
impl Runner {
pub fn serialize_feature(&self, feature: &Feature, filenames: &[String]) -> csv::ByteRecord {
let mut record = csv::ByteRecord::new();

record.push_field(itoa::Buffer::new().format(feature.psm_id).as_bytes());

let peptide = &self.database[feature.peptide_idx];
record.push_field(peptide.to_string().as_bytes());
record.push_field(
Expand Down Expand Up @@ -85,6 +91,54 @@ impl Runner {
record
}

pub fn serialize_fragments(
&self,
psm_id: usize,
fragments_: &Option<Fragments>,
) -> Vec<ByteRecord> {
let mut frag_records = vec![];

if let Some(fragments) = fragments_ {
for id in 0..fragments.fragment_ordinals.len() {
let mut record = ByteRecord::new();
record.push_field(itoa::Buffer::new().format(psm_id).as_bytes());
let ion_type = match fragments.kinds[id] {
Kind::A => "a",
Kind::B => "b",
Kind::C => "c",
Kind::X => "x",
Kind::Y => "y",
Kind::Z => "z",
};
record.push_field(ion_type.as_bytes());
record.push_field(
itoa::Buffer::new()
.format(fragments.fragment_ordinals[id])
.as_bytes(),
);
record.push_field(itoa::Buffer::new().format(fragments.charges[id]).as_bytes());
record.push_field(
ryu::Buffer::new()
.format(fragments.mz_calculated[id])
.as_bytes(),
);
record.push_field(
ryu::Buffer::new()
.format(fragments.mz_experimental[id])
.as_bytes(),
);
record.push_field(
ryu::Buffer::new()
.format(fragments.intensities[id])
.as_bytes(),
);
frag_records.push(record);
}
}

frag_records
}

pub fn write_features(
&self,
features: &[Feature],
Expand All @@ -96,7 +150,8 @@ impl Runner {
.delimiter(b'\t')
.from_writer(vec![]);

let headers = csv::ByteRecord::from(vec![
let csv_headers = vec![
"psm_id",
"peptide",
"proteins",
"num_proteins",
Expand Down Expand Up @@ -134,7 +189,9 @@ impl Runner {
"protein_q",
"ms1_intensity",
"ms2_intensity",
]);
];

let headers = csv::ByteRecord::from(csv_headers);

wtr.write_byte_record(&headers)?;
for record in features
Expand All @@ -151,10 +208,43 @@ impl Runner {
Ok(path.to_string())
}

pub fn write_fragments(&self, features: &[Feature]) -> anyhow::Result<String> {
let path = self.make_path("matched_fragments.sage.tsv");

let mut wtr = csv::WriterBuilder::new()
.delimiter(b'\t')
.from_writer(vec![]);

let headers = csv::ByteRecord::from(vec![
"psm_id",
"fragment_type",
"fragment_ordinals",
"fragment_charge",
"fragment_mz_calculated",
"fragment_mz_experimental",
"fragment_intensity",
]);

wtr.write_byte_record(&headers)?;

for record in features
.into_par_iter()
.map(|feat| self.serialize_fragments(feat.psm_id, &feat.fragments))
.flatten()
.collect::<Vec<_>>()
{
wtr.write_byte_record(&record)?;
}

wtr.flush()?;
let bytes = wtr.into_inner()?;
path.write_bytes_sync(bytes)?;
Ok(path.to_string())
}

fn serialize_pin(
&self,
re: &regex::Regex,
idx: usize,
feature: &Feature,
filenames: &[String],
) -> csv::ByteRecord {
Expand All @@ -166,7 +256,7 @@ impl Runner {

let mut record = csv::ByteRecord::new();
let peptide = &self.database[feature.peptide_idx];
record.push_field(itoa::Buffer::new().format(idx).as_bytes());
record.push_field(itoa::Buffer::new().format(feature.psm_id).as_bytes());
record.push_field(itoa::Buffer::new().format(feature.label).as_bytes());
record.push_field(scannr.as_bytes());
record.push_field(ryu::Buffer::new().format(feature.expmass).as_bytes());
Expand Down Expand Up @@ -332,8 +422,7 @@ impl Runner {
wtr.write_byte_record(&headers)?;
for record in features
.into_par_iter()
.enumerate()
.map(|(idx, feat)| self.serialize_pin(&re, idx, feat, filenames))
.map(|feat| self.serialize_pin(&re, feat, filenames))
.collect::<Vec<_>>()
{
wtr.write_byte_record(&record)?;
Expand Down
1 change: 1 addition & 0 deletions crates/sage-cli/tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ fn integration() -> anyhow::Result<()> {
chimera: false,
report_psms: 1,
wide_window: false,
annotate_matches: false,
};

let psm = scorer.score(&processed);
Expand Down
Loading