Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

53 match i #57

Merged
merged 3 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions alpharaw/match/mass_calibration.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
import warnings

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor

warnings.warn(
"This module will be removed in the future as "
"mass calibration has already been implemented in alphaDIA.",
category=DeprecationWarning,
)


def get_fragment_median(start_end_idxes: tuple, frag_df: pd.DataFrame):
start_idx, end_idx = start_end_idxes
Expand Down
30 changes: 28 additions & 2 deletions alpharaw/match/match_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,33 @@ def match_batch_spec(
peak_stop_idxes: np.ndarray,
query_mzs: np.ndarray,
query_mz_tols: np.ndarray,
):
) -> Tuple[np.ndarray, np.ndarray]:
"""
Extract matched mzs and intensities for query m/z values against the given batch spectra.

Parameters
----------
spec_idxes : np.ndarray
The batch spectra, given as spectrum indexes.
peak_mzs : np.ndarray
The peak m/z values in the whole raw data.
peak_intens : np.ndarray
The peak intensities in the whole raw data.
peak_start_idxes : np.ndarray
The batch spectra, given as the start indexes in peak m/z and intensities.
peak_stop_idxes : np.ndarray
The batch spectra, given as the stop indexes in peak m/z and intensities.
query_mzs : np.ndarray
The query m/z values, these can be from fragments of a precursor.
query_mz_tols : np.ndarray
The query tolerance values of query_mzs.

Returns
-------
Tuple[ndarray, ndarray]
ndarray with shape (spectrum num, query num): matched m/z values. 0.0 if not matched.
ndarray with shape (spectrum num, query num): matched intensity values. 0.0 if not matched.
"""
matched_mzs = np.zeros((len(spec_idxes), len(query_mzs)), dtype=peak_mzs.dtype)
matched_intens = np.zeros(
(len(spec_idxes), len(query_mzs)), dtype=peak_intens.dtype
Expand Down Expand Up @@ -63,7 +89,7 @@ def match_closest_peaks(
query_mzs: np.ndarray,
query_mz_tols: np.ndarray,
) -> np.ndarray:
"""Matching query mz values against sorted MS2/spec masses,
"""Matching query mz values against sorted MS2/spec m/z values,
only closest (minimal abs mass error) peaks are returned.

Parameters
Expand Down
139 changes: 134 additions & 5 deletions alpharaw/match/spec_finder.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

import numba
import numpy as np
import pandas as pd
Expand All @@ -9,8 +11,32 @@ def find_spec_idxes_by_rt(
query_stop_rt: float,
query_left_mz: float,
query_right_mz: float,
):
) -> np.ndarray:
"""
Find MS2 spectrum indices (int32) from the `spectrum_df`
jalew188 marked this conversation as resolved.
Show resolved Hide resolved
by given RT window and precursor m/z window.

Parameters
----------
spectrum_df : pd.DataFrame
Spectrum dataframe to find spectrum indices.
query_start_rt : float
Left RT of the query RT window.
query_stop_rt : float
Right RT of the query RT window.
query_left_mz : float
Left m/z of the query m/z window.
query_right_mz : float
Right m/z of the query m/z window.

Returns
-------
ndarray[int32]
Result spectrum indices. `int32` is used here as there will be
no more than 2 billions of spectra in a raw file.
"""
if "multinotch" in spectrum_df.columns:
# if multinotch, there are multiple isolation windows of MS2 spectra.
return find_multinotch_spec_idxes(
spec_rts=spectrum_df.rt.values,
spec_multinotch_wins=spectrum_df.multinotch.values,
Expand All @@ -21,6 +47,7 @@ def find_spec_idxes_by_rt(
query_right_mz=query_right_mz,
)
else:
# normal isolation windows (one window to one MS2 spectrum)
return find_spec_idxes(
spec_rts=spectrum_df.rt.values,
spec_isolation_lower_mzs=spectrum_df.isolation_lower_mz.values,
Expand All @@ -34,13 +61,40 @@ def find_spec_idxes_by_rt(

def find_multinotch_spec_idxes(
spec_rts: np.ndarray,
spec_multinotch_wins: list,
spec_multinotch_wins: List[List],
spec_ms_levels: np.ndarray,
query_start_rt: float,
query_stop_rt: float,
query_left_mz: float,
query_right_mz: float,
) -> np.ndarray:
"""
Find MS2 spectrum indices (int32) from the "multinotch" `spectrum_df`
by given RT window and precursor m/z window.
"multinotch" means there are multiple isolation windows of MS2 spectra.

Parameters
----------
spec_rts : np.ndarray
RT values of the spectra.
spec_multinotch_wins : List[List]
List (num of spectra) of list (multiple isolation windows).
spec_ms_levels : np.ndarray
MS levels of the spectra.
query_start_rt : float
Left RT of the query RT window.
query_stop_rt : float
Right RT of the query RT window.
query_left_mz : float
Left m/z of the query m/z window.
query_right_mz : float
Right m/z of the query m/z window.

Returns
-------
np.ndarray[int32]
Result spectrum indices.
"""
start_idx = np.searchsorted(spec_rts, query_start_rt)
stop_idx = np.searchsorted(spec_rts, query_stop_rt) + 1
spec_idxes = []
Expand All @@ -59,7 +113,26 @@ def find_dia_spec_idxes_same_window(
spec_rt_values: np.ndarray,
query_rt_values: np.ndarray,
max_spec_per_query: int,
):
) -> np.ndarray:
"""
For given array of query RT values, find spectrum indices
from the subset of spectra within the same normal DIA m/z window.
This function is numba accelerated.

Parameters
----------
spec_rt_values : np.ndarray
RT values of given DIA spectra.
query_rt_values : np.ndarray
Query RT values.
max_spec_per_query : int
Return maximal spectrum indices (scan windows) for the given query.

Returns
-------
ndarray[int32]
Result spectrum indices with shape (query num, max_spec_per_query).
"""
rt_idxes = np.searchsorted(spec_rt_values, query_rt_values)

spec_idxes = np.full((len(rt_idxes), max_spec_per_query), -1, dtype=np.int32)
Expand All @@ -84,7 +157,34 @@ def find_spec_idxes(
query_stop_rt: float,
query_left_mz: float,
query_right_mz: float,
):
) -> np.ndarray:
"""
Find MS2 spectrum indices (int32) from all the spectra
by given RT window and precursor m/z window.
This function is numba accelerated.

Parameters
----------
spec_rts : np.ndarray
RT values of the spectra.
spec_isolation_lower_mzs : np.ndarray
Left m/z values of the isolation windows.
spec_isolation_upper_mzs : np.ndarray
Right m/z values of the isolation windows.
query_start_rt : float
Left RT of the query RT window.
query_stop_rt : float
Right RT of the query RT window.
query_left_mz : float
Left m/z of the query m/z window.
query_right_mz : float
Right m/z of the query m/z window.

Returns
-------
np.ndarray[int32]
Result spectrum indices.
"""
rt_start_idx = np.searchsorted(spec_rts, query_start_rt)
rt_stop_idx = np.searchsorted(spec_rts, query_stop_rt) + 1

Expand All @@ -108,7 +208,36 @@ def find_batch_spec_idxes(
query_left_mzs: np.ndarray,
query_right_mzs: np.ndarray,
max_spec_per_query: int,
):
) -> np.ndarray:
"""
Find MS2 spectrum indices (int32) from all the spectra
by the given batch of RT windows and precursor m/z windows.
This function is numba accelerated.

Parameters
----------
spec_rts : np.ndarray
RT values of the spectra.
spec_isolation_lower_mzs : np.ndarray
Left m/z values of the isolation windows.
spec_isolation_upper_mzs : np.ndarray
Right m/z values of the isolation windows.
query_start_rts : np.ndarray
Left RT values of the query RT windows.
query_stop_rts : np.ndarray
Right RT values of the query RT windows.
query_left_mzs : np.ndarray
Left m/z values of the query m/z windows.
query_right_mzs : np.ndarray
Right m/z values of the query m/z windows.
max_spec_per_query : int
Return maximal spectrum indices (scan windows) for the given query.

Returns
-------
ndarray[int32]
Result spectrum indices with shape (query num, max_spec_per_query).
"""
rt_start_idxes = np.searchsorted(spec_rts, query_start_rts)
rt_stop_idxes = np.searchsorted(spec_rts, query_stop_rts) + 1

Expand Down
Loading