diff --git a/alpharaw/match/mass_calibration.py b/alpharaw/match/mass_calibration.py index 98f9e6a..5561f25 100644 --- a/alpharaw/match/mass_calibration.py +++ b/alpharaw/match/mass_calibration.py @@ -1,7 +1,15 @@ +import warnings + import numpy as np import pandas as pd from sklearn.neighbors import KNeighborsRegressor +warnings.warn( + "This module will be removed in the future as " + "mass calibration has already been implemented in alphaDIA.", + category=DeprecationWarning, +) + def get_fragment_median(start_end_idxes: tuple, frag_df: pd.DataFrame): start_idx, end_idx = start_end_idxes diff --git a/alpharaw/match/match_utils.py b/alpharaw/match/match_utils.py index 82fd7be..88f348d 100644 --- a/alpharaw/match/match_utils.py +++ b/alpharaw/match/match_utils.py @@ -13,7 +13,33 @@ def match_batch_spec( peak_stop_idxes: np.ndarray, query_mzs: np.ndarray, query_mz_tols: np.ndarray, -): +) -> Tuple[np.ndarray, np.ndarray]: + """ + Extract matched mzs and intensities for query m/z values against the given batch spectra. + + Parameters + ---------- + spec_idxes : np.ndarray + The batch spectra, given as spectrum indexes. + peak_mzs : np.ndarray + The peak m/z values in the whole raw data. + peak_intens : np.ndarray + The peak intensities in the whole raw data. + peak_start_idxes : np.ndarray + The batch spectra, given as the start indexes in peak m/z and intensities. + peak_stop_idxes : np.ndarray + The batch spectra, given as the stop indexes in peak m/z and intensities. + query_mzs : np.ndarray + The query m/z values, these can be from fragments of a precursor. + query_mz_tols : np.ndarray + The query tolerance values of query_mzs. + + Returns + ------- + Tuple[ndarray, ndarray] + ndarray with shape (spectrum num, query num): matched m/z values. 0.0 if not matched. + ndarray with shape (spectrum num, query num): matched intensity values. 0.0 if not matched. + """ matched_mzs = np.zeros((len(spec_idxes), len(query_mzs)), dtype=peak_mzs.dtype) matched_intens = np.zeros( (len(spec_idxes), len(query_mzs)), dtype=peak_intens.dtype @@ -63,7 +89,7 @@ def match_closest_peaks( query_mzs: np.ndarray, query_mz_tols: np.ndarray, ) -> np.ndarray: - """Matching query mz values against sorted MS2/spec masses, + """Matching query mz values against sorted MS2/spec m/z values, only closest (minimal abs mass error) peaks are returned. Parameters diff --git a/alpharaw/match/spec_finder.py b/alpharaw/match/spec_finder.py index 604f3b9..c383724 100644 --- a/alpharaw/match/spec_finder.py +++ b/alpharaw/match/spec_finder.py @@ -1,3 +1,5 @@ +from typing import List + import numba import numpy as np import pandas as pd @@ -9,8 +11,32 @@ def find_spec_idxes_by_rt( query_stop_rt: float, query_left_mz: float, query_right_mz: float, -): +) -> np.ndarray: + """ + Find MS2 spectrum indices (int32) from the `spectrum_df` + by given RT window and precursor m/z window. + + Parameters + ---------- + spectrum_df : pd.DataFrame + Spectrum dataframe to find spectrum indices. + query_start_rt : float + Left RT of the query RT window. + query_stop_rt : float + Right RT of the query RT window. + query_left_mz : float + Left m/z of the query m/z window. + query_right_mz : float + Right m/z of the query m/z window. + + Returns + ------- + ndarray[int32] + Result spectrum indices. `int32` is used here as there will be + no more than 2 billions of spectra in a raw file. + """ if "multinotch" in spectrum_df.columns: + # if multinotch, there are multiple isolation windows of MS2 spectra. return find_multinotch_spec_idxes( spec_rts=spectrum_df.rt.values, spec_multinotch_wins=spectrum_df.multinotch.values, @@ -21,6 +47,7 @@ def find_spec_idxes_by_rt( query_right_mz=query_right_mz, ) else: + # normal isolation windows (one window to one MS2 spectrum) return find_spec_idxes( spec_rts=spectrum_df.rt.values, spec_isolation_lower_mzs=spectrum_df.isolation_lower_mz.values, @@ -34,13 +61,40 @@ def find_spec_idxes_by_rt( def find_multinotch_spec_idxes( spec_rts: np.ndarray, - spec_multinotch_wins: list, + spec_multinotch_wins: List[List], spec_ms_levels: np.ndarray, query_start_rt: float, query_stop_rt: float, query_left_mz: float, query_right_mz: float, ) -> np.ndarray: + """ + Find MS2 spectrum indices (int32) from the "multinotch" `spectrum_df` + by given RT window and precursor m/z window. + "multinotch" means there are multiple isolation windows of MS2 spectra. + + Parameters + ---------- + spec_rts : np.ndarray + RT values of the spectra. + spec_multinotch_wins : List[List] + List (num of spectra) of list (multiple isolation windows). + spec_ms_levels : np.ndarray + MS levels of the spectra. + query_start_rt : float + Left RT of the query RT window. + query_stop_rt : float + Right RT of the query RT window. + query_left_mz : float + Left m/z of the query m/z window. + query_right_mz : float + Right m/z of the query m/z window. + + Returns + ------- + np.ndarray[int32] + Result spectrum indices. + """ start_idx = np.searchsorted(spec_rts, query_start_rt) stop_idx = np.searchsorted(spec_rts, query_stop_rt) + 1 spec_idxes = [] @@ -59,7 +113,26 @@ def find_dia_spec_idxes_same_window( spec_rt_values: np.ndarray, query_rt_values: np.ndarray, max_spec_per_query: int, -): +) -> np.ndarray: + """ + For given array of query RT values, find spectrum indices + from the subset of spectra within the same normal DIA m/z window. + This function is numba accelerated. + + Parameters + ---------- + spec_rt_values : np.ndarray + RT values of given DIA spectra. + query_rt_values : np.ndarray + Query RT values. + max_spec_per_query : int + Return maximal spectrum indices (scan windows) for the given query. + + Returns + ------- + ndarray[int32] + Result spectrum indices with shape (query num, max_spec_per_query). + """ rt_idxes = np.searchsorted(spec_rt_values, query_rt_values) spec_idxes = np.full((len(rt_idxes), max_spec_per_query), -1, dtype=np.int32) @@ -84,7 +157,34 @@ def find_spec_idxes( query_stop_rt: float, query_left_mz: float, query_right_mz: float, -): +) -> np.ndarray: + """ + Find MS2 spectrum indices (int32) from all the spectra + by given RT window and precursor m/z window. + This function is numba accelerated. + + Parameters + ---------- + spec_rts : np.ndarray + RT values of the spectra. + spec_isolation_lower_mzs : np.ndarray + Left m/z values of the isolation windows. + spec_isolation_upper_mzs : np.ndarray + Right m/z values of the isolation windows. + query_start_rt : float + Left RT of the query RT window. + query_stop_rt : float + Right RT of the query RT window. + query_left_mz : float + Left m/z of the query m/z window. + query_right_mz : float + Right m/z of the query m/z window. + + Returns + ------- + np.ndarray[int32] + Result spectrum indices. + """ rt_start_idx = np.searchsorted(spec_rts, query_start_rt) rt_stop_idx = np.searchsorted(spec_rts, query_stop_rt) + 1 @@ -108,7 +208,36 @@ def find_batch_spec_idxes( query_left_mzs: np.ndarray, query_right_mzs: np.ndarray, max_spec_per_query: int, -): +) -> np.ndarray: + """ + Find MS2 spectrum indices (int32) from all the spectra + by the given batch of RT windows and precursor m/z windows. + This function is numba accelerated. + + Parameters + ---------- + spec_rts : np.ndarray + RT values of the spectra. + spec_isolation_lower_mzs : np.ndarray + Left m/z values of the isolation windows. + spec_isolation_upper_mzs : np.ndarray + Right m/z values of the isolation windows. + query_start_rts : np.ndarray + Left RT values of the query RT windows. + query_stop_rts : np.ndarray + Right RT values of the query RT windows. + query_left_mzs : np.ndarray + Left m/z values of the query m/z windows. + query_right_mzs : np.ndarray + Right m/z values of the query m/z windows. + max_spec_per_query : int + Return maximal spectrum indices (scan windows) for the given query. + + Returns + ------- + ndarray[int32] + Result spectrum indices with shape (query num, max_spec_per_query). + """ rt_start_idxes = np.searchsorted(spec_rts, query_start_rts) rt_stop_idxes = np.searchsorted(spec_rts, query_stop_rts) + 1