From f153caf6c389f13a4b7948a468c3a6f4b7f69513 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Thu, 13 Jun 2024 20:40:54 +0200 Subject: [PATCH 1/3] #53 ADD docs for match_utils.py and spec_finder.py --- alpharaw/match/mass_calibration.py | 2 + alpharaw/match/match_utils.py | 30 ++++++- alpharaw/match/spec_finder.py | 138 +++++++++++++++++++++++++++-- 3 files changed, 163 insertions(+), 7 deletions(-) diff --git a/alpharaw/match/mass_calibration.py b/alpharaw/match/mass_calibration.py index 98f9e6a..de1d937 100644 --- a/alpharaw/match/mass_calibration.py +++ b/alpharaw/match/mass_calibration.py @@ -1,3 +1,5 @@ +# TODO This module will be removed in the future as mass calibration has already been implemented in alphaDIA. + import numpy as np import pandas as pd from sklearn.neighbors import KNeighborsRegressor diff --git a/alpharaw/match/match_utils.py b/alpharaw/match/match_utils.py index 82fd7be..3032bfc 100644 --- a/alpharaw/match/match_utils.py +++ b/alpharaw/match/match_utils.py @@ -13,7 +13,33 @@ def match_batch_spec( peak_stop_idxes: np.ndarray, query_mzs: np.ndarray, query_mz_tols: np.ndarray, -): +)->Tuple[np.ndarray, np.ndarray]: + """ + Extract matched mzs and intensities for query m/z values against the given batch spectra. + + Parameters + ---------- + spec_idxes : np.ndarray + The batch spectra, given as spectrum indexes. + peak_mzs : np.ndarray + The peak m/z values in the whole raw data. + peak_intens : np.ndarray + The peak intensities in the whole raw data. + peak_start_idxes : np.ndarray + The batch spectra, given as the start indexes in peak m/z and intensities. + peak_stop_idxes : np.ndarray + The batch spectra, given as the stop indexes in peak m/z and intensities. + query_mzs : np.ndarray + The query m/z values, these can be from fragments of a precursor. + query_mz_tols : np.ndarray + The query tolerance values of query_mzs. + + Returns + ------- + Tuple[ndarray, ndarray] + ndarray with shape (spectrum num, query num): matched m/z values. 0.0 if not matched. + ndarray with shape (spectrum num, query num): matched intensity values. 0.0 if not matched. + """ matched_mzs = np.zeros((len(spec_idxes), len(query_mzs)), dtype=peak_mzs.dtype) matched_intens = np.zeros( (len(spec_idxes), len(query_mzs)), dtype=peak_intens.dtype @@ -63,7 +89,7 @@ def match_closest_peaks( query_mzs: np.ndarray, query_mz_tols: np.ndarray, ) -> np.ndarray: - """Matching query mz values against sorted MS2/spec masses, + """Matching query mz values against sorted MS2/spec m/z values, only closest (minimal abs mass error) peaks are returned. Parameters diff --git a/alpharaw/match/spec_finder.py b/alpharaw/match/spec_finder.py index 604f3b9..1390116 100644 --- a/alpharaw/match/spec_finder.py +++ b/alpharaw/match/spec_finder.py @@ -1,3 +1,5 @@ +from typing import List + import numba import numpy as np import pandas as pd @@ -9,8 +11,31 @@ def find_spec_idxes_by_rt( query_stop_rt: float, query_left_mz: float, query_right_mz: float, -): +)->np.ndarray: + """ + Find MS2 spectrum indices (int32) from the `spectrum_df` + by given RT window and precursor m/z window. + + Parameters + ---------- + spectrum_df : pd.DataFrame + Spectrum dataframe to find spectrum indices. + query_start_rt : float + Left RT of the query RT window. + query_stop_rt : float + Right RT of the query RT window. + query_left_mz : float + Left m/z of the query m/z window. + query_right_mz : float + Right m/z of the query m/z window. + + Returns + ------- + ndarray[int32] + Result spectrum indices. + """ if "multinotch" in spectrum_df.columns: + # if multinotch, there are multiple isolation windows of MS2 spectra. return find_multinotch_spec_idxes( spec_rts=spectrum_df.rt.values, spec_multinotch_wins=spectrum_df.multinotch.values, @@ -21,6 +46,7 @@ def find_spec_idxes_by_rt( query_right_mz=query_right_mz, ) else: + # normal isolation windows (one window to one MS2 spectrum) return find_spec_idxes( spec_rts=spectrum_df.rt.values, spec_isolation_lower_mzs=spectrum_df.isolation_lower_mz.values, @@ -34,13 +60,40 @@ def find_spec_idxes_by_rt( def find_multinotch_spec_idxes( spec_rts: np.ndarray, - spec_multinotch_wins: list, + spec_multinotch_wins: List[List], spec_ms_levels: np.ndarray, query_start_rt: float, query_stop_rt: float, query_left_mz: float, query_right_mz: float, ) -> np.ndarray: + """ + Find MS2 spectrum indices (int32) from the "multinotch" `spectrum_df` + by given RT window and precursor m/z window. + "multinotch" means there are multiple isolation windows of MS2 spectra. + + Parameters + ---------- + spec_rts : np.ndarray + RT values of the spectra. + spec_multinotch_wins : List[List] + List (num of spectra) of list (multiple isolation windows). + spec_ms_levels : np.ndarray + MS levels of the spectra. + query_start_rt : float + Left RT of the query RT window. + query_stop_rt : float + Right RT of the query RT window. + query_left_mz : float + Left m/z of the query m/z window. + query_right_mz : float + Right m/z of the query m/z window. + + Returns + ------- + np.ndarray[int32] + Result spectrum indices. + """ start_idx = np.searchsorted(spec_rts, query_start_rt) stop_idx = np.searchsorted(spec_rts, query_stop_rt) + 1 spec_idxes = [] @@ -59,7 +112,26 @@ def find_dia_spec_idxes_same_window( spec_rt_values: np.ndarray, query_rt_values: np.ndarray, max_spec_per_query: int, -): +)->np.ndarray: + """ + For given array of query RT values, find spectrum indices + from the subset of spectra within the same normal DIA m/z window. + This function is numba accelerated. + + Parameters + ---------- + spec_rt_values : np.ndarray + RT values of given DIA spectra. + query_rt_values : np.ndarray + Query RT values. + max_spec_per_query : int + Return maximal spectrum indices (scan windows) for the given query. + + Returns + ------- + ndarray[int32] + Result spectrum indices with shape (query num, max_spec_per_query). + """ rt_idxes = np.searchsorted(spec_rt_values, query_rt_values) spec_idxes = np.full((len(rt_idxes), max_spec_per_query), -1, dtype=np.int32) @@ -84,7 +156,34 @@ def find_spec_idxes( query_stop_rt: float, query_left_mz: float, query_right_mz: float, -): +)->np.ndarray: + """ + Find MS2 spectrum indices (int32) from the all spectra + by given RT window and precursor m/z window. + This function is numba accelerated. + + Parameters + ---------- + spec_rts : np.ndarray + RT values of the spectra. + spec_isolation_lower_mzs : np.ndarray + Left m/z values of the isolation windows. + spec_isolation_upper_mzs : np.ndarray + Right m/z values of the isolation windows. + query_start_rt : float + Left RT of the query RT window. + query_stop_rt : float + Right RT of the query RT window. + query_left_mz : float + Left m/z of the query m/z window. + query_right_mz : float + Right m/z of the query m/z window. + + Returns + ------- + np.ndarray[int32] + Result spectrum indices. + """ rt_start_idx = np.searchsorted(spec_rts, query_start_rt) rt_stop_idx = np.searchsorted(spec_rts, query_stop_rt) + 1 @@ -108,7 +207,36 @@ def find_batch_spec_idxes( query_left_mzs: np.ndarray, query_right_mzs: np.ndarray, max_spec_per_query: int, -): +)->np.ndarray: + """ + Find MS2 spectrum indices (int32) from the all spectra + by the given batch of RT windows and precursor m/z windows. + This function is numba accelerated. + + Parameters + ---------- + spec_rts : np.ndarray + RT values of the spectra. + spec_isolation_lower_mzs : np.ndarray + Left m/z values of the isolation windows. + spec_isolation_upper_mzs : np.ndarray + Right m/z values of the isolation windows. + query_start_rts : np.ndarray + Left RT values of the query RT windows. + query_stop_rts : np.ndarray + Right RT values of the query RT windows. + query_left_mzs : np.ndarray + Left m/z values of the query m/z windows. + query_right_mzs : np.ndarray + Right m/z values of the query m/z windows. + max_spec_per_query : int + Return maximal spectrum indices (scan windows) for the given query. + + Returns + ------- + ndarray[int32] + Result spectrum indices with shape (query num, max_spec_per_query). + """ rt_start_idxes = np.searchsorted(spec_rts, query_start_rts) rt_stop_idxes = np.searchsorted(spec_rts, query_stop_rts) + 1 From 117d046e179288864cd1d68deee240084d90eae8 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Thu, 13 Jun 2024 20:41:31 +0200 Subject: [PATCH 2/3] #53 FIX pre-commit run --all-files --- alpharaw/match/match_utils.py | 2 +- alpharaw/match/spec_finder.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/alpharaw/match/match_utils.py b/alpharaw/match/match_utils.py index 3032bfc..88f348d 100644 --- a/alpharaw/match/match_utils.py +++ b/alpharaw/match/match_utils.py @@ -13,7 +13,7 @@ def match_batch_spec( peak_stop_idxes: np.ndarray, query_mzs: np.ndarray, query_mz_tols: np.ndarray, -)->Tuple[np.ndarray, np.ndarray]: +) -> Tuple[np.ndarray, np.ndarray]: """ Extract matched mzs and intensities for query m/z values against the given batch spectra. diff --git a/alpharaw/match/spec_finder.py b/alpharaw/match/spec_finder.py index 1390116..5a071b0 100644 --- a/alpharaw/match/spec_finder.py +++ b/alpharaw/match/spec_finder.py @@ -11,7 +11,7 @@ def find_spec_idxes_by_rt( query_stop_rt: float, query_left_mz: float, query_right_mz: float, -)->np.ndarray: +) -> np.ndarray: """ Find MS2 spectrum indices (int32) from the `spectrum_df` by given RT window and precursor m/z window. @@ -69,7 +69,7 @@ def find_multinotch_spec_idxes( ) -> np.ndarray: """ Find MS2 spectrum indices (int32) from the "multinotch" `spectrum_df` - by given RT window and precursor m/z window. + by given RT window and precursor m/z window. "multinotch" means there are multiple isolation windows of MS2 spectra. Parameters @@ -112,7 +112,7 @@ def find_dia_spec_idxes_same_window( spec_rt_values: np.ndarray, query_rt_values: np.ndarray, max_spec_per_query: int, -)->np.ndarray: +) -> np.ndarray: """ For given array of query RT values, find spectrum indices from the subset of spectra within the same normal DIA m/z window. @@ -156,7 +156,7 @@ def find_spec_idxes( query_stop_rt: float, query_left_mz: float, query_right_mz: float, -)->np.ndarray: +) -> np.ndarray: """ Find MS2 spectrum indices (int32) from the all spectra by given RT window and precursor m/z window. @@ -207,7 +207,7 @@ def find_batch_spec_idxes( query_left_mzs: np.ndarray, query_right_mzs: np.ndarray, max_spec_per_query: int, -)->np.ndarray: +) -> np.ndarray: """ Find MS2 spectrum indices (int32) from the all spectra by the given batch of RT windows and precursor m/z windows. From e221cb98f43e3958edee3366a3886af97ee20f31 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Mon, 17 Jun 2024 11:45:14 +0200 Subject: [PATCH 3/3] #53 FIX typos in docs --- alpharaw/match/mass_calibration.py | 8 +++++++- alpharaw/match/spec_finder.py | 7 ++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/alpharaw/match/mass_calibration.py b/alpharaw/match/mass_calibration.py index de1d937..5561f25 100644 --- a/alpharaw/match/mass_calibration.py +++ b/alpharaw/match/mass_calibration.py @@ -1,9 +1,15 @@ -# TODO This module will be removed in the future as mass calibration has already been implemented in alphaDIA. +import warnings import numpy as np import pandas as pd from sklearn.neighbors import KNeighborsRegressor +warnings.warn( + "This module will be removed in the future as " + "mass calibration has already been implemented in alphaDIA.", + category=DeprecationWarning, +) + def get_fragment_median(start_end_idxes: tuple, frag_df: pd.DataFrame): start_idx, end_idx = start_end_idxes diff --git a/alpharaw/match/spec_finder.py b/alpharaw/match/spec_finder.py index 5a071b0..c383724 100644 --- a/alpharaw/match/spec_finder.py +++ b/alpharaw/match/spec_finder.py @@ -32,7 +32,8 @@ def find_spec_idxes_by_rt( Returns ------- ndarray[int32] - Result spectrum indices. + Result spectrum indices. `int32` is used here as there will be + no more than 2 billions of spectra in a raw file. """ if "multinotch" in spectrum_df.columns: # if multinotch, there are multiple isolation windows of MS2 spectra. @@ -158,7 +159,7 @@ def find_spec_idxes( query_right_mz: float, ) -> np.ndarray: """ - Find MS2 spectrum indices (int32) from the all spectra + Find MS2 spectrum indices (int32) from all the spectra by given RT window and precursor m/z window. This function is numba accelerated. @@ -209,7 +210,7 @@ def find_batch_spec_idxes( max_spec_per_query: int, ) -> np.ndarray: """ - Find MS2 spectrum indices (int32) from the all spectra + Find MS2 spectrum indices (int32) from all the spectra by the given batch of RT windows and precursor m/z windows. This function is numba accelerated.