Skip to content

Commit

Permalink
#53 ADD docs for psm_match.py
Browse files Browse the repository at this point in the history
  • Loading branch information
jalew188 committed Jun 13, 2024
1 parent 0cd21c4 commit 1e1b10b
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 68 deletions.
197 changes: 133 additions & 64 deletions alpharaw/match/psm_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,25 @@
class PepSpecMatch:
"""
Extract fragment ions from MS2 data.
Parameters
----------
charged_frag_types : list, optional
fragment types with charge states,
e.g. ['b_z1', 'y_z2', 'b_modloss_z1', 'y_H2O_z2'].
If None, it is `get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2)`.
By default None.
match_closest : bool, optional
if True, match the closest peak for a m/z;
if False, matched the higest peak for a m/z in the tolerance range.
By default True.
use_ppm : bool, optional
If use ppm, by default True.
tol_value : float, optional
tolerance value, by default 20.0
"""

match_closest: bool = True
Expand All @@ -45,25 +64,6 @@ def __init__(
use_ppm: bool = True,
tol_value: float = 20.0,
):
"""
Parameters
----------
charged_frag_types : list, optional
fragment types with charge states,
e.g. ['b_z1', 'y_z2', 'b_modloss_z1', 'y_H2O_z2'].
By default `get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2)`
match_closest : bool, optional
if True, match the closest peak for a m/z;
if False, matched the higest peak for a m/z in the tolerance range.
By default True
use_ppm : bool, optional
If use ppm, by default True
tol_value : float, optional
tolerance value, by default 20.0
"""
self.charged_frag_types = (
get_charged_frag_types(["b", "y", "b_modloss", "y_modloss"], 2)
if charged_frag_types is None
Expand All @@ -73,17 +73,41 @@ def __init__(
self.use_ppm = use_ppm
self.tolerance = tol_value

def _preprocess_psms(self, psm_df):
pass
def get_fragment_mz_df(self) -> pd.DataFrame:
"""
Call :func:`alphabase.peptide.fragment.create_fragment_mz_dataframe`
for :attr:`PepSpecMatch.psm_df` and :attr:`PepSpecMatch.charged_frag_types`.
def get_fragment_mz_df(self):
Returns
-------
DataFrame
_description_
"""
return create_fragment_mz_dataframe(
self.psm_df,
self.charged_frag_types,
dtype=PEAK_MZ_DTYPE,
)

def _add_missing_columns_to_psm_df(self, psm_df: pd.DataFrame, raw_data=None):
def _add_missing_columns_to_psm_df(
self, psm_df: pd.DataFrame, raw_data: MSData_Base = None
):
"""
Add missing "rt", "nce", "rt_norm", ("mobility") columns to `psm_df` if missing.
Parameters
----------
psm_df : pd.DataFrame
psm dataframe to be processed.
raw_data : MSData_Base, optional
The `MSData_Base`. If None, `self.raw_data`. by default None.
Returns
-------
DataFrame
psm_df inplace.
"""
if raw_data is None:
raw_data = self.raw_data
add_spec_info_list = []
Expand Down Expand Up @@ -117,7 +141,19 @@ def _add_missing_columns_to_psm_df(self, psm_df: pd.DataFrame, raw_data=None):
# psm_df['rt_sec'] = psm_df.rt*60
return psm_df

def _prepare_matching_dfs(self):
def _prepare_matching_dfs(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Prepare dataframes to be matched.
Returns
-------
Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
pd.DataFrame: fragment mz dataframe.
pd.DataFrame: intensity dataframe to match.
pd.DataFrame: mz error dataframe to match.
"""
fragment_mz_df = self.get_fragment_mz_df()

matched_intensity_df = pd.DataFrame(
Expand All @@ -138,17 +174,17 @@ def load_ms_data(
process_count: int = 8,
**kwargs,
):
"""Load MS files
"""Load MS file to set `self.raw_data`.
Parameters
----------
ms_file : str | MSData_Base
ms2 file path
ms2 file path.
ms_file_type : str, optional
ms2 file type, could be
["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"].
Default to 'alpharaw_hdf'
Default to 'alpharaw_hdf'.
"""
self.raw_data = load_ms_data(ms_file, ms_file_type, process_count=process_count)

Expand All @@ -157,18 +193,39 @@ def get_peaks(self, spec_idx: int, **kwargs):

def _match_one_psm(
self,
spec_mzs: np.ndarray,
spec_intens: np.ndarray,
peak_mzs: np.ndarray,
peak_intens: np.ndarray,
fragment_mz_df: pd.DataFrame,
matched_intensity_df: pd.DataFrame,
matched_mz_err_df: pd.DataFrame,
frag_start_idx: int,
frag_stop_idx: int,
):
if len(spec_mzs) == 0:
"""
Match fragments of one precursor (located by `frag_start_idx` and `frag_stop_idx`)
against the corresponding `peak_mzs`.
Parameters
----------
peak_mzs : np.ndarray
Peak m/z values to be matched.
peak_intens : np.ndarray
Peak intensities to be matched.
fragment_mz_df : pd.DataFrame
fragment m/z dataframe to be matched.
matched_intensity_df : pd.DataFrame
The dataframe to store matched intensity values.
matched_mz_err_df : pd.DataFrame
The dataframe to store matched mz error values.
frag_start_idx : int
fragment start index of the given PSM.
frag_stop_idx : int
fragment stop index of the given PSM.
"""
if len(peak_mzs) == 0:
return

spec_mzs = spec_mzs.astype(PEAK_MZ_DTYPE)
peak_mzs = peak_mzs.astype(PEAK_MZ_DTYPE)

frag_mzs = fragment_mz_df.values[frag_start_idx:frag_stop_idx, :]

Expand All @@ -179,20 +236,20 @@ def _match_one_psm(

if self.match_closest:
matched_idxes = match_closest_peaks(
spec_mzs, spec_intens, frag_mzs, mz_tols
peak_mzs, peak_intens, frag_mzs, mz_tols
)
else:
matched_idxes = match_highest_peaks(
spec_mzs,
spec_intens,
peak_mzs,
peak_intens,
frag_mzs,
mz_tols,
)

matched_intens = spec_intens[matched_idxes]
matched_intens = peak_intens[matched_idxes]
matched_intens[matched_idxes == -1] = 0

matched_mz_errs = np.abs(spec_mzs[matched_idxes] - frag_mzs)
matched_mz_errs = np.abs(peak_mzs[matched_idxes] - frag_mzs)
matched_mz_errs[matched_idxes == -1] = np.inf

matched_intensity_df.values[frag_start_idx:frag_stop_idx, :] = matched_intens
Expand All @@ -203,7 +260,7 @@ def match_ms2_one_raw(
self,
psm_df_one_raw: pd.DataFrame,
verbose: bool = False,
) -> tuple:
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Matching psm_df_one_raw against self.raw_data
after `self.load_ms_data()`
Expand All @@ -216,18 +273,16 @@ def match_ms2_one_raw(
Returns
-------
tuple:
Tuple:
pd.DataFrame: psm dataframe with fragment index information.
pd.DataFrame: fragment mz dataframe.
pd.DataFrame: matched intensity dataframe.
pd.DataFrame: matched mass error dataframe.
np.inf if a fragment is not matched.
np.inf if a fragment is not matched.
"""
self._preprocess_psms(psm_df_one_raw)
self.psm_df = psm_df_one_raw

psm_df_one_raw = self._add_missing_columns_to_psm_df(
Expand Down Expand Up @@ -307,8 +362,9 @@ def match_ms2_multi_raw(
ms_files: Union[dict, list],
ms_file_type: str = "alpharaw_hdf",
process_num: int = 1,
):
"""Matching PSM dataframe against the ms2 files in ms_files
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Matching PSM dataframe against the ms2 files in ms_files
This method will store matched values as attributes:
- self.psm_df
- self.fragment_mz_df
Expand All @@ -330,18 +386,17 @@ def match_ms2_multi_raw(
Returns
-------
tuple:
Tuple:
pd.DataFrame: psm dataframe with fragment index information.
pd.DataFrame: fragment mz dataframe.
pd.DataFrame: matched intensity dataframe.
pd.DataFrame: matched mass error dataframe.
np.inf if a fragment is not matched.
np.inf if a fragment is not matched.
"""
self._preprocess_psms(psm_df)
self.psm_df = psm_df

(
Expand Down Expand Up @@ -425,7 +480,9 @@ def _prepare_matching_dfs(self):

return (fragment_mz_df, matched_intensity_df, matched_mz_err_df)

def _match_ms2_one_raw_numba(self, raw_name, psm_df_one_raw):
def _match_ms2_one_raw_numba(
self, raw_name: str, psm_df_one_raw: pd.DataFrame
) -> pd.DataFrame:
psm_df_one_raw = psm_df_one_raw.reset_index(drop=True)

if raw_name in self._ms_file_dict:
Expand Down Expand Up @@ -485,7 +542,7 @@ def match_ms2_multi_raw(
ms_files: Tuple[dict, list],
ms_file_type: str = "alpharaw_hdf",
process_num: int = 8,
):
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
if isinstance(ms_files, list):
ms_files = parse_ms_files_to_dict(ms_files)
psm_df = psm_df[psm_df.raw_name.isin(ms_files)].reset_index(drop=True)
Expand All @@ -501,18 +558,18 @@ def match_ms2_multi_raw(

@numba.jit(nogil=True)
def match_one_raw_with_numba(
spec_idxes,
frag_start_idxes,
frag_stop_idxes,
all_frag_mzs,
all_frag_mz_tols,
all_spec_mzs,
all_spec_intensities,
peak_start_idxes,
peak_stop_idxes,
matched_intensities,
matched_mz_errs,
match_closest=True,
spec_idxes: np.ndarray,
frag_start_idxes: np.ndarray,
frag_stop_idxes: np.ndarray,
all_frag_mzs: np.ndarray,
all_frag_mz_tols: np.ndarray,
all_spec_mzs: np.ndarray,
all_spec_intensities: np.ndarray,
peak_start_idxes: np.ndarray,
peak_stop_idxes: np.ndarray,
matched_intensities: np.ndarray,
matched_mz_errs: np.ndarray,
match_closest: bool = True,
):
"""
Internel function to match fragment mz values to spectrum mz values.
Expand Down Expand Up @@ -567,17 +624,23 @@ def load_ms_data(
ms_file_type: str = "alpharaw_hdf",
process_count: int = 8,
) -> MSData_Base:
"""Load MS files
"""
Load MS file.
Parameters
----------
ms_file : str | MSData_Base
ms2 file path
ms_file_type : str, optional
ms2 file type, could be
["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"].
Default to 'alpharaw_hdf'
ms2 file type, can be
["alpharaw_hdf", "thermo", "sciex", "alphapept_hdf", "mgf"].
Default to 'alpharaw_hdf'.
Returns
-------
MSData_Base:
Instance of sub-class of `MSData_Base`.
"""
if isinstance(ms_file, MSData_Base):
return ms_file
Expand All @@ -600,6 +663,9 @@ def get_best_matched_intens(
frag_start_idxes: np.ndarray,
frag_stop_idxes: np.ndarray,
):
"""
TODO Deprecated
"""
ret_intens = np.zeros(
shape=matched_intensity_values.shape[1:], dtype=matched_intensity_values.dtype
)
Expand All @@ -624,6 +690,9 @@ def get_ion_count_scores(
frag_stop_idxes: np.ndarray,
min_mz: float = 200,
):
"""
TODO Deprecated
"""
scores = []
for i in range(len(frag_start_idxes)):
scores.append(
Expand Down
5 changes: 1 addition & 4 deletions alpharaw/match/psm_match_alphatims.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# TODO to be remove as already implemented in alphaDIA.
from typing import Tuple, Union

import numpy as np
Expand All @@ -6,9 +7,6 @@
from alphatims.bruker import TimsTOF

from alpharaw.ms_data_base import MSData_Base, ms_reader_provider
from alpharaw.wrappers.alphapept_wrapper import (
AlphaPept_HDF_MS2_Reader, # noqa: F401 # TODO remove import side effect
)
from alpharaw.wrappers.alphatims_wrapper import AlphaTimsWrapper

from .psm_match import PepSpecMatch
Expand Down Expand Up @@ -231,7 +229,6 @@ def match_ms2_one_raw(
np.inf if a fragment is not matched.
"""
self._preprocess_psms(psm_df_one_raw)
self.psm_df = psm_df_one_raw

psm_df_one_raw = self._add_missing_columns_to_psm_df(psm_df_one_raw)
Expand Down

0 comments on commit 1e1b10b

Please sign in to comment.