Skip to content

Commit

Permalink
Merge pull request #55 from MannLabs/53-docs-base-and-thermo
Browse files Browse the repository at this point in the history
53 docs base and thermo
  • Loading branch information
jalew188 authored Jun 14, 2024
2 parents 0cd21c4 + 6fbfb40 commit 34fb605
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 66 deletions.
190 changes: 159 additions & 31 deletions alpharaw/ms_data_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@

class MSData_Base:
"""
The base data structure for MS Data, other MSData loader inherit
The base data structure for MS RAW Data, other MSData loaders inherit this class.
Parameters
----------
centroided : bool, optional
If centroiding the peak data, by default True
save_as_hdf : bool, optional
If automatically save the data into HDF5 format, by default False
"""

column_dtypes = {
Expand All @@ -25,7 +32,7 @@ class MSData_Base:
"""
Spectrum dataframe containing the following columns:
- `rt` (float64): in minutes
- `rt` (float64): in minutes. `rt_sec` will be RT in seconds, which is not included by default.
- `precursor_mz` (float64): mono_mz (DDA) or isolation center mz
- `isolation_lower_mz` (float64): left of the isolation window
- `isolation_upper_mz` (float64): right of the isolation window
Expand Down Expand Up @@ -62,15 +69,13 @@ class MSData_Base:
"FT",
"TOF",
]
"""
These spectrum infomation items in str format can be one-to-one mapped into
unique token IDs (indices), for exampel "CID"=0, "HCD"=1, ...
Token IDs are better for storage in HDF5 format.
"""

def __init__(self, centroided: bool = True, save_as_hdf: bool = False, **kwargs):
"""
Parameters
----------
centroided : bool, optional
if peaks will be centroided after loading,
by default True
"""
# A spectrum contains peaks
self.spectrum_df: pd.DataFrame = pd.DataFrame()
# A peak contains mz, intensity, and ...
Expand All @@ -82,9 +87,19 @@ def __init__(self, centroided: bool = True, save_as_hdf: bool = False, **kwargs)
self.file_type = ""
self.instrument = "none"

def _get_term_id(self, terminology: str):
def _get_term_id(self, terminology: str) -> int:
"""
Get terminology id from :data:`self.vocab`, -1 if not exist.
Get terminology ID from :attr:`.MSData_Base.vocab`, -1 if not exist.
Parameters
----------
terminology : str
The terminology name from :attr:`.MSData_Base.vocab`, such as "CID", "HCD", ...
Returns
-------
int
Terminology ID, which is the index in :attr:`.MSData_Base.vocab`.
"""
try:
return self.vocab.index(terminology)
Expand All @@ -96,20 +111,36 @@ def raw_file_path(self) -> str:
return self._raw_file_path

@raw_file_path.setter
def raw_file_path(self, _path: str):
self._raw_file_path = _path
def raw_file_path(self, raw_file_path: str):
self._raw_file_path = raw_file_path

def import_raw(self, _path: str):
self.raw_file_path = _path
raw_data = self._import(_path)
self._set_dataframes(raw_data)
def import_raw(self, raw_file_path: str):
"""
Import a raw file. It involves three steps:
```
raw_data_dict = self._import(raw_file_path)
self._set_dataframes(raw_data_dict)
self._check_df()
```
Parameters
----------
raw_file_path : str
Absolute or relative path of the raw file.
"""
self.raw_file_path = raw_file_path
raw_data_dict = self._import(raw_file_path)
self._set_dataframes(raw_data_dict)
self._check_df()

if self._save_as_hdf:
self.save_hdf(_path + ".hdf")
self.save_hdf(raw_file_path + ".hdf")

def load_raw(self, _path: str):
self.import_raw(_path)
def load_raw(self, raw_file_path: str):
"""
Wrapper of :func:`.MSData_Base.import_raw`
"""
self.import_raw(raw_file_path)

def _save_meta_to_hdf(self, hdf: HDF_File):
hdf.ms_data.meta = {
Expand All @@ -127,15 +158,35 @@ def _load_meta_from_hdf(self, hdf: HDF_File):
self.centroided = hdf.ms_data.meta.centroided
self.instrument = hdf.ms_data.meta.instrument

def save_hdf(self, _path: str):
hdf = HDF_File(_path, read_only=False, truncate=True, delete_existing=True)
def save_hdf(self, hdf_file_path: str):
"""
Save data into HDF5 file
Parameters
----------
hdf_file_path : str
Absolute or relative path of HDF5 file.
"""
hdf = HDF_File(
hdf_file_path, read_only=False, truncate=True, delete_existing=True
)

hdf.ms_data = {"spectrum_df": self.spectrum_df, "peak_df": self.peak_df}

self._save_meta_to_hdf(hdf)

def load_hdf(self, _path: str):
hdf = HDF_File(_path, read_only=True, truncate=False, delete_existing=False)
def load_hdf(self, hdf_file_path: str):
"""
Load data from HDF5 file.
Parameters
----------
hdf_file_path : str
Absolute or relative path of HDF5 file.
"""
hdf = HDF_File(
hdf_file_path, read_only=True, truncate=False, delete_existing=False
)

self.spectrum_df = hdf.ms_data.spectrum_df.values
self.peak_df = hdf.ms_data.peak_df.values
Expand All @@ -144,10 +195,43 @@ def load_hdf(self, _path: str):
self._load_meta_from_hdf(hdf)

def reset_spec_idxes(self):
"""
Reset spec indexes to make sure spec_idx values are continuous ranging from 0 to N.
"""
self.spectrum_df.reset_index(drop=True, inplace=True)
self.spectrum_df["spec_idx"] = self.spectrum_df.index.values

def _import(self, _path):
def _import(self, _path: str) -> dict:
"""
Parameters
----------
_path : str
Path of raw file.
Returns
-------
dict
Example:
```
spec_dict = {
"_peak_indices": _peak_indices,
"peak_mz": np.concatenate(mz_values).copy(),
"peak_intensity": np.concatenate(intensity_values).copy(),
"rt": np.array(rt_values).copy(),
"precursor_mz": np.array(precursor_mz_values).copy(),
"precursor_charge": np.array(precursor_charges, dtype=np.int8).copy(),
"isolation_lower_mz": np.array(isolation_mz_lowers).copy(),
"isolation_upper_mz": np.array(isolation_mz_uppers).copy(),
"ms_level": np.array(ms_order_list, dtype=np.int8).copy(),
"nce": np.array(ce_list, dtype=np.float32).copy(),
}
```
Raises
------
NotImplementedError
Sub-class of `MSData_Base` must implement this method.
"""
raise NotImplementedError(f"{self.__class__} must implement `_import()`")

def _set_dataframes(self, raw_data: dict):
Expand Down Expand Up @@ -200,6 +284,14 @@ def create_spectrum_df(
self,
spectrum_num: int,
):
"""
Create a empty spectrum dataframe from the number of spectra.
Parameters
----------
spectrum_num : int
The number of spectra.
"""
self.spectrum_df = pd.DataFrame(index=np.arange(spectrum_num, dtype=np.int64))
self.spectrum_df["spec_idx"] = self.spectrum_df.index.values

Expand Down Expand Up @@ -345,6 +437,12 @@ def index_ragged_list(ragged_list: list) -> np.ndarray:


class MSData_HDF(MSData_Base):
"""
Wrapper of reader for alpharaw's HDF5 spectrum file.
This class is registered as "alpharaw", "raw.hdf", "alpharaw_hdf", "hdf" and "hdf5"
in :data:`ms_reader_provider` instance.
"""

def import_raw(self, _path: str):
self.raw_file_path = _path
self.load_hdf(_path)
Expand All @@ -356,20 +454,50 @@ class MSReaderProvider:
def __init__(self):
self.ms_reader_dict = {}

def register_reader(self, ms2_type: str, reader_class):
self.ms_reader_dict[ms2_type.lower()] = reader_class
def register_reader(self, ms_file_type: str, reader_class: type):
"""
Register a new reader for `ms_file_type` format with `reader_class`.
Parameters
----------
ms_file_type : str
AlphaRaw supported MS file types.
reader_class : type
AlphaRaw supported MS class types.
"""
self.ms_reader_dict[ms_file_type.lower()] = reader_class

def get_reader(
self, file_type: str, *, centroided: bool = True, **kwargs
self, ms_file_type: str, *, centroided: bool = True, **kwargs
) -> MSData_Base:
file_type = file_type.lower()
if file_type not in self.ms_reader_dict:
"""
Get the MS reader for the given `ms_file_type`.
Parameters
----------
ms_file_type : str
AlphaRaw supported MS file types.
centroided : bool, optional
If centroiding the data, by default True.
Returns
-------
MSData_Base
Instance of corresponding sub-class of `MSData_Base`.
"""
ms_file_type = ms_file_type.lower()
if ms_file_type not in self.ms_reader_dict:
return None
else:
return self.ms_reader_dict[file_type](centroided=centroided, **kwargs)
return self.ms_reader_dict[ms_file_type](centroided=centroided, **kwargs)


ms_reader_provider = MSReaderProvider()
"""
MS data register (:class:`.MSReaderProvider`) performs as a factory to
produce different readers for different file formats.
"""

ms_reader_provider.register_reader("alpharaw", MSData_HDF)
ms_reader_provider.register_reader("raw.hdf", MSData_HDF)
ms_reader_provider.register_reader("alpharaw_hdf", MSData_HDF)
Expand Down
69 changes: 34 additions & 35 deletions alpharaw/thermo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ms_reader_provider,
)

#: These thermo spectrum items can be only accessed by trailer dict using RawFileReader APIs.
__trailer_extra_list__ = [
"injection_time",
"cv",
Expand All @@ -24,6 +25,8 @@
"funnel_rf_level",
"faims_cv",
]

#: The auxiliary items and types that can be accessed from thermo RawFileReader.
__auxiliary_item_dtypes__ = {
"injection_time": np.float32,
"cv": np.float32,
Expand All @@ -47,6 +50,23 @@
class ThermoRawData(MSData_Base):
"""
Loading Thermo Raw data as MSData_Base data structure.
This class is registered "thermo" and "thermo_raw" in :data:`ms_reader_provider`.
Parameters
----------
centroided : bool, optional
If peaks will be centroided after loading. By defaults True.
process_count : int, optional
number of spectra to load in each batch, by default 10.
mp_batch_size : int, optional
automatically save hdf after load raw data, by default 5000.
save_as_hdf : bool, optional
is DDA data, by default False.
dda : bool, optional
_description_, by default False.
auxiliary_items : list, optional
Additional spectrum items, candidates are in :data:`__auxiliary_item_dtypes__`.
By default [].
"""

def __init__(
Expand All @@ -59,34 +79,6 @@ def __init__(
auxiliary_items: list = [],
**kwargs,
):
"""
Parameters
----------
centroided : bool, default = True
if peaks will be centroided after loading,
by default True
process_count : int, default = 8
number of processes to use for loading
mp_batch_size : int, default = 10000
number of spectra to load in each batch
save_as_hdf : bool, default = False
automatically save hdf after load raw data.
dda : bool, default = False
is DDA data
auxiliary_items : list, default = []
Candidates are:
"injection_time", "cv",
"max_ion_time", "agc_target", "energy_ev",
"injection_optics_settling_time",
"funnel_rf_level", "faims_cv",
"detector", "activation", "analyzer",
"detector_id", "activation_id", "analyzer_id",
"""
super().__init__(centroided, save_as_hdf=save_as_hdf, **kwargs)
self.file_type = "thermo"
self.process_count = process_count
Expand All @@ -99,6 +91,19 @@ def _import(
self,
raw_file_path: str,
) -> dict:
"""
Re-implementation of :func:`MSData_Base._import` to enable :func:`.MSData_Base.import_raw`.
Parameters
----------
raw_file_path : str
File path of the raw data.
Returns
-------
dict
Spectrum information in a temporary dict format.
"""
rawfile = pyrawfilereader.RawFileReader(raw_file_path)
self.creation_time = rawfile.GetCreationDate()

Expand Down Expand Up @@ -185,13 +190,7 @@ def _import_batch(
is dda data.
auxiliary_items : list
Candidates:
"injection_time", "cv",
"max_ion_time", "agc_target", "energy_ev",
"injection_optics_settling_time",
"funnel_rf_level", "faims_cv",
"activation", "analyzer",
"activation_id", "analyzer_id",
Candidates are in :data:`__auxiliary_item_dtypes__`.
Returns
-------
Expand Down

0 comments on commit 34fb605

Please sign in to comment.