Merge pull request #55 from MannLabs/53-docs-base-and-thermo

53 docs base and thermo
MannLabs · Jun 14, 2024 · 34fb605 · 34fb605
2 parents 0cd21c4 + 6fbfb40
commit 34fb605
Show file tree

Hide file tree

Showing 2 changed files with 193 additions and 66 deletions.
diff --git a/alpharaw/ms_data_base.py b/alpharaw/ms_data_base.py
@@ -6,7 +6,14 @@
 
 class MSData_Base:
     """
-    The base data structure for MS Data, other MSData loader inherit
+    The base data structure for MS RAW Data, other MSData loaders inherit this class.
+
+    Parameters
+    ----------
+    centroided : bool, optional
+        If centroiding the peak data, by default True
+    save_as_hdf : bool, optional
+        If automatically save the data into HDF5 format, by default False
     """
 
     column_dtypes = {
@@ -25,7 +32,7 @@ class MSData_Base:
     """
     Spectrum dataframe containing the following columns:
 
-    - `rt` (float64): in minutes
+    - `rt` (float64): in minutes. `rt_sec` will be RT in seconds, which is not included by default.
     - `precursor_mz` (float64): mono_mz (DDA) or isolation center mz
     - `isolation_lower_mz` (float64): left of the isolation window
     - `isolation_upper_mz` (float64): right of the isolation window
@@ -62,15 +69,13 @@ class MSData_Base:
         "FT",
         "TOF",
     ]
+    """
+    These spectrum infomation items in str format can be one-to-one mapped into
+    unique token IDs (indices), for exampel "CID"=0, "HCD"=1, ...
+    Token IDs are better for storage in HDF5 format.
+    """
 
     def __init__(self, centroided: bool = True, save_as_hdf: bool = False, **kwargs):
-        """
-        Parameters
-        ----------
-        centroided : bool, optional
-            if peaks will be centroided after loading,
-            by default True
-        """
         # A spectrum contains peaks
         self.spectrum_df: pd.DataFrame = pd.DataFrame()
         # A peak contains mz, intensity, and ...
@@ -82,9 +87,19 @@ def __init__(self, centroided: bool = True, save_as_hdf: bool = False, **kwargs)
         self.file_type = ""
         self.instrument = "none"
 
-    def _get_term_id(self, terminology: str):
+    def _get_term_id(self, terminology: str) -> int:
         """
-        Get terminology id from :data:`self.vocab`, -1 if not exist.
+        Get terminology ID from :attr:`.MSData_Base.vocab`, -1 if not exist.
+
+        Parameters
+        ----------
+        terminology : str
+            The terminology name from :attr:`.MSData_Base.vocab`, such as "CID", "HCD", ...
+
+        Returns
+        -------
+        int
+            Terminology ID, which is the index in :attr:`.MSData_Base.vocab`.
         """
         try:
             return self.vocab.index(terminology)
@@ -96,20 +111,36 @@ def raw_file_path(self) -> str:
         return self._raw_file_path
 
     @raw_file_path.setter
-    def raw_file_path(self, _path: str):
-        self._raw_file_path = _path
+    def raw_file_path(self, raw_file_path: str):
+        self._raw_file_path = raw_file_path
 
-    def import_raw(self, _path: str):
-        self.raw_file_path = _path
-        raw_data = self._import(_path)
-        self._set_dataframes(raw_data)
+    def import_raw(self, raw_file_path: str):
+        """
+        Import a raw file. It involves three steps:
+        ```
+        raw_data_dict = self._import(raw_file_path)
+        self._set_dataframes(raw_data_dict)
+        self._check_df()
+        ```
+
+        Parameters
+        ----------
+        raw_file_path : str
+            Absolute or relative path of the raw file.
+        """
+        self.raw_file_path = raw_file_path
+        raw_data_dict = self._import(raw_file_path)
+        self._set_dataframes(raw_data_dict)
         self._check_df()
 
         if self._save_as_hdf:
-            self.save_hdf(_path + ".hdf")
+            self.save_hdf(raw_file_path + ".hdf")
 
-    def load_raw(self, _path: str):
-        self.import_raw(_path)
+    def load_raw(self, raw_file_path: str):
+        """
+        Wrapper of :func:`.MSData_Base.import_raw`
+        """
+        self.import_raw(raw_file_path)
 
     def _save_meta_to_hdf(self, hdf: HDF_File):
         hdf.ms_data.meta = {
@@ -127,15 +158,35 @@ def _load_meta_from_hdf(self, hdf: HDF_File):
         self.centroided = hdf.ms_data.meta.centroided
         self.instrument = hdf.ms_data.meta.instrument
 
-    def save_hdf(self, _path: str):
-        hdf = HDF_File(_path, read_only=False, truncate=True, delete_existing=True)
+    def save_hdf(self, hdf_file_path: str):
+        """
+        Save data into HDF5 file
+
+        Parameters
+        ----------
+        hdf_file_path : str
+            Absolute or relative path of HDF5 file.
+        """
+        hdf = HDF_File(
+            hdf_file_path, read_only=False, truncate=True, delete_existing=True
+        )
 
         hdf.ms_data = {"spectrum_df": self.spectrum_df, "peak_df": self.peak_df}
 
         self._save_meta_to_hdf(hdf)
 
-    def load_hdf(self, _path: str):
-        hdf = HDF_File(_path, read_only=True, truncate=False, delete_existing=False)
+    def load_hdf(self, hdf_file_path: str):
+        """
+        Load data from HDF5 file.
+
+        Parameters
+        ----------
+        hdf_file_path : str
+            Absolute or relative path of HDF5 file.
+        """
+        hdf = HDF_File(
+            hdf_file_path, read_only=True, truncate=False, delete_existing=False
+        )
 
         self.spectrum_df = hdf.ms_data.spectrum_df.values
         self.peak_df = hdf.ms_data.peak_df.values
@@ -144,10 +195,43 @@ def load_hdf(self, _path: str):
             self._load_meta_from_hdf(hdf)
 
     def reset_spec_idxes(self):
+        """
+        Reset spec indexes to make sure spec_idx values are continuous ranging from 0 to N.
+        """
         self.spectrum_df.reset_index(drop=True, inplace=True)
         self.spectrum_df["spec_idx"] = self.spectrum_df.index.values
 
-    def _import(self, _path):
+    def _import(self, _path: str) -> dict:
+        """
+        Parameters
+        ----------
+        _path : str
+            Path of raw file.
+
+        Returns
+        -------
+        dict
+            Example:
+            ```
+            spec_dict = {
+                "_peak_indices": _peak_indices,
+                "peak_mz": np.concatenate(mz_values).copy(),
+                "peak_intensity": np.concatenate(intensity_values).copy(),
+                "rt": np.array(rt_values).copy(),
+                "precursor_mz": np.array(precursor_mz_values).copy(),
+                "precursor_charge": np.array(precursor_charges, dtype=np.int8).copy(),
+                "isolation_lower_mz": np.array(isolation_mz_lowers).copy(),
+                "isolation_upper_mz": np.array(isolation_mz_uppers).copy(),
+                "ms_level": np.array(ms_order_list, dtype=np.int8).copy(),
+                "nce": np.array(ce_list, dtype=np.float32).copy(),
+            }
+            ```
+
+        Raises
+        ------
+        NotImplementedError
+            Sub-class of `MSData_Base` must implement this method.
+        """
         raise NotImplementedError(f"{self.__class__} must implement `_import()`")
 
     def _set_dataframes(self, raw_data: dict):
@@ -200,6 +284,14 @@ def create_spectrum_df(
         self,
         spectrum_num: int,
     ):
+        """
+        Create a empty spectrum dataframe from the number of spectra.
+
+        Parameters
+        ----------
+        spectrum_num : int
+            The number of spectra.
+        """
         self.spectrum_df = pd.DataFrame(index=np.arange(spectrum_num, dtype=np.int64))
         self.spectrum_df["spec_idx"] = self.spectrum_df.index.values
 
@@ -345,6 +437,12 @@ def index_ragged_list(ragged_list: list) -> np.ndarray:
 
 
 class MSData_HDF(MSData_Base):
+    """
+    Wrapper of reader for alpharaw's HDF5 spectrum file.
+    This class is registered as "alpharaw", "raw.hdf", "alpharaw_hdf", "hdf" and "hdf5"
+    in :data:`ms_reader_provider` instance.
+    """
+
     def import_raw(self, _path: str):
         self.raw_file_path = _path
         self.load_hdf(_path)
@@ -356,20 +454,50 @@ class MSReaderProvider:
     def __init__(self):
         self.ms_reader_dict = {}
 
-    def register_reader(self, ms2_type: str, reader_class):
-        self.ms_reader_dict[ms2_type.lower()] = reader_class
+    def register_reader(self, ms_file_type: str, reader_class: type):
+        """
+        Register a new reader for `ms_file_type` format with `reader_class`.
+
+        Parameters
+        ----------
+        ms_file_type : str
+            AlphaRaw supported MS file types.
+        reader_class : type
+            AlphaRaw supported MS class types.
+        """
+        self.ms_reader_dict[ms_file_type.lower()] = reader_class
 
     def get_reader(
-        self, file_type: str, *, centroided: bool = True, **kwargs
+        self, ms_file_type: str, *, centroided: bool = True, **kwargs
     ) -> MSData_Base:
-        file_type = file_type.lower()
-        if file_type not in self.ms_reader_dict:
+        """
+        Get the MS reader for the given `ms_file_type`.
+
+        Parameters
+        ----------
+        ms_file_type : str
+            AlphaRaw supported MS file types.
+        centroided : bool, optional
+            If centroiding the data, by default True.
+
+        Returns
+        -------
+        MSData_Base
+            Instance of corresponding sub-class of `MSData_Base`.
+        """
+        ms_file_type = ms_file_type.lower()
+        if ms_file_type not in self.ms_reader_dict:
             return None
         else:
-            return self.ms_reader_dict[file_type](centroided=centroided, **kwargs)
+            return self.ms_reader_dict[ms_file_type](centroided=centroided, **kwargs)
 
 
 ms_reader_provider = MSReaderProvider()
+"""
+MS data register (:class:`.MSReaderProvider`) performs as a factory to
+produce different readers for different file formats.
+"""
+
 ms_reader_provider.register_reader("alpharaw", MSData_HDF)
 ms_reader_provider.register_reader("raw.hdf", MSData_HDF)
 ms_reader_provider.register_reader("alpharaw_hdf", MSData_HDF)

diff --git a/alpharaw/thermo.py b/alpharaw/thermo.py
@@ -14,6 +14,7 @@
     ms_reader_provider,
 )
 
+#: These thermo spectrum items can be only accessed by trailer dict using RawFileReader APIs.
 __trailer_extra_list__ = [
     "injection_time",
     "cv",
@@ -24,6 +25,8 @@
     "funnel_rf_level",
     "faims_cv",
 ]
+
+#: The auxiliary items and types that can be accessed from thermo RawFileReader.
 __auxiliary_item_dtypes__ = {
     "injection_time": np.float32,
     "cv": np.float32,
@@ -47,6 +50,23 @@
 class ThermoRawData(MSData_Base):
     """
     Loading Thermo Raw data as MSData_Base data structure.
+    This class is registered "thermo" and "thermo_raw" in :data:`ms_reader_provider`.
+
+    Parameters
+    ----------
+    centroided : bool, optional
+        If peaks will be centroided after loading. By defaults True.
+    process_count : int, optional
+        number of spectra to load in each batch, by default 10.
+    mp_batch_size : int, optional
+        automatically save hdf after load raw data, by default 5000.
+    save_as_hdf : bool, optional
+        is DDA data, by default False.
+    dda : bool, optional
+        _description_, by default False.
+    auxiliary_items : list, optional
+        Additional spectrum items, candidates are in :data:`__auxiliary_item_dtypes__`.
+        By default [].
     """
 
     def __init__(
@@ -59,34 +79,6 @@ def __init__(
         auxiliary_items: list = [],
         **kwargs,
     ):
-        """
-        Parameters
-        ----------
-        centroided : bool, default = True
-            if peaks will be centroided after loading,
-            by default True
-
-        process_count : int, default = 8
-            number of processes to use for loading
-
-        mp_batch_size : int, default = 10000
-            number of spectra to load in each batch
-
-        save_as_hdf : bool, default = False
-            automatically save hdf after load raw data.
-
-        dda : bool, default = False
-            is DDA data
-
-        auxiliary_items : list, default = []
-            Candidates are:
-            "injection_time", "cv",
-            "max_ion_time", "agc_target", "energy_ev",
-            "injection_optics_settling_time",
-            "funnel_rf_level", "faims_cv",
-            "detector", "activation", "analyzer",
-            "detector_id", "activation_id", "analyzer_id",
-        """
         super().__init__(centroided, save_as_hdf=save_as_hdf, **kwargs)
         self.file_type = "thermo"
         self.process_count = process_count
@@ -99,6 +91,19 @@ def _import(
         self,
         raw_file_path: str,
     ) -> dict:
+        """
+        Re-implementation of :func:`MSData_Base._import` to enable :func:`.MSData_Base.import_raw`.
+
+        Parameters
+        ----------
+        raw_file_path : str
+            File path of the raw data.
+
+        Returns
+        -------
+        dict
+            Spectrum information in a temporary dict format.
+        """
         rawfile = pyrawfilereader.RawFileReader(raw_file_path)
         self.creation_time = rawfile.GetCreationDate()
 
@@ -185,13 +190,7 @@ def _import_batch(
         is dda data.
 
     auxiliary_items : list
-        Candidates:
-        "injection_time", "cv",
-        "max_ion_time", "agc_target", "energy_ev",
-        "injection_optics_settling_time",
-        "funnel_rf_level", "faims_cv",
-        "activation", "analyzer",
-        "activation_id", "analyzer_id",
+        Candidates are in :data:`__auxiliary_item_dtypes__`.
 
     Returns
     -------