use same util to strip substructure acronym everywhere

AllenInstitute · Jun 6, 2022 · ad14c9f · ad14c9f
1 parent 5fd483a
commit ad14c9f
Show file tree

Hide file tree

Showing 7 changed files with 93 additions and 88 deletions.
diff --git a/allensdk/brain_observatory/ecephys/_channel.py b/allensdk/brain_observatory/ecephys/_channel.py
@@ -2,6 +2,8 @@
 import numpy as np
 
 from allensdk.core import DataObject
+from allensdk.brain_observatory.ecephys.utils import (
+    strip_substructure_acronym)
 
 
 class Channel(DataObject):
@@ -75,9 +77,8 @@ def probe_horizontal_position(self) -> int:
     @property
     def structure_acronym(self) -> str:
         acronym = self._structure_acronym
-        if type(self._structure_acronym) is str and \
-                self._strip_structure_subregion:
-            acronym = self._structure_acronym.split('-')[0]
+        if self._strip_structure_subregion:
+            acronym = strip_substructure_acronym(self._structure_acronym)
         return acronym
 
     @property

diff --git a/allensdk/brain_observatory/ecephys/utils.py b/allensdk/brain_observatory/ecephys/utils.py
@@ -1,5 +1,5 @@
 import logging
-
+from typing import Optional, Union
 import numpy as np
 import pandas as pd
 
@@ -65,3 +65,43 @@ def clobbering_merge(to_df, from_df, **kwargs):
 
     to_df = to_df.drop(columns=list(overlapping))
     return pd.merge(to_df, from_df, **kwargs)
+
+
+def strip_substructure_acronym(
+        acronym: Optional[Union[str, list]]
+) -> Optional[Union[str, list]]:
+    """
+    Sanitize a structure acronym or a list of structure acronyms
+    by removing the substructure (e.g. DG-mo becomes DG).
+
+    If acronym is a list, every element in the list will be sanitized
+    and a list of unique acronyms will be returned. **Element order will
+    not be preserved**.
+
+    If acronym is None, return None. If None occurs in a list of
+    sturture acronyms, it will be omitted
+    """
+
+    if isinstance(acronym, str):
+        return acronym.split('-')[0]
+    elif isinstance(acronym, list):
+        new_acronym = set()
+
+        for el in acronym:
+            if isinstance(el, str):
+                new_el = el.split('-')[0]
+                new_acronym.add(new_el)
+            elif el is not None:
+                raise RuntimeError(
+                    "Do not know how to parse structure acronym "
+                    f"{el} of type {type(el)}")
+
+        new_acronym = list(new_acronym)
+        new_acronym.sort()
+        return new_acronym
+    elif acronym is None:
+        return None
+    else:
+        raise RuntimeError(
+            "acronym must be a list or a str or None; you gave "
+            f"{acronym} which is a {type(acronym)}")
diff --git a/allensdk/brain_observatory/vbn_2022/metadata_writer/dataframe_manipulations.py b/allensdk/brain_observatory/vbn_2022/metadata_writer/dataframe_manipulations.py
@@ -2,7 +2,7 @@
 # the VBN 2022 metadata dataframes as they are directly queried
 # from LIMS.
 
-from typing import Optional, Dict, List, Union
+from typing import Optional, Dict, List
 import pandas as pd
 import numpy as np
 import json
@@ -22,6 +22,9 @@
 from allensdk.internal.api.queries.behavior_lims_queries import (
     stimulus_pickle_paths_from_behavior_session_ids)
 
+from allensdk.brain_observatory.ecephys.utils import (
+    strip_substructure_acronym)
+
 
 def _add_session_number(
         sessions_df: pd.DataFrame,
@@ -528,47 +531,7 @@ def _get_session_duration_from_behavior_session_ids(
     return durations
 
 
-def _sanitize_structure_acronym(
-        acronym: Optional[Union[str, list]]
-) -> Optional[Union[str, list]]:
-    """
-    Sanitize a structure acronym or a list of structure acronyms
-    by removing the substructure (e.g. DG-mo becomes DG).
-
-    If acronym is a list, every element in the list will be sanitized
-    and a list of unique acronyms will be returned. **Element order will
-    not be preserved**.
-
-    If acronym is None, return None. If None occurs in a list of
-    sturture acronyms, it will be omitted
-    """
-
-    if isinstance(acronym, str):
-        return acronym.split('-')[0]
-    elif isinstance(acronym, list):
-        new_acronym = set()
-
-        for el in acronym:
-            if isinstance(el, str):
-                new_el = el.split('-')[0]
-                new_acronym.add(new_el)
-            elif el is not None:
-                raise RuntimeError(
-                    "Do not know how to parse structure acronym "
-                    f"{el} of type {type(el)}")
-
-        new_acronym = list(new_acronym)
-        new_acronym.sort()
-        return new_acronym
-    elif acronym is None:
-        return None
-    else:
-        raise RuntimeError(
-            "acronym must be a list or a str or None; you gave "
-            f"{acronym} which is a {type(acronym)}")
-
-
-def sanitize_structure_acronyms(
+def strip_substructure_acronym_df(
         df: pd.DataFrame,
         col_name: str) -> pd.DataFrame:
     """
@@ -597,7 +560,7 @@ def sanitize_structure_acronyms(
     Alters df in place
     """
     new_col = [
-        _sanitize_structure_acronym(acronym)
+        strip_substructure_acronym(acronym)
         for acronym in df[col_name]]
     df[col_name] = new_col
     return df
diff --git a/allensdk/brain_observatory/vbn_2022/metadata_writer/metadata_writer.py b/allensdk/brain_observatory/vbn_2022/metadata_writer/metadata_writer.py
@@ -29,7 +29,7 @@
 
 from allensdk.brain_observatory.vbn_2022.\
     metadata_writer.dataframe_manipulations import (
-        sanitize_structure_acronyms)
+        strip_substructure_acronym_df)
 
 from allensdk.core.auth_config import (
     LIMS_DB_CREDENTIAL_MAP,
@@ -82,7 +82,7 @@ def run(self):
                     ecephys_session_id_list=session_id_list,
                     probe_ids_to_skip=probe_ids_to_skip)
 
-        units_table = sanitize_structure_acronyms(
+        units_table = strip_substructure_acronym_df(
                 df=units_table,
                 col_name='structure_acronym')
 
@@ -131,7 +131,7 @@ def run(self):
                     ecephys_session_id_list=session_id_list,
                     probe_ids_to_skip=probe_ids_to_skip)
 
-        probes_table = sanitize_structure_acronyms(
+        probes_table = strip_substructure_acronym_df(
                 df=probes_table,
                 col_name='structure_acronyms')
 
@@ -149,7 +149,7 @@ def run(self):
                     ecephys_session_id_list=session_id_list,
                     probe_ids_to_skip=probe_ids_to_skip)
 
-        channels_table = sanitize_structure_acronyms(
+        channels_table = strip_substructure_acronym_df(
                 df=channels_table,
                 col_name='structure_acronym')
 

diff --git a/allensdk/test/brain_observatory/ecephys/test_ecephys_utils.py b/allensdk/test/brain_observatory/ecephys/test_ecephys_utils.py
@@ -0,0 +1,32 @@
+import pytest
+from allensdk.brain_observatory.ecephys.utils import (
+    strip_substructure_acronym)
+
+
+def test_strip_substructure_acronym():
+    """
+    Test that strip_substructure_acronym behaves properly
+    """
+
+    assert strip_substructure_acronym('abcde-fg-hi') == 'abcde'
+    assert strip_substructure_acronym(None) is None
+
+    data = ['DG-mo', 'DG-pd', 'LS-ab', 'LT-x', 'AB-cd',
+            'WX-yz', 'AB-ef']
+    expected = ['AB', 'DG', 'LS', 'LT', 'WX']
+    assert strip_substructure_acronym(data) == expected
+
+    data = [None, 'DG-mo', 'DG-pd', 'LS-ab', 'LT-x', 'AB-cd',
+            'WX-yz', None, 'AB-ef']
+    expected = ['AB', 'DG', 'LS', 'LT', 'WX']
+    assert strip_substructure_acronym(data) == expected
+
+    assert strip_substructure_acronym([None]) == []
+
+    # pass in a tuple; check that it fails since that is not
+    # a str or a list
+    with pytest.raises(RuntimeError, match="list or a str"):
+        strip_substructure_acronym(('a', 'b', 'c'))
+
+    with pytest.raises(RuntimeError, match="Do not know how to parse"):
+        strip_substructure_acronym(['abc', 2.3])
diff --git a/allensdk/test/brain_observatory/ecephys/test_probes.py b/allensdk/test/brain_observatory/ecephys/test_probes.py
@@ -1,7 +1,6 @@
 import datetime
 import json
 
-import numpy as np
 import pytest
 from pynwb import NWBFile
 
@@ -77,7 +76,7 @@ def test_units_from_structure_with_acronym(self):
         assert expected_n_units == obtained_n_units
 
 
-@pytest.mark.parametrize('structure_acronym', ('LGd-sh', 'LGd', np.nan))
+@pytest.mark.parametrize('structure_acronym', ('LGd-sh', 'LGd', None))
 @pytest.mark.parametrize('strip_structure_subregion', (True, False))
 def test_probe_channels_strip_subregion(
         structure_acronym, strip_structure_subregion):
@@ -100,4 +99,4 @@ def test_probe_channels_strip_subregion(
                 else 'LGd'
         assert c.structure_acronym == expected
     else:
-        assert np.isnan(c.structure_acronym)
+        c.structure_acronym is None
diff --git a/allensdk/test/brain_observatory/vbn_2022/metadata_writer/test_dataframe_manipulations.py b/allensdk/test/brain_observatory/vbn_2022/metadata_writer/test_dataframe_manipulations.py
@@ -14,8 +14,7 @@
         remove_aborted_sessions,
         _get_session_duration_from_behavior_session_ids,
         remove_pretest_sessions,
-        _sanitize_structure_acronym,
-        sanitize_structure_acronyms)
+        strip_substructure_acronym_df)
 
 from allensdk.test.brain_observatory.behavior.data_objects.lims_util import \
     LimsTest
@@ -352,35 +351,6 @@ def test_remove_pretest_sessions():
     pd.testing.assert_frame_equal(expected, actual)
 
 
-def test_sanitize_single_acronym():
-    """
-    Test that _sanitize_structure_acronym behaves properly
-    """
-
-    assert _sanitize_structure_acronym('abcde-fg-hi') == 'abcde'
-    assert _sanitize_structure_acronym(None) is None
-
-    data = ['DG-mo', 'DG-pd', 'LS-ab', 'LT-x', 'AB-cd',
-            'WX-yz', 'AB-ef']
-    expected = ['AB', 'DG', 'LS', 'LT', 'WX']
-    assert _sanitize_structure_acronym(data) == expected
-
-    data = [None, 'DG-mo', 'DG-pd', 'LS-ab', 'LT-x', 'AB-cd',
-            'WX-yz', None, 'AB-ef']
-    expected = ['AB', 'DG', 'LS', 'LT', 'WX']
-    assert _sanitize_structure_acronym(data) == expected
-
-    assert _sanitize_structure_acronym([None]) == []
-
-    # pass in a tuple; check that it fails since that is not
-    # a str or a list
-    with pytest.raises(RuntimeError, match="list or a str"):
-        _sanitize_structure_acronym(('a', 'b', 'c'))
-
-    with pytest.raises(RuntimeError, match="Do not know how to parse"):
-        _sanitize_structure_acronym(['abc', 2.3])
-
-
 @pytest.mark.parametrize(
     "input_data, output_data, col_name",
     [([{'a': 1, 'b': 'DG-mo'},
@@ -397,19 +367,19 @@ def test_sanitize_single_acronym():
       [{'a': 1, 'b': ['AB', 'DG']},
        {'a': 2, 'b': 'DG'}],
       'b')])
-def test_sanitize_structure_acronyms(
+def test_strip_substructure_acronym_df(
         input_data,
         output_data,
         col_name):
     """
-    Test method that sanitizes the structure acronym
+    Test method that strips the substructure acronym
     columns in a dataframe
     """
 
     input_df = pd.DataFrame(data=input_data)
     expected_df = pd.DataFrame(data=output_data)
 
-    actual_df = sanitize_structure_acronyms(
+    actual_df = strip_substructure_acronym_df(
             df=input_df,
             col_name=col_name)