Skip to content

Commit

Permalink
use same util to strip substructure acronym everywhere
Browse files Browse the repository at this point in the history
  • Loading branch information
danielsf committed Jun 6, 2022
1 parent 5fd483a commit ad14c9f
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 88 deletions.
7 changes: 4 additions & 3 deletions allensdk/brain_observatory/ecephys/_channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import numpy as np

from allensdk.core import DataObject
from allensdk.brain_observatory.ecephys.utils import (
strip_substructure_acronym)


class Channel(DataObject):
Expand Down Expand Up @@ -75,9 +77,8 @@ def probe_horizontal_position(self) -> int:
@property
def structure_acronym(self) -> str:
acronym = self._structure_acronym
if type(self._structure_acronym) is str and \
self._strip_structure_subregion:
acronym = self._structure_acronym.split('-')[0]
if self._strip_structure_subregion:
acronym = strip_substructure_acronym(self._structure_acronym)
return acronym

@property
Expand Down
42 changes: 41 additions & 1 deletion allensdk/brain_observatory/ecephys/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging

from typing import Optional, Union
import numpy as np
import pandas as pd

Expand Down Expand Up @@ -65,3 +65,43 @@ def clobbering_merge(to_df, from_df, **kwargs):

to_df = to_df.drop(columns=list(overlapping))
return pd.merge(to_df, from_df, **kwargs)


def strip_substructure_acronym(
acronym: Optional[Union[str, list]]
) -> Optional[Union[str, list]]:
"""
Sanitize a structure acronym or a list of structure acronyms
by removing the substructure (e.g. DG-mo becomes DG).
If acronym is a list, every element in the list will be sanitized
and a list of unique acronyms will be returned. **Element order will
not be preserved**.
If acronym is None, return None. If None occurs in a list of
sturture acronyms, it will be omitted
"""

if isinstance(acronym, str):
return acronym.split('-')[0]
elif isinstance(acronym, list):
new_acronym = set()

for el in acronym:
if isinstance(el, str):
new_el = el.split('-')[0]
new_acronym.add(new_el)
elif el is not None:
raise RuntimeError(
"Do not know how to parse structure acronym "
f"{el} of type {type(el)}")

new_acronym = list(new_acronym)
new_acronym.sort()
return new_acronym
elif acronym is None:
return None
else:
raise RuntimeError(
"acronym must be a list or a str or None; you gave "
f"{acronym} which is a {type(acronym)}")
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# the VBN 2022 metadata dataframes as they are directly queried
# from LIMS.

from typing import Optional, Dict, List, Union
from typing import Optional, Dict, List
import pandas as pd
import numpy as np
import json
Expand All @@ -22,6 +22,9 @@
from allensdk.internal.api.queries.behavior_lims_queries import (
stimulus_pickle_paths_from_behavior_session_ids)

from allensdk.brain_observatory.ecephys.utils import (
strip_substructure_acronym)


def _add_session_number(
sessions_df: pd.DataFrame,
Expand Down Expand Up @@ -528,47 +531,7 @@ def _get_session_duration_from_behavior_session_ids(
return durations


def _sanitize_structure_acronym(
acronym: Optional[Union[str, list]]
) -> Optional[Union[str, list]]:
"""
Sanitize a structure acronym or a list of structure acronyms
by removing the substructure (e.g. DG-mo becomes DG).
If acronym is a list, every element in the list will be sanitized
and a list of unique acronyms will be returned. **Element order will
not be preserved**.
If acronym is None, return None. If None occurs in a list of
sturture acronyms, it will be omitted
"""

if isinstance(acronym, str):
return acronym.split('-')[0]
elif isinstance(acronym, list):
new_acronym = set()

for el in acronym:
if isinstance(el, str):
new_el = el.split('-')[0]
new_acronym.add(new_el)
elif el is not None:
raise RuntimeError(
"Do not know how to parse structure acronym "
f"{el} of type {type(el)}")

new_acronym = list(new_acronym)
new_acronym.sort()
return new_acronym
elif acronym is None:
return None
else:
raise RuntimeError(
"acronym must be a list or a str or None; you gave "
f"{acronym} which is a {type(acronym)}")


def sanitize_structure_acronyms(
def strip_substructure_acronym_df(
df: pd.DataFrame,
col_name: str) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -597,7 +560,7 @@ def sanitize_structure_acronyms(
Alters df in place
"""
new_col = [
_sanitize_structure_acronym(acronym)
strip_substructure_acronym(acronym)
for acronym in df[col_name]]
df[col_name] = new_col
return df
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

from allensdk.brain_observatory.vbn_2022.\
metadata_writer.dataframe_manipulations import (
sanitize_structure_acronyms)
strip_substructure_acronym_df)

from allensdk.core.auth_config import (
LIMS_DB_CREDENTIAL_MAP,
Expand Down Expand Up @@ -82,7 +82,7 @@ def run(self):
ecephys_session_id_list=session_id_list,
probe_ids_to_skip=probe_ids_to_skip)

units_table = sanitize_structure_acronyms(
units_table = strip_substructure_acronym_df(
df=units_table,
col_name='structure_acronym')

Expand Down Expand Up @@ -131,7 +131,7 @@ def run(self):
ecephys_session_id_list=session_id_list,
probe_ids_to_skip=probe_ids_to_skip)

probes_table = sanitize_structure_acronyms(
probes_table = strip_substructure_acronym_df(
df=probes_table,
col_name='structure_acronyms')

Expand All @@ -149,7 +149,7 @@ def run(self):
ecephys_session_id_list=session_id_list,
probe_ids_to_skip=probe_ids_to_skip)

channels_table = sanitize_structure_acronyms(
channels_table = strip_substructure_acronym_df(
df=channels_table,
col_name='structure_acronym')

Expand Down
32 changes: 32 additions & 0 deletions allensdk/test/brain_observatory/ecephys/test_ecephys_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pytest
from allensdk.brain_observatory.ecephys.utils import (
strip_substructure_acronym)


def test_strip_substructure_acronym():
"""
Test that strip_substructure_acronym behaves properly
"""

assert strip_substructure_acronym('abcde-fg-hi') == 'abcde'
assert strip_substructure_acronym(None) is None

data = ['DG-mo', 'DG-pd', 'LS-ab', 'LT-x', 'AB-cd',
'WX-yz', 'AB-ef']
expected = ['AB', 'DG', 'LS', 'LT', 'WX']
assert strip_substructure_acronym(data) == expected

data = [None, 'DG-mo', 'DG-pd', 'LS-ab', 'LT-x', 'AB-cd',
'WX-yz', None, 'AB-ef']
expected = ['AB', 'DG', 'LS', 'LT', 'WX']
assert strip_substructure_acronym(data) == expected

assert strip_substructure_acronym([None]) == []

# pass in a tuple; check that it fails since that is not
# a str or a list
with pytest.raises(RuntimeError, match="list or a str"):
strip_substructure_acronym(('a', 'b', 'c'))

with pytest.raises(RuntimeError, match="Do not know how to parse"):
strip_substructure_acronym(['abc', 2.3])
5 changes: 2 additions & 3 deletions allensdk/test/brain_observatory/ecephys/test_probes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import datetime
import json

import numpy as np
import pytest
from pynwb import NWBFile

Expand Down Expand Up @@ -77,7 +76,7 @@ def test_units_from_structure_with_acronym(self):
assert expected_n_units == obtained_n_units


@pytest.mark.parametrize('structure_acronym', ('LGd-sh', 'LGd', np.nan))
@pytest.mark.parametrize('structure_acronym', ('LGd-sh', 'LGd', None))
@pytest.mark.parametrize('strip_structure_subregion', (True, False))
def test_probe_channels_strip_subregion(
structure_acronym, strip_structure_subregion):
Expand All @@ -100,4 +99,4 @@ def test_probe_channels_strip_subregion(
else 'LGd'
assert c.structure_acronym == expected
else:
assert np.isnan(c.structure_acronym)
c.structure_acronym is None
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
remove_aborted_sessions,
_get_session_duration_from_behavior_session_ids,
remove_pretest_sessions,
_sanitize_structure_acronym,
sanitize_structure_acronyms)
strip_substructure_acronym_df)

from allensdk.test.brain_observatory.behavior.data_objects.lims_util import \
LimsTest
Expand Down Expand Up @@ -352,35 +351,6 @@ def test_remove_pretest_sessions():
pd.testing.assert_frame_equal(expected, actual)


def test_sanitize_single_acronym():
"""
Test that _sanitize_structure_acronym behaves properly
"""

assert _sanitize_structure_acronym('abcde-fg-hi') == 'abcde'
assert _sanitize_structure_acronym(None) is None

data = ['DG-mo', 'DG-pd', 'LS-ab', 'LT-x', 'AB-cd',
'WX-yz', 'AB-ef']
expected = ['AB', 'DG', 'LS', 'LT', 'WX']
assert _sanitize_structure_acronym(data) == expected

data = [None, 'DG-mo', 'DG-pd', 'LS-ab', 'LT-x', 'AB-cd',
'WX-yz', None, 'AB-ef']
expected = ['AB', 'DG', 'LS', 'LT', 'WX']
assert _sanitize_structure_acronym(data) == expected

assert _sanitize_structure_acronym([None]) == []

# pass in a tuple; check that it fails since that is not
# a str or a list
with pytest.raises(RuntimeError, match="list or a str"):
_sanitize_structure_acronym(('a', 'b', 'c'))

with pytest.raises(RuntimeError, match="Do not know how to parse"):
_sanitize_structure_acronym(['abc', 2.3])


@pytest.mark.parametrize(
"input_data, output_data, col_name",
[([{'a': 1, 'b': 'DG-mo'},
Expand All @@ -397,19 +367,19 @@ def test_sanitize_single_acronym():
[{'a': 1, 'b': ['AB', 'DG']},
{'a': 2, 'b': 'DG'}],
'b')])
def test_sanitize_structure_acronyms(
def test_strip_substructure_acronym_df(
input_data,
output_data,
col_name):
"""
Test method that sanitizes the structure acronym
Test method that strips the substructure acronym
columns in a dataframe
"""

input_df = pd.DataFrame(data=input_data)
expected_df = pd.DataFrame(data=output_data)

actual_df = sanitize_structure_acronyms(
actual_df = strip_substructure_acronym_df(
df=input_df,
col_name=col_name)

Expand Down

0 comments on commit ad14c9f

Please sign in to comment.