Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ticket/2450/supplemental/columns #2451

Merged
merged 5 commits into from
Jun 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@

import allensdk

from allensdk.core.dataframe_utils import (
patch_df_from_other)

from allensdk.brain_observatory.vbn_2022.metadata_writer.schemas import (
VBN2022MetadataWriterInputSchema,
DataReleaseToolsInputSchema)
Expand Down Expand Up @@ -143,7 +146,7 @@ def run(self):
df=channels_table,
output_path=self.args['channels_path'])

(session_table,
(ecephys_session_table,
behavior_session_table) = session_tables_from_ecephys_session_id_list(
lims_connection=lims_connection,
mtrain_connection=mtrain_connection,
Expand All @@ -154,16 +157,35 @@ def run(self):
ecephys_nwb_dir = pathlib.Path(
self.args['ecephys_nwb_dir'])

session_table = add_file_paths_to_metadata_table(
metadata_table=session_table,
ecephys_session_table = add_file_paths_to_metadata_table(
metadata_table=ecephys_session_table,
id_generator=file_id_generator,
file_dir=ecephys_nwb_dir,
file_prefix=self.args['ecephys_nwb_prefix'],
index_col='ecephys_session_id',
on_missing_file=self.args['on_missing_file'])

# add supplemental columns to the ecephys_sessions
# column
if self.args['supplemental_data'] is not None:
self.logger.info("Adding supplemental data")
supplemental_df = pd.DataFrame(
data=self.args['supplemental_data'])

columns_to_patch = []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't you just use pd.merge here instead of patch_df_from_other?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if we did just use pd.merge, I'd want to wrap it in a function that we could test to make sure that the columns we are adding get added the way we expect. patch_df_from_other is already tested. I'd rather keep this as it is.

for column_name in supplemental_df.columns:
if column_name == 'ecephys_session_id':
continue
columns_to_patch.append(column_name)

ecephys_session_table = patch_df_from_other(
target_df=ecephys_session_table,
source_df=supplemental_df,
columns_to_patch=columns_to_patch,
index_column='ecephys_session_id')

self.write_df(
df=session_table,
df=ecephys_session_table,
output_path=self.args['ecephys_sessions_path'])

self.write_df(
Expand Down
12 changes: 12 additions & 0 deletions allensdk/brain_observatory/vbn_2022/metadata_writer/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@ class VBN2022MetadataWriterInputSchema(argschema.ArgSchema):
"{ecephys_nwb_dir}/{ecephys_nwb_prefix}_{ecephys_session_id}.nwb")
)

supplemental_data = argschema.fields.List(
argschema.fields.Dict,
default=None,
allow_none=True,
description=(
"List of dicts definining any supplemental columns "
"that need to be added to the ecephys_sessions.csv "
"table. Each dict should represent a row in a dataframe "
"that will get merged on ecephys_session_id with "
"the ecephys_sessions table (row must therefore contain "
"ecephys_session_id)"))

on_missing_file = argschema.fields.Str(
default='error',
required=False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ def smoketest_config_fixture():
config parameters for on-prem metadata writer smoketest
"""
config = {
"ecephys_session_id_list": [1115077618, 1081429294, 1123100019],
"probes_to_skip": [{"session": 1123100019, "probe": "probeC"}]
"ecephys_session_id_list": [1115077618, 1081429294],
"probes_to_skip": [{"session": 1115077618, "probe": "probeC"}]
}
return config

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,20 @@

@pytest.mark.requires_bamboo
@pytest.mark.parametrize(
'on_missing_file', ['skip', 'warn'])
'on_missing_file, with_supplement',
[('skip', False), ('warn', False), ('warn', True)])
def test_metadata_writer_smoketest(
smoketest_config_fixture,
tmp_path_factory,
helper_functions,
on_missing_file):
on_missing_file,
with_supplement):
"""
smoke test for VBN 2022 metadata writer. Requires LIMS
and mtrain connections.

If with_supplement is True, add supplemental data
to the ecephys_sessions.csv file and test for their existence
"""

output_names = ('units.csv', 'probes.csv', 'channels.csv',
Expand All @@ -34,6 +39,20 @@ def test_metadata_writer_smoketest(
suffix='.json')[1])
config['output_dir'] = str(output_dir.resolve().absolute())

if with_supplement:
# add suplemental columns to configuration
supplement = [{'ecephys_session_id': 1115077618,
'supplementA': 'cat',
'supplementB': 5},
{'ecephys_session_id': 1081429294,
'supplementA': 'frog',
'supplementB': 6},
{'ecephys_session_id': 11111,
'supplementA': None,
'supplementB': None}]

config['supplemental_data'] = supplement

expected_paths = []
for name in output_names:
file_path = output_dir / name
Expand Down Expand Up @@ -63,8 +82,25 @@ def test_metadata_writer_smoketest(
assert file_path.exists()
df = pd.read_csv(file_path)
expected_columns = set(column_lookup[file_path.name])
if with_supplement and file_path.name == 'ecephys_sessions.csv':
expected_columns.add('supplementA')
expected_columns.add('supplementB')

actual_columns = set(df.columns)
assert expected_columns == actual_columns

if with_supplement:
df = pd.read_csv(output_dir / 'ecephys_sessions.csv')
# make sure that no extra rows were added when adding
# the supplemental data
assert len(df) == 2

for expected in supplement[:2]:
this_row = df.loc[
df.ecephys_session_id == expected['ecephys_session_id']]
assert len(this_row) == 1
assert this_row.supplementA.values[0] == expected['supplementA']
assert this_row.supplementB.values[0] == expected['supplementB']

helper_functions.windows_safe_cleanup_dir(
dir_path=output_dir)
24 changes: 24 additions & 0 deletions allensdk/test/core/test_datafame_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,3 +174,27 @@ def test_patch_with_duplicates(
columns_to_patch=['b', 'd'])

pd.testing.assert_frame_equal(actual, expected_df)


def test_patch_new_column(
target_df_fixture,
source_df_fixture):
"""
Test case where we use patch_df_from_other in the case where we are
adding a column to target_df
"""
expected_data = [
{'a': 1, 'b': 3.4, 'c': 'apple', 'd': None, 'e': 'frog'},
{'a': 9, 'b': 4.5, 'c': 'banana', 'd': 4.6, 'e': None},
{'a': 12, 'b': 7.8, 'c': 'pineapple', 'd': 'purple', 'e': 'dog'},
{'a': 17, 'b': None, 'c': 'papaya', 'd': 11, 'e': 'cat'}
]
expected_df = pd.DataFrame(data=expected_data)

actual = patch_df_from_other(
source_df=source_df_fixture.copy(deep=True),
target_df=target_df_fixture,
index_column='a',
columns_to_patch=['e'])

pd.testing.assert_frame_equal(actual, expected_df)