Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ticket/ecephys/33a/metadata/writer #2407

Merged
merged 35 commits into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
5c849a2
fix docstring error in timestamps_processing
danielsf Apr 28, 2022
353cf02
factor lims_query methods out of behavior_project_cache space
danielsf Apr 26, 2022
68c1a32
add session_type and date_of_acquisition properties to BehaviorStimul…
danielsf Apr 27, 2022
4ef4148
start implementing VBN metadata writer
danielsf Apr 19, 2022
22bbfe1
add method to lookup pickle paths from a list of behavior session IDs
danielsf Apr 27, 2022
dc80384
reorganize metadata utility methods
danielsf Apr 27, 2022
2862b41
add docstrings to lims_queries.py
danielsf Apr 29, 2022
0f6ab97
add method to get behavior_sessions table from ecephys_session_ids
danielsf Apr 28, 2022
9e76d7c
finish functional implementation of VBN_2022 metadata writer
danielsf Apr 28, 2022
6c67e6f
reorganize metadata writer utils
danielsf May 2, 2022
0f91f73
fix _add_prior_omissions_to_behavior
danielsf May 2, 2022
44474fb
finish adding dataframe_manipulations unit test
danielsf May 2, 2022
97b95be
add class to generate unique file IDs when metadata tables are written
danielsf May 2, 2022
62cf764
add method to add file_id and file_path to session_table
danielsf May 2, 2022
20acde6
connect add_file_path... CLI metadata writer
danielsf May 2, 2022
168434e
add on-prem smoke test for VBN metadata writer CLI
danielsf May 2, 2022
8f7b6a2
move _add_images_from_behavior to dataframe_manipulations
danielsf May 2, 2022
b227a30
pep8 change
danielsf May 2, 2022
751ad8b
fix test to pass in windows
danielsf May 3, 2022
065769a
simplify return statement in BehaviorStimulusFile.session_type
danielsf May 4, 2022
120fcf0
rename test package
danielsf May 4, 2022
8f4c1a5
reformat SQL queries
danielsf May 4, 2022
a705f50
remove parentheses from return statement
danielsf May 4, 2022
5484608
fix docstrings in lims_queries.py
danielsf May 4, 2022
673c81d
remove unused array
danielsf May 4, 2022
b54485a
add valid_data to channels and units table
danielsf May 4, 2022
183cf8e
use existing get_prior_exposures_to_omissions method
danielsf May 5, 2022
8230824
_add_prior_omissions uses code from VBO tables/utils
danielsf May 5, 2022
41f28d1
indent queries in allensdk/internal/api/queries/lims_queries.py
danielsf May 6, 2022
53eb5d2
factor out different flavors of lims queries
danielsf May 6, 2022
c2814c4
privatize methods in lims_queries.py
danielsf May 6, 2022
b02d213
move fileID generator infrastructure to data_release_utils
danielsf May 6, 2022
433de7c
fix docstrings
danielsf May 6, 2022
f08e511
move get_image_set
danielsf May 6, 2022
e8be731
pep8 changes
danielsf May 6, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,15 @@
from allensdk.internal.api import db_connection_creator
from allensdk.brain_observatory.ecephys.ecephys_project_api.http_engine \
import (HttpEngine)
from allensdk.core.typing import SupportsStr
from allensdk.core.authentication import DbCredentials
from allensdk.core.auth_config import (
MTRAIN_DB_CREDENTIAL_MAP, LIMS_DB_CREDENTIAL_MAP)
from allensdk.internal.api.queries.utils import (
build_in_list_selector_query)
from allensdk.internal.api.queries.behavior_lims_queries import (
foraging_id_map_from_behavior_session_id)
from allensdk.internal.api.queries.mtrain_queries import (
session_stage_from_foraging_id)


class BehaviorProjectLimsApi(BehaviorProjectBase):
Expand Down Expand Up @@ -114,35 +119,6 @@ def default(
return cls(lims_engine, mtrain_engine, app_engine,
data_release_date=data_release_date)

@staticmethod
def _build_in_list_selector_query(
col,
valid_list: Optional[SupportsStr] = None,
operator: str = "WHERE") -> str:
"""
Filter for rows where the value of a column is contained in a list.
If no list is specified in `valid_list`, return an empty string.

NOTE: if string ids are used, then the strings in `valid_list` must
be enclosed in single quotes, or else the query will throw a column
does not exist error. E.g. ["'mystringid1'", "'mystringid2'"...]

:param col: name of column to compare if in a list
:type col: str
:param valid_list: iterable of values that can be mapped to str
(e.g. string, int, float).
:type valid_list: list
:param operator: SQL operator to start the clause. Default="WHERE".
Valid inputs: "AND", "OR", "WHERE" (not case-sensitive).
:type operator: str
"""
if not valid_list:
return ""
session_query = (
f"""{operator} {col} IN ({",".join(
sorted(set(map(str, valid_list))))})""")
return session_query

def _build_experiment_from_session_query(self) -> str:
"""Aggregate sql sub-query to get all ophys_experiment_ids associated
with a single ophys_session_id."""
Expand Down Expand Up @@ -243,51 +219,6 @@ def _get_behavior_summary_table(self) -> pd.DataFrame:
self.logger.debug(f"get_behavior_session_table query: \n{query}")
return self.lims_engine.select(query)

def _get_foraging_ids_from_behavior_session(
self, behavior_session_ids: List[int]) -> List[str]:
behav_ids = self._build_in_list_selector_query("id",
behavior_session_ids,
operator="AND")
forag_ids_query = f"""
SELECT foraging_id
FROM behavior_sessions
WHERE foraging_id IS NOT NULL
{behav_ids};
"""
self.logger.debug("get_foraging_ids_from_behavior_session query: \n"
f"{forag_ids_query}")
foraging_ids = self.lims_engine.fetchall(forag_ids_query)

self.logger.debug(f"Retrieved {len(foraging_ids)} foraging ids for"
f" behavior stage query. Ids = {foraging_ids}")
return foraging_ids

def _get_behavior_stage_table(
self,
behavior_session_ids: Optional[List[int]] = None):
# Select fewer rows if possible via behavior_session_id
if behavior_session_ids:
foraging_ids = self._get_foraging_ids_from_behavior_session(
behavior_session_ids)
foraging_ids = [f"'{fid}'" for fid in foraging_ids]
# Otherwise just get the full table from mtrain
else:
foraging_ids = None

foraging_ids_query = self._build_in_list_selector_query(
"bs.id", foraging_ids)

query = f"""
SELECT
stages.name as session_type,
bs.id AS foraging_id
FROM behavior_sessions bs
JOIN stages ON stages.id = bs.state_id
{foraging_ids_query};
"""
self.logger.debug(f"_get_behavior_stage_table query: \n {query}")
return self.mtrain_engine.select(query)

def get_behavior_stage_parameters(self,
foraging_ids: List[str]) -> pd.Series:
"""Gets the stage parameters for each foraging id from mtrain
Expand All @@ -302,7 +233,7 @@ def get_behavior_stage_parameters(self,
---------
Series with index of foraging id and values stage parameters
"""
foraging_ids_query = self._build_in_list_selector_query(
foraging_ids_query = build_in_list_selector_query(
"bs.id", foraging_ids)

query = f"""
Expand Down Expand Up @@ -514,8 +445,19 @@ def get_behavior_session_table(self) -> pd.DataFrame:
:rtype: pd.DataFrame
"""
summary_tbl = self._get_behavior_summary_table()
stimulus_names = self._get_behavior_stage_table(
behavior_session_ids=summary_tbl.index.tolist())

foraging_id_map = foraging_id_map_from_behavior_session_id(
lims_engine=self.lims_engine,
behavior_session_ids=summary_tbl.behavior_session_id.tolist(),
logger=self.logger)

foraging_ids = list(foraging_id_map.foraging_id)

stimulus_names = session_stage_from_foraging_id(
mtrain_engine=self.mtrain_engine,
foraging_ids=foraging_ids,
logger=self.logger)

return (summary_tbl.merge(stimulus_names,
on=["foraging_id"], how="left")
.set_index("behavior_session_id"))
Expand Down Expand Up @@ -602,19 +544,19 @@ def _get_behavior_session_release_filter(self):
release_behavior_only_session_ids + \
release_behavior_with_ophys_session_ids

return self._build_in_list_selector_query(
return build_in_list_selector_query(
"bs.id", release_behavior_session_ids)

def _get_ophys_session_release_filter(self):
release_files = self.get_release_files(
file_type='BehaviorOphysNwb')
return self._build_in_list_selector_query(
return build_in_list_selector_query(
"bs.id", release_files['behavior_session_id'].tolist())

def _get_ophys_experiment_release_filter(self):
release_files = self.get_release_files(
file_type='BehaviorOphysNwb')
return self._build_in_list_selector_query(
return build_in_list_selector_query(
"oe.id", release_files.index.tolist())

def get_natural_movie_template(self, number: int) -> Iterable[bytes]:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from typing import Optional
import pandas as pd
import re


def get_image_set(df: pd.DataFrame) -> pd.Series:
"""Get image set

The image set here is the letter part of the session type
ie for session type OPHYS_1_images_B, it would be "B"

Some session types don't have an image set name, such as
gratings, which will be set to null

Parameters
----------
df
The session df

Returns
--------
Series with index same as df whose values are image_set
"""
def __get_image_set_name(session_type: Optional[str]):
match = re.match(r'.*images_(?P<image_set>\w)', session_type)
if match is None:
return None
return match.group('image_set')

session_type = df['session_type'][
df['session_type'].notnull()]
image_set = session_type.apply(__get_image_set_name)
return image_set
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import re
from typing import Optional

import pandas as pd

from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.data_io import BehaviorProjectLimsApi # noqa: E501
from allensdk.brain_observatory.behavior.behavior_project_cache \
.tables.util.image_presentation_utils import (
get_image_set)


def get_prior_exposures_to_session_type(df: pd.DataFrame) -> pd.Series:
Expand Down Expand Up @@ -40,16 +40,7 @@ def get_prior_exposures_to_image_set(df: pd.DataFrame) -> pd.Series:
--------
Series with index same as df and values prior exposure counts to image set
"""

def __get_image_set_name(session_type: Optional[str]):
match = re.match(r'.*images_(?P<image_set>\w)', session_type)
if match is None:
return None
return match.group('image_set')

session_type = df['session_type'][
df['session_type'].notnull()]
image_set = session_type.apply(__get_image_set_name)
image_set = get_image_set(df=df)
return __get_prior_exposure_count(df=df, to=image_set)


Expand Down
58 changes: 58 additions & 0 deletions allensdk/brain_observatory/behavior/data_files/stimulus_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from cachetools import cached, LRUCache
from cachetools.keys import hashkey

import datetime
import pandas as pd
import copy

from allensdk.internal.api import PostgresQueryMixin
from allensdk.internal.core.lims_utilities import safe_system_path
Expand Down Expand Up @@ -142,6 +144,62 @@ def num_frames(self) -> int:
self._validate_frame_data()
return len(self.data['items']['behavior']['intervalsms']) + 1

@property
def date_of_acquisition(self) -> datetime.datetime:
"""
Return the date_of_acquisition as a datetime.datetime.

This will be read from self.data['start_time']
"""
assert isinstance(self.data, dict)
if 'start_time' not in self.data:
raise KeyError(
"No 'start_time' listed in pickle file "
f"{self.filepath}")

return copy.deepcopy(self.data['start_time'])

@property
def session_type(self) -> str:
"""
Return the session type as read from the pickle file. This can
be read either from

data['items']['behavior']['params']['stage']
or
data['items']['behavior']['cl_params']['stage']

if both are present and they disagree, raise an exception
"""
param_value = None
if 'params' in self.data['items']['behavior']:
if 'stage' in self.data['items']['behavior']['params']:
param_value = self.data['items']['behavior']['params']['stage']

cl_value = None
if 'cl_params' in self.data['items']['behavior']:
if 'stage' in self.data['items']['behavior']['cl_params']:
cl_value = self.data['items']['behavior']['cl_params']['stage']

if cl_value is None and param_value is None:
raise RuntimeError("Could not find stage in pickle file "
f"{self.filepath}")

if param_value is None:
return cl_value

if cl_value is None:
return param_value

if cl_value != param_value:
raise RuntimeError(
"Conflicting session_types in pickle file "
f"{self.filepath}\n"
f"cl_params: {cl_value}\n"
f"params: {param_value}\n")

return param_value


class ReplayStimulusFile(_StimulusFile):

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_frame_indices(
each event occured on. Indexes will be chosen to be the
first index satisfying

frame_timestamps[event_indices] <= event_timestamps
frame_timestamps[event_indices] >= event_timestamps
event_timestamps < frame_timestamps[event_indices+1]

Parameters
Expand Down
1 change: 1 addition & 0 deletions allensdk/brain_observatory/data_release_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# empty
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# empty
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pathlib


class FileIDGenerator(object):
"""
A class to generate a unique integer ID for each file in the
data release
"""

def __init__(self):
self._id_lookup = dict()
self._next_id = 0
self._dummy_value = -999

@property
def dummy_value(self) -> int:
"""
Value reserved for files that are missing from the
release
"""
return self._dummy_value

def id_from_path(self,
file_path: pathlib.Path) -> int:
"""
Get the unique ID for a file path. If the file has already
been assigned a unique ID, return that. Otherwise, assign
a unique ID to the file path and return it
"""
if not isinstance(file_path, pathlib.Path):
msg = ("file_path must be a pathlib.Path (this is so "
"we can resolve it into an absolute path). You passed "
f"in a {type(file_path)}")
raise ValueError(msg)

if not file_path.is_file():
msg = f"{file_path} is not a file"
raise ValueError(msg)

if file_path.is_symlink():
msg = f"{file_path} is a symlink; must be an actual path"
raise ValueError(msg)

str_path = str(file_path.resolve().absolute())
if str_path not in self._id_lookup:
self._id_lookup[str_path] = self._next_id
self._next_id += 1
while self._next_id == self.dummy_value:
self._next_id += 1

return self._id_lookup[str_path]
Loading