From 3f34514eb2a677ce8879577ae3ea1259b9d7544d Mon Sep 17 00:00:00 2001 From: sammlapp Date: Thu, 6 Feb 2025 16:13:49 -0500 Subject: [PATCH] enable passing list of options for annotation_column users often have a headache when realizing the annotation colum could be species or Species or Annotation, e.g. resolves allow multiple possible column names for BoxedAnnotations.from_raven_files #1098 adds tests --- opensoundscape/annotations.py | 42 ++++++++++++++++--- .../raven_with_Annotation_col.txt | 2 + tests/test_annotations.py | 36 ++++++++++++++++ 3 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 tests/raven_annots/raven_with_Annotation_col.txt diff --git a/opensoundscape/annotations.py b/opensoundscape/annotations.py index c2c25f6e..488b4ef4 100644 --- a/opensoundscape/annotations.py +++ b/opensoundscape/annotations.py @@ -129,13 +129,16 @@ def from_raven_files( Args: raven_files: list or iterable of raven .txt file paths (as str or pathlib.Path), or a single file path (str or pathlib.Path). Eg ['path1.txt','path2.txt'] - annotation_column: string name or integer position of column containing annotations + annotation_column: column name(s) or integer position to use as the annotations - pass `None` to load the Raven file without explicitly assigning a column as the annotation column. The resulting object's `.df` will have an `annotation` column with nan values! - if a string is passed, the column with this name will be used as the annotations. - if an integer is passed, the column at that position will be used as the annotation column. - NOTE: column positions are ordered increasingly starting at 0. + NOTE: column positions are ordered increasingly starting at 0. + - if a list/tuple is passed, find a column matching any value in the list + NOTE: if multiple columns match, an error will be raised + Example: ['annotation','label','Species'] will find a column with any of these names audio_files: (list) optionally specify audio files corresponding to each raven file (length should match raven_files) Eg ['path1.txt','path2.txt'] - if None (default), .clip_labels() will not be able to @@ -195,6 +198,10 @@ def from_raven_files( but their lengths did not match. """ + assert isinstance( + annotation_column, (str, int, type(None), list, tuple) + ), "Annotation column index has to be a string, integer, list, tuple, or None." + all_file_dfs = [] # mapping of Raven file columns to standard opensoundscape names @@ -214,10 +221,7 @@ def from_raven_files( warnings.warn(f"{raven_file} has zero rows.") continue - assert isinstance( - annotation_column, (str, int, type(None)) - ), "Annotation column index has to be a string, integer, or None." - + # handle varioius options for specifying the annotation column if isinstance(annotation_column, str): # annotation_column is a string that is present in the annotation file's header try: @@ -251,6 +255,32 @@ def from_raven_files( }, errors="raise", ) + elif isinstance(annotation_column, (list, tuple)): + annotation_column = list(annotation_column) + # make sure exactly one value from annotation_column is in the df.columns + matching_cols = [col for col in annotation_column if col in df.columns] + if len(matching_cols) == 0: + raise KeyError( + f"None of the specified annotation columns, {annotation_column}, " + f"match any of the column names in the annotation file: {list(df.columns)} " + f"when attempting to load {raven_file}. " + f"Please ensure all raven files contain one of the specified annotation_column values." + ) + elif len(matching_cols) > 1: + raise KeyError( + f"Multiple columns in the annotation file match the specified annotation columns: " + f"{matching_cols}. when attempting to load {raven_file}. " + "Please ensure only one column in each raven file matches a value listed in annotation_columns" + ) + else: + # rename the column to 'annotation' + df = df.rename( + columns={ + matching_cols[0]: "annotation", + }, + errors="raise", + ) + else: # None was passed to annotation_column # we'll create an empty `annotation` column diff --git a/tests/raven_annots/raven_with_Annotation_col.txt b/tests/raven_annots/raven_with_Annotation_col.txt new file mode 100644 index 00000000..d8f34fae --- /dev/null +++ b/tests/raven_annots/raven_with_Annotation_col.txt @@ -0,0 +1,2 @@ +Selection View Channel Begin Time (s) End Time (s) Low Freq (Hz) High Freq (Hz) Annotation +1 Spectrogram 1 1 1.897648165999982 4.110570810999974 1326.3 3266.5 CSWA \ No newline at end of file diff --git a/tests/test_annotations.py b/tests/test_annotations.py index c4967e1e..45457a98 100644 --- a/tests/test_annotations.py +++ b/tests/test_annotations.py @@ -15,6 +15,11 @@ def raven_file(): return "tests/raven_annots/MSD-0003_20180427_2minstart00.Table.1.selections.txt" +@pytest.fixture() +def raven_file_Annotation_col(): + return "tests/raven_annots/raven_with_Annotation_col.txt" + + @pytest.fixture() def audio_2min(): return "tests/audio/MSD-0003_20180427_2minstart00.wav" @@ -232,6 +237,37 @@ def test_load_raven_annotation_column_name(raven_file): a = BoxedAnnotations.from_raven_files([raven_file], annotation_column=-1) +def test_from_raven_files_list_of_annotation_column( + raven_file, raven_file_Annotation_col +): + ba = BoxedAnnotations.from_raven_files( + [raven_file, raven_file_Annotation_col], + annotation_column=["Species", "Annotation"], + ) + assert "CSWA" in ba.unique_labels() and "WOTH" in ba.unique_labels() + + # also allowed to be a tuple + ba = BoxedAnnotations.from_raven_files( + [raven_file, raven_file_Annotation_col], + annotation_column=("Species", "Annotation"), + ) + assert "CSWA" in ba.unique_labels() and "WOTH" in ba.unique_labels() + + # raises an exception if no matching column is found + with pytest.raises(KeyError): + ba = BoxedAnnotations.from_raven_files( + [raven_file, raven_file_Annotation_col], + annotation_column=["Species", "notacolumn"], + ) + + # raises an exception if multiple matching columns are found + with pytest.raises(KeyError): + ba = BoxedAnnotations.from_raven_files( + [raven_file, raven_file_Annotation_col], + annotation_column=["Species", "Selection"], + ) + + def test_load_raven_annotations_empty(raven_file_empty): a = BoxedAnnotations.from_raven_files([raven_file_empty], None) assert len(a.df) == 0