From 3f34514eb2a677ce8879577ae3ea1259b9d7544d Mon Sep 17 00:00:00 2001
From: sammlapp <sammlapp@gmail.com>
Date: Thu, 6 Feb 2025 16:13:49 -0500
Subject: [PATCH] enable passing list of options for annotation_column

users often have a headache when realizing the annotation colum could be species or Species or Annotation, e.g.

resolves allow multiple possible column names for BoxedAnnotations.from_raven_files #1098

adds tests
---
 opensoundscape/annotations.py                 | 42 ++++++++++++++++---
 .../raven_with_Annotation_col.txt             |  2 +
 tests/test_annotations.py                     | 36 ++++++++++++++++
 3 files changed, 74 insertions(+), 6 deletions(-)
 create mode 100644 tests/raven_annots/raven_with_Annotation_col.txt

diff --git a/opensoundscape/annotations.py b/opensoundscape/annotations.py
index c2c25f6e..488b4ef4 100644
--- a/opensoundscape/annotations.py
+++ b/opensoundscape/annotations.py
@@ -129,13 +129,16 @@ def from_raven_files(
         Args:
             raven_files: list or iterable of raven .txt file paths (as str or pathlib.Path),
                 or a single file path (str or pathlib.Path). Eg ['path1.txt','path2.txt']
-            annotation_column: string name or integer position of column containing annotations
+            annotation_column: column name(s) or integer position to use as the annotations
                 - pass `None` to load the Raven file without explicitly
                 assigning a column as the annotation column. The resulting
                 object's `.df` will have an `annotation` column with nan values!
                 - if a string is passed, the column with this name will be used as the annotations.
                 - if an integer is passed, the column at that position will be used as the annotation column.
-                NOTE: column positions are ordered increasingly starting at 0.
+                    NOTE: column positions are ordered increasingly starting at 0.
+                - if a list/tuple is passed, find a column matching any value in the list
+                    NOTE: if multiple columns match, an error will be raised
+                    Example: ['annotation','label','Species'] will find a column with any of these names
             audio_files: (list) optionally specify audio files corresponding to each
                 raven file (length should match raven_files) Eg ['path1.txt','path2.txt']
                 - if None (default), .clip_labels() will not be able to
@@ -195,6 +198,10 @@ def from_raven_files(
             but their lengths did not match.
             """
 
+        assert isinstance(
+            annotation_column, (str, int, type(None), list, tuple)
+        ), "Annotation column index has to be a string, integer, list, tuple, or None."
+
         all_file_dfs = []
 
         # mapping of Raven file columns to standard opensoundscape names
@@ -214,10 +221,7 @@ def from_raven_files(
                 warnings.warn(f"{raven_file} has zero rows.")
                 continue
 
-            assert isinstance(
-                annotation_column, (str, int, type(None))
-            ), "Annotation column index has to be a string, integer, or None."
-
+            # handle varioius options for specifying the annotation column
             if isinstance(annotation_column, str):
                 # annotation_column is a string that is present in the annotation file's header
                 try:
@@ -251,6 +255,32 @@ def from_raven_files(
                     },
                     errors="raise",
                 )
+            elif isinstance(annotation_column, (list, tuple)):
+                annotation_column = list(annotation_column)
+                # make sure exactly one value from annotation_column is in the df.columns
+                matching_cols = [col for col in annotation_column if col in df.columns]
+                if len(matching_cols) == 0:
+                    raise KeyError(
+                        f"None of the specified annotation columns, {annotation_column}, "
+                        f"match any of the column names in the annotation file: {list(df.columns)} "
+                        f"when attempting to load {raven_file}. "
+                        f"Please ensure all raven files contain one of the specified annotation_column values."
+                    )
+                elif len(matching_cols) > 1:
+                    raise KeyError(
+                        f"Multiple columns in the annotation file match the specified annotation columns: "
+                        f"{matching_cols}. when attempting to load {raven_file}. "
+                        "Please ensure only one column in each raven file matches a value listed in annotation_columns"
+                    )
+                else:
+                    # rename the column to 'annotation'
+                    df = df.rename(
+                        columns={
+                            matching_cols[0]: "annotation",
+                        },
+                        errors="raise",
+                    )
+
             else:
                 # None was passed to annotation_column
                 # we'll create an empty `annotation` column
diff --git a/tests/raven_annots/raven_with_Annotation_col.txt b/tests/raven_annots/raven_with_Annotation_col.txt
new file mode 100644
index 00000000..d8f34fae
--- /dev/null
+++ b/tests/raven_annots/raven_with_Annotation_col.txt
@@ -0,0 +1,2 @@
+Selection	View	Channel	Begin Time (s)	End Time (s)	Low Freq (Hz)	High Freq (Hz)	Annotation	
+1	Spectrogram 1	1	1.897648165999982	4.110570810999974	1326.3	3266.5	CSWA
\ No newline at end of file
diff --git a/tests/test_annotations.py b/tests/test_annotations.py
index c4967e1e..45457a98 100644
--- a/tests/test_annotations.py
+++ b/tests/test_annotations.py
@@ -15,6 +15,11 @@ def raven_file():
     return "tests/raven_annots/MSD-0003_20180427_2minstart00.Table.1.selections.txt"
 
 
+@pytest.fixture()
+def raven_file_Annotation_col():
+    return "tests/raven_annots/raven_with_Annotation_col.txt"
+
+
 @pytest.fixture()
 def audio_2min():
     return "tests/audio/MSD-0003_20180427_2minstart00.wav"
@@ -232,6 +237,37 @@ def test_load_raven_annotation_column_name(raven_file):
         a = BoxedAnnotations.from_raven_files([raven_file], annotation_column=-1)
 
 
+def test_from_raven_files_list_of_annotation_column(
+    raven_file, raven_file_Annotation_col
+):
+    ba = BoxedAnnotations.from_raven_files(
+        [raven_file, raven_file_Annotation_col],
+        annotation_column=["Species", "Annotation"],
+    )
+    assert "CSWA" in ba.unique_labels() and "WOTH" in ba.unique_labels()
+
+    # also allowed to be a tuple
+    ba = BoxedAnnotations.from_raven_files(
+        [raven_file, raven_file_Annotation_col],
+        annotation_column=("Species", "Annotation"),
+    )
+    assert "CSWA" in ba.unique_labels() and "WOTH" in ba.unique_labels()
+
+    # raises an exception if no matching column is found
+    with pytest.raises(KeyError):
+        ba = BoxedAnnotations.from_raven_files(
+            [raven_file, raven_file_Annotation_col],
+            annotation_column=["Species", "notacolumn"],
+        )
+
+    # raises an exception if multiple matching columns are found
+    with pytest.raises(KeyError):
+        ba = BoxedAnnotations.from_raven_files(
+            [raven_file, raven_file_Annotation_col],
+            annotation_column=["Species", "Selection"],
+        )
+
+
 def test_load_raven_annotations_empty(raven_file_empty):
     a = BoxedAnnotations.from_raven_files([raven_file_empty], None)
     assert len(a.df) == 0