Label chunker (#114)

* Added label chunker * Added documentation for the annotation_chunker Co-authored-by: shreyasar2202 <ars.shreyas@gmail.com> Co-authored-by: Sean Perry <shperry@ucsd.edu> Co-authored-by: Samantha Prestrelski <samantha.prestrelski@gmail.com>
UCSD-E4E · Jun 29, 2022 · 2c48a85 · 2c48a85
1 parent b27cfc6
commit 2c48a85
Show file tree

Hide file tree

Showing 7 changed files with 605 additions and 1,098 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,5 +9,9 @@ mixed_bird_manual.csv
 outputs/result.csv
 *.wav
 *.pyc
+PyHa_Model_Comparison.ipynb
+PyHa_Testing-Copy1.ipynb
+PyHa_Testing.ipynb
+outputs/*
 *.ipynb
-!PyHa_Tutorial.ipynb
+!PyHa_Tutorial.ipynb
diff --git a/PyHa/annotation_post_processing.py b/PyHa/annotation_post_processing.py
@@ -0,0 +1,79 @@
+import pandas as pd
+import numpy as np
+
+
+def annotation_chunker(kaleidoscope_df, chunk_length):
+    """
+    Function that converts a Kaleidoscope-formatted Dataframe containing 
+    annotations to uniform chunks of chunk_length.
+
+    Note: if all or part of an annotation covers the last < chunk_length
+    seconds of a clip it will be ignored. If two annotations overlap in 
+    the same 3 second chunk, both are represented in that chunk
+
+    Args:
+        kaleidoscope_df (Dataframe)
+            - Dataframe of annotations in kaleidoscope format
+
+        chunk_length (int)
+            - duration to set all annotation chunks
+    Returns:
+        Dataframe of labels with chunk_length duration 
+        (elements in "OFFSET" are divisible by chunk_length).
+    """
+
+    #Init list of clips to cycle through and output dataframe
+    clips = kaleidoscope_df["IN FILE"].unique()
+    df_columns = {'IN FILE' :'str', 'CLIP LENGTH' : 'float64', 'CHANNEL' : 'int64', 'OFFSET' : 'float64',
+                'DURATION' : 'float64', 'SAMPLE RATE' : 'int64','MANUAL ID' : 'str'}
+    output_df = pd.DataFrame({c: pd.Series(dtype=t) for c, t in df_columns.items()})
+
+    # going through each clip
+    for clip in clips:
+        clip_df = kaleidoscope_df[kaleidoscope_df["IN FILE"] == clip]
+        birds = clip_df["MANUAL ID"].unique()
+        sr = clip_df["SAMPLE RATE"].unique()[0]
+        clip_len = clip_df["CLIP LENGTH"].unique()[0]
+
+        # quick data sanitization to remove very short clips
+        # do not consider any chunk that is less than chunk_length
+        if clip_len < chunk_length:
+            continue
+        potential_annotation_count = int(clip_len)//int(chunk_length)
+
+        # going through each species that was ID'ed in the clip
+        arr_len = int(clip_len*1000)
+        for bird in birds:
+            species_df = clip_df[clip_df["MANUAL ID"] == bird]
+            human_arr = np.zeros((arr_len))
+            # looping through each annotation
+            for annotation in species_df.index:
+                minval = int(round(species_df["OFFSET"][annotation] * 1000, 0))
+                # Determining the end of a human label
+                maxval = int(
+                    round(
+                        (species_df["OFFSET"][annotation] +
+                         species_df["DURATION"][annotation]) *
+                        1000,
+                        0))
+                # Placing the label relative to the clip
+                human_arr[minval:maxval] = 1
+            # performing the chunk isolation technique on the human array
+
+            for index in range(potential_annotation_count):
+                chunk_start = index * (chunk_length*1000)
+                chunk_end = min((index+1)*chunk_length*1000,arr_len)
+                chunk = human_arr[int(chunk_start):int(chunk_end)]
+                if max(chunk) >= 0.5:
+                    row = pd.DataFrame(index = [0])
+                    annotation_start = chunk_start / 1000
+                    #updating the dictionary
+                    row["IN FILE"] = clip
+                    row["CLIP LENGTH"] = clip_len
+                    row["OFFSET"] = annotation_start
+                    row["DURATION"] = chunk_length
+                    row["SAMPLE RATE"] = sr
+                    row["MANUAL ID"] = bird
+                    row["CHANNEL"] = 0
+                    output_df = pd.concat([output_df,row], ignore_index=True)
+    return output_df
diff --git a/PyHa/birdnet_lite/__pycache__/analyze.cpython-37.pyc b/PyHa/birdnet_lite/__pycache__/analyze.cpython-37.pyc
diff --git a/PyHa/birdnet_lite/__pycache__/analyze.cpython-38.pyc b/PyHa/birdnet_lite/__pycache__/analyze.cpython-38.pyc
diff --git a/PyHa/birdnet_lite/__pycache__/analyze.cpython-39.pyc b/PyHa/birdnet_lite/__pycache__/analyze.cpython-39.pyc