UCSD-E4E · sprestrelski · Jul 18, 2022 · Jun 21, 2021 · Jun 21, 2021 · Jul 6, 2022
diff --git a/.gitignore b/.gitignore
@@ -9,9 +9,6 @@ mixed_bird_manual.csv
 outputs/result.csv
 *.wav
 *.pyc
-PyHa_Model_Comparison.ipynb
-PyHa_Testing-Copy1.ipynb
-PyHa_Testing.ipynb
 outputs/*
 *.ipynb
 !PyHa_Tutorial.ipynb
diff --git a/PyHa/statistics.py b/PyHa/statistics.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from scipy import stats
 import numpy as np
-
+import time
 
 # Function that takes in a pandas dataframe of annotations and outputs a
 # dataframe of the mean, median, mode, quartiles, and standard deviation of
@@ -198,8 +198,14 @@ def automated_labeling_statistics(
     clips = list(dict.fromkeys(clips))
     # Initializing the returned dataframe
     statistics_df = pd.DataFrame()
+
+    num_errors = 0
+    num_processed = 0
+
+    start_time = time.time()
     # Looping through each audio clip
     for clip in clips:
+        num_processed += 1
         clip_automated_df = automated_df[automated_df["IN FILE"] == clip]
         clip_manual_df = manual_df[manual_df["IN FILE"] == clip]
         try:
@@ -218,11 +224,16 @@ def automated_labeling_statistics(
                     statistics_df = clip_stats_df
                 else:
                     statistics_df = statistics_df.append(clip_stats_df)
-
         except BaseException as e:
-            print("Something went wrong with: " + clip)
-            print(e)
+            num_errors += 1
+            #print("Something went wrong with: " + clip)
+            #print(e)
             continue
+        if num_processed % 50 == 0:
+            print("Processed", num_processed, "clips in", int((time.time() - start_time) * 10) / 10.0, 'seconds')
+            start_time = time.time()
+    if num_errors > 0:
+        print("Something went wrong with", num_errors, "clips out of", len(clips), "clips")
     statistics_df.reset_index(inplace=True, drop=True)
     return statistics_df
 
@@ -260,13 +271,6 @@ def global_dataset_statistics(statistics_df, manual_id = "bird"):
              'Global IoU': round(IoU, 6)}
     return pd.DataFrame.from_dict([entry])
 
-# TODO rework this function to implement some linear algebra, right now the
-# nested for loop won't handle larger loads well To make a global matrix, find
-# the clip with the most amount of automated labels and set that to the number
-# of columns I believe this is currently the largest bottleneck in terms of
-# temporal performance.
-
-
 def clip_IoU(automated_df, manual_df):
     """
     Function that takes in the manual and automated labels for a clip and
@@ -306,65 +310,42 @@ def clip_IoU(automated_df, manual_df):
 
     # Initializing arrays that will represent each of the human and automated
     # labels
-    bot_arr = np.zeros((int(duration * SAMPLE_RATE)))
-    human_arr = np.zeros((int(duration * SAMPLE_RATE)))
+    bot_arr = np.zeros((automated_row_count, int(duration * SAMPLE_RATE)))
+    human_arr = np.zeros((manual_row_count, int(duration * SAMPLE_RATE)))
+
+    # Fill array with automated labels 
+    for row in automated_df.index:
+        # Determine the beginning of an automated label
+        minval = int(round(automated_df["OFFSET"][row] * SAMPLE_RATE, 0))
+        # Determining the ending of an automated label
+        maxval = int(round((automated_df["OFFSET"][row] + automated_df["DURATION"][row]) * 
+                SAMPLE_RATE,0))
+        # Placing the label relative to the clip
+        bot_arr[row][minval:maxval] = 1
 
-    # Looping through each human label
+    # Fill array with human labels
     for row in manual_df.index:
-        # print(row)
         # Determining the beginning of a human label
         minval = int(round(manual_df["OFFSET"][row] * SAMPLE_RATE, 0))
         # Determining the end of a human label
-        maxval = int(
-            round(
-                (manual_df["OFFSET"][row] +
-                 manual_df["DURATION"][row]) *
-                SAMPLE_RATE,
-                0))
+        maxval = int(round((manual_df["OFFSET"][row] + manual_df["DURATION"][row]) *
+                SAMPLE_RATE,0))
         # Placing the label relative to the clip
-        human_arr[minval:maxval] = 1
-        # Looping through each automated label
-        for column in automated_df.index:
-            # Determining the beginning of an automated label
-            minval = int(
-                round(
-                    automated_df["OFFSET"][column] *
-                    SAMPLE_RATE,
-                    0))
-            # Determining the ending of an automated label
-            maxval = int(
-                round(
-                    (automated_df["OFFSET"][column] +
-                     automated_df["DURATION"][column]) *
-                    SAMPLE_RATE,
-                    0))
-            # Placing the label relative to the clip
-            bot_arr[minval:maxval] = 1
-            # Determining the overlap between the human label and the automated
-            # label
-            intersection = human_arr * bot_arr
-            # Determining the union between the human label and the automated
-            # label
-            union = human_arr + bot_arr
-            union[union == 2] = 1
-            # Determining how much of the human label and the automated label
-            # overlap with respect to time
-            intersection_count = np.count_nonzero(
-                intersection == 1) / SAMPLE_RATE
-            # Determining the span of the human label and the automated label
-            # with respect to time.
-            union_count = np.count_nonzero(union == 1) / SAMPLE_RATE
-            # Placing the Intersection over Union Percentage into it's
-            # respective position in the array.
-            IoU_Matrix[row, column] = round(
-                intersection_count / union_count, 4)
-            # Resetting the automated label to zero
-            bot_arr[bot_arr == 1] = 0
-        # Resetting the human label to zero
-        human_arr[human_arr == 1] = 0
-
-    return IoU_Matrix
+        human_arr[row][minval:maxval] = 1
 
+    # Multiply every row in human by every row in bot
+    IoU_Matrix = np.matmul(human_arr, bot_arr.transpose())
+
+    # Compare each human annotation to every automated annotation
+    for i in range(manual_row_count):
+        for j in range(automated_row_count):
+            # Skip comparision if there is no intersection, since IoU = 0 anyway 
+            if IoU_Matrix[i][j] == 0:
+                continue
+            # Sum logicial ORR of time bins shared by the human and bot annotations
+            IoU_Matrix[i][j] /= np.sum(np.logical_or(human_arr[i], bot_arr[j]))
+            IoU_Matrix[i][j] = round(IoU_Matrix[i][j], 4)
+    return np.nan_to_num(IoU_Matrix)
 
 def matrix_IoU_Scores(IoU_Matrix, manual_df, threshold):
     """
@@ -418,9 +399,7 @@ def matrix_IoU_Scores(IoU_Matrix, manual_df, threshold):
         precision = round(tp_count / (tp_count + fp_count), 4)
         f1 = round(2 * (recall * precision) / (recall + precision), 4)
     except ZeroDivisionError:
-        print(
-            "Division by zero setting precision, recall, and f1 to zero on " +
-            filename)
+#        print("Division by zero setting precision, recall, and f1 to zero on", filename)
         recall = 0
         precision = 0
         f1 = 0
@@ -781,4 +760,4 @@ def class_statistics(clip_statistics):
             temp_df = global_statistics(class_df, manual_id = class_)
             class_statistics = class_statistics.append(temp_df)
     class_statistics.reset_index(inplace=True,drop=True)
-    return class_statistics
+    return class_statistics