-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Added label chunker * Added documentation for the annotation_chunker Co-authored-by: shreyasar2202 <ars.shreyas@gmail.com> Co-authored-by: Sean Perry <shperry@ucsd.edu> Co-authored-by: Samantha Prestrelski <samantha.prestrelski@gmail.com>
- Loading branch information
1 parent
b27cfc6
commit 2c48a85
Showing
7 changed files
with
605 additions
and
1,098 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import pandas as pd | ||
import numpy as np | ||
|
||
|
||
def annotation_chunker(kaleidoscope_df, chunk_length): | ||
""" | ||
Function that converts a Kaleidoscope-formatted Dataframe containing | ||
annotations to uniform chunks of chunk_length. | ||
Note: if all or part of an annotation covers the last < chunk_length | ||
seconds of a clip it will be ignored. If two annotations overlap in | ||
the same 3 second chunk, both are represented in that chunk | ||
Args: | ||
kaleidoscope_df (Dataframe) | ||
- Dataframe of annotations in kaleidoscope format | ||
chunk_length (int) | ||
- duration to set all annotation chunks | ||
Returns: | ||
Dataframe of labels with chunk_length duration | ||
(elements in "OFFSET" are divisible by chunk_length). | ||
""" | ||
|
||
#Init list of clips to cycle through and output dataframe | ||
clips = kaleidoscope_df["IN FILE"].unique() | ||
df_columns = {'IN FILE' :'str', 'CLIP LENGTH' : 'float64', 'CHANNEL' : 'int64', 'OFFSET' : 'float64', | ||
'DURATION' : 'float64', 'SAMPLE RATE' : 'int64','MANUAL ID' : 'str'} | ||
output_df = pd.DataFrame({c: pd.Series(dtype=t) for c, t in df_columns.items()}) | ||
|
||
# going through each clip | ||
for clip in clips: | ||
clip_df = kaleidoscope_df[kaleidoscope_df["IN FILE"] == clip] | ||
birds = clip_df["MANUAL ID"].unique() | ||
sr = clip_df["SAMPLE RATE"].unique()[0] | ||
clip_len = clip_df["CLIP LENGTH"].unique()[0] | ||
|
||
# quick data sanitization to remove very short clips | ||
# do not consider any chunk that is less than chunk_length | ||
if clip_len < chunk_length: | ||
continue | ||
potential_annotation_count = int(clip_len)//int(chunk_length) | ||
|
||
# going through each species that was ID'ed in the clip | ||
arr_len = int(clip_len*1000) | ||
for bird in birds: | ||
species_df = clip_df[clip_df["MANUAL ID"] == bird] | ||
human_arr = np.zeros((arr_len)) | ||
# looping through each annotation | ||
for annotation in species_df.index: | ||
minval = int(round(species_df["OFFSET"][annotation] * 1000, 0)) | ||
# Determining the end of a human label | ||
maxval = int( | ||
round( | ||
(species_df["OFFSET"][annotation] + | ||
species_df["DURATION"][annotation]) * | ||
1000, | ||
0)) | ||
# Placing the label relative to the clip | ||
human_arr[minval:maxval] = 1 | ||
# performing the chunk isolation technique on the human array | ||
|
||
for index in range(potential_annotation_count): | ||
chunk_start = index * (chunk_length*1000) | ||
chunk_end = min((index+1)*chunk_length*1000,arr_len) | ||
chunk = human_arr[int(chunk_start):int(chunk_end)] | ||
if max(chunk) >= 0.5: | ||
row = pd.DataFrame(index = [0]) | ||
annotation_start = chunk_start / 1000 | ||
#updating the dictionary | ||
row["IN FILE"] = clip | ||
row["CLIP LENGTH"] = clip_len | ||
row["OFFSET"] = annotation_start | ||
row["DURATION"] = chunk_length | ||
row["SAMPLE RATE"] = sr | ||
row["MANUAL ID"] = bird | ||
row["CHANNEL"] = 0 | ||
output_df = pd.concat([output_df,row], ignore_index=True) | ||
return output_df |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.