Skip to content

Commit

Permalink
Limit Others behavior segments to match max labeled behavior count
Browse files Browse the repository at this point in the history
  • Loading branch information
healthonrails committed Nov 4, 2024
1 parent 745726f commit 6e29ed8
Showing 1 changed file with 106 additions and 11 deletions.
117 changes: 106 additions & 11 deletions annolid/behavior/data_loading/video_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
import random
import csv
import json
import itertools
from collections import defaultdict

# Constants for file paths and split ratio
BASE_FOLDER = "behavior_videos"
BASE_FOLDER = "/content/behaivor_videos"
OUTPUT_VIDEO_FOLDER = "behavior_video_clips"
TRAIN_JSONL_PATH = "train_video_annotations.jsonl"
TEST_JSONL_PATH = "test_video_annotations.jsonl"
Expand Down Expand Up @@ -85,21 +87,41 @@ def sample_segments_from_gaps(gaps, video_file_path, video_name, gap_duration, b
return entries


def sample_limited_segments_from_gaps(gaps, video_file_path, video_name, gap_duration, max_count, behavior_label="Others"):
"""Samples segments from the gaps with a limit on the number of segments for 'Others' behavior."""
entries = []
for start, end in gaps:
num_segments = int((end - start) // gap_duration)
for i in range(min(num_segments, max_count - len(entries))):
segment_start = start + i * gap_duration
segment_end = segment_start + gap_duration
segment_path = f"{OUTPUT_VIDEO_FOLDER}/{video_name}_other_{segment_start}-{segment_end}.mp4"
extract_video_segment(
video_file_path, segment_start, segment_end, segment_path)
entries.append(create_annotation_entry(
behavior_label, segment_path))
if len(entries) >= max_count: # Stop if we reach the max limit
break
if len(entries) >= max_count:
break
return entries


def process_video_file(csv_path, video_file_path, gap_duration=5):
"""Processes a video file by extracting labeled and unlabeled segments."""
"""Processes a video file by extracting labeled and limited 'Others' segments."""
from moviepy.editor import VideoFileClip
start_events, stop_events = parse_behavior_events(csv_path)

# Get a clean video name without spaces
video_name = os.path.splitext(os.path.basename(video_file_path))[
0].replace(" ", "_")

# Extract labeled segments
# Extract labeled segments and count occurrences of each behavior
labeled_entries = []
behavior_counts = defaultdict(int)
for start_event in start_events:
start_time = start_event["time"]
behavior = start_event["behavior"].replace(
" ", "_") # Replace spaces in behavior
behavior = start_event["behavior"]
matching_stop = next(
(stop for stop in stop_events if stop["behavior"] == behavior and stop["time"] > start_time), None)

Expand All @@ -111,13 +133,17 @@ def process_video_file(csv_path, video_file_path, gap_duration=5):
video_file_path, start_time, end_time, segment_path)
labeled_entries.append(
create_annotation_entry(behavior, segment_path))
behavior_counts[behavior] += 1

# Determine max count among labeled behaviors
max_behavior_count = max(behavior_counts.values(), default=0)

# Extract "Others" segments
# Extract "Others" segments, limited by max_behavior_count
with VideoFileClip(video_file_path) as video:
video_duration = video.duration
gaps = find_gaps(start_events, stop_events, video_duration, gap_duration)
other_entries = sample_segments_from_gaps(
gaps, video_file_path, video_name, gap_duration)
other_entries = sample_limited_segments_from_gaps(
gaps, video_file_path, video_name, gap_duration, max_behavior_count)

return labeled_entries + other_entries

Expand All @@ -137,8 +163,76 @@ def split_and_save_annotations(entries, train_path, test_path, train_ratio=0.97)
f.write(json.dumps(entry) + "\n")


def stratified_split_and_save_annotations(entries, train_path, test_path, train_ratio=0.97):
"""Splits annotations into stratified train and test sets by behavior and saves them to JSONL files."""
# Group entries by behavior
behavior_groups = defaultdict(list)
for entry in entries:
# Assuming the 'response' field contains the behavior label
behavior = entry['response']
behavior_groups[behavior].append(entry)

# Stratified sampling for train and test sets
train_entries, test_entries = [], []
for behavior, group_entries in behavior_groups.items():
random.shuffle(group_entries)
split_index = int(len(group_entries) * train_ratio)
train_entries.extend(group_entries[:split_index])
test_entries.extend(group_entries[split_index:])

# Save the stratified entries to JSONL files
with open(train_path, 'w') as f:
for entry in train_entries:
f.write(json.dumps(entry) + "\n")

with open(test_path, 'w') as f:
for entry in test_entries:
f.write(json.dumps(entry) + "\n")


def stratified_interleaved_split_and_save_annotations(entries, train_path, test_path, train_ratio=0.97):
"""Splits annotations into stratified, interleaved train and test sets by behavior and saves them to JSONL files."""
# Group entries by behavior
behavior_groups = defaultdict(list)
for entry in entries:
# Assuming the 'response' field contains the behavior label
behavior = entry['response']
behavior_groups[behavior].append(entry)

# Initialize lists for train and test entries
train_entries, test_entries = [], []

# Stratified sampling with interleaving
for behavior, group_entries in behavior_groups.items():
random.shuffle(group_entries) # Shuffle within each behavior group
split_index = int(len(group_entries) * train_ratio)
train_entries.append(group_entries[:split_index])
test_entries.append(group_entries[split_index:])

# Interleave entries from each behavior group
interleaved_train = list(itertools.chain.from_iterable(
itertools.zip_longest(*train_entries)))
interleaved_test = list(itertools.chain.from_iterable(
itertools.zip_longest(*test_entries)))

# Remove None entries introduced by zip_longest
interleaved_train = [
entry for entry in interleaved_train if entry is not None]
interleaved_test = [
entry for entry in interleaved_test if entry is not None]

# Save the interleaved entries to JSONL files
with open(train_path, 'w') as f:
for entry in interleaved_train:
f.write(json.dumps(entry) + "\n")

with open(test_path, 'w') as f:
for entry in interleaved_test:
f.write(json.dumps(entry) + "\n")


def process_dataset():
"""Processes the entire dataset and creates train/test JSONL files."""
"""Processes the entire dataset and creates stratified interleaved train/test JSONL files."""
all_entries = []
for subdir in os.listdir(BASE_FOLDER):
subdir_path = os.path.join(BASE_FOLDER, subdir)
Expand All @@ -151,6 +245,7 @@ def process_dataset():
entries = process_video_file(csv_path, video_file_path)
all_entries.extend(entries)

split_and_save_annotations(
# Use the interleaved function to split and save annotations
stratified_interleaved_split_and_save_annotations(
all_entries, TRAIN_JSONL_PATH, TEST_JSONL_PATH, TRAIN_SPLIT_RATIO)
print("Conversion complete. Training and testing datasets created.")
print("Conversion complete. Stratified interleaved training and testing datasets created.")

0 comments on commit 6e29ed8

Please sign in to comment.