Skip to content

Commit

Permalink
Add a ValidVIAtracksCSV class (2/4) (#219)
Browse files Browse the repository at this point in the history
* Add skeleton for ValidVIAtracksCSV class

* Add skeleton for ValidVIAtracksCSV test

* Draft VIA file validator

* Change asserts to errors (WIP)

* Remove 1-based integer checks (for track ID and frames). Replace assert by errors

* Small edits

* Add tests for VIA file (pending fixtures)

* Add one fixture

* Add frame number as invalid file attribute

* Factor out valid header fixture

* Add test for frame number wrongly encoded in the filename

* Add unique frame numbers test. Check bbox shape.

* Add test for region attribute not defined

* Add test for track ID not castable as an integer

* Add test for unique track IDs per frame

* Small edits to comments and docstrings

* Apply suggestions from code review

Co-authored-by: Niko Sirmpilatze <niko.sirbiladze@gmail.com>

* Fix test duplicate from rebase

* Rename symbols

* csv to .csv

* Small edits to comments

---------

Co-authored-by: Niko Sirmpilatze <niko.sirbiladze@gmail.com>
  • Loading branch information
sfmig and niksirbi authored Jul 22, 2024
1 parent 4830fd5 commit 242f532
Show file tree
Hide file tree
Showing 3 changed files with 565 additions and 1 deletion.
211 changes: 211 additions & 0 deletions movement/validators/files.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
"""``attrs`` classes for validating file paths."""

import ast
import os
import re
from pathlib import Path
from typing import Literal

import h5py
import pandas as pd
from attrs import define, field, validators

from movement.utils.logging import log_error
Expand Down Expand Up @@ -198,3 +201,211 @@ def _csv_file_contains_expected_levels(self, attribute, value):
".csv header rows do not match the known format for "
"DeepLabCut pose estimation output files.",
)


@define
class ValidVIATracksCSV:
"""Class for validating VIA tracks .csv files.
Parameters
----------
path : pathlib.Path or str
Path to the VIA tracks .csv file.
Raises
------
ValueError
If the .csv file does not match the VIA tracks file requirements.
"""

path: Path = field(validator=validators.instance_of(Path))

@path.validator
def csv_file_contains_valid_header(self, attribute, value):
"""Ensure the VIA tracks .csv file contains the expected header."""
expected_header = [
"filename",
"file_size",
"file_attributes",
"region_count",
"region_id",
"region_shape_attributes",
"region_attributes",
]

with open(value) as f:
header = f.readline().strip("\n").split(",")

if header != expected_header:
raise log_error(
ValueError,
".csv header row does not match the known format for "
"VIA tracks output files. "
f"Expected {expected_header} but got {header}.",
)

@path.validator
def csv_file_contains_valid_frame_numbers(self, attribute, value):
"""Ensure that the VIA tracks .csv file contains valid frame numbers.
This involves:
- Checking that frame numbers are included in `file_attributes` or
encoded in the image file `filename`.
- Checking the frame number can be cast as an integer.
- Checking that there are as many unique frame numbers as unique image
files.
If the frame number is included as part of the image file name, it is
expected as an integer led by at least one zero, between "_" and ".",
followed by the file extension.
"""
df = pd.read_csv(value, sep=",", header=0)

# Extract list of file attributes (dicts)
file_attributes_dicts = [
ast.literal_eval(d) for d in df.file_attributes
]

# If 'frame' is a file_attribute for all files:
# extract frame number
list_frame_numbers = []
if all(["frame" in d for d in file_attributes_dicts]):
for k_i, k in enumerate(file_attributes_dicts):
try:
list_frame_numbers.append(int(k["frame"]))
except Exception as e:
raise log_error(
ValueError,
f"{df.filename.iloc[k_i]} (row {k_i}): "
"'frame' file attribute cannot be cast as an integer. "
f"Please review the file attributes: {k}.",
) from e

# else: extract frame number from filename.
else:
pattern = r"_(0\d*)\.\w+$"

for f_i, f in enumerate(df["filename"]):
regex_match = re.search(pattern, f)
if regex_match: # if there is a pattern match
list_frame_numbers.append(
int(regex_match.group(1)) # type: ignore
# the match will always be castable as integer
)
else:
raise log_error(
ValueError,
f"{f} (row {f_i}): "
"a frame number could not be extracted from the "
"filename. If included in the filename, the frame "
"number is expected as a zero-padded integer between "
"an underscore '_' and the file extension "
"(e.g. img_00234.png).",
)

# Check we have as many unique frame numbers as unique image files
if len(set(list_frame_numbers)) != len(df.filename.unique()):
raise log_error(
ValueError,
"The number of unique frame numbers does not match the number "
"of unique image files. Please review the VIA tracks .csv "
"file and ensure a unique frame number is defined for each "
"file. ",
)

@path.validator
def csv_file_contains_tracked_bboxes(self, attribute, value):
"""Ensure that the VIA tracks .csv contains tracked bounding boxes.
This involves:
- Checking that the bounding boxes are defined as rectangles.
- Checking that the bounding boxes have all geometric parameters
(["x", "y", "width", "height"]).
- Checking that the bounding boxes have a track ID defined.
- Checking that the track ID can be cast as an integer.
"""
df = pd.read_csv(value, sep=",", header=0)

for row in df.itertuples():
row_region_shape_attrs = ast.literal_eval(
row.region_shape_attributes
)
row_region_attrs = ast.literal_eval(row.region_attributes)

# check annotation is a rectangle
if row_region_shape_attrs["name"] != "rect":
raise log_error(
ValueError,
f"{row.filename} (row {row.Index}): "
"bounding box shape must be 'rect' (rectangular) "
"but instead got "
f"'{row_region_shape_attrs['name']}'.",
)

# check all geometric parameters for the box are defined
if not all(
[
key in row_region_shape_attrs
for key in ["x", "y", "width", "height"]
]
):
raise log_error(
ValueError,
f"{row.filename} (row {row.Index}): "
f"at least one bounding box shape parameter is missing. "
"Expected 'x', 'y', 'width', 'height' to exist as "
"'region_shape_attributes', but got "
f"'{list(row_region_shape_attrs.keys())}'.",
)

# check track ID is defined
if "track" not in row_region_attrs:
raise log_error(
ValueError,
f"{row.filename} (row {row.Index}): "
"bounding box does not have a 'track' attribute defined "
"under 'region_attributes'. "
"Please review the VIA tracks .csv file.",
)

# check track ID is castable as an integer
try:
int(row_region_attrs["track"])
except Exception as e:
raise log_error(
ValueError,
f"{row.filename} (row {row.Index}): "
"the track ID for the bounding box cannot be cast "
"as an integer. Please review the VIA tracks .csv file.",
) from e

@path.validator
def csv_file_contains_unique_track_IDs_per_filename(
self, attribute, value
):
"""Ensure the VIA tracks .csv contains unique track IDs per filename.
It checks that bounding boxes IDs are defined once per image file.
"""
df = pd.read_csv(value, sep=",", header=0)

list_unique_filenames = list(set(df.filename))
for file in list_unique_filenames:
df_one_filename = df.loc[df["filename"] == file]

list_track_IDs_one_filename = [
int(ast.literal_eval(row.region_attributes)["track"])
for row in df_one_filename.itertuples()
]

if len(set(list_track_IDs_one_filename)) != len(
list_track_IDs_one_filename
):
raise log_error(
ValueError,
f"{file}: "
"multiple bounding boxes in this file "
"have the same track ID. "
"Please review the VIA tracks .csv file.",
)
Loading

0 comments on commit 242f532

Please sign in to comment.