neuroinformatics-unit · sfmig · Jul 22, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/movement/validators/files.py b/movement/validators/files.py
@@ -1,10 +1,13 @@
 """``attrs`` classes for validating file paths."""
 
+import ast
 import os
+import re
 from pathlib import Path
 from typing import Literal
 
 import h5py
+import pandas as pd
 from attrs import define, field, validators
 
 from movement.utils.logging import log_error
@@ -198,3 +201,211 @@ def _csv_file_contains_expected_levels(self, attribute, value):
                     ".csv header rows do not match the known format for "
                     "DeepLabCut pose estimation output files.",
                 )
+
+
+@define
+class ValidVIATracksCSV:
+    """Class for validating VIA tracks .csv files.
+
+    Parameters
+    ----------
+    path : pathlib.Path or str
+        Path to the VIA tracks .csv file.
+
+    Raises
+    ------
+    ValueError
+        If the .csv file does not match the VIA tracks file requirements.
+
+    """
+
+    path: Path = field(validator=validators.instance_of(Path))
+
+    @path.validator
+    def csv_file_contains_valid_header(self, attribute, value):
+        """Ensure the VIA tracks .csv file contains the expected header."""
+        expected_header = [
+            "filename",
+            "file_size",
+            "file_attributes",
+            "region_count",
+            "region_id",
+            "region_shape_attributes",
+            "region_attributes",
+        ]
+
+        with open(value) as f:
+            header = f.readline().strip("\n").split(",")
+
+            if header != expected_header:
+                raise log_error(
+                    ValueError,
+                    ".csv header row does not match the known format for "
+                    "VIA tracks output files. "
+                    f"Expected {expected_header} but got {header}.",
+                )
+
+    @path.validator
+    def csv_file_contains_valid_frame_numbers(self, attribute, value):
+        """Ensure that the VIA tracks .csv file contains valid frame numbers.
+
+        This involves:
+        - Checking that frame numbers are included in `file_attributes` or
+          encoded in the image file `filename`.
+        - Checking the frame number can be cast as an integer.
+        - Checking that there are as many unique frame numbers as unique image
+          files.
+
+        If the frame number is included as part of the image file name, it is
+        expected as an integer led by at least one zero, between "_" and ".",
+        followed by the file extension.
+        """
+        df = pd.read_csv(value, sep=",", header=0)
+
+        # Extract list of file attributes (dicts)
+        file_attributes_dicts = [
+            ast.literal_eval(d) for d in df.file_attributes
+        ]
+
+        # If 'frame' is a file_attribute for all files:
+        # extract frame number
+        list_frame_numbers = []
+        if all(["frame" in d for d in file_attributes_dicts]):
+            for k_i, k in enumerate(file_attributes_dicts):
+                try:
+                    list_frame_numbers.append(int(k["frame"]))
+                except Exception as e:
+                    raise log_error(
+                        ValueError,
+                        f"{df.filename.iloc[k_i]} (row {k_i}): "
+                        "'frame' file attribute cannot be cast as an integer. "
+                        f"Please review the file attributes: {k}.",
+                    ) from e
+
+        # else: extract frame number from filename.
+        else:
+            pattern = r"_(0\d*)\.\w+$"
+
+            for f_i, f in enumerate(df["filename"]):
+                regex_match = re.search(pattern, f)
+                if regex_match:  # if there is a pattern match
+                    list_frame_numbers.append(
+                        int(regex_match.group(1))  # type: ignore
+                        # the match will always be castable as integer
+                    )
+                else:
+                    raise log_error(
+                        ValueError,
+                        f"{f} (row {f_i}): "
+                        "a frame number could not be extracted from the "
+                        "filename. If included in the filename, the frame "
+                        "number is expected as a zero-padded integer between "
+                        "an underscore '_' and the file extension "
+                        "(e.g. img_00234.png).",
+                    )
+
+        # Check we have as many unique frame numbers as unique image files
+        if len(set(list_frame_numbers)) != len(df.filename.unique()):
+            raise log_error(
+                ValueError,
+                "The number of unique frame numbers does not match the number "
+                "of unique image files. Please review the VIA tracks .csv "
+                "file and ensure a unique frame number is defined for each "
+                "file. ",
+            )
+
+    @path.validator
+    def csv_file_contains_tracked_bboxes(self, attribute, value):
+        """Ensure that the VIA tracks .csv contains tracked bounding boxes.
+
+        This involves:
+        - Checking that the bounding boxes are defined as rectangles.
+        - Checking that the bounding boxes have all geometric parameters
+          (["x", "y", "width", "height"]).
+        - Checking that the bounding boxes have a track ID defined.
+        - Checking that the track ID can be cast as an integer.
+        """
+        df = pd.read_csv(value, sep=",", header=0)
+
+        for row in df.itertuples():
+            row_region_shape_attrs = ast.literal_eval(
+                row.region_shape_attributes
+            )
+            row_region_attrs = ast.literal_eval(row.region_attributes)
+
+            # check annotation is a rectangle
+            if row_region_shape_attrs["name"] != "rect":
+                raise log_error(
+                    ValueError,
+                    f"{row.filename} (row {row.Index}): "
+                    "bounding box shape must be 'rect' (rectangular) "
+                    "but instead got "
+                    f"'{row_region_shape_attrs['name']}'.",
+                )
+
+            # check all geometric parameters for the box are defined
+            if not all(
+                [
+                    key in row_region_shape_attrs
+                    for key in ["x", "y", "width", "height"]
+                ]
+            ):
+                raise log_error(
+                    ValueError,
+                    f"{row.filename} (row {row.Index}): "
+                    f"at least one bounding box shape parameter is missing. "
+                    "Expected 'x', 'y', 'width', 'height' to exist as "
+                    "'region_shape_attributes', but got "
+                    f"'{list(row_region_shape_attrs.keys())}'.",
+                )
+
+            # check track ID is defined
+            if "track" not in row_region_attrs:
+                raise log_error(
+                    ValueError,
+                    f"{row.filename} (row {row.Index}): "
+                    "bounding box does not have a 'track' attribute defined "
+                    "under 'region_attributes'. "
+                    "Please review the VIA tracks .csv file.",
+                )
+
+            # check track ID is castable as an integer
+            try:
+                int(row_region_attrs["track"])
+            except Exception as e:
+                raise log_error(
+                    ValueError,
+                    f"{row.filename} (row {row.Index}): "
+                    "the track ID for the bounding box cannot be cast "
+                    "as an integer. Please review the VIA tracks .csv file.",
+                ) from e
+
+    @path.validator
+    def csv_file_contains_unique_track_IDs_per_filename(
+        self, attribute, value
+    ):
+        """Ensure the VIA tracks .csv contains unique track IDs per filename.
+
+        It checks that bounding boxes IDs are defined once per image file.
+        """
+        df = pd.read_csv(value, sep=",", header=0)
+
+        list_unique_filenames = list(set(df.filename))
+        for file in list_unique_filenames:
+            df_one_filename = df.loc[df["filename"] == file]
+
+            list_track_IDs_one_filename = [
+                int(ast.literal_eval(row.region_attributes)["track"])
+                for row in df_one_filename.itertuples()
+            ]
+
+            if len(set(list_track_IDs_one_filename)) != len(
+                list_track_IDs_one_filename
+            ):
+                raise log_error(
+                    ValueError,
+                    f"{file}: "
+                    "multiple bounding boxes in this file "
+                    "have the same track ID. "
+                    "Please review the VIA tracks .csv file.",
+                )