Lightning-Universe · ethanwharris · Sep 1, 2022 · Jul 14, 2022 · Jul 14, 2022 · Jul 14, 2022
@@ -41,6 +41,8 @@
     VideoClassificationFilesInput,
     VideoClassificationFoldersInput,
     VideoClassificationPathsPredictInput,
+    VideoClassificationTensorsInput,
+    VideoClassificationTensorsPredictInput,
 )
 from flash.video.classification.input_transform import VideoClassificationInputTransform
 
@@ -63,6 +65,7 @@
         "VideoClassificationData.from_folders",
         "VideoClassificationData.from_data_frame",
         "VideoClassificationData.from_csv",
+        "VideoClassificationData.from_tensors",
     ]
 if not _VIDEO_EXTRAS_TESTING:
     __doctest_skip__ += ["VideoClassificationData.from_fiftyone"]
@@ -395,7 +398,6 @@ def from_data_frame(
         predict_data_frame: Optional[pd.DataFrame] = None,
         predict_videos_root: Optional[str] = None,
         predict_resolver: Optional[Callable[[str, str], str]] = None,
-        target_formatter: Optional[TargetFormatter] = None,
         clip_sampler: Union[str, "ClipSampler"] = "random",
         clip_duration: float = 2,
         clip_sampler_kwargs: Dict[str, Any] = None,
@@ -404,6 +406,7 @@ def from_data_frame(
         decoder: str = "pyav",
         input_cls: Type[Input] = VideoClassificationDataFrameInput,
         predict_input_cls: Type[Input] = VideoClassificationDataFramePredictInput,
+        target_formatter: Optional[TargetFormatter] = None,
         transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform,
         transform_kwargs: Optional[Dict] = None,
         **data_module_kwargs: Any,
@@ -566,6 +569,131 @@ def from_data_frame(
             **data_module_kwargs,
         )
 
+    @classmethod
+    def from_tensors(
+        cls,
+        input_field: str,
+        target_field: Optional[Union[str, Sequence[str]]] = None,
+        train_data: Optional[Dict[str, Union[torch.Tensor, Any, List[Any]]]] = None,
+        val_data: Optional[Dict[str, Union[torch.Tensor, Any, List[Any]]]] = None,
+        test_data: Optional[Dict[str, Union[torch.Tensor, Any, List[Any]]]] = None,
+        predict_data: Optional[Dict[str, Union[torch.Tensor, Any, List[Any]]]] = None,
+        video_sampler: Type[Sampler] = torch.utils.data.SequentialSampler,
+        input_cls: Type[Input] = VideoClassificationTensorsInput,
+        predict_input_cls: Type[Input] = VideoClassificationTensorsPredictInput,
+        target_formatter: Optional[TargetFormatter] = None,
+        transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform,
+        transform_kwargs: Optional[Dict] = None,
+        **data_module_kwargs: Any,
+    ) -> "VideoClassificationData":
+        """Load the :class:`~flash.video.classification.data.VideoClassificationData` from a dictionary containing
+        PyTorch tensors representing input video frames and their corresponding targets.
+
+        Input tensor(s) will be extracted from the ``input_field`` in the ``dict``.
+        The targets will be extracted from the ``target_fields`` in the ``dict`` and can be in any of our
+        :ref:`supported classification target formats <formatting_classification_targets>`.
+
+        To learn how to customize the transforms applied for each stage, read our
+        :ref:`customizing transforms guide <customizing_transforms>`.
+
+        Args:
+            input_field: The field (key name) in ``dict`` containing the video tensors.
+            target_field: The field (key name) in the ``dict`` containing the targets.
+            train_data: The ``dict`` containing tensors in ``input_field`` key and targets in
+                ``target_fields`` key to use when training.
+            val_data: The ``dict`` containing tensors in ``input_field`` key and targets in
+                ``target_fields`` key to use when validating.
+            test_data: The ``dict`` containing tensors in ``input_field`` key and targets in
+                ``target_fields`` key to use when testing.
+            target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to
+                control how targets are handled. See :ref:`formatting_classification_targets` for more details.
+            video_sampler: Sampler for the internal video container. This defines the order tensors are used and,
+                if necessary, the distributed split.
+            input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data.
+            predict_input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the prediction data.
+            transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use.
+            transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms.
+            data_module_kwargs: Additional keyword arguments to provide to the
+                :class:`~flash.core.data.data_module.DataModule` constructor.
+
+        Returns:
+            The constructed :class:`~flash.video.classification.data.VideoClassificationData`.
+
+        Examples
+        ________
+
+        .. doctest::
+
+            >>> import torch
+            >>> from flash import Trainer
+            >>> from flash.video import VideoClassifier, VideoClassificationData
+            >>> input_video = torch.randint(low=0, high=255, size=(3, 10, 10, 10), dtype=torch.uint8, device="cpu")
+            >>> train_data = {
+            ...     "data": torch.stack(
+            ...         (
+            ...             input_video,
+            ...             input_video,
+            ...         )
+            ...     ),  # 2 videos (each video: 10 frames)
+            ...     "targets": ["fruit", "vegetable"],  # Labels corresponding to each video
+            ... }
+            >>> predict_data = {
+            ...     "data": torch.stack((input_video,)),
+            ... }
+            >>> datamodule = VideoClassificationData.from_tensors(
+            ...     input_field="data",
+            ...     target_field="targets",
+            ...     train_data=train_data,
+            ...     predict_data=predict_data,
+            ...     batch_size=1,
+            ... )
+            >>> datamodule.num_classes
+            2
+            >>> datamodule.labels
+            ['fruit', 'vegetable']
+            >>> model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes)
+            >>> trainer = Trainer(fast_dev_run=True)
+            >>> trainer.fit(model, datamodule=datamodule)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+            Training...
+            >>> trainer.predict(model, datamodule=datamodule)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+            Predicting...
+
+        .. testcleanup::
+
+            >>> del input_video
+            >>> del train_data
+            >>> del predict_data
+        """
+        train_tuple = (train_data, input_field, target_field)
+        val_tuple = (val_data, input_field, target_field)
+        test_tuple = (test_data, input_field, target_field)
+        predict_tuple = (predict_data, input_field)
+
+        train_input = input_cls(
+            RunningStage.TRAINING, *train_tuple, video_sampler=video_sampler, target_formatter=target_formatter
+        )
+        target_formatter = getattr(train_input, "target_formatter", None)
+
+        return cls(
+            train_input,
+            input_cls(
+                RunningStage.VALIDATING,
+                *val_tuple,
+                video_sampler=video_sampler,
+                target_formatter=target_formatter,
+            ),
+            input_cls(
+                RunningStage.TESTING,
+                *test_tuple,
+                video_sampler=video_sampler,
+                target_formatter=target_formatter,
+            ),
+            predict_input_cls(RunningStage.PREDICTING, *predict_tuple),
+            transform=transform,
+            transform_kwargs=transform_kwargs,
+            **data_module_kwargs,
+        )
+
     @classmethod
     def from_csv(
         cls,

@@ -40,8 +40,17 @@
     from pytorchvideo.data.encoded_video import EncodedVideo
     from pytorchvideo.data.labeled_video_dataset import LabeledVideoDataset
     from pytorchvideo.data.labeled_video_paths import LabeledVideoPaths
+
+    from flash.video.classification.utils import LabeledVideoTensorDataset
+
 else:
-    ClipSampler, LabeledVideoDataset, EncodedVideo, ApplyTransformToKey = None, None, None, None
+    ClipSampler, LabeledVideoDataset, LabeledVideoTensorDataset, EncodedVideo, ApplyTransformToKey = (
+        None,
+        None,
+        None,
+        None,
+        None,
+    )
 
 
 def _make_clip_sampler(
@@ -87,6 +96,29 @@ def load_sample(self, sample):
         return sample
 
 
+class VideoClassificationTensorsBaseInput(IterableInput, ClassificationInputMixin):
+    def load_data(
+        self,
+        inputs: torch.Tensor,
+        targets: Union[List[Any], Any],
+        video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
+        target_formatter: Optional[TargetFormatter] = None,
+    ) -> "LabeledVideoTensorDataset":
+        # Note: We take whatever is the shortest out of inputs and targets
+        dataset = LabeledVideoTensorDataset(list(zip(inputs, targets)), video_sampler=video_sampler)
+        if not self.predicting:
+            self.load_target_metadata(
+                [sample[1] for sample in dataset._labeled_videos], target_formatter=target_formatter
+            )
+        return dataset
+
+    def load_sample(self, sample):
+        sample["label"] = self.format_target(sample["label"])
+        sample[DataKeys.INPUT] = sample.pop("video")
+        sample[DataKeys.TARGET] = sample.pop("label")
+        return sample
+
+
 class VideoClassificationFoldersInput(VideoClassificationInput):
     def load_data(
         self,
@@ -178,6 +210,35 @@ def load_data(
         return result
 
 
+class VideoClassificationTensorsInput(VideoClassificationTensorsBaseInput):
+    labels: list
+
+    def load_data(
+        self,
+        input_data: Dict[str, Union[torch.Tensor, Any, List[Any]]],
+        input_key: str,
+        target_keys: Union[str, List[str]],
+        video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
+        target_formatter: Optional[TargetFormatter] = None,
+    ) -> "LabeledVideoTensorDataset":
+        result = super().load_data(
+            input_data[input_key],
+            input_data[target_keys],  # TODO: @krshrimali: this does not support list of str as of now
+            video_sampler=video_sampler,
+            target_formatter=target_formatter,
+        )
+
+        # If we had binary multi-class targets then we also know the labels (column names)
+        if (
+            self.training
+            and isinstance(self.target_formatter, MultiBinaryTargetFormatter)
+            and isinstance(target_keys, List)
+        ):
+            self.labels = target_keys
+
+        return result
+
+
 class VideoClassificationCSVInput(VideoClassificationDataFrameInput):
     def load_data(
         self,
@@ -316,6 +377,21 @@ def predict_load_data(
         )
 
 
+class VideoClassificationTensorsPredictInput(Input):
+    def predict_load_data(
+        self,
+        data: Dict[str, Union[torch.Tensor, List[Any], Any]],
+        data_key: str,
+    ):
+        return list(data[data_key])
+
+    def predict_load_sample(self, sample: torch.Tensor) -> Dict[str, Any]:
+        return {
+            DataKeys.INPUT: sample,
+            "video_index": 0,
+        }
+
+
 class VideoClassificationCSVPredictInput(VideoClassificationDataFramePredictInput):
     def predict_load_data(
         self,

@@ -0,0 +1,106 @@
+from typing import List, Optional, Tuple, Type
+
+import torch
+
+from flash.core.utilities.imports import _VIDEO_AVAILABLE
+
+if _VIDEO_AVAILABLE:
+    from pytorchvideo.data.utils import MultiProcessSampler
+else:
+    MultiProcessSampler = None
+
+
+class LabeledVideoTensorDataset(torch.utils.data.IterableDataset):
+    """LabeledVideoTensorDataset handles a direct tensor input data."""
+
+    def __init__(
+        self,
+        labeled_video_tensors: List[Tuple[str, Optional[dict]]],
+        video_sampler: Type[torch.utils.data.Sampler] = torch.utils.data.RandomSampler,
+    ) -> None:
+        self._labeled_videos = labeled_video_tensors
+
+        # If a RandomSampler is used we need to pass in a custom random generator that
+        # ensures all PyTorch multiprocess workers have the same random seed.
+        self._video_random_generator = None
+        if video_sampler == torch.utils.data.RandomSampler:
+            self._video_random_generator = torch.Generator()
+            self._video_sampler = video_sampler(self._labeled_videos, generator=self._video_random_generator)
+        else:
+            self._video_sampler = video_sampler(self._labeled_videos)
+
+        self._video_sampler_iter = None  # Initialized on first call to self.__next__()
+
+        # Depending on the clip sampler type, we may want to sample multiple clips
+        # from one video. In that case, we keep the store video, label and previous sampled
+        # clip time in these variables.
+        self._loaded_video_label = None
+
+    @property
+    def video_sampler(self):
+        """
+        Returns:
+            The video sampler that defines video sample order. Note that you'll need to
+            use this property to set the epoch for a torch.utils.data.DistributedSampler.
+        """
+        return self._video_sampler
+
+    @property
+    def num_videos(self):
+        """
+        Returns:
+            Number of videos in dataset.
+        """
+        return len(self.video_sampler)
+
+    def __next__(self) -> dict:
+        """Retrieves the next clip based on the clip sampling strategy and video sampler.
+
+        Returns:
+            A dictionary with the following format.
+
+            .. code-block:: text
+
+                {
+                    'video': <video_tensor>,
+                    'label': <index_label>,
+                    'video_label': <index_label>
+                    'video_index': <video_index>,
+                }
+        """
+        if not self._video_sampler_iter:
+            # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned.
+            self._video_sampler_iter = iter(MultiProcessSampler(self._video_sampler))
+
+        # Reuse previously stored video if there are still clips to be sampled from
+        # the last loaded video.
+        video_index = next(self._video_sampler_iter)
+        video_tensor, info_dict = self._labeled_videos[video_index]
+        self._loaded_video_label = (video_tensor, info_dict, video_index)
+
+        sample_dict = {
+            "video": self._loaded_video_label[0],
+            "video_name": f"video{video_index}",
+            "video_index": video_index,
+            "label": info_dict,
+            "video_label": info_dict,
+        }
+
+        return sample_dict
+
+    def __iter__(self):
+        self._video_sampler_iter = None  # Reset video sampler
+
+        # If we're in a PyTorch DataLoader multiprocessing context, we need to use the
+        # same seed for each worker's RandomSampler generator. The workers at each
+        # __iter__ call are created from the unique value: worker_info.seed - worker_info.id,
+        # which we can use for this seed.
+        worker_info = torch.utils.data.get_worker_info()
+        if self._video_random_generator is not None and worker_info is not None:
+            base_seed = worker_info.seed - worker_info.id
+            self._video_random_generator.manual_seed(base_seed)
+
+        return self
+
+    def size(self):
+        return len(self._labeled_videos)