Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

from_tensors support for VideoClassification #1389

Merged
merged 48 commits into from
Sep 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
bef8e50
WIP: from_tensors support
krshrimali Jul 14, 2022
b29cdb2
remove unused func in tests
krshrimali Jul 14, 2022
6d1a0be
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 14, 2022
2f8bb7e
Remove doc, add LabeledVideoTensorDataset
krshrimali Jul 14, 2022
9490e31
Fix merge conflict
krshrimali Jul 14, 2022
ce882a1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 14, 2022
4031f5c
class for prediction
krshrimali Jul 14, 2022
22f049d
Fixes for predictions
krshrimali Jul 14, 2022
951ae93
Merge branch 'master' into video/feature/classification/from_tensors
krshrimali Jul 14, 2022
c36dbbe
minor... to fix the CI
krshrimali Jul 14, 2022
23002f2
Merge branch 'video/feature/classification/from_tensors' of github.co…
krshrimali Jul 14, 2022
d210bd8
remove make_tensor, use randint (compatible with older pytorch versions)
krshrimali Jul 15, 2022
242ca8b
Merge branch 'master' into video/feature/classification/from_tensors
krshrimali Jul 15, 2022
738a022
Separate tests for data loading for tensors
krshrimali Jul 15, 2022
465fb2f
Separate tests for data loading for tensors
krshrimali Jul 15, 2022
b235e7d
Merge branch 'video/feature/classification/from_tensors' of github.co…
krshrimali Jul 15, 2022
5da11a4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 15, 2022
80ec1eb
Skip doctest if not video installed
krshrimali Jul 15, 2022
6617097
Fix tests
krshrimali Jul 15, 2022
22edc4e
skip if pytorchvideo not installed
krshrimali Jul 15, 2022
df37dd1
correct format in the doctest
krshrimali Jul 15, 2022
f864a3b
Merge branch 'master' into video/feature/classification/from_tensors
krshrimali Jul 21, 2022
76410e0
Add labels to the call; prediction test
krshrimali Aug 29, 2022
6f6de3a
Pass labels, add prediction test
krshrimali Aug 29, 2022
6827164
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 29, 2022
1f61192
Fix doc
krshrimali Aug 29, 2022
abd7f22
Merge branch 'video/feature/classification/from_tensors' of github.co…
krshrimali Aug 29, 2022
92df464
Update tests/video/classification/test_model.py
krshrimali Aug 30, 2022
3e887f3
Update flash/video/classification/utils.py
krshrimali Aug 30, 2022
a006bb6
Address review
krshrimali Aug 30, 2022
2e65edc
Merge branch 'video/feature/classification/from_tensors' of github.co…
krshrimali Aug 30, 2022
3ef8fbd
pep8
krshrimali Aug 30, 2022
6fea612
Update flash/video/classification/utils.py
krshrimali Aug 30, 2022
d71ce41
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 30, 2022
1c9e3f8
Address review: allow stack of tensors, tensor, list of tensors, matc…
krshrimali Aug 31, 2022
b81803b
Remove breakpoints
krshrimali Aug 31, 2022
13df5af
Fix doctests
krshrimali Aug 31, 2022
da2fe56
Fix doctest
krshrimali Aug 31, 2022
2381f04
Revert pre-commit change
krshrimali Aug 31, 2022
f95ef7d
Merge branch 'master' into video/feature/classification/from_tensors
krshrimali Aug 31, 2022
ecc5528
Add license, improve tests - use parametrize, refactor
krshrimali Sep 1, 2022
73613f3
Merge branch 'video/feature/classification/from_tensors' of github.co…
krshrimali Sep 1, 2022
2d69b40
Fix error for video not available
krshrimali Sep 1, 2022
7009f86
unused import
krshrimali Sep 1, 2022
62e67f5
Add check for video available or not
krshrimali Sep 1, 2022
e5b2350
If not video available, return tensors from randint
krshrimali Sep 1, 2022
3e72e1a
mock_video_tensors is removed now
krshrimali Sep 1, 2022
1cf3d75
Use _is_list_like instead of isinstance for list/tuple
krshrimali Sep 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 121 additions & 2 deletions flash/video/classification/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
from typing import Any, Callable, Collection, Dict, List, Optional, Sequence, Type, Union

import pandas as pd
import torch
Expand Down Expand Up @@ -41,6 +41,8 @@
VideoClassificationFilesInput,
VideoClassificationFoldersInput,
VideoClassificationPathsPredictInput,
VideoClassificationTensorsInput,
VideoClassificationTensorsPredictInput,
)
from flash.video.classification.input_transform import VideoClassificationInputTransform

Expand All @@ -63,6 +65,7 @@
"VideoClassificationData.from_folders",
"VideoClassificationData.from_data_frame",
"VideoClassificationData.from_csv",
"VideoClassificationData.from_tensors",
]
if not _VIDEO_EXTRAS_TESTING:
__doctest_skip__ += ["VideoClassificationData.from_fiftyone"]
Expand Down Expand Up @@ -395,7 +398,6 @@ def from_data_frame(
predict_data_frame: Optional[pd.DataFrame] = None,
predict_videos_root: Optional[str] = None,
predict_resolver: Optional[Callable[[str, str], str]] = None,
target_formatter: Optional[TargetFormatter] = None,
clip_sampler: Union[str, "ClipSampler"] = "random",
clip_duration: float = 2,
clip_sampler_kwargs: Dict[str, Any] = None,
Expand All @@ -404,6 +406,7 @@ def from_data_frame(
decoder: str = "pyav",
input_cls: Type[Input] = VideoClassificationDataFrameInput,
predict_input_cls: Type[Input] = VideoClassificationDataFramePredictInput,
target_formatter: Optional[TargetFormatter] = None,
transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform,
transform_kwargs: Optional[Dict] = None,
**data_module_kwargs: Any,
Expand Down Expand Up @@ -566,6 +569,122 @@ def from_data_frame(
**data_module_kwargs,
)

@classmethod
def from_tensors(
cls,
train_data: Optional[Union[Collection[torch.Tensor], torch.Tensor]] = None,
train_targets: Optional[Collection[Any]] = None,
val_data: Optional[Union[Collection[torch.Tensor], torch.Tensor]] = None,
val_targets: Optional[Sequence[Any]] = None,
test_data: Optional[Collection[torch.Tensor]] = None,
test_targets: Optional[Sequence[Any]] = None,
predict_data: Optional[Union[Collection[torch.Tensor], torch.Tensor]] = None,
target_formatter: Optional[TargetFormatter] = None,
video_sampler: Type[Sampler] = torch.utils.data.SequentialSampler,
input_cls: Type[Input] = VideoClassificationTensorsInput,
predict_input_cls: Type[Input] = VideoClassificationTensorsPredictInput,
transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform,
transform_kwargs: Optional[Dict] = None,
**data_module_kwargs: Any,
) -> "VideoClassificationData":
"""Load the :class:`~flash.video.classification.data.VideoClassificationData` from a dictionary containing
PyTorch tensors representing input video frames and their corresponding targets.

Input tensor(s) will be extracted from the ``input_field`` in the ``dict``.
The targets will be extracted from the ``target_fields`` in the ``dict`` and can be in any of our
:ref:`supported classification target formats <formatting_classification_targets>`.

To learn how to customize the transforms applied for each stage, read our
:ref:`customizing transforms guide <customizing_transforms>`.

Args:
train_data: The torch tensor or list of tensors to use when training.
train_targets: The list of targets to use when training.
val_data: The torch tensor or list of tensors to use when validating.
val_targets: The list of targets to use when validating.
test_data: The torch tensor or list of tensors to use when testing.
test_targets: The list of targets to use when testing.
predict_data: The torch tensor or list of tensors to use when predicting.
train_data: A torch tensor or list of tensors to use when training.
train_targets: The list of targets to use when training.
target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to
control how targets are handled. See :ref:`formatting_classification_targets` for more details.
video_sampler: Sampler for the internal video container. This defines the order tensors are used and,
if necessary, the distributed split.
input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data.
predict_input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the prediction data.
transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use.
transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms.
data_module_kwargs: Additional keyword arguments to provide to the
:class:`~flash.core.data.data_module.DataModule` constructor.

Returns:
The constructed :class:`~flash.video.classification.data.VideoClassificationData`.

Examples
________

.. doctest::

>>> import torch
>>> from flash import Trainer
>>> from flash.video import VideoClassifier, VideoClassificationData
>>> frame = torch.randint(low=0, high=255, size=(3, 5, 10, 10), dtype=torch.uint8, device="cpu")
>>> datamodule = VideoClassificationData.from_tensors(
... train_data=[frame, frame, frame],
... train_targets=["fruit", "vegetable", "fruit"],
... val_data=[frame, frame],
... val_targets=["vegetable", "fruit"],
... predict_data=[frame],
... batch_size=1,
... )
>>> datamodule.num_classes
2
>>> datamodule.labels
['fruit', 'vegetable']
>>> model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes)
>>> trainer = Trainer(fast_dev_run=True)
>>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
Training...
>>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
Predicting...

.. testcleanup::

>>> del frame
"""

train_input = input_cls(
RunningStage.TRAINING,
train_data,
train_targets,
video_sampler=video_sampler,
target_formatter=target_formatter,
)
target_formatter = getattr(train_input, "target_formatter", None)

return cls(
train_input,
input_cls(
RunningStage.VALIDATING,
val_data,
val_targets,
video_sampler=video_sampler,
target_formatter=target_formatter,
),
input_cls(
RunningStage.TESTING,
test_data,
test_targets,
video_sampler=video_sampler,
target_formatter=target_formatter,
),
predict_input_cls(RunningStage.PREDICTING, predict_data),
transform=transform,
transform_kwargs=transform_kwargs,
**data_module_kwargs,
)

@classmethod
def from_csv(
cls,
Expand Down
104 changes: 101 additions & 3 deletions flash/video/classification/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Any, Callable, Dict, List, Optional, Type, Union
from typing import Any, Callable, Collection, Dict, List, Optional, Type, Union

import pandas as pd
import torch
Expand All @@ -21,7 +21,7 @@

from flash.core.data.io.classification_input import ClassificationInputMixin
from flash.core.data.io.input import DataKeys, Input, IterableInput
from flash.core.data.utilities.classification import MultiBinaryTargetFormatter, TargetFormatter
from flash.core.data.utilities.classification import _is_list_like, MultiBinaryTargetFormatter, TargetFormatter
from flash.core.data.utilities.data_frame import resolve_files, resolve_targets
from flash.core.data.utilities.loading import load_data_frame
from flash.core.data.utilities.paths import list_valid_files, make_dataset, PATH_TYPE
Expand All @@ -40,8 +40,17 @@
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.data.labeled_video_dataset import LabeledVideoDataset
from pytorchvideo.data.labeled_video_paths import LabeledVideoPaths

from flash.video.classification.utils import LabeledVideoTensorDataset

else:
ClipSampler, LabeledVideoDataset, EncodedVideo, ApplyTransformToKey = None, None, None, None
ClipSampler, LabeledVideoDataset, LabeledVideoTensorDataset, EncodedVideo, ApplyTransformToKey = (
None,
None,
None,
None,
None,
)


def _make_clip_sampler(
Expand Down Expand Up @@ -87,6 +96,43 @@ def load_sample(self, sample):
return sample


class VideoClassificationTensorsBaseInput(IterableInput, ClassificationInputMixin):
def load_data(
self,
inputs: Optional[Union[Collection[torch.Tensor], torch.Tensor]],
targets: Union[List[Any], Any],
video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
target_formatter: Optional[TargetFormatter] = None,
) -> "LabeledVideoTensorDataset":
if isinstance(inputs, torch.Tensor):
# In case of (number of videos x CTHW) format
if inputs.ndim == 5:
inputs = list(inputs)
elif inputs.ndim == 4:
inputs = [inputs]
else:
raise ValueError(
f"Got dimension of the input tensor: {inputs.ndim}"
" for stack of tensors - dimension should be 5 or for a single tensor, dimension should be 4.",
)
elif not _is_list_like(inputs):
raise TypeError(f"Expected either a list/tuple of torch.Tensor or torch.Tensor, but got: {type(inputs)}.")

# Note: We take whatever is the shortest out of inputs and targets
dataset = LabeledVideoTensorDataset(list(zip(inputs, targets)), video_sampler=video_sampler)
if not self.predicting:
self.load_target_metadata(
[sample[1] for sample in dataset._labeled_videos], target_formatter=target_formatter
)
return dataset

def load_sample(self, sample):
sample["label"] = self.format_target(sample["label"])
sample[DataKeys.INPUT] = sample.pop("video")
sample[DataKeys.TARGET] = sample.pop("label")
return sample


class VideoClassificationFoldersInput(VideoClassificationInput):
def load_data(
self,
Expand Down Expand Up @@ -178,6 +224,34 @@ def load_data(
return result


class VideoClassificationTensorsInput(VideoClassificationTensorsBaseInput):
labels: list

def load_data(
self,
tensors: Any,
targets: Optional[List[Any]] = None,
video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
target_formatter: Optional[TargetFormatter] = None,
) -> "LabeledVideoTensorDataset":
result = super().load_data(
tensors,
targets,
video_sampler=video_sampler,
target_formatter=target_formatter,
)

# If we had binary multi-class targets then we also know the labels (column names)
if (
self.training
and isinstance(self.target_formatter, MultiBinaryTargetFormatter)
and isinstance(targets, List)
):
self.labels = targets

return result


class VideoClassificationCSVInput(VideoClassificationDataFrameInput):
def load_data(
self,
Expand Down Expand Up @@ -316,6 +390,30 @@ def predict_load_data(
)


class VideoClassificationTensorsPredictInput(Input):
def predict_load_data(self, data: Union[torch.Tensor, List[Any], Any]):
if _is_list_like(data):
return data
else:
if not isinstance(data, torch.Tensor):
raise TypeError(f"Expected either a list/tuple of torch.Tensor or torch.Tensor, but got: {type(data)}.")
if data.ndim == 5:
return list(data)
elif data.ndim == 4:
return [data]
else:
raise ValueError(
f"Got dimension of the input tensor: {data.ndim},"
" for stack of tensors - dimension should be 5 or for a single tensor, dimension should be 4."
)

def predict_load_sample(self, sample: torch.Tensor) -> Dict[str, Any]:
return {
DataKeys.INPUT: sample,
"video_index": 0,
}


class VideoClassificationCSVPredictInput(VideoClassificationDataFramePredictInput):
def predict_load_data(
self,
Expand Down
Loading