Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add VideoModelZoo models #1130

Merged
merged 8 commits into from
Jul 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions test/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ def get_available_detection_models():
return [k for k, v in models.detection.__dict__.items() if callable(v) and k[0].lower() == k[0] and k[0] != "_"]


def get_available_video_models():
# TODO add a registration mechanism to torchvision.models
return [k for k, v in models.video.__dict__.items() if callable(v) and k[0].lower() == k[0] and k[0] != "_"]


class Tester(unittest.TestCase):
def _test_classification_model(self, name, input_shape):
# passing num_class equal to a number other than 1000 helps in making the test
Expand Down Expand Up @@ -53,6 +58,16 @@ def _test_detection_model(self, name):
self.assertTrue("scores" in out[0])
self.assertTrue("labels" in out[0])

def _test_video_model(self, name):
# the default input shape is
# bs * num_channels * clip_len * h *w
input_shape = (1, 3, 8, 112, 112)
# test both basicblock and Bottleneck
model = models.video.__dict__[name](num_classes=50)
x = torch.rand(input_shape)
out = model(x)
self.assertEqual(out.shape[-1], 50)

def _make_sliced_model(self, model, stop_layer):
layers = OrderedDict()
for name, layer in model.named_children():
Expand Down Expand Up @@ -130,6 +145,12 @@ def do_test(self, model_name=model_name):

setattr(Tester, "test_" + model_name, do_test)

for model_name in get_available_video_models():

def do_test(self, model_name=model_name):
self._test_video_model(model_name)

setattr(Tester, "test_" + model_name, do_test)

if __name__ == '__main__':
unittest.main()
1 change: 1 addition & 0 deletions torchvision/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
from .shufflenetv2 import *
from . import segmentation
from . import detection
from . import video
3 changes: 3 additions & 0 deletions torchvision/models/video/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .r3d import *
from .r2plus1d import *
from .mixed_conv import *
72 changes: 72 additions & 0 deletions torchvision/models/video/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import torch.nn as nn


__all__ = ["Conv3DSimple", "Conv2Plus1D", "Conv3DNoTemporal"]


class Conv3DSimple(nn.Conv3d):
def __init__(self,
in_planes,
out_planes,
midplanes=None,
stride=1,
padding=1):

super(Conv3DSimple, self).__init__(
in_channels=in_planes,
out_channels=out_planes,
kernel_size=(3, 3, 3),
stride=stride,
padding=padding,
bias=False)

@staticmethod
def get_downsample_stride(stride):
return (stride, stride, stride)


class Conv2Plus1D(nn.Sequential):

def __init__(self,
in_planes,
out_planes,
midplanes,
stride=1,
padding=1):
conv1 = [
nn.Conv3d(in_planes, midplanes, kernel_size=(1, 3, 3),
stride=(1, stride, stride), padding=(0, padding, padding),
bias=False),
nn.BatchNorm3d(midplanes),
nn.ReLU(inplace=True),
nn.Conv3d(midplanes, out_planes, kernel_size=(3, 1, 1),
stride=(stride, 1, 1), padding=(padding, 0, 0),
bias=False)
]
super(Conv2Plus1D, self).__init__(*conv1)

@staticmethod
def get_downsample_stride(stride):
return (stride, stride, stride)


class Conv3DNoTemporal(nn.Conv3d):

def __init__(self,
in_planes,
out_planes,
midplanes=None,
stride=1,
padding=1):

super(Conv3DNoTemporal, self).__init__(
in_channels=in_planes,
out_channels=out_planes,
kernel_size=(1, 3, 3),
stride=(1, stride, stride),
padding=(0, padding, padding),
bias=False)

@staticmethod
def get_downsample_stride(stride):
return (1, stride, stride)
bjuncek marked this conversation as resolved.
Show resolved Hide resolved
78 changes: 78 additions & 0 deletions torchvision/models/video/mixed_conv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import torch.nn as nn

from ._utils import Conv3DSimple, Conv3DNoTemporal
from .video_stems import get_default_stem
from .video_trunk import VideoTrunkBuilder, BasicBlock, Bottleneck


__all__ = ["mc3_18"]


def _mcX(model_depth, X=3, use_pool1=False, **kwargs):
"""Generate mixed convolution network as in
https://arxiv.org/abs/1711.11248

Args:
model_depth (int): trunk depth - supports most resnet depths
X (int): Up to which layers are convolutions 3D
use_pool1 (bool, optional): Add pooling layer to the stem. Defaults to False.

Returns:
nn.Module: mcX video trunk
"""
assert X > 1 and X <= 5
conv_makers = [Conv3DSimple] * (X - 2)
while len(conv_makers) < 5:
conv_makers.append(Conv3DNoTemporal)

if model_depth < 50:
block = BasicBlock
else:
block = Bottleneck

model = VideoTrunkBuilder(block=block, conv_makers=conv_makers, model_depth=model_depth,
stem=get_default_stem(use_pool1=use_pool1), **kwargs)

return model


def _rmcX(model_depth, X=3, use_pool1=False, **kwargs):
"""Generate reverse mixed convolution network as in
https://arxiv.org/abs/1711.11248

Args:
model_depth (int): trunk depth - supports most resnet depths
X (int): Up to which layers are convolutions 2D
use_pool1 (bool, optional): Add pooling layer to the stem. Defaults to False.

Returns:
nn.Module: mcX video trunk
"""
assert X > 1 and X <= 5

conv_makers = [Conv3DNoTemporal] * (X - 2)
while len(conv_makers) < 5:
conv_makers.append(Conv3DSimple)

if model_depth < 50:
block = BasicBlock
else:
block = Bottleneck

model = VideoTrunkBuilder(block=block, conv_makers=conv_makers, model_depth=model_depth,
stem=get_default_stem(use_pool1=use_pool1), **kwargs)

return model


def mc3_18(use_pool1=False, **kwargs):
"""Constructor for 18 layer Mixed Convolution network as in
https://arxiv.org/abs/1711.11248

Args:
use_pool1 (bool, optional): Include pooling in the resnet stem. Defaults to False.

Returns:
nn.Module: MC3 Network definitino
"""
return _mcX(18, 3, use_pool1, **kwargs)
43 changes: 43 additions & 0 deletions torchvision/models/video/r2plus1d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import torch.nn as nn

from ._utils import Conv2Plus1D
from .video_stems import get_r2plus1d_stem
from .video_trunk import VideoTrunkBuilder, BasicBlock, Bottleneck


__all__ = ["r2plus1d_18"]


def _r2plus1d(model_depth, use_pool1=False, **kwargs):
"""Constructor for R(2+1)D network as described in
https://arxiv.org/abs/1711.11248

Args:
model_depth (int): Depth of the model - standard resnet depths apply
use_pool1 (bool, optional): Should we use the pooling layer? Defaults to False
Returns:
nn.Module: An R(2+1)D video backbone
"""
convs = [Conv2Plus1D] * 4
if model_depth < 50:
block = BasicBlock
else:
block = Bottleneck

model = VideoTrunkBuilder(
block=block, conv_makers=convs, model_depth=model_depth,
stem=get_r2plus1d_stem(use_pool1), **kwargs)
return model


def r2plus1d_18(use_pool1=False, **kwargs):
"""Constructor for the 18 layer deep R(2+1)D network as in
https://arxiv.org/abs/1711.11248

Args:
use_pool1 (bool, optional): Include pooling in the resnet stem. Defaults to False.

Returns:
nn.Module: R(2+1)D-18 network
"""
return _r2plus1d(18, use_pool1, **kwargs)
43 changes: 43 additions & 0 deletions torchvision/models/video/r3d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import torch.nn as nn

from ._utils import Conv3DSimple
from .video_stems import get_default_stem
from .video_trunk import VideoTrunkBuilder, BasicBlock, Bottleneck

__all__ = ["r3d_18"]


def _r3d(model_depth, use_pool1=False, **kwargs):
"""Constructor of a r3d network as in
https://arxiv.org/abs/1711.11248

Args:
model_depth (int): resnet trunk depth
use_pool1 (bool, optional): Add pooling layer to the stem. Defaults to False

Returns:
nn.Module: R3D network trunk
"""

conv_makers = [Conv3DSimple] * 4
if model_depth < 50:
block = BasicBlock
else:
block = Bottleneck

model = VideoTrunkBuilder(block=block, conv_makers=conv_makers, model_depth=model_depth,
stem=get_default_stem(use_pool1=use_pool1), **kwargs)
return model


def r3d_18(use_pool1=False, **kwargs):
"""Construct 18 layer Resnet3D model as in
https://arxiv.org/abs/1711.11248

Args:
use_pool1 (bool, optional): Include pooling in resnet stem. Defaults to False.

Returns:
nn.Module: R3D-18 network
"""
return _r3d(18, use_pool1, **kwargs)
48 changes: 48 additions & 0 deletions torchvision/models/video/video_stems.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import torch.nn as nn


def get_default_stem(use_pool1=False):
"""The default conv-batchnorm-relu(-maxpool) stem

Args:
use_pool1 (bool, optional): Should the stem include the default maxpool? Defaults to False.

Returns:
nn.Sequential: Conv1 stem of resnet based models.
"""

m = [
nn.Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2),
padding=(1, 3, 3), bias=False),
nn.BatchNorm3d(64),
nn.ReLU(inplace=True)]
if use_pool1:
m.append(nn. MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1))
return nn.Sequential(*m)


def get_r2plus1d_stem(use_pool1=False):
"""R(2+1)D stem is different than the default one as it uses separated 3D convolution

Args:
use_pool1 (bool, optional): Should the stem contain pool1 layer. Defaults to False.

Returns:
nn.Sequential: the stem of the conv-separated network.
"""

m = [
nn.Conv3d(3, 45, kernel_size=(1, 7, 7),
stride=(1, 2, 2), padding=(0, 3, 3),
bias=False),
nn.BatchNorm3d(45),
nn.ReLU(inplace=True),
nn.Conv3d(45, 64, kernel_size=(3, 1, 1),
stride=(1, 1, 1), padding=(1, 0, 0),
bias=False),
nn.BatchNorm3d(64),
nn.ReLU(inplace=True)]

if use_pool1:
m.append(nn. MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1))
return nn.Sequential(*m)
Loading