From 2bc646ea3e88e9d3e9add396656c434a00ec60d8 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Sun, 14 Feb 2021 23:18:04 +0530
Subject: [PATCH 01/17] add backbones for detection

---
 flash/vision/backbones.py       | 22 ++++++++++++-
 flash/vision/detection/model.py | 56 +++++++++++++++++++++++----------
 2 files changed, 60 insertions(+), 18 deletions(-)

diff --git a/flash/vision/backbones.py b/flash/vision/backbones.py
index 8259af09c7d..fa9c3403092 100644
--- a/flash/vision/backbones.py
+++ b/flash/vision/backbones.py
@@ -11,12 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
+from typing import Any, Optional, Tuple
 
 import torchvision
 from pytorch_lightning.utilities import _BOLTS_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from torch import nn as nn
+from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
 
 if _BOLTS_AVAILABLE:
     from pl_bolts.models.self_supervised import SimCLR, SwAV
@@ -109,3 +110,22 @@ def torchvision_backbone_and_num_features(model_name: str, pretrained: bool = Tr
         return backbone, num_features
 
     raise ValueError(f"{model_name} is not supported yet.")
+
+
+def fetch_fasterrcnn_backbone_and_num_features(
+    backbone: str,
+    fpn: bool = True,
+    pretrained: Optional[str] = None,
+    trainable_backbone_layers: int = 3,
+    **kwargs: Any
+) -> nn.Module:
+    if fpn:
+        if backbone in RESNET_MODELS:
+            backbone = resnet_fpn_backbone(backbone, pretrained, trainable_backbone_layers, **kwargs)
+            num_features = 512 if backbone in RESNET_MODELS[:2] else 2048
+            return backbone, num_features
+        else:
+            raise MisconfigurationException(f"{backbone} is not supported with `fpn=True`")
+    else:
+        backbone, num_features = backbone_and_num_features(backbone, pretrained)
+        return backbone, num_features
diff --git a/flash/vision/detection/model.py b/flash/vision/detection/model.py
index dead9955152..8f5492a9179 100644
--- a/flash/vision/detection/model.py
+++ b/flash/vision/detection/model.py
@@ -11,20 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Mapping, Sequence, Type, Union
+from typing import Any, Callable, Mapping, Optional, Sequence, Type, Union
 
 import torch
 import torchvision
 from torch import nn
 from torch.optim import Optimizer
+from torchvision.models.detection.faster_rcnn import FasterRCNN as torchvision_FasterRCNN
+from torchvision.models.detection.faster_rcnn import fasterrcnn_resnet50_fpn, FastRCNNPredictor
+from torchvision.models.detection.rpn import AnchorGenerator
 from torchvision.ops import box_iou
 
 from flash.core import Task
+from flash.vision.backbones import fetch_fasterrcnn_backbone_and_num_features
 from flash.vision.detection.data import ObjectDetectionDataPipeline
 from flash.vision.detection.finetuning import ObjectDetectionFineTuning
 
-_models = {"fasterrcnn_resnet50_fpn": torchvision.models.detection.fasterrcnn_resnet50_fpn}
-
 
 def _evaluate_iou(target, pred):
     """
@@ -37,14 +39,17 @@ def _evaluate_iou(target, pred):
 
 
 class ObjectDetector(Task):
-    """Image detection task
+    """Object detection task
 
     Ref: Lightning Bolts https://github.com/PyTorchLightning/pytorch-lightning-bolts
 
     Args:
         num_classes: the number of classes for detection, including background
-        model: either a string of :attr`_models` or a custom nn.Module.
-            Defaults to 'fasterrcnn_resnet50_fpn'.
+        backbone: Pretained backbone CNN architecture.
+        fpn: If True, creates a Feature Pyramind Network on top of Resnet based CNNs.
+        pretrained: if true, returns a model pre-trained on COCO train2017
+        pretrained_backbone: if true, returns a model with backbone pre-trained on Imagenet
+        trainable_backbone_layers: number of trainable resnet layers starting from final block
         loss: the function(s) to update the model with. Has no effect for torchvision detection models.
         metrics: The provided metrics. All metrics here will be logged to progress bar and the respective logger.
         optimizer: The optimizer to use for training. Can either be the actual class or the class name.
@@ -57,25 +62,42 @@ class ObjectDetector(Task):
     def __init__(
         self,
         num_classes: int,
-        model: Union[str, nn.Module] = "fasterrcnn_resnet50_fpn",
+        backbone: Optional[str] = None,
+        fpn: bool = True,
+        pretrained: bool = False,
+        pretrained_backbone: bool = True,
+        trainable_backbone_layers: int = 3,
         loss=None,
         metrics: Union[Callable, nn.Module, Mapping, Sequence, None] = None,
         optimizer: Type[Optimizer] = torch.optim.Adam,
-        pretrained: bool = True,
-        learning_rate=1e-3,
-        **kwargs,
+        learning_rate: float = 1e-3,
+        **kwargs: Any,
     ):
 
         self.save_hyperparameters()
 
-        if model in _models:
-            model = _models[model](pretrained=pretrained)
-            if isinstance(model, torchvision.models.detection.FasterRCNN):
-                in_features = model.roi_heads.box_predictor.cls_score.in_features
-                head = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)
-                model.roi_heads.box_predictor = head
+        if backbone is None:
+            model = fasterrcnn_resnet50_fpn(
+                pretrained=pretrained,
+                pretrained_backbone=pretrained_backbone,
+                trainable_backbone_layers=trainable_backbone_layers,
+            )
+            in_features = model.roi_heads.box_predictor.cls_score.in_features
+            head = FastRCNNPredictor(in_features, num_classes)
+            model.roi_heads.box_predictor = head
         else:
-            ValueError(f"{model} is not supported yet.")
+            backbone_model, num_features = fetch_fasterrcnn_backbone_and_num_features(
+                backbone,
+                fpn,
+                pretrained_backbone,
+                trainable_backbone_layers,
+                **kwargs,
+            )
+            backbone_model.out_channels = num_features
+            anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512), ), aspect_ratios=((0.5, 1.0, 2.0), ))
+            model = torchvision_FasterRCNN(
+                backbone_model, num_classes=num_classes, rpn_anchor_generator=anchor_generator, **kwargs
+            )
 
         super().__init__(
             model=model,

From e38734fdd880605107c11eb0f94b19b64b723770 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Sun, 14 Feb 2021 23:30:02 +0530
Subject: [PATCH 02/17] add warning for fpn

---
 flash/vision/backbones.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/flash/vision/backbones.py b/flash/vision/backbones.py
index fa9c3403092..b1bdd7ce8b4 100644
--- a/flash/vision/backbones.py
+++ b/flash/vision/backbones.py
@@ -14,7 +14,7 @@
 from typing import Any, Optional, Tuple
 
 import torchvision
-from pytorch_lightning.utilities import _BOLTS_AVAILABLE
+from pytorch_lightning.utilities import _BOLTS_AVAILABLE, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from torch import nn as nn
 from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
@@ -125,7 +125,6 @@ def fetch_fasterrcnn_backbone_and_num_features(
             num_features = 512 if backbone in RESNET_MODELS[:2] else 2048
             return backbone, num_features
         else:
-            raise MisconfigurationException(f"{backbone} is not supported with `fpn=True`")
-    else:
-        backbone, num_features = backbone_and_num_features(backbone, pretrained)
-        return backbone, num_features
+            rank_zero_warn(f"{backbone} is not supported with `fpn=True`, `fpn` won't be added.")
+    backbone, num_features = backbone_and_num_features(backbone, pretrained)
+    return backbone, num_features

From 336912104f7c6454f85a435434f1dba34d66ad38 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Sun, 14 Feb 2021 23:42:57 +0530
Subject: [PATCH 03/17] update parameters for resnet fasterrcnn backbone

---
 flash/vision/backbones.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/flash/vision/backbones.py b/flash/vision/backbones.py
index b1bdd7ce8b4..564eea0eeca 100644
--- a/flash/vision/backbones.py
+++ b/flash/vision/backbones.py
@@ -121,7 +121,9 @@ def fetch_fasterrcnn_backbone_and_num_features(
 ) -> nn.Module:
     if fpn:
         if backbone in RESNET_MODELS:
-            backbone = resnet_fpn_backbone(backbone, pretrained, trainable_backbone_layers, **kwargs)
+            backbone = resnet_fpn_backbone(
+                backbone, pretrained=pretrained, trainable_layers=trainable_backbone_layers, **kwargs
+            )
             num_features = 512 if backbone in RESNET_MODELS[:2] else 2048
             return backbone, num_features
         else:

From 7d53f118e240c42c2cc4a97ff1675b864c4e9ff0 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Mon, 15 Feb 2021 00:38:15 +0530
Subject: [PATCH 04/17] add model data integration tests & conditional anchor
 generator

---
 flash/vision/backbones.py                             | 8 ++++----
 flash/vision/detection/model.py                       | 6 ++++--
 tests/vision/detection/test_data_model_integration.py | 5 +++--
 tests/vision/detection/test_model.py                  | 2 +-
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/flash/vision/backbones.py b/flash/vision/backbones.py
index 564eea0eeca..ad019f548fb 100644
--- a/flash/vision/backbones.py
+++ b/flash/vision/backbones.py
@@ -115,7 +115,7 @@ def torchvision_backbone_and_num_features(model_name: str, pretrained: bool = Tr
 def fetch_fasterrcnn_backbone_and_num_features(
     backbone: str,
     fpn: bool = True,
-    pretrained: Optional[str] = None,
+    pretrained: bool = True,
     trainable_backbone_layers: int = 3,
     **kwargs: Any
 ) -> nn.Module:
@@ -124,9 +124,9 @@ def fetch_fasterrcnn_backbone_and_num_features(
             backbone = resnet_fpn_backbone(
                 backbone, pretrained=pretrained, trainable_layers=trainable_backbone_layers, **kwargs
             )
-            num_features = 512 if backbone in RESNET_MODELS[:2] else 2048
-            return backbone, num_features
+            fpn_out_channels = 256
+            return backbone, fpn_out_channels
         else:
-            rank_zero_warn(f"{backbone} is not supported with `fpn=True`, `fpn` won't be added.")
+            rank_zero_warn(f"{backbone} backbone is not supported with `fpn=True`, `fpn` won't be added.")
     backbone, num_features = backbone_and_num_features(backbone, pretrained)
     return backbone, num_features
diff --git a/flash/vision/detection/model.py b/flash/vision/detection/model.py
index 8f5492a9179..5a1ea49e3cd 100644
--- a/flash/vision/detection/model.py
+++ b/flash/vision/detection/model.py
@@ -64,7 +64,7 @@ def __init__(
         num_classes: int,
         backbone: Optional[str] = None,
         fpn: bool = True,
-        pretrained: bool = False,
+        pretrained: bool = True,
         pretrained_backbone: bool = True,
         trainable_backbone_layers: int = 3,
         loss=None,
@@ -94,7 +94,9 @@ def __init__(
                 **kwargs,
             )
             backbone_model.out_channels = num_features
-            anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512), ), aspect_ratios=((0.5, 1.0, 2.0), ))
+            anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512), ),
+                                               aspect_ratios=((0.5, 1.0,
+                                                               2.0), )) if not hasattr(backbone_model, "fpn") else None
             model = torchvision_FasterRCNN(
                 backbone_model, num_classes=num_classes, rpn_anchor_generator=anchor_generator, **kwargs
             )
diff --git a/tests/vision/detection/test_data_model_integration.py b/tests/vision/detection/test_data_model_integration.py
index ac814c76168..ca86a608d76 100644
--- a/tests/vision/detection/test_data_model_integration.py
+++ b/tests/vision/detection/test_data_model_integration.py
@@ -26,12 +26,13 @@
 
 
 @pytest.mark.skipif(not _COCO_AVAILABLE, reason="pycocotools is not installed for testing")
-def test_detection(tmpdir):
+@pytest.mark.parametrize("backbone", [None, "resnet34", "mobilenet_v2", "simclr-imagenet"])
+def test_detection(tmpdir, backbone):
 
     train_folder, coco_ann_path = _create_synth_coco_dataset(tmpdir)
 
     data = ObjectDetectionData.from_coco(train_folder=train_folder, train_ann_file=coco_ann_path, batch_size=1)
-    model = ObjectDetector(num_classes=data.num_classes)
+    model = ObjectDetector(backbone=backbone, num_classes=data.num_classes)
 
     trainer = flash.Trainer(fast_dev_run=True)
 
diff --git a/tests/vision/detection/test_model.py b/tests/vision/detection/test_model.py
index 93bc16375c9..3efb557b90f 100644
--- a/tests/vision/detection/test_model.py
+++ b/tests/vision/detection/test_model.py
@@ -63,7 +63,7 @@ def test_init():
 
 
 def test_training(tmpdir):
-    model = ObjectDetector(num_classes=2, model="fasterrcnn_resnet50_fpn")
+    model = ObjectDetector(num_classes=2)
     ds = DummyDetectionDataset((3, 224, 224), 1, 2, 10)
     dl = DataLoader(ds, collate_fn=collate_fn)
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)

From 6adb89867702cef47869716d5787b40cc8260a6f Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Mon, 15 Feb 2021 00:51:09 +0530
Subject: [PATCH 05/17] add tests for backbones

---
 tests/vision/test_backbones.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 tests/vision/test_backbones.py

diff --git a/tests/vision/test_backbones.py b/tests/vision/test_backbones.py
new file mode 100644
index 00000000000..428e6d38a57
--- /dev/null
+++ b/tests/vision/test_backbones.py
@@ -0,0 +1,15 @@
+import pytest
+
+from flash.vision.backbones import fetch_fasterrcnn_backbone_and_num_features
+
+
+@pytest.mark.parametrize(["backbone", "expected_num_features"], [("resnet34", 512), ("mobilenet_v2", 1280),
+                                                                 ("simclr-imagenet", 2048)])
+def test_fetch_fasterrcnn_backbone_and_num_features(backbone, expected_num_features):
+
+    backbone_model, num_features = fetch_fasterrcnn_backbone_and_num_features(
+        backbone=backbone, pretrained=False, fpn=False
+    )
+
+    assert backbone_model
+    assert num_features == expected_num_features

From 7fd9f2706f4cff89cd3628ffc52953eb5a48a936 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Mon, 15 Feb 2021 03:51:20 +0530
Subject: [PATCH 06/17] add retinanet & tests

---
 flash/vision/backbones.py                     | 40 +++++----
 flash/vision/classification/model.py          |  2 +-
 flash/vision/detection/model.py               | 81 +++++++++++++------
 .../vision/embedding/image_embedder_model.py  |  2 +-
 .../detection/test_data_model_integration.py  |  7 +-
 tests/vision/detection/test_model.py          |  6 +-
 tests/vision/test_backbones.py                |  6 +-
 7 files changed, 87 insertions(+), 57 deletions(-)

diff --git a/flash/vision/backbones.py b/flash/vision/backbones.py
index ad019f548fb..0d18541a55f 100644
--- a/flash/vision/backbones.py
+++ b/flash/vision/backbones.py
@@ -33,12 +33,28 @@
 BOLTS_MODELS = ["simclr-imagenet", "swav-imagenet"]
 
 
-def backbone_and_num_features(model_name: str, *args, **kwargs) -> Tuple[nn.Module, int]:
+def backbone_and_num_features(
+    model_name: str,
+    fpn: bool = False,
+    pretrained: bool = True,
+    trainable_backbone_layers: int = 3,
+    **kwargs
+) -> Tuple[nn.Module, int]:
+    if fpn:
+        if model_name in RESNET_MODELS:
+            backbone = resnet_fpn_backbone(
+                model_name, pretrained=pretrained, trainable_layers=trainable_backbone_layers, **kwargs
+            )
+            fpn_out_channels = 256
+            return backbone, fpn_out_channels
+        else:
+            rank_zero_warn(f"{model_name} backbone is not supported with `fpn=True`, `fpn` won't be added.")
+
     if model_name in BOLTS_MODELS:
         return bolts_backbone_and_num_features(model_name)
 
     if model_name in TORCHVISION_MODELS:
-        return torchvision_backbone_and_num_features(model_name, *args, **kwargs)
+        return torchvision_backbone_and_num_features(model_name, pretrained)
 
     raise ValueError(f"{model_name} is not supported yet.")
 
@@ -110,23 +126,3 @@ def torchvision_backbone_and_num_features(model_name: str, pretrained: bool = Tr
         return backbone, num_features
 
     raise ValueError(f"{model_name} is not supported yet.")
-
-
-def fetch_fasterrcnn_backbone_and_num_features(
-    backbone: str,
-    fpn: bool = True,
-    pretrained: bool = True,
-    trainable_backbone_layers: int = 3,
-    **kwargs: Any
-) -> nn.Module:
-    if fpn:
-        if backbone in RESNET_MODELS:
-            backbone = resnet_fpn_backbone(
-                backbone, pretrained=pretrained, trainable_layers=trainable_backbone_layers, **kwargs
-            )
-            fpn_out_channels = 256
-            return backbone, fpn_out_channels
-        else:
-            rank_zero_warn(f"{backbone} backbone is not supported with `fpn=True`, `fpn` won't be added.")
-    backbone, num_features = backbone_and_num_features(backbone, pretrained)
-    return backbone, num_features
diff --git a/flash/vision/classification/model.py b/flash/vision/classification/model.py
index 4c173d93b69..114175b90bb 100644
--- a/flash/vision/classification/model.py
+++ b/flash/vision/classification/model.py
@@ -57,7 +57,7 @@ def __init__(
 
         self.save_hyperparameters()
 
-        self.backbone, num_features = backbone_and_num_features(backbone, pretrained)
+        self.backbone, num_features = backbone_and_num_features(backbone, pretrained=pretrained)
 
         self.head = nn.Sequential(
             nn.AdaptiveAvgPool2d((1, 1)),
diff --git a/flash/vision/detection/model.py b/flash/vision/detection/model.py
index 5a1ea49e3cd..3b3503f0751 100644
--- a/flash/vision/detection/model.py
+++ b/flash/vision/detection/model.py
@@ -17,16 +17,21 @@
 import torchvision
 from torch import nn
 from torch.optim import Optimizer
-from torchvision.models.detection.faster_rcnn import FasterRCNN as torchvision_FasterRCNN
-from torchvision.models.detection.faster_rcnn import fasterrcnn_resnet50_fpn, FastRCNNPredictor
+from torchvision.models.detection.faster_rcnn import FasterRCNN, FastRCNNPredictor
+from torchvision.models.detection.retinanet import RetinaNet, RetinaNetHead
 from torchvision.models.detection.rpn import AnchorGenerator
 from torchvision.ops import box_iou
 
 from flash.core import Task
-from flash.vision.backbones import fetch_fasterrcnn_backbone_and_num_features
+from flash.vision.backbones import backbone_and_num_features
 from flash.vision.detection.data import ObjectDetectionDataPipeline
 from flash.vision.detection.finetuning import ObjectDetectionFineTuning
 
+_models = {
+    "fasterrcnn": torchvision.models.detection.fasterrcnn_resnet50_fpn,
+    "retinanet": torchvision.models.detection.retinanet_resnet50_fpn,
+}
+
 
 def _evaluate_iou(target, pred):
     """
@@ -45,11 +50,14 @@ class ObjectDetector(Task):
 
     Args:
         num_classes: the number of classes for detection, including background
-        backbone: Pretained backbone CNN architecture.
+        model: a string of :attr`_models`. Defaults to 'fasterrcnn'.
+        backbone: Pretained backbone CNN architecture. Constructs a model with a
+            ResNet-50-FPN backbone when no backbone is specified.
         fpn: If True, creates a Feature Pyramind Network on top of Resnet based CNNs.
         pretrained: if true, returns a model pre-trained on COCO train2017
         pretrained_backbone: if true, returns a model with backbone pre-trained on Imagenet
-        trainable_backbone_layers: number of trainable resnet layers starting from final block
+        trainable_backbone_layers: number of trainable resnet layers starting from final block.
+            Only applicable for `fasterrcnn`.
         loss: the function(s) to update the model with. Has no effect for torchvision detection models.
         metrics: The provided metrics. All metrics here will be logged to progress bar and the respective logger.
         optimizer: The optimizer to use for training. Can either be the actual class or the class name.
@@ -62,6 +70,7 @@ class ObjectDetector(Task):
     def __init__(
         self,
         num_classes: int,
+        model: str = "fasterrcnn",
         backbone: Optional[str] = None,
         fpn: bool = True,
         pretrained: bool = True,
@@ -76,17 +85,46 @@ def __init__(
 
         self.save_hyperparameters()
 
-        if backbone is None:
-            model = fasterrcnn_resnet50_fpn(
-                pretrained=pretrained,
-                pretrained_backbone=pretrained_backbone,
-                trainable_backbone_layers=trainable_backbone_layers,
+        if model in _models:
+            model = ObjectDetector.get_model(
+                model, num_classes, backbone, fpn, pretrained, pretrained_backbone, trainable_backbone_layers, **kwargs
             )
-            in_features = model.roi_heads.box_predictor.cls_score.in_features
-            head = FastRCNNPredictor(in_features, num_classes)
-            model.roi_heads.box_predictor = head
         else:
-            backbone_model, num_features = fetch_fasterrcnn_backbone_and_num_features(
+            ValueError(f"{model} is not supported yet.")
+
+        super().__init__(
+            model=model,
+            loss_fn=loss,
+            metrics=metrics,
+            learning_rate=learning_rate,
+            optimizer=optimizer,
+        )
+
+    @staticmethod
+    def get_model(
+        model_name, num_classes, backbone, fpn, pretrained, pretrained_backbone, trainable_backbone_layers, **kwargs
+    ):
+        if backbone is None:
+            # Constructs a model with a ResNet-50-FPN backbone when no backbone is specified.
+            if model_name == "fasterrcnn":
+                model = _models[model_name](
+                    pretrained=pretrained,
+                    pretrained_backbone=pretrained_backbone,
+                    trainable_backbone_layers=trainable_backbone_layers,
+                )
+                in_features = model.roi_heads.box_predictor.cls_score.in_features
+                head = FastRCNNPredictor(in_features, num_classes)
+                model.roi_heads.box_predictor = head
+            else:
+                model = _models[model_name](pretrained=pretrained, pretrained_backbone=pretrained_backbone)
+                model.head = RetinaNetHead(
+                    in_channels=model.backbone.out_channels,
+                    num_anchors=model.head.classification_head.num_anchors,
+                    num_classes=num_classes,
+                    **kwargs
+                )
+        else:
+            backbone_model, num_features = backbone_and_num_features(
                 backbone,
                 fpn,
                 pretrained_backbone,
@@ -97,17 +135,12 @@ def __init__(
             anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512), ),
                                                aspect_ratios=((0.5, 1.0,
                                                                2.0), )) if not hasattr(backbone_model, "fpn") else None
-            model = torchvision_FasterRCNN(
-                backbone_model, num_classes=num_classes, rpn_anchor_generator=anchor_generator, **kwargs
-            )
 
-        super().__init__(
-            model=model,
-            loss_fn=loss,
-            metrics=metrics,
-            learning_rate=learning_rate,
-            optimizer=optimizer,
-        )
+            if model_name == "fasterrcnn":
+                model = FasterRCNN(backbone_model, num_classes=num_classes, rpn_anchor_generator=anchor_generator)
+            else:
+                model = RetinaNet(backbone_model, num_classes=num_classes, anchor_generator=anchor_generator)
+        return model
 
     def training_step(self, batch, batch_idx) -> Any:
         """The training step. Overrides ``Task.training_step``
diff --git a/flash/vision/embedding/image_embedder_model.py b/flash/vision/embedding/image_embedder_model.py
index e388cffd964..0e0884d5c80 100644
--- a/flash/vision/embedding/image_embedder_model.py
+++ b/flash/vision/embedding/image_embedder_model.py
@@ -112,7 +112,7 @@ def __init__(
         assert pooling_fn in [torch.mean, torch.max]
         self.pooling_fn = pooling_fn
 
-        self.backbone, num_features = backbone_and_num_features(backbone, pretrained)
+        self.backbone, num_features = backbone_and_num_features(backbone, pretrained=pretrained)
 
         if embedding_dim is None:
             self.head = nn.Identity()
diff --git a/tests/vision/detection/test_data_model_integration.py b/tests/vision/detection/test_data_model_integration.py
index ca86a608d76..e014086c940 100644
--- a/tests/vision/detection/test_data_model_integration.py
+++ b/tests/vision/detection/test_data_model_integration.py
@@ -26,13 +26,14 @@
 
 
 @pytest.mark.skipif(not _COCO_AVAILABLE, reason="pycocotools is not installed for testing")
-@pytest.mark.parametrize("backbone", [None, "resnet34", "mobilenet_v2", "simclr-imagenet"])
-def test_detection(tmpdir, backbone):
+@pytest.mark.parametrize(["model", "backbone"], [("fasterrcnn", None), ("retinanet", "resnet34"),
+                                                 ("fasterrcnn", "mobilenet_v2"), ("retinanet", "simclr-imagenet")])
+def test_detection(tmpdir, model, backbone):
 
     train_folder, coco_ann_path = _create_synth_coco_dataset(tmpdir)
 
     data = ObjectDetectionData.from_coco(train_folder=train_folder, train_ann_file=coco_ann_path, batch_size=1)
-    model = ObjectDetector(backbone=backbone, num_classes=data.num_classes)
+    model = ObjectDetector(model=model, backbone=backbone, num_classes=data.num_classes)
 
     trainer = flash.Trainer(fast_dev_run=True)
 
diff --git a/tests/vision/detection/test_model.py b/tests/vision/detection/test_model.py
index 3efb557b90f..70453e6e73f 100644
--- a/tests/vision/detection/test_model.py
+++ b/tests/vision/detection/test_model.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import pytest
 import torch
 from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader, Dataset
@@ -62,8 +63,9 @@ def test_init():
     assert {"boxes", "labels", "scores"} <= out[0].keys()
 
 
-def test_training(tmpdir):
-    model = ObjectDetector(num_classes=2)
+@pytest.mark.parametrize("model", ["fasterrcnn", "retinanet"])
+def test_training(tmpdir, model):
+    model = ObjectDetector(num_classes=2, model=model, pretrained=False, pretrained_backbone=False)
     ds = DummyDetectionDataset((3, 224, 224), 1, 2, 10)
     dl = DataLoader(ds, collate_fn=collate_fn)
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
diff --git a/tests/vision/test_backbones.py b/tests/vision/test_backbones.py
index 428e6d38a57..72e5896fe1e 100644
--- a/tests/vision/test_backbones.py
+++ b/tests/vision/test_backbones.py
@@ -1,15 +1,13 @@
 import pytest
 
-from flash.vision.backbones import fetch_fasterrcnn_backbone_and_num_features
+from flash.vision.backbones import backbone_and_num_features
 
 
 @pytest.mark.parametrize(["backbone", "expected_num_features"], [("resnet34", 512), ("mobilenet_v2", 1280),
                                                                  ("simclr-imagenet", 2048)])
 def test_fetch_fasterrcnn_backbone_and_num_features(backbone, expected_num_features):
 
-    backbone_model, num_features = fetch_fasterrcnn_backbone_and_num_features(
-        backbone=backbone, pretrained=False, fpn=False
-    )
+    backbone_model, num_features = backbone_and_num_features(model_name=backbone, pretrained=False, fpn=False)
 
     assert backbone_model
     assert num_features == expected_num_features

From 0b1cc92b49c39f5fbf8466ce8dd1cd03bb6e51d1 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Mon, 15 Feb 2021 03:55:58 +0530
Subject: [PATCH 07/17] add docstring for backbones

---
 flash/vision/backbones.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/flash/vision/backbones.py b/flash/vision/backbones.py
index 0d18541a55f..a192682467c 100644
--- a/flash/vision/backbones.py
+++ b/flash/vision/backbones.py
@@ -40,6 +40,14 @@ def backbone_and_num_features(
     trainable_backbone_layers: int = 3,
     **kwargs
 ) -> Tuple[nn.Module, int]:
+    """
+    >>> backbone_and_num_features('mobilenet_v2')  # doctest: +ELLIPSIS
+    (Sequential(...), 1280)
+    >>> backbone_and_num_features('resnet50', fpn=True)  # doctest: +ELLIPSIS
+    (Sequential(...), 256)
+    >>> backbone_and_num_features('swav-imagenet')  # doctest: +ELLIPSIS
+    (Sequential(...), 2048)
+    """
     if fpn:
         if model_name in RESNET_MODELS:
             backbone = resnet_fpn_backbone(

From e51cc6c7b1bfdfa36030f6b3faf26319a0890177 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Mon, 15 Feb 2021 12:35:08 +0530
Subject: [PATCH 08/17] fix doctests

---
 flash/vision/backbones.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flash/vision/backbones.py b/flash/vision/backbones.py
index a192682467c..814991f26e4 100644
--- a/flash/vision/backbones.py
+++ b/flash/vision/backbones.py
@@ -44,7 +44,7 @@ def backbone_and_num_features(
     >>> backbone_and_num_features('mobilenet_v2')  # doctest: +ELLIPSIS
     (Sequential(...), 1280)
     >>> backbone_and_num_features('resnet50', fpn=True)  # doctest: +ELLIPSIS
-    (Sequential(...), 256)
+    (BackboneWithFPN((...), 256)
     >>> backbone_and_num_features('swav-imagenet')  # doctest: +ELLIPSIS
     (Sequential(...), 2048)
     """

From d85f6874edb601fa3ed9b95bc9e42af7c61910b8 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Mon, 15 Feb 2021 12:45:49 +0530
Subject: [PATCH 09/17] fix doctests

---
 flash/vision/backbones.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flash/vision/backbones.py b/flash/vision/backbones.py
index 814991f26e4..ba68eb8eaa3 100644
--- a/flash/vision/backbones.py
+++ b/flash/vision/backbones.py
@@ -44,7 +44,7 @@ def backbone_and_num_features(
     >>> backbone_and_num_features('mobilenet_v2')  # doctest: +ELLIPSIS
     (Sequential(...), 1280)
     >>> backbone_and_num_features('resnet50', fpn=True)  # doctest: +ELLIPSIS
-    (BackboneWithFPN((...), 256)
+    (BackboneWithFPN(...), 256)
     >>> backbone_and_num_features('swav-imagenet')  # doctest: +ELLIPSIS
     (Sequential(...), 2048)
     """

From ec996d61422f6900ca8b4269d222e03688306cfe Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Mon, 15 Feb 2021 20:16:38 +0530
Subject: [PATCH 10/17] add docstring for backbones

---
 flash/vision/backbones.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/flash/vision/backbones.py b/flash/vision/backbones.py
index ba68eb8eaa3..9269ad21037 100644
--- a/flash/vision/backbones.py
+++ b/flash/vision/backbones.py
@@ -41,6 +41,12 @@ def backbone_and_num_features(
     **kwargs
 ) -> Tuple[nn.Module, int]:
     """
+    Args:
+        model_name: backbone supported by `torchvision` and `bolts`
+        fpn: If True, creates a Feature Pyramind Network on top of Resnet based CNNs.
+        pretrained: if true, returns a model with backbone pre-trained on Imagenet
+        trainable_backbone_layers: number of trainable resnet layers starting from final block.
+
     >>> backbone_and_num_features('mobilenet_v2')  # doctest: +ELLIPSIS
     (Sequential(...), 1280)
     >>> backbone_and_num_features('resnet50', fpn=True)  # doctest: +ELLIPSIS

From c5719235b3a598f4a658cb6abe741e8ebb4aa2ad Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Mon, 15 Feb 2021 20:56:31 +0530
Subject: [PATCH 11/17] add docs for od

---
 docs/source/reference/object_detection.rst | 60 ++++++++++++++++++----
 1 file changed, 50 insertions(+), 10 deletions(-)

diff --git a/docs/source/reference/object_detection.rst b/docs/source/reference/object_detection.rst
index 6b9ae98d06f..3be16b3536b 100644
--- a/docs/source/reference/object_detection.rst
+++ b/docs/source/reference/object_detection.rst
@@ -22,18 +22,18 @@ The :class:`~flash.vision.ObjectDetector` is already pre-trained on `COCO train2
 .. code-block::
 
     annotation{
-        "id": int, 
-        "image_id": int, 
-        "category_id": int, 
-        "segmentation": RLE or [polygon], 
-        "area": float, 
-        "bbox": [x,y,width,height], 
+        "id": int,
+        "image_id": int,
+        "category_id": int,
+        "segmentation": RLE or [polygon],
+        "area": float,
+        "bbox": [x,y,width,height],
         "iscrowd": 0 or 1,
     }
 
     categories[{
-        "id": int, 
-        "name": str, 
+        "id": int,
+        "name": str,
         "supercategory": str,
     }]
 
@@ -88,7 +88,7 @@ To tailor the object detector to your dataset, you would need to have it in `COC
     )
 
     # 3. Build the model
-    model = ObjectDetector(num_classes=datamodule.num_classes)
+    model = ObjectDetector(model="fasterrcnn", backbone="simclr-imagenet", num_classes=datamodule.num_classes)
 
     # 4. Create the trainer. Run thrice on data
     trainer = flash.Trainer(max_epochs=3)
@@ -105,7 +105,47 @@ To tailor the object detector to your dataset, you would need to have it in `COC
 Model
 *****
 
-By default, we use the `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_ model with a ResNet-50 FPN backbone. The inputs could be images of different sizes. The model behaves differently for training and evaluation. For training, it expects both the input tensors as well as the targets. And during evaluation, it expects only the input tensors and returns predictions for each image. The predictions are a list of boxes, labels and scores.
+By default, we use the `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_ model with a ResNet-50 FPN backbone. We have support for `RetinaNet <https://arxiv.org/abs/1708.02002>`_ as well. The inputs could be images of different sizes. The model behaves differently for training and evaluation. For training, it expects both the input tensors as well as the targets. And during evaluation, it expects only the input tensors and returns predictions for each image. The predictions are a list of boxes, labels and scores.
+
+------
+
+*********************
+Changing the backbone
+*********************
+By default, we use a ResNet-50 FPN backbone. You can change the backbone for the model by passing in a different backbone.
+
+
+.. code-block:: python
+
+    # 1. Organize the data
+    datamodule = ObjectDetectionData.from_coco(
+        train_folder="data/coco128/images/train2017/",
+        train_ann_file="data/coco128/annotations/instances_train2017.json",
+        batch_size=2
+    )
+
+    # 2. Build the Task
+    model = ObjectDetector(model="retinanet", backbone="resnet101", num_classes=datamodule.num_classes)
+
+Available backbones:
+
+* resnet18
+* resnet34
+* resnet50
+* resnet101
+* resnet152
+* resnext50_32x4d
+* resnext101_32x8d
+* mobilenet_v2
+* vgg11
+* vgg13
+* vgg16
+* vgg19
+* densenet121
+* densenet169
+* densenet161
+* swav-imagenet
+* simclr-imagenet
 
 ------
 

From 3e012949ef800159f51f8b78ea5e11301e6aa9bc Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Mon, 15 Feb 2021 20:59:29 +0530
Subject: [PATCH 12/17] update docs for detection

---
 docs/source/reference/object_detection.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/reference/object_detection.rst b/docs/source/reference/object_detection.rst
index 3be16b3536b..f2fab9aa4dd 100644
--- a/docs/source/reference/object_detection.rst
+++ b/docs/source/reference/object_detection.rst
@@ -105,7 +105,7 @@ To tailor the object detector to your dataset, you would need to have it in `COC
 Model
 *****
 
-By default, we use the `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_ model with a ResNet-50 FPN backbone. We have support for `RetinaNet <https://arxiv.org/abs/1708.02002>`_ as well. The inputs could be images of different sizes. The model behaves differently for training and evaluation. For training, it expects both the input tensors as well as the targets. And during evaluation, it expects only the input tensors and returns predictions for each image. The predictions are a list of boxes, labels and scores.
+By default, we use the `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_ model with a ResNet-50 FPN backbone. We also support `RetinaNet <https://arxiv.org/abs/1708.02002>`_. The inputs could be images of different sizes. The model behaves differently for training and evaluation. For training, it expects both the input tensors as well as the targets. And during evaluation, it expects only the input tensors and returns predictions for each image. The predictions are a list of boxes, labels and scores.
 
 ------
 

From e01b45076fd79ecde1fbb6ae60ae792e1df4fe91 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Mon, 15 Feb 2021 23:03:03 +0530
Subject: [PATCH 13/17] update changelog

---
 CHANGELOG.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 92ca4b67363..2552dd5b240 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,22 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Removed
 
 
+## [Unreleased] - 2021-02-15
+
+### Added
+
+- Added `RetinaNet` & `backbones` to `ObjectDetector` Task ([#121](https://github.com/PyTorchLightning/lightning-flash/pull/121))
+
+### Changed
+
+
+
+### Fixed
+
+
+
+### Removed
+
 
 
 ## [0.2.0] - 2021-02-12

From a869a28c283d2bf6de1bc0ce981083d54ca4b231 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Tue, 16 Feb 2021 10:41:26 +0530
Subject: [PATCH 14/17] add anchor generator param

---
 docs/source/reference/object_detection.rst |  2 +-
 flash/vision/detection/model.py            | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/docs/source/reference/object_detection.rst b/docs/source/reference/object_detection.rst
index f2fab9aa4dd..0245b3b2be6 100644
--- a/docs/source/reference/object_detection.rst
+++ b/docs/source/reference/object_detection.rst
@@ -68,7 +68,7 @@ For more advanced inference options, see :ref:`predictions`.
 Finetuning
 **********
 
-To tailor the object detector to your dataset, you would need to have it in `COCO Format <https://cocodataset.org/#format-data>`_, and then finetune the model.
+To tailor the object detector to your dataset, you would need to have it in `COCO Format <https://cocodataset.org/#format-data>`_, and then finetune the model. You could also pass `trainable_backbone_layers` to :class:`~flash.vision.ObjectDetector` and train the model.
 
 .. code-block:: python
 
diff --git a/flash/vision/detection/model.py b/flash/vision/detection/model.py
index 3b3503f0751..deef62a8f1a 100644
--- a/flash/vision/detection/model.py
+++ b/flash/vision/detection/model.py
@@ -76,6 +76,7 @@ def __init__(
         pretrained: bool = True,
         pretrained_backbone: bool = True,
         trainable_backbone_layers: int = 3,
+        achnor_generator: Optional[Type[AnchorGenerator]] = None,
         loss=None,
         metrics: Union[Callable, nn.Module, Mapping, Sequence, None] = None,
         optimizer: Type[Optimizer] = torch.optim.Adam,
@@ -87,7 +88,8 @@ def __init__(
 
         if model in _models:
             model = ObjectDetector.get_model(
-                model, num_classes, backbone, fpn, pretrained, pretrained_backbone, trainable_backbone_layers, **kwargs
+                model, num_classes, backbone, fpn, pretrained, pretrained_backbone, trainable_backbone_layers,
+                anchor_generator, **kwargs
             )
         else:
             ValueError(f"{model} is not supported yet.")
@@ -102,7 +104,8 @@ def __init__(
 
     @staticmethod
     def get_model(
-        model_name, num_classes, backbone, fpn, pretrained, pretrained_backbone, trainable_backbone_layers, **kwargs
+        model_name, num_classes, backbone, fpn, pretrained, pretrained_backbone, trainable_backbone_layers,
+        anchor_generator, **kwargs
     ):
         if backbone is None:
             # Constructs a model with a ResNet-50-FPN backbone when no backbone is specified.
@@ -132,9 +135,10 @@ def get_model(
                 **kwargs,
             )
             backbone_model.out_channels = num_features
-            anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512), ),
-                                               aspect_ratios=((0.5, 1.0,
-                                                               2.0), )) if not hasattr(backbone_model, "fpn") else None
+            if anchor_generator is None:
+                anchor_generator = AnchorGenerator(
+                    sizes=((32, 64, 128, 256, 512), ), aspect_ratios=((0.5, 1.0, 2.0), )
+                ) if not hasattr(backbone_model, "fpn") else None
 
             if model_name == "fasterrcnn":
                 model = FasterRCNN(backbone_model, num_classes=num_classes, rpn_anchor_generator=anchor_generator)

From bd8f2eb521a99cddbeed45c9935fefd80b5f7ded Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Tue, 16 Feb 2021 13:45:48 +0530
Subject: [PATCH 15/17] fix anchor typ

---
 flash/vision/detection/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flash/vision/detection/model.py b/flash/vision/detection/model.py
index deef62a8f1a..e8759751dc1 100644
--- a/flash/vision/detection/model.py
+++ b/flash/vision/detection/model.py
@@ -76,7 +76,7 @@ def __init__(
         pretrained: bool = True,
         pretrained_backbone: bool = True,
         trainable_backbone_layers: int = 3,
-        achnor_generator: Optional[Type[AnchorGenerator]] = None,
+        anchor_generator: Optional[Type[AnchorGenerator]] = None,
         loss=None,
         metrics: Union[Callable, nn.Module, Mapping, Sequence, None] = None,
         optimizer: Type[Optimizer] = torch.optim.Adam,

From fd8cc9bea07ca4242dadc2aebf2a177685145824 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Tue, 16 Feb 2021 13:56:28 +0530
Subject: [PATCH 16/17] Update docs/source/reference/object_detection.rst

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 docs/source/reference/object_detection.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/reference/object_detection.rst b/docs/source/reference/object_detection.rst
index 0245b3b2be6..99a506a1670 100644
--- a/docs/source/reference/object_detection.rst
+++ b/docs/source/reference/object_detection.rst
@@ -68,7 +68,9 @@ For more advanced inference options, see :ref:`predictions`.
 Finetuning
 **********
 
-To tailor the object detector to your dataset, you would need to have it in `COCO Format <https://cocodataset.org/#format-data>`_, and then finetune the model. You could also pass `trainable_backbone_layers` to :class:`~flash.vision.ObjectDetector` and train the model.
+To tailor the object detector to your dataset, you would need to have it in `COCO Format <https://cocodataset.org/#format-data>`_, and then finetune the model.
+
+.. tip:: You could also pass `trainable_backbone_layers` to :class:`~flash.vision.ObjectDetector` and train the model.
 
 .. code-block:: python
 

From 76fa0b09e0d1d638336bafd20750118db66ea9c8 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Tue, 16 Feb 2021 13:56:36 +0530
Subject: [PATCH 17/17] Update docs/source/reference/object_detection.rst

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 docs/source/reference/object_detection.rst | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/source/reference/object_detection.rst b/docs/source/reference/object_detection.rst
index 99a506a1670..2840923ca07 100644
--- a/docs/source/reference/object_detection.rst
+++ b/docs/source/reference/object_detection.rst
@@ -107,7 +107,12 @@ To tailor the object detector to your dataset, you would need to have it in `COC
 Model
 *****
 
-By default, we use the `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_ model with a ResNet-50 FPN backbone. We also support `RetinaNet <https://arxiv.org/abs/1708.02002>`_. The inputs could be images of different sizes. The model behaves differently for training and evaluation. For training, it expects both the input tensors as well as the targets. And during evaluation, it expects only the input tensors and returns predictions for each image. The predictions are a list of boxes, labels and scores.
+By default, we use the `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_ model with a ResNet-50 FPN backbone.
+We also support `RetinaNet <https://arxiv.org/abs/1708.02002>`_.
+The inputs could be images of different sizes.
+The model behaves differently for training and evaluation.
+For training, it expects both the input tensors as well as the targets. And during the evaluation, it expects only the input tensors and returns predictions for each image.
+The predictions are a list of boxes, labels, and scores.
 
 ------