add clip-convnext backbone (#21)

vita-epfl · Mar 7, 2024 · dd32c38 · dd32c38
1 parent ec21ca1
commit dd32c38
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 1 deletion.
diff --git a/docs/LICENSE.CLIPCONVNEXT b/docs/LICENSE.CLIPCONVNEXT
@@ -0,0 +1,23 @@
+Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman, 
+Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar, 
+John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi, 
+Ludwig Schmidt
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/setup.py b/setup.py
@@ -96,6 +96,7 @@ def add_cpp_extension():
             'mmcv>=2.0',
             'mmpose>=1.0',  # for HRFormer
             'mmpretrain',  # for ConvNeXt V2
+            'open_clip_torch',  # for CLIP-ConvNeXt
         ],
         'dev': [
             'flameprof',

diff --git a/src/openpifpaf/network/basenetworks.py b/src/openpifpaf/network/basenetworks.py
@@ -820,3 +820,28 @@ def cli(cls, parser: argparse.ArgumentParser):
     @classmethod
     def configure(cls, args: argparse.Namespace):
         cls.pretrained = args.convnextv2_pretrained
+
+
+class CLIPConvNeXt(BaseNetwork):
+    pretrained = True
+    unused_parameters = True  # For DDP initialization
+
+    def __init__(self, name, clipconvnext_net):
+        clipconvnext_backbone, out_features = clipconvnext_net(self.pretrained)
+        super().__init__(name, stride=32, out_features=out_features)
+        self.backbone = clipconvnext_backbone
+
+    def forward(self, x):
+        return self.backbone.visual.trunk.forward_features(x)
+
+    @classmethod
+    def cli(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group('CLIPConvNeXt')
+        assert cls.pretrained
+        group.add_argument('--clipconvnext-no-pretrain', dest='clipconvnext_pretrained',
+                           default=True, action='store_false',
+                           help='use randomly initialized models')
+
+    @classmethod
+    def configure(cls, args: argparse.Namespace):
+        cls.pretrained = args.clipconvnext_pretrained
diff --git a/src/openpifpaf/network/clipconvnext.py b/src/openpifpaf/network/clipconvnext.py
@@ -0,0 +1,29 @@
+try:
+    import open_clip
+except ImportError:
+    pass
+
+
+def adapt_clipconvnext(backbone):
+    """Adapt CLIPConvNeXt's downsampling to work with custom image size in OpenPifPaf."""
+    backbone.visual.trunk.stem[0].padding = (2, 2)
+    backbone.visual.trunk.stages[1].downsample[1].padding = (1, 1)
+    backbone.visual.trunk.stages[2].downsample[1].padding = (1, 1)
+    backbone.visual.trunk.stages[3].downsample[1].padding = (1, 1)
+    return backbone
+
+
+def clipconvnext(model_name=None, pretraining_dataset=None):
+    backbone, _, _ = open_clip.create_model_and_transforms(model_name,
+                                                           pretrained=pretraining_dataset)
+    backbone = adapt_clipconvnext(backbone)
+    return backbone
+
+
+def clipconvnextbase(pretrained=True):
+    model_name, pretraining_dataset = 'convnext_base_w_320', 'laion_aesthetic_s13b_b82k_augreg'
+    out_features = 1024
+    if not pretrained:
+        pretraining_dataset = None
+    backbone = clipconvnext(model_name=model_name, pretraining_dataset=pretraining_dataset)
+    return backbone, out_features
diff --git a/src/openpifpaf/network/factory.py b/src/openpifpaf/network/factory.py
@@ -10,7 +10,7 @@
 from .. import headmeta
 from ..configurable import Configurable
 from . import basenetworks, heads, model_migration, nets, tracking_heads
-from . import convnextv2, hrformer, swin_transformer, xcit
+from . import clipconvnext, convnextv2, hrformer, swin_transformer, xcit
 from .tracking_base import TrackingBase
 
 
@@ -77,6 +77,7 @@
 CHECKPOINT_URLS = {}
 
 BASE_TYPES = set([
+    basenetworks.CLIPConvNeXt,
     basenetworks.ConvNeXtV2,
     basenetworks.HRFormer,
     basenetworks.MobileNetV2,
@@ -248,6 +249,10 @@
     # ConvNeXt V2 architecture
     'convnextv2base': lambda: basenetworks.ConvNeXtV2(
         'convnextv2base', convnextv2.convnextv2base),
+    # CLIPConvNeXt architecture
+    'clipconvnextbase': lambda: basenetworks.CLIPConvNeXt(
+        'clipconvnextbase', clipconvnext.clipconvnextbase),
+
 }
 # base factories that wrap other base factories:
 BASE_FACTORIES['tshufflenetv2k16'] = lambda: TrackingBase(BASE_FACTORIES['shufflenetv2k16']())