voxel51 · brimoor · Dec 20, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/docs/scripts/make_model_zoo_docs.py b/docs/scripts/make_model_zoo_docs.py
@@ -143,6 +143,7 @@
         "{{ name }}",
         text_prompt="A photo of a",
         classes=["person", "dog", "cat", "bird", "car", "tree", "chair"],
+        cache=False,
     )
 
     dataset.apply_model(model, label_field="predictions")

diff --git a/fiftyone/utils/open_clip.py b/fiftyone/utils/open_clip.py
@@ -0,0 +1,118 @@
+"""
+CLIP model wrapper for the FiftyOne Model Zoo.
+
+| Copyright 2017-2023, Voxel51, Inc.
+| `voxel51.com <https://voxel51.com/>`_
+|
+"""
+import logging
+
+import fiftyone.core.models as fom
+import fiftyone.core.utils as fou
+import fiftyone.utils.torch as fout
+import fiftyone.zoo.models as fozm
+
+fou.ensure_torch()
+import torch
+
+open_clip = fou.lazy_import(
+    "open_clip", callback=lambda: fou.ensure_package("open_clip_torch")
+)
+
+logger = logging.getLogger(__name__)
+
+
+class TorchOpenClipModelConfig(fout.TorchImageModelConfig, fozm.HasZooModel):
+    """Configuration for running a :class:`TorchOpenClipModel`.
+
+    See :class:`fiftyone.utils.torch.TorchImageModelConfig` for additional
+    arguments.
+
+    Args:
+        text_prompt: the text prompt to use, e.g., ``"A photo of"``
+        clip_model ("ViT-B-32"): the Open CLIP model to use
+        pretrained ("openai"): the pretrained version to use
+        classes (None): a list of custom classes for zero-shot prediction
+    """
+
+    def __init__(self, d):
+        d = self.init(d)
+        super().__init__(d)
+
+        self.text_prompt = self.parse_string(d, "text_prompt")
+        self.clip_model = self.parse_string(
+            d, "clip_model", default="ViT-B-32"
+        )
+        self.pretrained = self.parse_string(d, "pretrained", default="openai")
+
+
+class TorchOpenClipModel(fout.TorchImageModel, fom.PromptMixin):
+    """Torch implementation of CLIP from
+    https://github.com/mlfoundations/open_clip.
+
+    Args:
+        config: a :class:`TorchOpenClipModelConfig`
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self._text_features = None
+
+    def _load_model(self, config):
+        (
+            self._model,
+            _,
+            self.preprocess,
+        ) = open_clip.create_model_and_transforms(
+            config.clip_model, pretrained=config.pretrained
+        )
+        self._tokenizer = open_clip.get_tokenizer(config.clip_model)
+        return self._model
+
+    def _get_text_features(self):
+        if self._text_features is None:
+            prompts = [
+                "%s %s" % (self.config.text_prompt, c) for c in self.classes
+            ]
+            # Tokenize text
+            text = self._tokenizer(prompts)
+            self._text_features = self._model.encode_text(text)
+
+        return self._text_features
+
+    def _get_class_logits(self, text_features, image_features):
+        # source: https://github.com/openai/CLIP/blob/main/README.md
+        image_features = image_features / image_features.norm(
+            dim=1, keepdim=True
+        )
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        logit_scale = self._model.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        return logits_per_image, logits_per_text
+
+    def _predict_all(self, imgs):
+        if self._preprocess:
+            imgs = [self._preprocess(img).unsqueeze(0) for img in imgs]
+
+        if isinstance(imgs, (list, tuple)):
+            imgs = torch.stack(imgs)
+
+        height, width = imgs.size()[-2:]
+        frame_size = (width, height)
+
+        if self._using_gpu:
+            imgs = imgs.cuda()
+
+        with torch.no_grad(), torch.cuda.amp.autocast():
+            image_features = self._model.encode_image(imgs)
+            text_features = self._get_text_features()
+
+            output, _ = self._get_class_logits(text_features, image_features)
+
+            if self.has_logits:
+                self._output_processor.store_logits = self.store_logits
+
+        return self._output_processor(
+            output, frame_size, confidence_thresh=self.config.confidence_thresh
+        )
diff --git a/fiftyone/zoo/models/manifest-torch.json b/fiftyone/zoo/models/manifest-torch.json
@@ -1862,6 +1862,51 @@
             ],
             "date_added": "2020-12-11 13:45:51"
         },
+        {
+            "base_name": "open-clip-torch",
+            "base_filename": "",
+            "version": null,
+            "description": "OPEN CLIP text/image encoder from `Learning Transferable Visual Models From Natural Language Supervision <https://arxiv.org/abs/2103.00020>`_ trained on 400M text-image pairs",
+            "source": "https://github.com/mlfoundations/open_clip",
+            "size_bytes": 353976522,
+            "manager": {
+                "type": "fiftyone.core.models.ModelManager",
+                "config": {}
+            },
+            "default_deployment_config_dict": {
+                "type": "fiftyone.utils.open_clip.TorchOpenClipModel",
+                "config": {
+                    "entrypoint_fcn": "",
+                    "labels_path": "{{eta-resources}}/voc-labels.txt",
+                    "output_processor_cls": "fiftyone.utils.torch.ClassifierOutputProcessor",
+                    "image_size": [224, 224],
+                    "image_mean": [0.48145466, 0.4578275, 0.40821073],
+                    "image_std": [0.26862954, 0.26130258, 0.27577711],
+                    "embeddings_layer": "visual",
+                    "tokenizer_base_filename": "clip_bpe_simple_vocab_16e6.txt.gz",
+                    "tokenizer_base_url": "https://github.com/openai/CLIP/raw/main/clip/bpe_simple_vocab_16e6.txt.gz",
+                    "text_prompt": "A photo of"
+                }
+            },
+            "requirements": {
+                "packages": ["torch", "torchvision", "open_clip_torch"],
+                "cpu": {
+                    "support": true
+                },
+                "gpu": {
+                    "support": true
+                }
+            },
+            "tags": [
+                "classification",
+                "logits",
+                "embeddings",
+                "torch",
+                "clip",
+                "zero-shot"
+            ],
+            "date_added": "2023-12-13 14:25:51"
+        },
         {
             "base_name": "clip-vit-base32-torch",
             "base_filename": "CLIP-ViT-B-32.pt",