Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherry picking OpenCLIP for 0.23.2 release #3948

Merged
merged 8 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/scripts/make_model_zoo_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@
"{{ name }}",
text_prompt="A photo of a",
classes=["person", "dog", "cat", "bird", "car", "tree", "chair"],
cache=False,
)

dataset.apply_model(model, label_field="predictions")
Expand Down
118 changes: 118 additions & 0 deletions fiftyone/utils/open_clip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""
CLIP model wrapper for the FiftyOne Model Zoo.

| Copyright 2017-2023, Voxel51, Inc.
| `voxel51.com <https://voxel51.com/>`_
|
"""
import logging

import fiftyone.core.models as fom
import fiftyone.core.utils as fou
import fiftyone.utils.torch as fout
import fiftyone.zoo.models as fozm

fou.ensure_torch()
import torch

open_clip = fou.lazy_import(
"open_clip", callback=lambda: fou.ensure_package("open_clip_torch")
)

logger = logging.getLogger(__name__)


class TorchOpenClipModelConfig(fout.TorchImageModelConfig, fozm.HasZooModel):
"""Configuration for running a :class:`TorchOpenClipModel`.

See :class:`fiftyone.utils.torch.TorchImageModelConfig` for additional
arguments.

Args:
text_prompt: the text prompt to use, e.g., ``"A photo of"``
clip_model ("ViT-B-32"): the Open CLIP model to use
pretrained ("openai"): the pretrained version to use
classes (None): a list of custom classes for zero-shot prediction
"""

def __init__(self, d):
d = self.init(d)
super().__init__(d)

self.text_prompt = self.parse_string(d, "text_prompt")
self.clip_model = self.parse_string(
d, "clip_model", default="ViT-B-32"
)
self.pretrained = self.parse_string(d, "pretrained", default="openai")


class TorchOpenClipModel(fout.TorchImageModel, fom.PromptMixin):
"""Torch implementation of CLIP from
https://github.com/mlfoundations/open_clip.

Args:
config: a :class:`TorchOpenClipModelConfig`
"""

def __init__(self, config):
super().__init__(config)
self._text_features = None

def _load_model(self, config):
(
self._model,
_,
self.preprocess,
) = open_clip.create_model_and_transforms(
config.clip_model, pretrained=config.pretrained
)
self._tokenizer = open_clip.get_tokenizer(config.clip_model)
return self._model

def _get_text_features(self):
if self._text_features is None:
prompts = [
"%s %s" % (self.config.text_prompt, c) for c in self.classes
]
# Tokenize text
text = self._tokenizer(prompts)
self._text_features = self._model.encode_text(text)

return self._text_features

def _get_class_logits(self, text_features, image_features):
# source: https://github.com/openai/CLIP/blob/main/README.md
image_features = image_features / image_features.norm(
dim=1, keepdim=True
)
text_features = text_features / text_features.norm(dim=1, keepdim=True)
logit_scale = self._model.logit_scale.exp()
logits_per_image = logit_scale * image_features @ text_features.t()
logits_per_text = logits_per_image.t()
return logits_per_image, logits_per_text

def _predict_all(self, imgs):
if self._preprocess:
imgs = [self._preprocess(img).unsqueeze(0) for img in imgs]

if isinstance(imgs, (list, tuple)):
imgs = torch.stack(imgs)

height, width = imgs.size()[-2:]
frame_size = (width, height)

if self._using_gpu:
imgs = imgs.cuda()

with torch.no_grad(), torch.cuda.amp.autocast():
image_features = self._model.encode_image(imgs)
text_features = self._get_text_features()

output, _ = self._get_class_logits(text_features, image_features)

if self.has_logits:
self._output_processor.store_logits = self.store_logits

return self._output_processor(
output, frame_size, confidence_thresh=self.config.confidence_thresh
)
45 changes: 45 additions & 0 deletions fiftyone/zoo/models/manifest-torch.json
Original file line number Diff line number Diff line change
Expand Up @@ -1862,6 +1862,51 @@
],
"date_added": "2020-12-11 13:45:51"
},
{
"base_name": "open-clip-torch",
"base_filename": "",
"version": null,
"description": "OPEN CLIP text/image encoder from `Learning Transferable Visual Models From Natural Language Supervision <https://arxiv.org/abs/2103.00020>`_ trained on 400M text-image pairs",
"source": "https://github.com/mlfoundations/open_clip",
"size_bytes": 353976522,
"manager": {
"type": "fiftyone.core.models.ModelManager",
"config": {}
},
"default_deployment_config_dict": {
"type": "fiftyone.utils.open_clip.TorchOpenClipModel",
"config": {
"entrypoint_fcn": "",
"labels_path": "{{eta-resources}}/voc-labels.txt",
"output_processor_cls": "fiftyone.utils.torch.ClassifierOutputProcessor",
"image_size": [224, 224],
"image_mean": [0.48145466, 0.4578275, 0.40821073],
"image_std": [0.26862954, 0.26130258, 0.27577711],
"embeddings_layer": "visual",
"tokenizer_base_filename": "clip_bpe_simple_vocab_16e6.txt.gz",
"tokenizer_base_url": "https://github.com/openai/CLIP/raw/main/clip/bpe_simple_vocab_16e6.txt.gz",
"text_prompt": "A photo of"
}
},
"requirements": {
"packages": ["torch", "torchvision", "open_clip_torch"],
"cpu": {
"support": true
},
"gpu": {
"support": true
}
},
"tags": [
"classification",
"logits",
"embeddings",
"torch",
"clip",
"zero-shot"
],
"date_added": "2023-12-13 14:25:51"
},
{
"base_name": "clip-vit-base32-torch",
"base_filename": "CLIP-ViT-B-32.pt",
Expand Down
Loading