-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b761ca7
commit 1799442
Showing
7 changed files
with
182 additions
and
1,080 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,39 +1,39 @@ | ||
from transformers import Wav2Vec2Processor, Wav2Vec2Model | ||
import torch | ||
import librosa | ||
import numpy as np | ||
import io | ||
# from transformers import Wav2Vec2Processor, Wav2Vec2Model | ||
# import torch | ||
# import librosa | ||
# import numpy as np | ||
# import io | ||
|
||
class AudioEmbeddingService: | ||
def __init__(self, model): | ||
self.processor = Wav2Vec2Processor.from_pretrained(model) | ||
self.model = Wav2Vec2Model.from_pretrained(model) | ||
# class AudioEmbeddingService: | ||
# def __init__(self, model): | ||
# self.processor = Wav2Vec2Processor.from_pretrained(model) | ||
# self.model = Wav2Vec2Model.from_pretrained(model) | ||
|
||
def encode(self, file_stream): | ||
# Load the audio file | ||
audio_input, sr = librosa.load(io.BytesIO(file_stream), sr=16000) | ||
# def encode(self, file_stream): | ||
# # Load the audio file | ||
# audio_input, sr = librosa.load(io.BytesIO(file_stream), sr=16000) | ||
|
||
# Process audio | ||
inputs = self.processor(audio_input, return_tensors="pt", sampling_rate=sr) | ||
# # Process audio | ||
# inputs = self.processor(audio_input, return_tensors="pt", sampling_rate=sr) | ||
|
||
# Move to CPU | ||
inputs = {k: v.to("cpu") for k, v in inputs.items()} | ||
# # Move to CPU | ||
# inputs = {k: v.to("cpu") for k, v in inputs.items()} | ||
|
||
# Get the audio embedding | ||
with torch.no_grad(): | ||
audio_features = self.model(**inputs).last_hidden_state | ||
|
||
# Mean pooling the embeddings across the time dimension | ||
embeddings = audio_features.mean(dim=1) | ||
# # Get the audio embedding | ||
# with torch.no_grad(): | ||
# audio_features = self.model(**inputs).last_hidden_state | ||
|
||
# Normalize the embeddings | ||
normalized_embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) | ||
# # Mean pooling the embeddings across the time dimension | ||
# embeddings = audio_features.mean(dim=1) | ||
|
||
return normalized_embeddings | ||
# # Normalize the embeddings | ||
# normalized_embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) | ||
|
||
def get_dimensions(self): | ||
return self.model.config.hidden_size | ||
# return normalized_embeddings | ||
|
||
def get_token_size(self): | ||
# Similar to images, token size isn't directly applicable to audio | ||
return None | ||
# def get_dimensions(self): | ||
# return self.model.config.hidden_size | ||
|
||
# def get_token_size(self): | ||
# # Similar to images, token size isn't directly applicable to audio | ||
# return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,38 @@ | ||
from transformers import CLIPProcessor, CLIPModel | ||
import torch | ||
import io | ||
from PIL import Image | ||
# from transformers import CLIPProcessor, CLIPModel | ||
# import torch | ||
# import io | ||
# from PIL import Image | ||
|
||
|
||
class ImageEmbeddingService: | ||
def __init__(self, model): | ||
self.processor = CLIPProcessor.from_pretrained(model) | ||
self.model = CLIPModel.from_pretrained(model) | ||
# class ImageEmbeddingService: | ||
# def __init__(self, model): | ||
# self.processor = CLIPProcessor.from_pretrained(model) | ||
# self.model = CLIPModel.from_pretrained(model) | ||
|
||
def encode(self, file_stream): | ||
# Load the image | ||
image = Image.open(io.BytesIO(file_stream)) | ||
# def encode(self, file_stream): | ||
# # Load the image | ||
# image = Image.open(io.BytesIO(file_stream)) | ||
|
||
# Process image | ||
inputs = self.processor(images=image, return_tensors="pt") | ||
# # Process image | ||
# inputs = self.processor(images=image, return_tensors="pt") | ||
|
||
# Move to CPU | ||
inputs = {k: v.to("cpu") for k, v in inputs.items()} | ||
# # Move to CPU | ||
# inputs = {k: v.to("cpu") for k, v in inputs.items()} | ||
|
||
# Get the image embedding | ||
with torch.no_grad(): | ||
image_features = self.model.get_image_features(**inputs) | ||
# # Get the image embedding | ||
# with torch.no_grad(): | ||
# image_features = self.model.get_image_features(**inputs) | ||
|
||
# Normalize the embeddings | ||
image_embeddings = torch.nn.functional.normalize(image_features, p=2, dim=1) | ||
# # Normalize the embeddings | ||
# image_embeddings = torch.nn.functional.normalize(image_features, p=2, dim=1) | ||
|
||
return image_embeddings | ||
# return image_embeddings | ||
|
||
def get_dimensions(self): | ||
# CLIP's image and text embeddings are of the same size | ||
return self.model.config.text_config.hidden_size | ||
# def get_dimensions(self): | ||
# # CLIP's image and text embeddings are of the same size | ||
# return self.model.config.text_config.hidden_size | ||
|
||
def get_token_size(self): | ||
# This method isn't directly applicable to images as it is to text | ||
# Returning None or a default value could be more appropriate | ||
return None | ||
# def get_token_size(self): | ||
# # This method isn't directly applicable to images as it is to text | ||
# # Returning None or a default value could be more appropriate | ||
# return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,57 +1,57 @@ | ||
import torch | ||
from PIL import Image | ||
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize | ||
from transformers import CLIPProcessor, CLIPModel | ||
import cv2 | ||
import numpy as np | ||
|
||
|
||
class VideoEmbeddingService: | ||
def __init__(self, model): | ||
self.processor = CLIPProcessor.from_pretrained(model) | ||
self.model = CLIPModel.from_pretrained(model) | ||
|
||
def frame_embeddings(self, video_path): | ||
# Initialize a video capture object | ||
cap = cv2.VideoCapture(video_path) | ||
frame_embeddings = [] | ||
|
||
while True: | ||
ret, frame = cap.read() | ||
if not ret: | ||
break | ||
|
||
# Convert the color space from BGR to RGB, then convert to PIL Image | ||
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | ||
pil_image = Image.fromarray(frame) | ||
|
||
# Preprocess the image | ||
inputs = self.processor(images=pil_image, return_tensors="pt", padding=True) | ||
|
||
# Move to CPU and get the image embedding | ||
inputs = {k: v.to("cpu") for k, v in inputs.items()} | ||
with torch.no_grad(): | ||
frame_features = self.model.get_image_features(**inputs) | ||
|
||
frame_embeddings.append(frame_features) | ||
|
||
cap.release() | ||
return torch.stack(frame_embeddings) | ||
|
||
def encode(self, file_stream): | ||
# Assume file_stream is a path for simplicity; adapt as necessary for actual streams | ||
embeddings = self.frame_embeddings(file_stream) | ||
# Aggregate embeddings, e.g., by averaging | ||
video_embedding = embeddings.mean(dim=0) | ||
# Normalize the embeddings | ||
normalized_embedding = torch.nn.functional.normalize( | ||
video_embedding, p=2, dim=1 | ||
) | ||
return normalized_embedding | ||
|
||
def get_dimensions(self): | ||
return self.model.config.visual_projection.out_features | ||
|
||
def get_token_size(self): | ||
# Not applicable for videos, similar to audio and images | ||
return None | ||
# import torch | ||
# from PIL import Image | ||
# from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize | ||
# from transformers import CLIPProcessor, CLIPModel | ||
# import cv2 | ||
# import numpy as np | ||
|
||
|
||
# class VideoEmbeddingService: | ||
# def __init__(self, model): | ||
# self.processor = CLIPProcessor.from_pretrained(model) | ||
# self.model = CLIPModel.from_pretrained(model) | ||
|
||
# def frame_embeddings(self, video_path): | ||
# # Initialize a video capture object | ||
# cap = cv2.VideoCapture(video_path) | ||
# frame_embeddings = [] | ||
|
||
# while True: | ||
# ret, frame = cap.read() | ||
# if not ret: | ||
# break | ||
|
||
# # Convert the color space from BGR to RGB, then convert to PIL Image | ||
# frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | ||
# pil_image = Image.fromarray(frame) | ||
|
||
# # Preprocess the image | ||
# inputs = self.processor(images=pil_image, return_tensors="pt", padding=True) | ||
|
||
# # Move to CPU and get the image embedding | ||
# inputs = {k: v.to("cpu") for k, v in inputs.items()} | ||
# with torch.no_grad(): | ||
# frame_features = self.model.get_image_features(**inputs) | ||
|
||
# frame_embeddings.append(frame_features) | ||
|
||
# cap.release() | ||
# return torch.stack(frame_embeddings) | ||
|
||
# def encode(self, file_stream): | ||
# # Assume file_stream is a path for simplicity; adapt as necessary for actual streams | ||
# embeddings = self.frame_embeddings(file_stream) | ||
# # Aggregate embeddings, e.g., by averaging | ||
# video_embedding = embeddings.mean(dim=0) | ||
# # Normalize the embeddings | ||
# normalized_embedding = torch.nn.functional.normalize( | ||
# video_embedding, p=2, dim=1 | ||
# ) | ||
# return normalized_embedding | ||
|
||
# def get_dimensions(self): | ||
# return self.model.config.visual_projection.out_features | ||
|
||
# def get_token_size(self): | ||
# # Not applicable for videos, similar to audio and images | ||
# return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.