-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_arch.py
119 lines (90 loc) · 4.75 KB
/
model_arch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from transformers import AutoModelForCausalLM
from PIL import Image
class CustomTransformer(nn.Module):
def __init__(self, input_size, output_size, num_layers=2, hidden_size=256):
super(CustomTransformer, self).__init__()
self.input_size = input_size
self.output_size = output_size
self.num_layers = num_layers
self.hidden_size = hidden_size
self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_size, nhead=4, dim_feedforward=hidden_size)
self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
self.linear = nn.Linear(input_size, output_size)
def forward(self, x):
x = self.transformer_encoder(x)
x = self.linear(x)
return x
class LargeWorldModel:
def __init__(self, clip_model, clip_processor, transformer_config):
self.clip_model = clip_model
self.clip_processor = clip_processor
self.transformer = CustomTransformer(**transformer_config)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.clip_model.to(self.device)
self.transformer.to(self.device)
self.clip_model.eval() # Set the CLIP model to evaluation mode
self.transformer.eval() # Set the transformer model to evaluation mode
def encode_image(self, image_tensor, text=[' ']):
# Provide a dummy text input along with the image
inputs = self.clip_processor(text=text, images=image_tensor, return_tensors="pt")
with torch.no_grad():
outputs = self.clip_model(**inputs)
return outputs.image_embeds
def build_concept_space(self, image_path):
#with torch.no_grad():
# Preprocess the image
image_tensor = self.preprocess_image(image_path)
# Get embeddings for the image using the clip model
embeddings = self.encode_image(image_tensor)
# Save the embeddings to concept_space_1
self.concept_space_1 = embeddings.cpu() # Move embeddings to CPU for storage
#print(f"len of cs1 {self.concept_space_1.shape}")
# Pass embeddings through the custom transformer architecture
concept_space_2 = self.transformer(embeddings)
# Save the concept_space_2 embeddings to self variable
self.concept_space_2 = concept_space_2.cpu() # Move embeddings to CPU for storage
#print(f"len of cs2 {self.concept_space_2.shape}")
def preprocess_image(self, image_path):
# Read image
image = Image.open(image_path).convert("RGB") # Ensure image is in RGB mode
# Preprocess image
"""
preprocess = transforms.Compose([
transforms.Resize((224, 224)), # Resize image to 224x224
transforms.ToTensor(), # Convert image to tensor
])
image = preprocess(image)
# Add batch dimension
image = image.unsqueeze(0)
"""
return image
class LWM_Actions():
def __init__(self, clip_model, clip_processor, transformer_config, decoder_model="meta-llama/Llama-2-7b-hf"):
#Define Encoders
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#define World Model
self.LWM = LargeWorldModel(clip_model, clip_processor, transformer_config)
self.action_model = model = AutoModelForCausalLM.from_pretrained(decoder_model)
self.action_model.eval()
def build_action_space(self, image_path):
self.LWM.build_concept_space(image_path)
self.action_sequence = self.action_model(**self.LWM.concept_space_2, output_hidden_states=True).hidden_states[-1]
if __name__ == '__main__':
#Imports are only here to avoid uncesseray over head during deployment
from transformers import CLIPProcessor, CLIPModel
# Example usage:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") # Load your pre-trained CLIP model
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
transformer_config = {
"input_size": 512, # Assuming CLIP model's output size is 512
"output_size": 512, # Output size should match CLIP model's output size
"num_layers": 2, # Number of transformer layers
"hidden_size": 256 # Hidden size of the transformer
}
concept_space_builder = LargeWorldModel(clip_model, clip_processor, transformer_config)
image_path = "space_invaders_frames/frame_1.jpg"
concept_space_builder.build_concept_space(image_path)
# Now the concept_space_2 variable within concept_space_builder will hold the embeddings produced by the custom transformer architecture.