-
Notifications
You must be signed in to change notification settings - Fork 8
/
gemini_mini_vision.py
139 lines (119 loc) · 6.2 KB
/
gemini_mini_vision.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import google.generativeai as genai
import PIL.Image as Image
import glob
from natsort import natsorted, ns
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
images_root = 'demo/composite'
masks_root = 'demo/mask'
new_caption_path = 'demo/caption_multi2.txt' # where to save the caption.
prompt_num = 2 # How many different prompts for one image.
api_key = "Your KEY"
"""
Generate prompts with Google Genmini. Please refer to Section 3.1 `Imaging Condition Description Generation` in our
paper-v2 for more details.
Note that in case network connection issues, we here provide an offline generation script. That is, pre-generated multiple
prompts in advance. This will not hurt performance too much. You can also easily modify and incorporate this script
into the harmonization pipeline to achieve an online generation as presented in Section 3.3 in our paper-v2.
"""
def get_caption(image_path, mask_path):
try:
genai.configure(api_key=api_key)
# Set up the model
generation_config = {
"temperature": 0.3,
"top_p": 1,
"top_k": 64,
"max_output_tokens": 4096,
}
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
}
]
model = genai.GenerativeModel(model_name="gemini-pro-vision",
generation_config=generation_config,
safety_settings=safety_settings)
prompt_parts = [
"I want to choose some words to describe the composite image, which is made by superimposing cut-out object onto the background image."
]
prompt_parts.append(
"This is the provided image:")
img = Image.open(image_path).convert('RGB')
# resize
img = img.resize((1024, int(1024 * img.height / img.width)), Image.ANTIALIAS)
prompt_parts.append(img)
prompt_parts.append("This is the mask of the foreground object:")
mask_img = Image.open(mask_path).convert('L')
# resize
mask_img = mask_img.resize((1024, int(1024 * img.height / img.width)), Image.ANTIALIAS)
prompt_parts.append(mask_img)
prompt_parts.append(
"The foreground region is the mask region, while the rest constitutes the background.\n"
"Here, I provide a set of descriptive words categorized in a dictionary as follows: \n" # Imaging description:sunset,sunrise,night,daytime,day,dusk,dawn,evening,morning
"{'brightness':[dazzling, bright, dim, dull, shaded, shadowed],"
"'weather':[cloudy, sunny, rainy, snowy, foggy, windy, stormy, clear, misty],"
"'temperature':[hot, warm, cool, cold, icy],"
"'season':[spring, summer, autumn, winter],"
"'time':[dawn, sunrise, daylight, twilight, sunset, dusk, dark, night],"
"'color tone':[greyscale, neon, golden, white, blue, green, yellow, orange, red, earthy],"
"'environment':[city, rural, lake, ocean, mountain, forest, desert, grassland, sky, space, indoor, street]}\n",
)
prompt_parts.append(
"Now, I need to first give the name of the foreground object and then select appropriate words from the above dictionary to describe both the foreground object and background. Here are the specific instructions: \n "
"1. Describe the name of the foreground object\n"
"2. Choose one or more words from the entire dictionary that best describe the style of foreground. (e.g. brightness, color tone...)\n"
"3. Choose one or more words from the entire dictionary that best describe the background. (e.g. brightness, weather, temperature, season ...)\n"
"Note: Choose only one word from each list and ensure that a word from the 'brightness' list is included in the selection.\n"
"Note: The output format should be: (foreground object name) X X ... & (foreground) X X ... & (background) X X ..., where X represents a word. "
"For example, dog & (foreground) bright summer & (background) winter dull greyscale. \n "
# "Note: Please refer to the provided image for the actual chosen words. \n" (foreground) bright & (background) summer dim blue, or,
"Ensure adherence to this format in your response; any other formats will not be accepted."
)
response = model.generate_content(prompt_parts, stream=True)
response.resolve()
except Exception as e:
print(e)
print("get caption failed, try again.....")
return ''
print(response.text)
return response.text.strip(' ')
composite_images = []
mask_images = []
for i in glob.glob(os.path.join(images_root, "*")):
composite_images.append(i)
for i in glob.glob(os.path.join(masks_root, "*")):
mask_images.append(i)
composite_images = natsorted(composite_images, alg=ns.PATH)
mask_images = natsorted(mask_images, alg=ns.PATH)
for i in range(0, len(composite_images)):
image_path = composite_images[i]
mask_path = mask_images[i]
caption = ''
for t in range(prompt_num):
while len(caption.split('&')) != 3 or "(foreground)" not in caption or "(background)" not in caption:
caption = get_caption(image_path, mask_path)
object_name = caption.split('&')[0].strip()
new_caption = "{} {},{} {}".format(object_name, caption.split('&')[1].split(')')[1].strip(),
object_name, caption.split('&')[2].split(')')[1].strip())
print("new caption: ", new_caption)
caption = ''
with open(new_caption_path, 'a') as f:
f.write(new_caption)
if t != 1:
f.write(';')
with open(new_caption_path, 'a') as f:
f.write('\n')