Skip to content

Commit

Permalink
feat: run OpenAdapt inside Anthropic Computer Use (#934)
Browse files Browse the repository at this point in the history
* import sounddevice inside record_audio()

* utils.get_scaling_factor

* describe_actions.py with PIL

* show text in top left

* max_width = image.width

* dim_outside_window

* add module docstring

* add browser to visualize.py
  • Loading branch information
abrichr authored Jan 2, 2025
1 parent 11f51e1 commit 8a79c83
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 59 deletions.
81 changes: 36 additions & 45 deletions experiments/describe_action.py → experiments/describe_actions.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,40 @@
"""Generate action descriptions."""
"""Generate natural language descriptions from actions."""

from pprint import pformat

from loguru import logger
import cv2
from PIL import Image, ImageDraw
import numpy as np

from openadapt.db import crud
from openadapt.plotting import get_font
from openadapt.utils import get_scaling_factor

scaling_factor = get_scaling_factor()


def embed_description(
image: np.ndarray,
image: Image.Image,
description: str,
x: int = None,
y: int = None,
) -> np.ndarray:
x: int = 0,
y: int = 0,
) -> Image.Image:
"""Embed a description into an image at the specified location.
Args:
image (np.ndarray): The image to annotate.
image (Image.Image): The image to annotate.
description (str): The text to embed.
x (int, optional): The x-coordinate. Defaults to None (centered).
y (int, optional): The y-coordinate. Defaults to None (centered).
x (int, optional): The x-coordinate. Defaults to 0.
y (int, optional): The y-coordinate. Defaults to 0.
Returns:
np.ndarray: The annotated image.
Image.Image: The annotated image.
"""
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 1
font_color = (255, 255, 255) # White
line_type = 1
draw = ImageDraw.Draw(image)
font_size = 30 # Set font size (2x the default size)
font = get_font("Arial.ttf", font_size)

# Split description into multiple lines
max_width = 60 # Maximum characters per line
max_width = image.width
words = description.split()
lines = []
current_line = []
Expand All @@ -45,36 +47,28 @@ def embed_description(
if current_line:
lines.append(" ".join(current_line))

# Default to center if coordinates are not provided
if x is None or y is None:
x = image.shape[1] // 2
y = image.shape[0] // 2
# Adjust coordinates for scaling factor
x = int(x * scaling_factor)
y = int(y * scaling_factor)

# Draw semi-transparent background and text
# Calculate text dimensions and draw semi-transparent background and text
for i, line in enumerate(lines):
text_size, _ = cv2.getTextSize(line, font, font_scale, line_type)
text_x = max(0, min(x - text_size[0] // 2, image.shape[1] - text_size[0]))
text_y = y + i * 20
bbox = draw.textbbox((0, 0), line, font=font)
text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
text_x = max(0, min(x - text_width // 2, image.width - text_width))
text_y = y + i * text_height

# Draw background
cv2.rectangle(
image,
(text_x - 15, text_y - 25),
(text_x + text_size[0] + 15, text_y + 15),
(0, 0, 0),
-1,
background_box = (
text_x - 15,
text_y - 5,
text_x + text_width + 15,
text_y + text_height + 5,
)
draw.rectangle(background_box, fill=(0, 0, 0, 128))

# Draw text
cv2.putText(
image,
line,
(text_x, text_y),
font,
font_scale,
font_color,
line_type,
)
draw.text((text_x, text_y), line, fill=(255, 255, 255), font=font)

return image

Expand All @@ -88,25 +82,22 @@ def main() -> None:
for action in action_events:
description, image = action.prompt_for_description(return_image=True)

# Convert image to numpy array for OpenCV compatibility
image = np.array(image)
# Convert image to PIL.Image for compatibility
image = Image.fromarray(np.array(image))

if action.mouse_x is not None and action.mouse_y is not None:
# Use the mouse coordinates for mouse events
annotated_image = embed_description(
image,
description,
x=int(action.mouse_x) * 2,
y=int(action.mouse_y) * 2,
)
else:
# Center the text for other events
annotated_image = embed_description(image, description)

logger.info(f"{action=}")
logger.info(f"{description=}")
cv2.imshow("Annotated Image", annotated_image)
cv2.waitKey(0)
annotated_image.show() # Opens the annotated image using the default viewer
descriptions.append(description)

logger.info(f"descriptions=\n{pformat(descriptions)}")
Expand Down
1 change: 1 addition & 0 deletions openadapt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,7 @@ def prompt_for_description(self, return_image: bool = False) -> str:
darken_outside=0.7,
display_text=False,
marker_fill_transparency=0,
dim_outside_window=False,
)

if self.text:
Expand Down
19 changes: 11 additions & 8 deletions openadapt/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ def display_event(
diff: bool = False,
darken_outside: float | None = None,
display_text: bool = True,
dim_outside_window: bool = True,
) -> Image.Image:
"""Display an action event on the image.
Expand All @@ -247,6 +248,7 @@ def display_event(
the ellipse for mouse events. Range 0-1, where 1 is completely black.
Defaults to None (no darkening).
display_text (bool): Whether to display action text. Defaults to True.
dim_outside_window (bool): Whether to dim outside the WindowEvent area.
Returns:
PIL.Image.Image: The image with the action event displayed on it.
Expand All @@ -267,14 +269,15 @@ def display_event(
width_ratio, height_ratio = utils.get_scale_ratios(action_event)

# dim area outside window event
if not window_event:
logger.error(f"{window_event=}")
else:
x0 = window_event.left * width_ratio
y0 = window_event.top * height_ratio
x1 = x0 + window_event.width * width_ratio
y1 = y0 + window_event.height * height_ratio
image = draw_rectangle(x0, y0, x1, y1, image, outline_width=5)
if dim_outside_window:
if not window_event:
logger.error(f"{window_event=}")
else:
x0 = window_event.left * width_ratio
y0 = window_event.top * height_ratio
x1 = x0 + window_event.width * width_ratio
y1 = y0 + window_event.height * height_ratio
image = draw_rectangle(x0, y0, x1, y1, image, outline_width=5)

# display diff bbox
if diff:
Expand Down
3 changes: 2 additions & 1 deletion openadapt/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@

import numpy as np
import psutil
import sounddevice
import soundfile
import websockets.sync.server
import whisper
Expand Down Expand Up @@ -1082,6 +1081,8 @@ def record_audio(

audio_frames = [] # to store audio frames

import sounddevice

def audio_callback(
indata: np.ndarray, frames: int, time: Any, status: sounddevice.CallbackFlags
) -> None:
Expand Down
12 changes: 12 additions & 0 deletions openadapt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,18 @@ def get_html_prompt(html: str, convert_to_markdown: bool = False) -> str:
return str(soup)


def get_scaling_factor() -> int:
"""Determine the scaling factor using AppKit on macOS."""
if sys.platform == "darwin":
from AppKit import NSScreen

main_screen = NSScreen.mainScreen()
backing_scale = main_screen.backingScaleFactor()
logger.info(f"Backing Scale Factor: {backing_scale}")
return int(backing_scale)
return 1 # Default for Windows/Linux


class WrapStdout:
"""Class to be used a target for multiprocessing.Process."""

Expand Down
14 changes: 9 additions & 5 deletions openadapt/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def main(
recording_id: int = None,
diff_video: bool = False,
cleanup: bool = True,
browser: str = None,
) -> bool:
"""Visualize a recording.
Expand All @@ -167,6 +168,7 @@ def main(
recording_id (int, optional): The ID of the recording to visualize.
diff_video (bool): Whether to diff Screenshots against video frames.
cleanup (bool): Whether to remove the HTML file after it is displayed.
browser (str, optional): Command to open the browser executable.
Returns:
bool: True if visualization was successful, None otherwise.
Expand Down Expand Up @@ -442,11 +444,13 @@ def main(
os.makedirs(RECORDING_DIR_PATH, exist_ok=True)
output_file(fname_out, title=title)

result = show( # noqa: F841
layout(
rows,
)
)
result = show(layout(rows)) # noqa: F841

if browser:
import subprocess

logger.info(f"Opening browser with command: {browser}")
subprocess.run([browser, f"file://{fname_out}"], check=True)

def _cleanup() -> None:
os.remove(fname_out)
Expand Down

0 comments on commit 8a79c83

Please sign in to comment.