Skip to content

Commit

Permalink
Merge branch 'release-v0.1.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
dandansamax committed May 12, 2024
2 parents 06a0cc3 + 77fa14e commit 2739f5d
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 29 deletions.
20 changes: 11 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@ Crab is a framework for building LLM agent benchmark environments in a Python-ce

#### Key Features

* 🌐 Cross-platform
* Build agent environments in memory, hosted through a docker environment, installed a virtual machine, or as a distributed physical machine, as long as they can be accessed by Python functions.
* Let the agent access all the environments in the same time through a unified interface.
* ⚙ ️Easy-to-use Configuration
* Add a new action by simply adding a `@action` decorator on a Python function.
* Deine the environment by integrating several actions together.
* 📐 Novel Benchmarking Suite
* Define tasks and the corresponding evlauators in an intuitive Python-native way.
* Introduce a novel graph evaluator method proving fine-grained metrics.
🌐 Cross-platform
* Create build agent environments that support various deployment options including in-memory, Docker-hosted, virtual machines, or distributed physical machines, provided they are accessible via Python functions.
* Let the agent access all the environments in the same time through a unified interface.

⚙ ️Easy-to-use Configuration
* Add a new action by simply adding a `@action` decorator on a Python function.
* Deine the environment by integrating several actions together.

📐 Novel Benchmarking Suite
* Define tasks and the corresponding evlauators in an intuitive Python-native way.
* Introduce a novel graph evaluator method providing fine-grained metrics.

## Installation

Expand Down
47 changes: 31 additions & 16 deletions crab/actions/desktop_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@
DURATION = 0.8
DELAY = 0.5

@action
def set_screen_size(env) -> None:
"""Set the screen size to env attribute `width` and `height`."""
env.width, env.height = pyautogui.size()

@action
def click_position(x: int, y: int) -> None:
"""
click on the current desktop screen.
"""Click on the current desktop screen.
Args:
x: The X coordinate, as a floating-point number in the range [0.0, 1.0].
Expand All @@ -41,8 +44,7 @@ def click_position(x: int, y: int) -> None:

@action(local=True)
def click(element: int, env) -> None:
"""
Click an UI element shown on the desktop screen. A simple use case can be
"""Click an UI element shown on the desktop screen. A simple use case can be
click(5), which clicks the UI element labeled with the number 5.
Args:
Expand All @@ -55,8 +57,7 @@ def click(element: int, env) -> None:

@action
def mouse_scroll(click: int = 1) -> None:
"""
Performs a scroll of the mouse scroll wheel.
"""Perform a scroll of the mouse scroll wheel.
Args:
click(int): The amount of scrolling. Default to 1.
Expand Down Expand Up @@ -165,8 +166,7 @@ class KeyEnum(str, Enum):

@action
def key_press(key: KeyEnum) -> None:
"""
Performs a keyboard key press down, followed by a release.
"""Press and release a single keyboard key.
Args:
key (str): The key to be pressed.
Expand All @@ -177,13 +177,27 @@ def key_press(key: KeyEnum) -> None:
pyautogui.press(key)
time.sleep(DELAY)

@action
def hotkey_press(keys: list[KeyEnum]) -> None:
"""Press and release multiple keyboard keys at the same time.
For exmaple, if you want to use Ctrl-C hoykey to copy the selected text, you
can call hotkey_press(keys=["ctrl", "c"]).
Args:
keys: The key list to be pressed together.
"""
if isinstance(keys[0], KeyEnum):
keys = [key.value for key in keys]
pyautogui.hotkey(*keys)
time.sleep(DELAY)

@action
def write_text(text: str) -> None:
"""
Typing the specified text. Note: This function does not move the mouse cursor.
Ensure the cursor focuses in the correct text input field before calling this
function.
"""Type the specified text.
Note: This function does not move the mouse cursor. Ensure the cursor
focuses in the correct text input field before calling this function.
Args:
text (str): The text to be typed.
Expand All @@ -194,10 +208,11 @@ def write_text(text: str) -> None:

@action
def search_application(name: str) -> None:
"""
Search an application name. For exmaple, if you want to open an application named
"slack", you can call search_application(name="slack"). You MUST use this action to
search for applications.
"""Search an application name.
For exmaple, if you want to open an application named "slack", you can call
search_application(name="slack"). You MUST use this action to search for
applications.
Args:
name: the application name.
Expand Down
12 changes: 10 additions & 2 deletions crab/actions/visual_prompt_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,15 @@ def groundingdino_easyocr(
filtered_boxes = filter_boxes_by_overlap(filtered_boxes)
result_boxes = [box[0] for box in filtered_boxes]
draw_boxes(image, result_boxes, font_size)
env.element_position_map = result_boxes
env.element_position_map = [
(
box[0] / image.width,
box[1] / image.height,
box[2] / image.width,
box[3] / image.height,
)
for box in result_boxes
]
env.ocr_results = "".join([box[1] for box in ocr_boxes])
return image_to_base64(image), filtered_boxes

Expand All @@ -298,4 +306,4 @@ def get_element_position(element_id, env):
box = env.element_position_map[element_id]
x = (box[0] + box[2]) / 2
y = (box[1] + box[3]) / 2
return round(x), round(y)
return round(x * env.width), round(y * env.height)
2 changes: 2 additions & 0 deletions crab/environments/linux.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
key_press,
screenshot,
search_application,
set_screen_size,
write_text,
)
from crab.core import EnvironmentConfig
Expand All @@ -25,4 +26,5 @@
action_space=[click, key_press, write_text, search_application],
observation_space=[screenshot],
description="A Ubuntu 22.04 desktop environment with a single display.",
reset=set_screen_size,
)
12 changes: 10 additions & 2 deletions examples/desktop_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,14 @@
create_benchmark,
evaluator,
)
from crab.actions.desktop_actions import click, key_press, screenshot, write_text
from crab.actions.desktop_actions import (
click,
hotkey_press,
key_press,
screenshot,
set_screen_size,
write_text,
)
from crab.actions.visual_prompt_actions import (
get_elements_prompt,
groundingdino_easyocr,
Expand Down Expand Up @@ -63,9 +70,10 @@ def start_benchmark(benchmark: Benchmark, agent: OpenAIAgent):

ENV_CONFIG = EnvironmentConfig(
name="desktop",
action_space=[click, key_press, write_text],
action_space=[click, key_press, write_text, hotkey_press],
observation_space=[screenshot],
description="A desktop environment with a single display.",
reset=set_screen_size,
)

BENCHMARK_CONFIG = BenchmarkConfig(
Expand Down
5 changes: 5 additions & 0 deletions examples/multi_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,10 @@ def start_benchmark(benchmark, agent):
multienv=True,
model="gpt-4-turbo-preview",
)
print(
"\033[92m"
f"Start performing task: \"{task.description}\""
"\033[0m"
)
start_benchmark(benchmark, agent)
benchmark.reset()
5 changes: 5 additions & 0 deletions examples/single_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,10 @@ def start_benchmark(benchmark, agent):
action_space,
model="gpt-4-turbo-preview",
)
print(
"\033[92m"
f"Start performing task: \"{task.description}\""
"\033[0m"
)
start_benchmark(benchmark, agent)
benchmark.reset()

0 comments on commit 2739f5d

Please sign in to comment.