feat: Add execution time to both task and testing feature (#1031)

* feat: Add execution time to both task and testing feature * feat: Remove unused functions * feat: change test_crew to evalaute_crew to avoid issues with testing libs * feat: fix tests
crewAIInc · Jul 30, 2024 · d824db8 · d824db8
1 parent de6b597
commit d824db8
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 33 deletions.
diff --git a/src/crewai/cli/cli.py b/src/crewai/cli/cli.py
@@ -6,9 +6,9 @@
 )
 
 from .create_crew import create_crew
+from .evaluate_crew import evaluate_crew
 from .replay_from_task import replay_task_command
 from .reset_memories_command import reset_memories_command
-from .test_crew import test_crew
 from .train_crew import train_crew
 
 
@@ -144,7 +144,7 @@ def reset_memories(long, short, entities, kickoff_outputs, all):
 def test(n_iterations: int, model: str):
     """Test the crew and evaluate the results."""
     click.echo(f"Testing the crew for {n_iterations} iterations with model {model}")
-    test_crew(n_iterations, model)
+    evaluate_crew(n_iterations, model)
 
 
 if __name__ == "__main__":

diff --git a/src/crewai/cli/test_crew.py → src/crewai/cli/evaluate_crew.py b/src/crewai/cli/test_crew.py → src/crewai/cli/evaluate_crew.py
@@ -1,13 +1,11 @@
 import subprocess
-import click
-import pytest
 
-pytest.skip(allow_module_level=True)
+import click
 
 
-def test_crew(n_iterations: int, model: str) -> None:
+def evaluate_crew(n_iterations: int, model: str) -> None:
     """
-    Test the crew by running a command in the Poetry environment.
+    Test and Evaluate the crew by running a command in the Poetry environment.
 
     Args:
         n_iterations (int): The number of iterations to test the crew.

diff --git a/src/crewai/task.py b/src/crewai/task.py
@@ -1,3 +1,4 @@
+import datetime
 import json
 import os
 import threading
@@ -107,6 +108,7 @@ class Config:
     _original_description: str | None = None
     _original_expected_output: str | None = None
     _thread: threading.Thread | None = None
+    _execution_time: float | None = None
 
     def __init__(__pydantic_self__, **data):
         config = data.pop("config", {})
@@ -120,6 +122,12 @@ def _deny_user_set_id(cls, v: Optional[UUID4]) -> None:
                 "may_not_set_field", "This field is not to be set by the user.", {}
             )
 
+    def _set_start_execution_time(self) -> float:
+        return datetime.datetime.now().timestamp()
+
+    def _set_end_execution_time(self, start_time: float) -> None:
+        self._execution_time = datetime.datetime.now().timestamp() - start_time
+
     @field_validator("output_file")
     @classmethod
     def output_file_validation(cls, value: str) -> str:
@@ -216,6 +224,7 @@ def _execute_core(
                 f"The task '{self.description}' has no agent assigned, therefore it can't be executed directly and should be executed in a Crew using a specific process that support that, like hierarchical."
             )
 
+        start_time = self._set_start_execution_time()
         self._execution_span = self._telemetry.task_started(crew=agent.crew, task=self)
 
         self.prompt_context = context
@@ -239,6 +248,7 @@ def _execute_core(
         )
         self.output = task_output
 
+        self._set_end_execution_time(start_time)
         if self.callback:
             self.callback(self.output)
 
@@ -250,7 +260,9 @@ def _execute_core(
             content = (
                 json_output
                 if json_output
-                else pydantic_output.model_dump_json() if pydantic_output else result
+                else pydantic_output.model_dump_json()
+                if pydantic_output
+                else result
             )
             self._save_file(content)
 

diff --git a/src/crewai/utilities/evaluators/crew_evaluator_handler.py b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -28,6 +28,7 @@ class CrewEvaluator:
     """
 
     tasks_scores: defaultdict = defaultdict(list)
+    run_execution_times: defaultdict = defaultdict(list)
     iteration: int = 0
 
     def __init__(self, crew, openai_model_name: str):
@@ -40,9 +41,6 @@ def _setup_for_evaluating(self) -> None:
         for task in self.crew.tasks:
             task.callback = self.evaluate
 
-    def set_iteration(self, iteration: int) -> None:
-        self.iteration = iteration
-
     def _evaluator_agent(self):
         return Agent(
             role="Task Execution Evaluator",
@@ -71,6 +69,9 @@ def _evaluation_task(
             output_pydantic=TaskEvaluationPydanticOutput,
         )
 
+    def set_iteration(self, iteration: int) -> None:
+        self.iteration = iteration
+
     def print_crew_evaluation_result(self) -> None:
         """
         Prints the evaluation result of the crew in a table.
@@ -119,6 +120,16 @@ def print_crew_evaluation_result(self) -> None:
         ]
         table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}")
 
+        run_exec_times = [
+            int(sum(tasks_exec_times))
+            for _, tasks_exec_times in self.run_execution_times.items()
+        ]
+        execution_time_avg = int(sum(run_exec_times) / len(run_exec_times))
+        table.add_row(
+            "Execution Time (s)",
+            *map(str, run_exec_times),
+            f"{execution_time_avg}",
+        )
         # Display the table in the terminal
         console = Console()
         console.print(table)
@@ -145,5 +156,8 @@ def evaluate(self, task_output: TaskOutput):
 
         if isinstance(evaluation_result.pydantic, TaskEvaluationPydanticOutput):
             self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
+            self.run_execution_times[self.iteration].append(
+                current_task._execution_time
+            )
         else:
             raise ValueError("Evaluation result is not in the expected format")
diff --git a/tests/cli/cli_test.py b/tests/cli/cli_test.py
@@ -135,29 +135,29 @@ def test_version_command_with_tools(runner):
     )
 
 
-@mock.patch("crewai.cli.cli.test_crew")
-def test_test_default_iterations(test_crew, runner):
+@mock.patch("crewai.cli.cli.evaluate_crew")
+def test_test_default_iterations(evaluate_crew, runner):
     result = runner.invoke(test)
 
-    test_crew.assert_called_once_with(3, "gpt-4o-mini")
+    evaluate_crew.assert_called_once_with(3, "gpt-4o-mini")
     assert result.exit_code == 0
     assert "Testing the crew for 3 iterations with model gpt-4o-mini" in result.output
 
 
-@mock.patch("crewai.cli.cli.test_crew")
-def test_test_custom_iterations(test_crew, runner):
+@mock.patch("crewai.cli.cli.evaluate_crew")
+def test_test_custom_iterations(evaluate_crew, runner):
     result = runner.invoke(test, ["--n_iterations", "5", "--model", "gpt-4o"])
 
-    test_crew.assert_called_once_with(5, "gpt-4o")
+    evaluate_crew.assert_called_once_with(5, "gpt-4o")
     assert result.exit_code == 0
     assert "Testing the crew for 5 iterations with model gpt-4o" in result.output
 
 
-@mock.patch("crewai.cli.cli.test_crew")
-def test_test_invalid_string_iterations(test_crew, runner):
+@mock.patch("crewai.cli.cli.evaluate_crew")
+def test_test_invalid_string_iterations(evaluate_crew, runner):
     result = runner.invoke(test, ["--n_iterations", "invalid"])
 
-    test_crew.assert_not_called()
+    evaluate_crew.assert_not_called()
     assert result.exit_code == 2
     assert (
         "Usage: test [OPTIONS]\nTry 'test --help' for help.\n\nError: Invalid value for '-n' / '--n_iterations': 'invalid' is not a valid integer.\n"

diff --git a/tests/cli/test_crew_test.py b/tests/cli/test_crew_test.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from crewai.cli import test_crew
+from crewai.cli import evaluate_crew
 
 
 @pytest.mark.parametrize(
@@ -14,13 +14,13 @@
         (10, "gpt-4"),
     ],
 )
-@mock.patch("crewai.cli.test_crew.subprocess.run")
+@mock.patch("crewai.cli.evaluate_crew.subprocess.run")
 def test_crew_success(mock_subprocess_run, n_iterations, model):
     """Test the crew function for successful execution."""
     mock_subprocess_run.return_value = subprocess.CompletedProcess(
         args=f"poetry run test {n_iterations} {model}", returncode=0
     )
-    result = test_crew.test_crew(n_iterations, model)
+    result = evaluate_crew.evaluate_crew(n_iterations, model)
 
     mock_subprocess_run.assert_called_once_with(
         ["poetry", "run", "test", str(n_iterations), model],
@@ -31,26 +31,26 @@ def test_crew_success(mock_subprocess_run, n_iterations, model):
     assert result is None
 
 
-@mock.patch("crewai.cli.test_crew.click")
+@mock.patch("crewai.cli.evaluate_crew.click")
 def test_test_crew_zero_iterations(click):
-    test_crew.test_crew(0, "gpt-4o")
+    evaluate_crew.evaluate_crew(0, "gpt-4o")
     click.echo.assert_called_once_with(
         "An unexpected error occurred: The number of iterations must be a positive integer.",
         err=True,
     )
 
 
-@mock.patch("crewai.cli.test_crew.click")
+@mock.patch("crewai.cli.evaluate_crew.click")
 def test_test_crew_negative_iterations(click):
-    test_crew.test_crew(-2, "gpt-4o")
+    evaluate_crew.evaluate_crew(-2, "gpt-4o")
     click.echo.assert_called_once_with(
         "An unexpected error occurred: The number of iterations must be a positive integer.",
         err=True,
     )
 
 
-@mock.patch("crewai.cli.test_crew.click")
-@mock.patch("crewai.cli.test_crew.subprocess.run")
+@mock.patch("crewai.cli.evaluate_crew.click")
+@mock.patch("crewai.cli.evaluate_crew.subprocess.run")
 def test_test_crew_called_process_error(mock_subprocess_run, click):
     n_iterations = 5
     mock_subprocess_run.side_effect = subprocess.CalledProcessError(
@@ -59,7 +59,7 @@ def test_test_crew_called_process_error(mock_subprocess_run, click):
         output="Error",
         stderr="Some error occurred",
     )
-    test_crew.test_crew(n_iterations, "gpt-4o")
+    evaluate_crew.evaluate_crew(n_iterations, "gpt-4o")
 
     mock_subprocess_run.assert_called_once_with(
         ["poetry", "run", "test", "5", "gpt-4o"],
@@ -78,13 +78,13 @@ def test_test_crew_called_process_error(mock_subprocess_run, click):
     )
 
 
-@mock.patch("crewai.cli.test_crew.click")
-@mock.patch("crewai.cli.test_crew.subprocess.run")
+@mock.patch("crewai.cli.evaluate_crew.click")
+@mock.patch("crewai.cli.evaluate_crew.subprocess.run")
 def test_test_crew_unexpected_exception(mock_subprocess_run, click):
     # Arrange
     n_iterations = 5
     mock_subprocess_run.side_effect = Exception("Unexpected error")
-    test_crew.test_crew(n_iterations, "gpt-4o")
+    evaluate_crew.evaluate_crew(n_iterations, "gpt-4o")
 
     mock_subprocess_run.assert_called_once_with(
         ["poetry", "run", "test", "5", "gpt-4o"],

diff --git a/tests/utilities/evaluators/test_crew_evaluator_handler.py b/tests/utilities/evaluators/test_crew_evaluator_handler.py
@@ -84,6 +84,10 @@ def test_print_crew_evaluation_result(self, table, console, crew_planner):
             1: [10, 9, 8],
             2: [9, 8, 7],
         }
+        crew_planner.run_execution_times = {
+            1: [24, 45, 66],
+            2: [55, 33, 67],
+        }
 
         crew_planner.print_crew_evaluation_result()
 
@@ -98,6 +102,7 @@ def test_print_crew_evaluation_result(self, table, console, crew_planner):
                 mock.call().add_row("Task 2", "9", "8", "8.5"),
                 mock.call().add_row("Task 3", "8", "7", "7.5"),
                 mock.call().add_row("Crew", "9.0", "8.0", "8.5"),
+                mock.call().add_row("Execution Time (s)", "135", "155", "145"),
             ]
         )
         console.assert_has_calls([mock.call(), mock.call().print(table())])