diff --git a/docs/source/notebooks/tool_usage/benchmark_all_tasks.ipynb b/docs/source/notebooks/tool_usage/benchmark_all_tasks.ipynb
index af5764e..94611ad 100644
--- a/docs/source/notebooks/tool_usage/benchmark_all_tasks.ipynb
+++ b/docs/source/notebooks/tool_usage/benchmark_all_tasks.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "13a7483b-d08f-49fa-83da-619863171e5b",
    "metadata": {
     "tags": []
@@ -36,6 +36,7 @@
     "    AnthropicToolUserFactory,\n",
     "    CustomAgentFactory,\n",
     "    OpenAIAgentFactory,\n",
+    "    OpenAIAssistantFactory,\n",
     ")"
    ]
   },
@@ -50,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "e0f2bb0c-c741-4fb4-96bc-54b3ee88bf5b",
    "metadata": {
     "tags": []
@@ -62,28 +63,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "adfbcaa9-349c-4223-89be-4abff9cf76ff",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'input': '(2 + 5) and then to the power of 0.5',\n",
-       " 'output': 'The answer is 192.54605765894036.',\n",
-       " 'intermediate_steps': [(OpenAIToolAgentAction(tool='add', tool_input={'a': 2, 'b': 5}, log=\"\\nInvoking: `add` with `{'a': 2, 'b': 5}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_8149a232b2db4b0c92d8c80576f0047a', 'function': {'arguments': '{\"a\": 2, \"b\": 5}', 'name': 'add'}, 'type': 'function'}]})], tool_call_id='call_8149a232b2db4b0c92d8c80576f0047a'),\n",
-       "   8.2),\n",
-       "  (OpenAIToolAgentAction(tool='power', tool_input={'a': 8.2, 'b': 0.5}, log=\"\\nInvoking: `power` with `{'a': 8.2, 'b': 0.5}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_f6a87225d4bd45348e60cbbe622495cf', 'function': {'arguments': '{\"a\": 8.2, \"b\": 0.5}', 'name': 'power'}, 'type': 'function'}]})], tool_call_id='call_f6a87225d4bd45348e60cbbe622495cf'),\n",
-       "   192.54605765894036)]}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "agent_factory = OpenAIAgentFactory(\n",
     "    task, model=\"mistral-7b-instruct-v0.1\"\n",
@@ -102,7 +87,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "066d7695-416c-4faf-8c33-c40e5f136672",
    "metadata": {
     "tags": []
@@ -122,7 +107,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "26d390b6-9ade-424c-aabb-d450f52ed121",
    "metadata": {
     "tags": []
@@ -137,6 +122,8 @@
     "    (\"openai_functions\", \"gpt-3.5-turbo-0613\"),\n",
     "    (\"openai_functions\", \"gpt-4-1106-preview\"),\n",
     "    (\"openai_functions\", \"gpt-4-0613\"),\n",
+    "    (\"openai_assistant\", \"gpt-4-1106-preview\"),\n",
+    "    (\"openai_assistant\", \"gpt-3.5-turbo-1106\"),\n",
     "]"
    ]
   },
@@ -150,7 +137,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "92a1894b-6232-421c-a243-567617cba083",
    "metadata": {
     "tags": []
@@ -162,650 +149,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "e6fbc3ef-7a3f-430f-8b79-45af5861b3ee",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset Tool Usage - Typewriter (1 tool) already exists. Skipping.\n",
-      "You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3.\n",
-      "\n",
-      "Benchmarking Tool Usage - Typewriter (1 tool) with model: claude-2.1 and arch: anthropic_tool_user\n",
-      "View the evaluation results for project 'claude-2.1-Tool Usage - Typewriter (1 tool)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=2d749062-a6a5-474d-99ce-60f6c847375c\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n",
-      "[------------------------------------------------->] 20/20\n",
-      "Benchmarking Tool Usage - Typewriter (1 tool) with model: mistral-7b-instruct-v0.1 and arch: openai_functions\n",
-      "View the evaluation results for project 'mistral-7b-instruct-v0.1-Tool Usage - Typewriter (1 tool)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=3a3d0b38-33ab-405c-aac3-5c00192a89e7\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n",
-      "[------------------------------------------------->] 20/20\n",
-      "Benchmarking Tool Usage - Typewriter (1 tool) with model: gpt-3.5-turbo-1106 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-3.5-turbo-1106-Tool Usage - Typewriter (1 tool)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=a507bd3e-fa6e-465d-a8e7-f343d32ddfc2\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n",
-      "[------------------------------------------------->] 20/20\n",
-      "Benchmarking Tool Usage - Typewriter (1 tool) with model: gpt-3.5-turbo-0613 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-3.5-turbo-0613-Tool Usage - Typewriter (1 tool)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=31bd977f-418e-489a-bd98-87142720b153\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n",
-      "[--------->                                        ] 4/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 8a2b5450-dd16-4213-8b70-cb2583d6c7eb with inputs {'question': 'student'}\n",
-      "Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': \"An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_EXTKZSoSqyJufBYv64TAm018\", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------------------------------->] 20/20\n",
-      "Benchmarking Tool Usage - Typewriter (1 tool) with model: gpt-4-1106-preview and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-4-1106-preview-Tool Usage - Typewriter (1 tool)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=81e16b5e-ab5b-4886-86ec-c08c3d6b92fb\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n",
-      "[------->                                          ] 3/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a with inputs {'question': 'communication'}\n",
-      "Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': \"An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_YaB1u6onJCRe3TBblCZaVxkA\", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------------------------------->] 20/20\n",
-      "Benchmarking Tool Usage - Typewriter (1 tool) with model: gpt-4-0613 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-4-0613-Tool Usage - Typewriter (1 tool)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=5ca2a98a-8b30-4f90-97dd-9b2eefa9a591\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n",
-      "[>                                                 ] 0/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a with inputs {'question': 'university'}\n",
-      "Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': \"An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_X0Z2LtrSXeCpqF0qEdWPWMtH\", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[->                                                ] 1/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 9017ddcc-d3bd-45a8-88dd-70906964586b with inputs {'question': 'dictionary'}\n",
-      "Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': \"An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_vd5tg67PGyjHRdFZdZ5hgoF9\", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[----------->                                      ] 5/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 8a2b5450-dd16-4213-8b70-cb2583d6c7eb with inputs {'question': 'student'}\n",
-      "Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': \"An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_uc1oozJfGTvYEfIzzcsfXfOl\", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[--------------------->                            ] 9/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 5daad87c-a008-49ab-841c-76916b150f4d with inputs {'question': 'house'}\n",
-      "Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': \"An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_lZDuTxM5Thzwc1bfJVKsN7Pm\", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------------------------------->] 20/20Dataset Tool Usage - Typewriter (26 tools) already exists. Skipping.\n",
-      "You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478.\n",
-      "\n",
-      "Benchmarking Tool Usage - Typewriter (26 tools) with model: claude-2.1 and arch: anthropic_tool_user\n",
-      "View the evaluation results for project 'claude-2.1-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=59ef70f6-a0d2-4d3f-a1d9-265b9d736254\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (26 tools) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478\n",
-      "[------------------------------------------------->] 20/20\n",
-      "Benchmarking Tool Usage - Typewriter (26 tools) with model: mistral-7b-instruct-v0.1 and arch: openai_functions\n",
-      "View the evaluation results for project 'mistral-7b-instruct-v0.1-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=2182247a-6973-472d-a25b-83444eddd5f4\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (26 tools) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478\n",
-      "[----------------------------------------->        ] 17/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 23649150-3c39-4beb-ba5d-c50ff1c66c63 with inputs {'question': 'church'}\n",
-      "Error Type: InternalServerError, Message: Error code: 504 - {'generated_text': None, 'tool_calls': None, 'embedding_outputs': None, 'num_input_tokens': None, 'num_input_tokens_batch': None, 'num_generated_tokens': None, 'num_generated_tokens_batch': None, 'preprocessing_time': None, 'generation_time': None, 'timestamp': 1702701535.5182757, 'finish_reason': None, 'error': {'message': 'Your request has exceeded the timeout of 3 minutes. This may be caused by excessive traffic against Anyscale EndpointsPlease either use streaming to hold a longer connection, or update your prompt to shorten the response time. (Request ID: 8J-AVRZ2lIjAvoI83OBRYSd5VnsmsdPjU76WfZyRSOo)', 'internal_message': 'rayllm.backend.server.openai_compat.openai_exception.OpenAIHTTPException: 504 (Request ID: 8J-AVRZ2lIjAvoI83OBRYSd5VnsmsdPjU76WfZyRSOo)', 'code': 504, 'type': 'OpenAIHTTPException', 'param': {}}, 'num_total_tokens': 0, 'num_total_tokens_batch': 0, 'total_time': None}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[-------------------------------------------->     ] 18/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 818869de-fc7c-45e6-9d24-2871eda9c081 with inputs {'question': 'hand'}\n",
-      "Error Type: InternalServerError, Message: Error code: 504 - {'generated_text': None, 'tool_calls': None, 'embedding_outputs': None, 'num_input_tokens': None, 'num_input_tokens_batch': None, 'num_generated_tokens': None, 'num_generated_tokens_batch': None, 'preprocessing_time': None, 'generation_time': None, 'timestamp': 1702701543.895307, 'finish_reason': None, 'error': {'message': 'Your request has exceeded the timeout of 3 minutes. This may be caused by excessive traffic against Anyscale EndpointsPlease either use streaming to hold a longer connection, or update your prompt to shorten the response time. (Request ID: FuqWl9qSiivXLxvy_ZzLnl1js_I7ZOUZzIGno_iqPmU)', 'internal_message': 'rayllm.backend.server.openai_compat.openai_exception.OpenAIHTTPException: 504 (Request ID: FuqWl9qSiivXLxvy_ZzLnl1js_I7ZOUZzIGno_iqPmU)', 'code': 504, 'type': 'OpenAIHTTPException', 'param': {}}, 'num_total_tokens': 0, 'num_total_tokens_batch': 0, 'total_time': None}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[----------------------------------------------->  ] 19/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 607d5f26-c165-4034-b5f9-0f592913cb71 with inputs {'question': 'dog'}\n",
-      "Error Type: InternalServerError, Message: Error code: 504 - {'generated_text': None, 'tool_calls': None, 'embedding_outputs': None, 'num_input_tokens': None, 'num_input_tokens_batch': None, 'num_generated_tokens': None, 'num_generated_tokens_batch': None, 'preprocessing_time': None, 'generation_time': None, 'timestamp': 1702701547.4154704, 'finish_reason': None, 'error': {'message': 'Your request has exceeded the timeout of 3 minutes. This may be caused by excessive traffic against Anyscale EndpointsPlease either use streaming to hold a longer connection, or update your prompt to shorten the response time. (Request ID: 3D-YtEj-K6-A3VswxzH0TgUdRboWP52pXwXCTDqnxCU)', 'internal_message': 'rayllm.backend.server.openai_compat.openai_exception.OpenAIHTTPException: 504 (Request ID: 3D-YtEj-K6-A3VswxzH0TgUdRboWP52pXwXCTDqnxCU)', 'code': 504, 'type': 'OpenAIHTTPException', 'param': {}}, 'num_total_tokens': 0, 'num_total_tokens_batch': 0, 'total_time': None}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------------------------------->] 20/20\n",
-      "Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-3.5-turbo-1106 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-3.5-turbo-1106-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=a7a8b8e7-cefa-4147-ba34-929ac5bc12dd\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (26 tools) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478\n",
-      "[>                                                 ] 0/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 54e4c8e2-d85b-4652-99e8-f91496bc6c4e with inputs {'question': 'university'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 0f3c71115a5a990643ef7d0be5eb0b7c in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[---->                                             ] 2/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example e5bc36cc-e077-4a6f-80a0-9ce1794a77e5 with inputs {'question': 'communication'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 086c0205400563e5fe66c893367c58db in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[--------->                                        ] 4/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example ff31e6be-4d37-4c29-b869-9b3a2075ff25 with inputs {'question': 'computer'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 86a3452f2c554778caa5fcc4b7c3a5bc in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n",
-      "Chain failed for example 80264d75-4aa4-484a-bbdc-4f82f930a69d with inputs {'question': 'student'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 4b43112cf53a05987bd46a908574349c in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[-------------->                                   ] 6/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example f78f3a19-9f51-474c-9632-c5c43ff8da2e with inputs {'question': 'teacher'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 95ab8156ff646e7b4048b5d3a784e7d9 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[----------------->                                ] 7/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 23649150-3c39-4beb-ba5d-c50ff1c66c63 with inputs {'question': 'church'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID c2f5b270d0e638f0a3c059540d6aefe6 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------->                              ] 8/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 0e6b2b64-57b1-4e8a-8611-4edab1cea326 with inputs {'question': 'school'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 968fcd16776b0c7364df1b214a1c6b1a in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[--------------------->                            ] 9/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 8af5bd36-fc11-4b23-9019-f642cfaf8a01 with inputs {'question': 'horse'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID fcd45b1b69a26065316de72fd0bec5ee in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------>                         ] 10/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example c1a0336d-ae2f-4cf3-a204-58eec17330e7 with inputs {'question': 'house'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 18a0312e0c33e0e24381676b67751b60 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[--------------------------->                      ] 11/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 7083dccd-2397-47ab-b2c4-216e6177f5eb with inputs {'question': 'head'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 74e03a7a3d7e58300261063f424c120d in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[----------------------------->                    ] 12/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 818869de-fc7c-45e6-9d24-2871eda9c081 with inputs {'question': 'hand'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 93a3d69e89477c416c13e3b68fcc6cf2 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n",
-      "Chain failed for example de38ad8a-ca82-44d6-a4ba-ff3bd5a6640e with inputs {'question': 'cat'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 5b1f5f98c1aba8c9e4b1730dd679a0ba in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[---------------------------------->               ] 14/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 607d5f26-c165-4034-b5f9-0f592913cb71 with inputs {'question': 'dog'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID dafebdfdd71fba8868825a6daefa530a in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[----------------------------------------------->  ] 19/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 45a3aa01-9158-4adc-807b-b79c5e11e7db with inputs {'question': 'dictionary'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID b0949a00e6975d982ce3b593e6c52808 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------------------------------->] 20/20\n",
-      "Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-3.5-turbo-0613 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-3.5-turbo-0613-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=d13d0ec3-151b-463a-85dd-ea6564f8c8c9\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (26 tools) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478\n",
-      "[>                                                 ] 0/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 2d4e99fc-8495-468e-8429-6c25a2d176f3 with inputs {'question': 'keyboard'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 3143d6b7e43251f800acb20f1c5b15cf in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[-------------->                                   ] 6/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 8af5bd36-fc11-4b23-9019-f642cfaf8a01 with inputs {'question': 'horse'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 4706b2d842a40fcadb5a5e1a38bdec55 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------------------------------->] 20/20\n",
-      "Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-4-1106-preview and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-4-1106-preview-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=1c885b09-212d-4608-bd3b-6e1b27964a88\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (26 tools) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478\n",
-      "[------------------------------------------------->] 20/20\n",
-      "Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-4-0613 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-4-0613-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=c7b1bbec-a395-4915-afd0-0c4a8935ff95\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (26 tools) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478\n",
-      "[------->                                          ] 3/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example c4af9bd6-84c5-4b25-ac0a-04c307fc7441 with inputs {'question': 'information'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID fa4be24c98131a3426aebab3538f9a92 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[-------------->                                   ] 6/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example e5bc36cc-e077-4a6f-80a0-9ce1794a77e5 with inputs {'question': 'communication'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID a1bc46fafb2620ed793db3a29345e369 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------>                         ] 10/20"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example f78f3a19-9f51-474c-9632-c5c43ff8da2e with inputs {'question': 'teacher'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 8a3c32e3a21f70d19927fe4a69f30bfc in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------------------------------->] 20/20Dataset Tool Usage - Relational Data already exists. Skipping.\n",
-      "You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826.\n",
-      "\n",
-      "Benchmarking Tool Usage - Relational Data with model: claude-2.1 and arch: anthropic_tool_user\n",
-      "View the evaluation results for project 'claude-2.1-Tool Usage - Relational Data-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=1aabaef6-fc88-4771-8211-d67a16810d1d\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[-------------------------------->                 ] 14/21"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example f8657d6a-1cd9-4f9e-84e1-0f4a0703e494 with inputs {'question': 'Is it likely that Donna is awake right now?'}\n",
-      "Error Type: ValueError, Message: invalid literal for int() with base 10: 'find_users_by_name(\"Donna\")[0].id'\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[----------------------------------->              ] 15/21"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 33002acd-a844-4a3f-8a4e-99fdf3af1cb6 with inputs {'question': 'Is it likely that Donna is outside with an umbrella at this time?'}\n",
-      "Error Type: ValueError, Message: invalid literal for int() with base 10: 'find_users_by_name(\"Donna\")[0].id'\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------------------------------->] 21/21\n",
-      "Benchmarking Tool Usage - Relational Data with model: mistral-7b-instruct-v0.1 and arch: openai_functions\n",
-      "View the evaluation results for project 'mistral-7b-instruct-v0.1-Tool Usage - Relational Data-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=a60f355a-94c6-47e9-8a14-9a26a465de28\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21\n",
-      "Benchmarking Tool Usage - Relational Data with model: gpt-3.5-turbo-1106 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-3.5-turbo-1106-Tool Usage - Relational Data-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=92494613-1ca6-449a-8dc9-cd7b7e69daa9\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21\n",
-      "Benchmarking Tool Usage - Relational Data with model: gpt-3.5-turbo-0613 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-3.5-turbo-0613-Tool Usage - Relational Data-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=4f3e45a6-5d64-4a6f-814f-e28d9efd88f1\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21\n",
-      "Benchmarking Tool Usage - Relational Data with model: gpt-4-1106-preview and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-4-1106-preview-Tool Usage - Relational Data-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=6997e3a9-ac1c-4c7c-851e-e921f62ce286\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21\n",
-      "Benchmarking Tool Usage - Relational Data with model: gpt-4-0613 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-4-0613-Tool Usage - Relational Data-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=62b1bd0c-0119-4b3e-8f03-069227b84cb0\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21Dataset Multiverse Math already exists. Skipping.\n",
-      "You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0.\n",
-      "\n",
-      "Benchmarking Multiverse Math with model: claude-2.1 and arch: anthropic_tool_user\n",
-      "View the evaluation results for project 'claude-2.1-Multiverse Math-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=aa4ee3ef-4b4e-44ed-97bf-bd904ef700d9\n",
-      "\n",
-      "View all tests for Dataset Multiverse Math at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n",
-      "[------------------------------------------------->] 10/10\n",
-      "Benchmarking Multiverse Math with model: mistral-7b-instruct-v0.1 and arch: openai_functions\n",
-      "View the evaluation results for project 'mistral-7b-instruct-v0.1-Multiverse Math-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=114f4155-40ca-440a-9b17-adef2a3c2590\n",
-      "\n",
-      "View all tests for Dataset Multiverse Math at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n",
-      "[--------->                                        ] 2/10"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 2a20a13d-050e-4a16-84ff-22d9582f1449 with inputs {'question': 'after calculating the sin of 1.5 radians, divide the result by cos of 1.5 radians'}\n",
-      "Error Type: ZeroDivisionError, Message: float division by zero\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------------------------------->] 10/10\n",
-      "Benchmarking Multiverse Math with model: gpt-3.5-turbo-1106 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-3.5-turbo-1106-Multiverse Math-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=91df1a9b-d6d3-44aa-99c6-6e98401f3ac7\n",
-      "\n",
-      "View all tests for Dataset Multiverse Math at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n",
-      "[------------------------------------------------->] 10/10\n",
-      "Benchmarking Multiverse Math with model: gpt-3.5-turbo-0613 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-3.5-turbo-0613-Multiverse Math-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=b934ef76-8ad1-4b71-bae0-b95d4a1ef7d5\n",
-      "\n",
-      "View all tests for Dataset Multiverse Math at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n",
-      "[------------------------------------------------->] 10/10\n",
-      "Benchmarking Multiverse Math with model: gpt-4-1106-preview and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-4-1106-preview-Multiverse Math-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=df46ee08-01da-40d2-a992-933df0e0c2d7\n",
-      "\n",
-      "View all tests for Dataset Multiverse Math at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n",
-      "[-------------->                                   ] 3/10"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 67867526-791a-452f-b534-ef2c1f5efd20 with inputs {'question': 'ecoli divides every 20 minutes. How many cells will be there after 2 hours if we start with 5 cells?'}\n",
-      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 28a9d136aed7faacc59b14d4351430dd in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[------------------------------------------------->] 10/10\n",
-      "Benchmarking Multiverse Math with model: gpt-4-0613 and arch: openai_functions\n",
-      "View the evaluation results for project 'gpt-4-0613-Multiverse Math-2023-12-15-woof' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=ea96a024-5987-47d6-9650-832bbbd4aa67\n",
-      "\n",
-      "View all tests for Dataset Multiverse Math at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n",
-      "[------------------------------------------------->] 10/10"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "client = Client()  # Launch langsmith client for cloning datasets\n",
     "today = datetime.date.today().isoformat()\n",
@@ -833,6 +182,8 @@
     "            )\n",
     "        elif arch == \"anthropic_tool_user\":\n",
     "            agent_factory = AnthropicToolUserFactory(task)\n",
+    "        elif arch == \"openai_assistant\":\n",
+    "            agent_factory = OpenAIAssistantFactory(task, model=model)\n",
     "        else:\n",
     "            raise ValueError()\n",
     "\n",
@@ -1070,7 +421,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.11.2"
   }
  },
  "nbformat": 4,