camel-ai · Wendong-Fan · Oct 29, 2024 · Oct 22, 2024 · Oct 26, 2024 · Oct 26, 2024
diff --git a/camel/loaders/__init__.py b/camel/loaders/__init__.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 # =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
 
+from .apify_reader import Apify
 from .base_io import File
 from .chunkr_reader import ChunkrReader
 from .firecrawl_reader import Firecrawl
@@ -23,5 +24,6 @@
     'UnstructuredIO',
     'JinaURLReader',
     'Firecrawl',
+    'Apify',
     'ChunkrReader',
 ]
diff --git a/camel/loaders/apify_reader.py b/camel/loaders/apify_reader.py
@@ -0,0 +1,223 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+import os
+from typing import TYPE_CHECKING, List, Optional
+
+if TYPE_CHECKING:
+    from apify_client.clients import DatasetClient
+
+from camel.utils import api_keys_required
+
+
+class Apify:
+    r"""Apify is a platform that allows you to automate any web workflow.
+
+    Args:
+        api_key (Optional[str]): API key for authenticating with the Apify API.
+    """
+
+    @api_keys_required("APIFY_API_KEY")
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+    ) -> None:
+        from apify_client import ApifyClient
+
+        self._api_key = api_key or os.environ.get("APIFY_API_KEY")
+        self.client = ApifyClient(token=self._api_key)
+
+    def run_actor(
+        self,
+        actor_id: str,
+        run_input: Optional[dict] = None,
+        content_type: Optional[str] = None,
+        build: Optional[str] = None,
+        max_items: Optional[int] = None,
+        memory_mbytes: Optional[int] = None,
+        timeout_secs: Optional[int] = None,
+        webhooks: Optional[list] = None,
+        wait_secs: Optional[int] = None,
+    ) -> Optional[dict]:
+        r"""Run an actor on the Apify platform.
+
+        Args:
+            actor_id (str): The ID of the actor to run.
+            run_input (Optional[dict]): The input data for the actor. Defaults
+                to `None`.
+            content_type (str, optional): The content type of the input.
+            build (str, optional): Specifies the Actor build to run. It can be
+                either a build tag or build number. By default, the run uses
+                the build specified in the default run configuration for the
+                Actor (typically latest).
+            max_items (int, optional): Maximum number of results that will be
+                returned by this run. If the Actor is charged per result, you
+                will not be charged for more results than the given limit.
+            memory_mbytes (int, optional): Memory limit for the run, in
+                megabytes. By default, the run uses a memory limit specified in
+                the default run configuration for the Actor.
+            timeout_secs (int, optional): Optional timeout for the run, in
+                seconds. By default, the run uses timeout specified in the
+                default run configuration for the Actor.
+            webhooks (list, optional): Optional webhooks
+                (https://docs.apify.com/webhooks) associated with the Actor
+                run, which can be used to receive a notification, e.g. when the
+                Actor finished or failed. If you already have a webhook set up
+                for the Actor, you do not have to add it again here.
+            wait_secs (int, optional): The maximum number of seconds the server
+                waits for finish. If not provided, waits indefinitely.
+
+        Returns:
+            Optional[dict]: The output data from the actor if successful.
+            # please use the 'defaultDatasetId' to get the dataset
+
+        Raises:
+            RuntimeError: If the actor fails to run.
+        """
+        try:
+            return self.client.actor(actor_id).call(
+                run_input=run_input,
+                content_type=content_type,
+                build=build,
+                max_items=max_items,
+                memory_mbytes=memory_mbytes,
+                timeout_secs=timeout_secs,
+                webhooks=webhooks,
+                wait_secs=wait_secs,
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to run actor {actor_id}: {e}") from e
+
+    def get_dataset_client(
+        self,
+        dataset_id: str,
+    ) -> "DatasetClient":
+        r"""Get a dataset client from the Apify platform.
+
+        Args:
+            dataset_id (str): The ID of the dataset to get the client for.
+
+        Returns:
+            DatasetClient: The dataset client.
+
+        Raises:
+            RuntimeError: If the dataset client fails to be retrieved.
+        """
+        try:
+            return self.client.dataset(dataset_id)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to get dataset {dataset_id}: {e}"
+            ) from e
+
+    def get_dataset(
+        self,
+        dataset_id: str,
+    ) -> Optional[dict]:
+        r"""Get a dataset from the Apify platform.
+
+        Args:
+            dataset_id (str): The ID of the dataset to get.
+
+        Returns:
+            dict: The dataset.
+
+        Raises:
+            RuntimeError: If the dataset fails to be retrieved.
+        """
+        try:
+            return self.get_dataset_client(dataset_id).get()
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to get dataset {dataset_id}: {e}"
+            ) from e
+
+    def update_dataset(
+        self,
+        dataset_id: str,
+        name: str,
+    ) -> dict:
+        r"""Update a dataset on the Apify platform.
+
+        Args:
+            dataset_id (str): The ID of the dataset to update.
+            name (str): The new name for the dataset.
+
+        Returns:
+            dict: The updated dataset.
+
+        Raises:
+            RuntimeError: If the dataset fails to be updated.
+        """
+        try:
+            return self.get_dataset_client(dataset_id).update(name=name)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to update dataset {dataset_id}: {e}"
+            ) from e
+
+    def get_dataset_items(
+        self,
+        dataset_id: str,
+    ) -> List:
+        r"""Get items from a dataset on the Apify platform.
+
+        Args:
+            dataset_id (str): The ID of the dataset to get items from.
+
+        Returns:
+            list: The items in the dataset.
+
+        Raises:
+            RuntimeError: If the items fail to be retrieved.
+        """
+        try:
+            items = self.get_dataset_client(dataset_id).list_items().items
+            return items
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to get dataset items {dataset_id}: {e}"
+            ) from e
+
+    def get_datasets(
+        self,
+        unnamed: Optional[bool] = None,
+        limit: Optional[int] = None,
+        offset: Optional[int] = None,
+        desc: Optional[bool] = None,
+    ) -> List[dict]:
+        r"""Get all named datasets from the Apify platform.
+
+        Args:
+            unnamed (bool, optional): Whether to include unnamed key-value
+                stores in the list
+            limit (int, optional): How many key-value stores to retrieve
+            offset (int, optional): What key-value store to include as first
+                when retrieving the list
+            desc (bool, optional): Whether to sort the key-value stores in
+                descending order based on their modification date
+
+        Returns:
+            List[dict]: The datasets.
+
+        Raises:
+            RuntimeError: If the datasets fail to be retrieved.
+        """
+        try:
+            return (
+                self.client.datasets()
+                .list(unnamed=unnamed, limit=limit, offset=offset, desc=desc)
+                .items
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to get datasets: {e}") from e
diff --git a/examples/loaders/apify_example.py b/examples/loaders/apify_example.py
@@ -0,0 +1,82 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+
+from camel.loaders import Apify
+
+apify = Apify()
+
+run_input = {
+    "startUrls": [{"url": "https://www.camel-ai.org/"}],
+    "maxCrawlDepth": 0,
+    "maxCrawlPages": 1,
+}
+actor_result = apify.run_actor(
+    actor_id="apify/website-content-crawler", run_input=run_input
+)
+
+print(actor_result["status"])
+'''
+===============================================================================
+SUCCEEDED
+===============================================================================
+'''
+
+print(actor_result["defaultDatasetId"])
+'''
+===============================================================================
+HoKb31FJzHm0ni51k
+===============================================================================
+'''
+
+dataset_result = apify.get_dataset_items(
+    dataset_id=actor_result["defaultDatasetId"]
+)
+
+print(dataset_result)
+'''
+===============================================================================
+[{'url': 'https://www.camel-ai.org/', 'crawl': {'loadedUrl': 'https://www.camel
+-ai.org/', 'loadedTime': '2024-10-27T04:51:16.651Z', 'referrerUrl': 'https://ww
+w.camel-ai.org/', 'depth': 0, 'httpStatusCode': 200}, 'metadata': {'canonicalUr
+l': 'https://www.camel-ai.org/', 'title': 'CAMEL-AI', 'description': 'CAMEL-AI.
+org is the 1st LLM multi-agent framework and an open-source community dedicated
+ to finding the scaling law of agents.', 'author': None, 'keywords': None, 'lan
+ guageCode': 'en', 'openGraph': [{'property': 'og:title', 'content': 'CAMEL-AI'
+ }, {'property': 'og:description', 'content': 'CAMEL-AI.org is the 1st LLM mult
+ i-agent framework and an open-source community dedicated to finding the scalin
+ g law of agents.'}, {'property': 'twitter:title', 'content': 'CAMEL-AI'}, {'pr
+ operty': 'twitter:description', 'content': 'CAMEL-AI.org is the 1st LLM multi-
+ agent framework and an open-source community dedicated to finding the scaling 
+ law of agents.'}, {'property': 'og:type', 'content': 'website'}], 'jsonLd': No
+ ne, 'headers': {'date': 'Sun, 27 Oct 2024 04:50:18 GMT', 'content-type': 'text
+ /html', 'cf-ray': '8d901082dae7efbe-PDX', 'cf-cache-status': 'HIT', 'age': '10
+ 81', 'content-encoding': 'gzip', 'last-modified': 'Sat, 26 Oct 2024 11:51:32 G
+ MT', 'strict-transport-security': 'max-age=31536000', 'surrogate-control': 'ma
+ x-age=432000', 'surrogate-key': 'www.camel-ai.org 6659a154491a54a40551bc78 pag
+ eId:6686a2bcb7ece5fb40457491 668181a0a818ade34e653b24 6659a155491a54a40551bd7e
+ ', 'x-lambda-id': 'd6c4424b-ac67-4c54-b52a-cb2a22ca09f0', 'vary': 'Accept-Enco
+ ding', 'set-cookie': '__cf_bm=oX5EmZ2SNJDOBQXI8dL_reCYlCpp1FMzu40qCNxiopU-1730
+ 004618-1.0.1.1-3teEeqUoemzHWAeGCtlPJVqdmAbiFkyu3JxopKfQFFndSCi_Z56RR.UDjLGZiq.
+ L_4LvAZYmNKxQ.k6VRhbA7g; path=/; expires=Sun, 27-Oct-24 05:20:18 GMT; domain=.
+ cdn.webflow.com; HttpOnly; Secure; SameSite=None\n_cfuvid=om_8lj9jNMIh.HEIxEAq
+ gszhEWaKlyS2kdXKwqGedSM-1730004618924-0.0.1.1-604800000; path=/; domain=.cdn.w
+ ebflow.com; HttpOnly; Secure; SameSite=None', 'alt-svc': 'h3=":443"; ma=86400'
+ , 'x-cluster-name': 'us-west-2-prod-hosting-red', 'x-firefox-spdy': 'h2'}}, 's
+ creenshotUrl': None, 'text': 'Build Multi-Agent Systems for _\nFEATURES & Inte
+ grations\nSeamless integrations with\npopular platforms \nScroll to explore ou
+ r features & integrations.', 'markdown': '# Build Multi-Agent Systems for \\_
+ \n\nFEATURES & Integrations\n\n## Seamless integrations with  \npopular platfo
+ rms\n\nScroll to explore our features & integrations.'}]
+===============================================================================
+'''