apify · janbuchar · Dec 2, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/src/crawlee/_cli.py b/src/crawlee/_cli.py
@@ -1,21 +1,27 @@
 # ruff: noqa: TRY301, FBT002, UP007
 from __future__ import annotations
 
-import os
+import importlib.resources
+import json
 from pathlib import Path
 from typing import Annotated, Optional, cast
 
-import httpx
 import inquirer  # type: ignore[import-untyped]
 import typer
 from cookiecutter.main import cookiecutter  # type: ignore[import-untyped]
 from inquirer.render.console import ConsoleRender  # type: ignore[import-untyped]
 from rich.progress import Progress, SpinnerColumn, TextColumn
 
-TEMPLATE_LIST_URL = 'https://api.github.com/repos/apify/crawlee-python/contents/templates'
-
 cli = typer.Typer(no_args_is_help=True)
 
+template_directory = importlib.resources.files('crawlee') / 'project_template'
+cookiecutter_json = json.load((template_directory / 'cookiecutter.json').open())
+
+crawler_choices = cookiecutter_json['crawler_type']
+http_client_choices = cookiecutter_json['http_client']
+package_manager_choices = cookiecutter_json['package_manager']
+default_start_url = cookiecutter_json['start_url']
+
 
 @cli.callback(invoke_without_command=True)
 def callback(
@@ -64,25 +70,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str:
         return project_name
 
 
-def _prompt_for_template() -> str:
-    """Prompt the user to select a template from a list."""
-    # Fetch available templates
-    response = httpx.get(
-        TEMPLATE_LIST_URL,
-        timeout=httpx.Timeout(10),
-        headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [],
+def _prompt_text(message: str, default: str) -> str:
+    return cast(
+        str,
+        ConsoleRender().render(
+            inquirer.Text(
+                name='text',
+                message=message,
+                default=default,
+                validate=lambda _, value: bool(value.strip()),
+            ),
+        ),
     )
-    response.raise_for_status()
-    template_choices = [item['name'] for item in response.json() if item['type'] == 'dir']
 
-    # Prompt for template choice
+
+def _prompt_choice(message: str, choices: list[str]) -> str:
+    """Prompt the user to pick one from a list of choices."""
     return cast(
         str,
         ConsoleRender().render(
             inquirer.List(
-                name='template',
-                message='Please select the template for your new Crawlee project',
-                choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices],
+                name='choice',
+                message=message,
+                choices=[(choice[0].upper() + choice[1:], choice) for choice in choices],
+            ),
+        ),
+    )
+
+
+def _prompt_bool(message: str, *, default: bool) -> bool:
+    return cast(
+        bool,
+        ConsoleRender().render(
+            inquirer.Confirm(
+                name='confirm',
+                message=message,
+                default=default,
             ),
         ),
     )
@@ -92,26 +115,77 @@ def _prompt_for_template() -> str:
 def create(
     project_name: Optional[str] = typer.Argument(
         default=None,
+        show_default=False,
         help='The name of the project and the directory that will be created to contain it. '
         'If none is given, you will be prompted.',
+    ),
+    crawler_type: Optional[str] = typer.Option(
+        None,
+        '--crawler-type',
+        '--template',
+        show_default=False,
+        help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',
+    ),
+    http_client: Optional[str] = typer.Option(
+        None,
+        show_default=False,
+        help='The library that will be used to make HTTP requests in your crawler. '
+        'If none is given, you will be prompted.',
+    ),
+    package_manager: Optional[str] = typer.Option(
+        default=None,
         show_default=False,
+        help='Package manager to be used in the new project. If none is given, you will be prompted.',
     ),
-    template: Optional[str] = typer.Option(
+    start_url: Optional[str] = typer.Option(
         default=None,
-        help='The template to be used to create the project. If none is given, you will be prompted.',
         show_default=False,
+        help='The URL where crawling should start. If none is given, you will be prompted.',
+    ),
+    enable_apify_integration: Optional[bool] = typer.Option(
+        None,
+        '--apify/--no-apify',
+        show_default=False,
+        help='Should Apify integration be set up for you? If not given, you will be prompted.',
     ),
 ) -> None:
     """Bootstrap a new Crawlee project."""
     try:
         # Prompt for project name if not provided.
         project_name = _prompt_for_project_name(project_name)
 
-        # Prompt for template choice if not provided.
-        if template is None:
-            template = _prompt_for_template()
+        # Prompt for crawler_type if not provided.
+        if crawler_type is None:
+            crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices)
+
+        # Prompt for http_client if not provided.
+        if http_client is None:
+            http_client = _prompt_choice('Please select the HTTP client', http_client_choices)
+
+        # Prompt for package manager if not provided.
+        if package_manager is None:
+            package_manager = _prompt_choice('Please select the package manager', package_manager_choices)
+
+        # Prompt for start URL
+        if start_url is None:
+            start_url = _prompt_text('Please specify the start URL', default=default_start_url)
+
+        # Ask about Apify integration if not explicitly configured
+        if enable_apify_integration is None:
+            enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False)
+
+        if all(
+            [
+                project_name,
+                crawler_type,
+                http_client,
+                package_manager,
+                start_url,
+                enable_apify_integration is not None,
+            ]
+        ):
+            package_name = project_name.replace('-', '_')
 
-        if project_name and template:
             # Start the bootstrap process.
             with Progress(
                 SpinnerColumn(),
@@ -120,21 +194,39 @@ def create(
             ) as progress:
                 progress.add_task(description='Bootstrapping...', total=None)
                 cookiecutter(
-                    template='gh:apify/crawlee-python',
-                    directory=f'templates/{template}',
+                    template=str(template_directory),
                     no_input=True,
-                    extra_context={'project_name': project_name},
+                    extra_context={
+                        'project_name': project_name,
+                        'package_manager': package_manager,
+                        'crawler_type': crawler_type,
+                        'http_client': http_client,
+                        'enable_apify_integration': enable_apify_integration,
+                        'start_url': start_url,
+                    },
                 )
 
             typer.echo(f'Your project "{project_name}" was created.')
-            typer.echo(
-                f'To run it, navigate to the directory: "cd {project_name}", '
-                'install dependencies with "poetry install", '
-                f'and run it using "poetry run python -m {project_name}".'
-            )
+
+            if package_manager == 'manual':
+                typer.echo(
+                    f'To run it, navigate to the directory: "cd {project_name}", '
+                    f'install the dependencies listed in "requirements.txt" '
+                    f'and run it using "python -m {package_name}".'
+                )
+            elif package_manager == 'pip':
+                typer.echo(
+                    f'To run it, navigate to the directory: "cd {project_name}", '
+                    f'activate the virtual environment in ".venv" ("source .venv/bin/activate") '
+                    f'and run your project using "python -m {package_name}".'
+                )
+            elif package_manager == 'poetry':
+                typer.echo(
+                    f'To run it, navigate to the directory: "cd {project_name}", '
+                    f'and run it using "poetry run python -m {package_name}".'
+                )
+
             typer.echo(f'See the "{project_name}/README.md" for more information.')
 
-    except httpx.HTTPStatusError as exc:
-        typer.echo(f'Failed to fetch templates: {exc}.', err=True)
     except KeyboardInterrupt:
         typer.echo('Operation cancelled by user.')
diff --git a/src/crawlee/project_template/cookiecutter.json b/src/crawlee/project_template/cookiecutter.json
@@ -0,0 +1,13 @@
+{
+    "project_name": "crawlee-python-project",
+    "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
+    "crawler_type": ["beautifulsoup", "parsel", "playwright"],
+    "http_client": ["httpx", "curl-impersonate"],
+    "package_manager": ["poetry", "pip", "manual"],
+    "enable_apify_integration": false,
+    "start_url": "https://crawlee.dev",
+    "_jinja2_env_vars": {
+        "line_statement_prefix": "# %"
+    },
+    "_extensions": ["jinja2.ext.do"]
+}
diff --git a/src/crawlee/project_template/hooks/post_gen_project.py b/src/crawlee/project_template/hooks/post_gen_project.py
@@ -0,0 +1,32 @@
+import platform
+import subprocess
+from pathlib import Path
+
+# % if cookiecutter.package_manager == 'poetry'
+Path('requirements.txt').unlink()
+
+subprocess.check_call(['poetry', 'install'])
+# % if cookiecutter.crawler_type == 'playwright'
+subprocess.check_call(['poetry', 'run', 'playwright', 'install'])
+# % endif
+# % elif cookiecutter.package_manager == 'pip'
+import venv  # noqa: E402
+
+# Create a virtual environment
+venv_root = Path('.venv')
+venv.main([str(venv_root)])
+
+if platform.system() == 'Windows':  # noqa: SIM108
+    path = venv_root / 'Scripts'
+else:
+    path = venv_root / 'bin'
+
+# Install requirements and generate requirements.txt as an impromptu lockfile
+subprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt'])
+with open('requirements.txt', 'w') as requirements_txt:
+    subprocess.check_call([str(path / 'pip'), 'freeze'], stdout=requirements_txt)
+
+# % if cookiecutter.crawler_type == 'playwright'
+subprocess.check_call([str(path / 'playwright'), 'install'])
+# % endif
+# % endif
diff --git a/src/crawlee/project_template/hooks/pre_gen_project.py b/src/crawlee/project_template/hooks/pre_gen_project.py
@@ -0,0 +1,8 @@
+# % if cookiecutter.package_manager == 'poetry'
+import subprocess
+
+try:
+    subprocess.check_call(['poetry', '--version'])
+except OSError as exc:
+    raise RuntimeError('You chose to use the Poetry package manager, but it does not seem to be installed') from exc
+# % endif
diff --git a/src/crawlee/project_template/templates/main.py b/src/crawlee/project_template/templates/main.py
@@ -0,0 +1,46 @@
+# % if cookiecutter.enable_apify_integration
+from apify import Actor
+# % endif
+# % block import required
+# % endblock
+# % if cookiecutter.http_client == 'curl-impersonate'
+from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient
+# % elif cookiecutter.http_client == 'httpx'
+from crawlee.http_clients._httpx import HttpxHttpClient
+# % endif
+
+from .routes import router
+
+# % filter truncate(0, end='')
+# % block http_client_instantiation
+# % if cookiecutter.http_client == 'curl-impersonate'
+http_client=CurlImpersonateHttpClient(),
+# % elif cookiecutter.http_client == 'httpx'
+http_client=HttpxHttpClient(),
+# % endif
+# % endblock
+# % endfilter
+
+async def main() -> None:
+    """The crawler entry point."""
+    # % filter truncate(0, end='')
+    # % block instantiation required
+    # % endblock
+    # % endfilter
+
+    # % if cookiecutter.enable_apify_integration
+    async with Actor:
+        # % filter indent(width=8, first=False)
+        {{ self.instantiation() }}
+        # % endfilter
+    # % else
+        # % filter indent(width=4, first=False)
+    {{ self.instantiation() }}
+        # % endfilter
+    # % endif
+
+    await crawler.run(
+        [
+            '{{ cookiecutter.start_url }}',
+        ]
+    )
diff --git a/src/crawlee/project_template/templates/main_beautifulsoup.py b/src/crawlee/project_template/templates/main_beautifulsoup.py
@@ -0,0 +1,13 @@
+# % extends 'main.py'
+
+# % block import
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
+# % endblock
+
+# % block instantiation
+crawler = BeautifulSoupCrawler(
+    request_handler=router,
+    max_requests_per_crawl=50,
+    {{ self.http_client_instantiation() }}
+)
+# % endblock
diff --git a/src/crawlee/project_template/templates/main_parsel.py b/src/crawlee/project_template/templates/main_parsel.py
@@ -0,0 +1,13 @@
+# % extends 'main.py'
+
+# % block import
+from crawlee.parsel_crawler import ParselCrawler
+# % endblock
+
+# % block instantiation
+crawler = ParselCrawler(
+    request_handler=router,
+    max_requests_per_crawl=50,
+    {{ self.http_client_instantiation() }}
+)
+# % endblock
diff --git a/src/crawlee/project_template/templates/main_playwright.py b/src/crawlee/project_template/templates/main_playwright.py
@@ -0,0 +1,14 @@
+# % extends 'main.py'
+
+# % block import
+from crawlee.playwright_crawler import PlaywrightCrawler
+# % endblock
+
+# % block instantiation
+crawler = PlaywrightCrawler(
+    request_handler=router,
+    headless=True,
+    max_requests_per_crawl=50,
+    {{ self.http_client_instantiation() }}
+)
+# % endblock
diff --git a/src/crawlee/project_template/templates/routes_beautifulsoup.py b/src/crawlee/project_template/templates/routes_beautifulsoup.py
@@ -0,0 +1,19 @@
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
+from crawlee.router import Router
+
+router = Router[BeautifulSoupCrawlingContext]()
+
+
+@router.default_handler
+async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
+    """Default request handler."""
+    context.log.info(f'Processing {context.request.url} ...')
+    title = context.soup.find('title')
+    await context.push_data(
+        {
+            'url': context.request.loaded_url,
+            'title': title.text if title else None,
+        }
+    )
+
+    await context.enqueue_links()