diff --git a/pyproject.toml b/pyproject.toml index 963704767a..94a322e123 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,7 @@ crawlee = "crawlee._cli:cli" [tool.ruff] line-length = 120 +extend-exclude = ["project_template"] [tool.ruff.lint] select = ["ALL"] @@ -189,6 +190,7 @@ timeout = 1200 [tool.mypy] python_version = "3.9" plugins = ["pydantic.mypy"] +exclude = ["project_template"] files = ["src", "tests"] check_untyped_defs = true disallow_incomplete_defs = true diff --git a/src/crawlee/_cli.py b/src/crawlee/_cli.py index 288d61ba07..f7bccd8bef 100644 --- a/src/crawlee/_cli.py +++ b/src/crawlee/_cli.py @@ -1,21 +1,27 @@ # ruff: noqa: TRY301, FBT002, UP007 from __future__ import annotations -import os +import importlib.resources +import json from pathlib import Path from typing import Annotated, Optional, cast -import httpx import inquirer # type: ignore[import-untyped] import typer from cookiecutter.main import cookiecutter # type: ignore[import-untyped] from inquirer.render.console import ConsoleRender # type: ignore[import-untyped] from rich.progress import Progress, SpinnerColumn, TextColumn -TEMPLATE_LIST_URL = 'https://api.github.com/repos/apify/crawlee-python/contents/templates' - cli = typer.Typer(no_args_is_help=True) +template_directory = importlib.resources.files('crawlee') / 'project_template' +cookiecutter_json = json.load((template_directory / 'cookiecutter.json').open()) + +crawler_choices = cookiecutter_json['crawler_type'] +http_client_choices = cookiecutter_json['http_client'] +package_manager_choices = cookiecutter_json['package_manager'] +default_start_url = cookiecutter_json['start_url'] + @cli.callback(invoke_without_command=True) def callback( @@ -64,25 +70,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str: return project_name -def _prompt_for_template() -> str: - """Prompt the user to select a template from a list.""" - # Fetch available templates - response = httpx.get( - TEMPLATE_LIST_URL, - timeout=httpx.Timeout(10), - headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [], +def _prompt_text(message: str, default: str) -> str: + return cast( + str, + ConsoleRender().render( + inquirer.Text( + name='text', + message=message, + default=default, + validate=lambda _, value: bool(value.strip()), + ), + ), ) - response.raise_for_status() - template_choices = [item['name'] for item in response.json() if item['type'] == 'dir'] - # Prompt for template choice + +def _prompt_choice(message: str, choices: list[str]) -> str: + """Prompt the user to pick one from a list of choices.""" return cast( str, ConsoleRender().render( inquirer.List( - name='template', - message='Please select the template for your new Crawlee project', - choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices], + name='choice', + message=message, + choices=[(choice[0].upper() + choice[1:], choice) for choice in choices], + ), + ), + ) + + +def _prompt_bool(message: str, *, default: bool) -> bool: + return cast( + bool, + ConsoleRender().render( + inquirer.Confirm( + name='confirm', + message=message, + default=default, ), ), ) @@ -92,14 +115,38 @@ def _prompt_for_template() -> str: def create( project_name: Optional[str] = typer.Argument( default=None, + show_default=False, help='The name of the project and the directory that will be created to contain it. ' 'If none is given, you will be prompted.', + ), + crawler_type: Optional[str] = typer.Option( + None, + '--crawler-type', + '--template', + show_default=False, + help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.', + ), + http_client: Optional[str] = typer.Option( + None, + show_default=False, + help='The library that will be used to make HTTP requests in your crawler. ' + 'If none is given, you will be prompted.', + ), + package_manager: Optional[str] = typer.Option( + default=None, show_default=False, + help='Package manager to be used in the new project. If none is given, you will be prompted.', ), - template: Optional[str] = typer.Option( + start_url: Optional[str] = typer.Option( default=None, - help='The template to be used to create the project. If none is given, you will be prompted.', show_default=False, + help='The URL where crawling should start. If none is given, you will be prompted.', + ), + enable_apify_integration: Optional[bool] = typer.Option( + None, + '--apify/--no-apify', + show_default=False, + help='Should Apify integration be set up for you? If not given, you will be prompted.', ), ) -> None: """Bootstrap a new Crawlee project.""" @@ -107,11 +154,38 @@ def create( # Prompt for project name if not provided. project_name = _prompt_for_project_name(project_name) - # Prompt for template choice if not provided. - if template is None: - template = _prompt_for_template() + # Prompt for crawler_type if not provided. + if crawler_type is None: + crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices) + + # Prompt for http_client if not provided. + if http_client is None: + http_client = _prompt_choice('Please select the HTTP client', http_client_choices) + + # Prompt for package manager if not provided. + if package_manager is None: + package_manager = _prompt_choice('Please select the package manager', package_manager_choices) + + # Prompt for start URL + if start_url is None: + start_url = _prompt_text('Please specify the start URL', default=default_start_url) + + # Ask about Apify integration if not explicitly configured + if enable_apify_integration is None: + enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False) + + if all( + [ + project_name, + crawler_type, + http_client, + package_manager, + start_url, + enable_apify_integration is not None, + ] + ): + package_name = project_name.replace('-', '_') - if project_name and template: # Start the bootstrap process. with Progress( SpinnerColumn(), @@ -120,21 +194,39 @@ def create( ) as progress: progress.add_task(description='Bootstrapping...', total=None) cookiecutter( - template='gh:apify/crawlee-python', - directory=f'templates/{template}', + template=str(template_directory), no_input=True, - extra_context={'project_name': project_name}, + extra_context={ + 'project_name': project_name, + 'package_manager': package_manager, + 'crawler_type': crawler_type, + 'http_client': http_client, + 'enable_apify_integration': enable_apify_integration, + 'start_url': start_url, + }, ) typer.echo(f'Your project "{project_name}" was created.') - typer.echo( - f'To run it, navigate to the directory: "cd {project_name}", ' - 'install dependencies with "poetry install", ' - f'and run it using "poetry run python -m {project_name}".' - ) + + if package_manager == 'manual': + typer.echo( + f'To run it, navigate to the directory: "cd {project_name}", ' + f'install the dependencies listed in "requirements.txt" ' + f'and run it using "python -m {package_name}".' + ) + elif package_manager == 'pip': + typer.echo( + f'To run it, navigate to the directory: "cd {project_name}", ' + f'activate the virtual environment in ".venv" ("source .venv/bin/activate") ' + f'and run your project using "python -m {package_name}".' + ) + elif package_manager == 'poetry': + typer.echo( + f'To run it, navigate to the directory: "cd {project_name}", ' + f'and run it using "poetry run python -m {package_name}".' + ) + typer.echo(f'See the "{project_name}/README.md" for more information.') - except httpx.HTTPStatusError as exc: - typer.echo(f'Failed to fetch templates: {exc}.', err=True) except KeyboardInterrupt: typer.echo('Operation cancelled by user.') diff --git a/src/crawlee/project_template/cookiecutter.json b/src/crawlee/project_template/cookiecutter.json new file mode 100644 index 0000000000..90b8d1c4a9 --- /dev/null +++ b/src/crawlee/project_template/cookiecutter.json @@ -0,0 +1,13 @@ +{ + "project_name": "crawlee-python-project", + "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", + "crawler_type": ["beautifulsoup", "parsel", "playwright"], + "http_client": ["httpx", "curl-impersonate"], + "package_manager": ["poetry", "pip", "manual"], + "enable_apify_integration": false, + "start_url": "https://crawlee.dev", + "_jinja2_env_vars": { + "line_statement_prefix": "# %" + }, + "_extensions": ["jinja2.ext.do"] +} diff --git a/src/crawlee/project_template/hooks/post_gen_project.py b/src/crawlee/project_template/hooks/post_gen_project.py new file mode 100644 index 0000000000..a62f99031b --- /dev/null +++ b/src/crawlee/project_template/hooks/post_gen_project.py @@ -0,0 +1,34 @@ +import platform +import subprocess +from pathlib import Path + +Path('_pyproject.toml').rename('pyproject.toml') + +# % if cookiecutter.package_manager == 'poetry' +Path('requirements.txt').unlink() + +subprocess.check_call(['poetry', 'install']) +# % if cookiecutter.crawler_type == 'playwright' +subprocess.check_call(['poetry', 'run', 'playwright', 'install']) +# % endif +# % elif cookiecutter.package_manager == 'pip' +import venv # noqa: E402 + +# Create a virtual environment +venv_root = Path('.venv') +venv.main([str(venv_root)]) + +if platform.system() == 'Windows': # noqa: SIM108 + path = venv_root / 'Scripts' +else: + path = venv_root / 'bin' + +# Install requirements and generate requirements.txt as an impromptu lockfile +subprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt']) +with open('requirements.txt', 'w') as requirements_txt: + subprocess.check_call([str(path / 'pip'), 'freeze'], stdout=requirements_txt) + +# % if cookiecutter.crawler_type == 'playwright' +subprocess.check_call([str(path / 'playwright'), 'install']) +# % endif +# % endif diff --git a/src/crawlee/project_template/hooks/pre_gen_project.py b/src/crawlee/project_template/hooks/pre_gen_project.py new file mode 100644 index 0000000000..5e1d1f8db0 --- /dev/null +++ b/src/crawlee/project_template/hooks/pre_gen_project.py @@ -0,0 +1,12 @@ +# % if cookiecutter.package_manager == 'poetry' +import subprocess +import re + +try: + version = subprocess.check_output(['poetry', '--version']).decode().strip() +except OSError as exc: + raise RuntimeError('You chose to use the Poetry package manager, but it does not seem to be installed') from exc + +if not re.match(r'Poetry \(version 1\..*\)', version): + raise RuntimeError(f'Poetry 1.x is required, but "{version}" is installed') +# % endif diff --git a/src/crawlee/project_template/templates/main.py b/src/crawlee/project_template/templates/main.py new file mode 100644 index 0000000000..dd9add9285 --- /dev/null +++ b/src/crawlee/project_template/templates/main.py @@ -0,0 +1,46 @@ +# % if cookiecutter.enable_apify_integration +from apify import Actor +# % endif +# % block import required +# % endblock +# % if cookiecutter.http_client == 'curl-impersonate' +from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient +# % elif cookiecutter.http_client == 'httpx' +from crawlee.http_clients._httpx import HttpxHttpClient +# % endif + +from .routes import router + +# % filter truncate(0, end='') +# % block http_client_instantiation +# % if cookiecutter.http_client == 'curl-impersonate' +http_client=CurlImpersonateHttpClient(), +# % elif cookiecutter.http_client == 'httpx' +http_client=HttpxHttpClient(), +# % endif +# % endblock +# % endfilter + +async def main() -> None: + """The crawler entry point.""" + # % filter truncate(0, end='') + # % block instantiation required + # % endblock + # % endfilter + + # % if cookiecutter.enable_apify_integration + async with Actor: + # % filter indent(width=8, first=False) + {{ self.instantiation() }} + # % endfilter + # % else + # % filter indent(width=4, first=False) + {{ self.instantiation() }} + # % endfilter + # % endif + + await crawler.run( + [ + '{{ cookiecutter.start_url }}', + ] + ) diff --git a/src/crawlee/project_template/templates/main_beautifulsoup.py b/src/crawlee/project_template/templates/main_beautifulsoup.py new file mode 100644 index 0000000000..c021f2a1a2 --- /dev/null +++ b/src/crawlee/project_template/templates/main_beautifulsoup.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler +# % endblock + +# % block instantiation +crawler = BeautifulSoupCrawler( + request_handler=router, + max_requests_per_crawl=50, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/main_parsel.py b/src/crawlee/project_template/templates/main_parsel.py new file mode 100644 index 0000000000..e3d703fdaf --- /dev/null +++ b/src/crawlee/project_template/templates/main_parsel.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.parsel_crawler import ParselCrawler +# % endblock + +# % block instantiation +crawler = ParselCrawler( + request_handler=router, + max_requests_per_crawl=50, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/main_playwright.py b/src/crawlee/project_template/templates/main_playwright.py new file mode 100644 index 0000000000..252336d60a --- /dev/null +++ b/src/crawlee/project_template/templates/main_playwright.py @@ -0,0 +1,13 @@ +# % extends 'main.py' + +# % block import +from crawlee.playwright_crawler import PlaywrightCrawler +# % endblock + +# % block instantiation +crawler = PlaywrightCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=50, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/routes_beautifulsoup.py b/src/crawlee/project_template/templates/routes_beautifulsoup.py new file mode 100644 index 0000000000..4b8715a35c --- /dev/null +++ b/src/crawlee/project_template/templates/routes_beautifulsoup.py @@ -0,0 +1,19 @@ +from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext +from crawlee.router import Router + +router = Router[BeautifulSoupCrawlingContext]() + + +@router.default_handler +async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = context.soup.find('title') + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title.text if title else None, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/templates/routes_parsel.py b/src/crawlee/project_template/templates/routes_parsel.py new file mode 100644 index 0000000000..b5c3e81183 --- /dev/null +++ b/src/crawlee/project_template/templates/routes_parsel.py @@ -0,0 +1,19 @@ +from crawlee.parsel_crawler import ParselCrawlingContext +from crawlee.router import Router + +router = Router[ParselCrawlingContext]() + + +@router.default_handler +async def default_handler(context: ParselCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = context.selector.xpath('//title/text()').get() + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/templates/routes_playwright.py b/src/crawlee/project_template/templates/routes_playwright.py new file mode 100644 index 0000000000..47aa207cc8 --- /dev/null +++ b/src/crawlee/project_template/templates/routes_playwright.py @@ -0,0 +1,19 @@ +from crawlee.playwright_crawler import PlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[PlaywrightCrawlingContext]() + + +@router.default_handler +async def default_handler(context: PlaywrightCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = await context.page.query_selector('title') + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': await title.inner_text() if title else None, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore b/src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore new file mode 100644 index 0000000000..1d17dae13b --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore @@ -0,0 +1 @@ +.venv diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile new file mode 100644 index 0000000000..74b1d926c2 --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile @@ -0,0 +1,68 @@ +# First, specify the base Docker image. +# You can see the Docker images from Apify at https://hub.docker.com/r/apify/. +# You can also use any other image from Docker Hub. +# % if cookiecutter.crawler_type == 'playwright' +FROM apify/actor-python-playwright:3.13 +# % else +FROM apify/actor-python:3.13 +# % endif + +RUN apt install -yq git && rm -rf /var/lib/apt/lists/* + +# % if cookiecutter.package_manager == 'poetry' +RUN pip install -U pip setuptools \ + && pip install 'poetry<2' \ + && poetry self add poetry-plugin-export + +# Second, copy just poetry.lock and pyproject.toml into the Actor image, +# since those should be the only files that affects the dependency install in the next step, +# in order to speed up the build +COPY pyproject.toml ./ +COPY poetry.lock ./ + +# Install the dependencies +RUN echo "Python version:" \ + && python --version \ + && echo "Installing dependencies:" \ + # Export packages from poetry.lock + && poetry export -f requirements.txt --without-hashes | \ + # Replace playwright version so that it matches whatever is pre-installed in the image + sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \ + # Install everything using pip (ignore dependency checks - the lockfile is correct, period) + pip install -r /dev/stdin --no-dependencies \ + && echo "All installed Python packages:" \ + && pip freeze +# % elif cookiecutter.package_manager == 'pip' +RUN pip install -U pip setuptools + +# Second, copy just requirements.txt into the Actor image, +# since it should be the only file that affects the dependency install in the next step, +# in order to speed up the build +COPY requirements.txt ./ + +# Install the dependencies +RUN echo "Python version:" \ + && python --version \ + && echo "Installing dependencies:" \ + # Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image + cat requirements.txt \ + # Replace playwright version so that it matches whatever is pre-installed in the image + sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \ + # Install everything using pip (ignore dependency checks - the lockfile is correct, period) + pip install -r /dev/stdin --no-dependencies \ + && echo "All installed Python packages:" \ + && pip freeze +# % elif cookiecutter.package_manager == 'manual' +# TODO install dependencies +# % endif + +# Next, copy the remaining files and directories with the source code. +# Since we do this after installing the dependencies, quick build will be really fast +# for most source file changes. +COPY . ./ + +# Use compileall to ensure the runnability of the Actor Python code. +RUN python -m compileall -q . + +# Specify how to launch the source code of your Actor. +CMD ["python", "-m", "{{ cookiecutter.__package_name }}"] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/README.md b/src/crawlee/project_template/{{cookiecutter.project_name}}/README.md new file mode 100644 index 0000000000..92fa18444d --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/README.md @@ -0,0 +1,39 @@ +# {{cookiecutter.project_name}} + +Project skeleton generated by Crawlee ({{ cookiecutter.crawler_type | capitalize }} template). + +## Usage + +{% if cookiecutter.package_manager == 'poetry' -%} +To get started, ensure you have [Poetry](https://python-poetry.org/), a package and dependency management system, installed on your machine. We recommend installing it with the following command: + +```sh +pipx install poetry +``` + +Next, install the project dependencies: + +```sh +poetry install +``` + +Finally, launch the crawler with: + +```sh +poetry run python -m {{cookiecutter.__package_name}} +``` +{% elif cookiecutter.package_manager == 'pip' -%} +To install dependencies, your can run the following command: + +```sh +pip install . +``` + +When the dependencies are installed, you may launch the crawler with: + +```sh +python -m {{cookiecutter.__package_name}} +``` +{% elif cookiecutter.package_manager == 'manual' -%} +You selected the manual dependency installation method, so you're on your own. There is a simple `requirements.txt` file to get you started. +{% endif %} diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/_pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/_pyproject.toml new file mode 100644 index 0000000000..e1814cd1e2 --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/_pyproject.toml @@ -0,0 +1,33 @@ +# % set extras = [cookiecutter.crawler_type] +# % if cookiecutter.enable_apify_integration +# % do extras.append('apify') +# % endif +# % if cookiecutter.http_client == 'curl-impersonate' +# % do extras.append('curl-impersonate') +# % endif + +# % if cookiecutter.package_manager == 'poetry' +[tool.poetry] +name = "{{cookiecutter.project_name}}" +version = "0.0.1" +description = "" +authors = ["Your Name "] +readme = "README.md" +package-mode = false + +[tool.poetry.dependencies] +python = "^3.9" +crawlee = {version = "*", extras = {{ extras | tojson }}} + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" +# % else +[project] +name = "{{cookiecutter.project_name}}" +version = "0.0.1" +description = "" +authors = ["Your Name "] +readme = "README.md" +requires-python = ">=3.9" +# % endif diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt new file mode 100644 index 0000000000..a77db502e6 --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt @@ -0,0 +1,8 @@ +# % set extras = [cookiecutter.crawler_type] +# % if cookiecutter.enable_apify_integration +# % do extras.append('apify') +# % endif +# % if cookiecutter.http_client == 'curl-impersonate' +# % do extras.append('curl-impersonate') +# % endif +crawlee[{{ extras | join(',') }}] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py new file mode 100644 index 0000000000..9f3ec5aff7 --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py @@ -0,0 +1,12 @@ +import asyncio +import platform + +from .main import main + + +if __name__ == '__main__': + if platform.system == 'Windows': + # This mitigates a warning raised by curl-cffi. If you do not need to use curl-impersonate, you may remove this. + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + + asyncio.run(main()) diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py new file mode 100644 index 0000000000..d6591ab12d --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py @@ -0,0 +1 @@ +# % include 'main_%s.py' % cookiecutter.crawler_type diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py new file mode 100644 index 0000000000..dfab2f1bb1 --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py @@ -0,0 +1 @@ +# % include 'routes_%s.py' % cookiecutter.crawler_type diff --git a/templates/beautifulsoup/Dockerfile b/templates/beautifulsoup/Dockerfile index 7d97d6ba87..b170601b05 100644 --- a/templates/beautifulsoup/Dockerfile +++ b/templates/beautifulsoup/Dockerfile @@ -6,7 +6,7 @@ FROM apify/actor-python:3.13 RUN apt install -yq git && rm -rf /var/lib/apt/lists/* RUN pip install -U pip setuptools \ - && pip install poetry \ + && pip install 'poetry<2' \ && poetry self add poetry-plugin-export # Second, copy just poetry.lock and pyproject.toml into the Actor image, diff --git a/templates/playwright/Dockerfile b/templates/playwright/Dockerfile index 8a10ce79ac..aec01ae316 100644 --- a/templates/playwright/Dockerfile +++ b/templates/playwright/Dockerfile @@ -6,7 +6,7 @@ FROM apify/actor-python-playwright:3.13 RUN apt install -yq git && rm -rf /var/lib/apt/lists/* RUN pip install -U pip setuptools \ - && pip install poetry \ + && pip install 'poetry<2' \ && poetry self add poetry-plugin-export # Second, copy just poetry.lock and pyproject.toml into the Actor image, diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 787feb28a9..a452950adc 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -1,7 +1,7 @@ from __future__ import annotations import os -from unittest.mock import Mock +from unittest.mock import ANY, Mock import pytest import readchar @@ -26,6 +26,10 @@ def test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyP *'my_project', readchar.key.ENTER, readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -34,10 +38,16 @@ def test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyP assert 'Your project "my_project" was created.' in result.output mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/beautifulsoup', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'beautifulsoup', + 'http_client': 'httpx', + 'enable_apify_integration': False, + 'start_url': 'https://crawlee.dev', + }, ) @@ -48,6 +58,10 @@ def test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkey readchar.key.ENTER, readchar.key.DOWN, readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -56,21 +70,48 @@ def test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkey assert 'Your project "my_project" was created.' in result.output mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/playwright', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'parsel', + 'http_client': 'httpx', + 'enable_apify_integration': False, + 'start_url': 'https://crawlee.dev', + }, ) def test_create_non_interactive(mock_cookiecutter: Mock) -> None: - runner.invoke(crawlee._cli.cli, ['create', 'my_project', '--template', 'playwright']) + runner.invoke( + crawlee._cli.cli, + [ + 'create', + 'my_project', + '--crawler-type', + 'playwright', + '--http-client', + 'curl-impersonate', + '--package-manager', + 'pip', + '--start-url', + 'https://yr.no', + '--no-apify', + ], + ) mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/playwright', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'pip', + 'crawler_type': 'playwright', + 'http_client': 'curl-impersonate', + 'start_url': 'https://yr.no', + 'enable_apify_integration': False, + }, ) @@ -89,14 +130,35 @@ def test_create_existing_folder( os.chdir(tmp) (tmp / 'existing_project').mkdir() - result = runner.invoke(crawlee._cli.cli, ['create', 'existing_project', '--template', 'playwright']) + result = runner.invoke( + crawlee._cli.cli, + [ + 'create', + 'existing_project', + '--crawler-type', + 'playwright', + '--http-client', + 'curl-impersonate', + '--package-manager', + 'pip', + '--start-url', + 'https://yr.no', + '--no-apify', + ], + ) assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/playwright', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'pip', + 'crawler_type': 'playwright', + 'http_client': 'curl-impersonate', + 'start_url': 'https://yr.no', + 'enable_apify_integration': False, + }, ) @@ -109,6 +171,10 @@ def test_create_existing_folder_interactive( readchar.key.ENTER, *'my_project', readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -121,10 +187,16 @@ def test_create_existing_folder_interactive( assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/playwright', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'playwright', + 'http_client': 'httpx', + 'start_url': 'https://crawlee.dev', + 'enable_apify_integration': False, + }, ) @@ -139,6 +211,10 @@ def test_create_existing_folder_interactive_multiple_attempts( readchar.key.ENTER, *'my_project', readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -148,12 +224,18 @@ def test_create_existing_folder_interactive_multiple_attempts( (tmp / 'existing_project').mkdir() (tmp / 'existing_project_2').mkdir() - result = runner.invoke(crawlee._cli.cli, ['create', '--template', 'playwright']) + result = runner.invoke(crawlee._cli.cli, ['create', '--crawler-type', 'playwright']) assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/playwright', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'playwright', + 'http_client': 'httpx', + 'start_url': 'https://crawlee.dev', + 'enable_apify_integration': False, + }, )