From a2172581938befcf335e71acf19c5b60cfc90ecd Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 2 Dec 2024 15:21:59 +0000 Subject: [PATCH] feat: Improve project bootstrapping (#538) This adds a unified `crawlee/project_template` template. The original `playwright` and `beautifulsoup` templates are kept for compatibility with older versions of the CLI. The user is now prompted for package manager type (pip, poetry), crawler type, start URL and whether or not Apify integration should be set up. - closes #317 - closes #414 (http client selection is not implemented) - closes #511 - closes #495 ### TODO - [x] http client selection - [x] disable poetry option if it isn't installed - [x] rectify the pip-based setup 1. **manual dependency installation** - no automatic installation, just dump requirements.txt and tell the user to handle it any way they want 2. **pip+venv** - dump requirements.txt, make a virtualenv (.venv) using the current python interpreter, install requirements and tell user to activate it - ~should be disabled if `venv` module is not present~ it's stdlib - [x] test the whole thing on Windows (mainly the various package manager configurations) - [x] fix how cookiecutter.json is read (it is not present when installing via pip) --- pyproject.toml | 2 + src/crawlee/_cli.py | 158 ++++++++++++++---- .../project_template/cookiecutter.json | 13 ++ .../hooks/post_gen_project.py | 34 ++++ .../project_template/hooks/pre_gen_project.py | 12 ++ .../project_template/templates/main.py | 46 +++++ .../templates/main_beautifulsoup.py | 12 ++ .../project_template/templates/main_parsel.py | 12 ++ .../templates/main_playwright.py | 13 ++ .../templates/routes_beautifulsoup.py | 19 +++ .../templates/routes_parsel.py | 19 +++ .../templates/routes_playwright.py | 19 +++ .../.dockerignore | 1 + .../{{cookiecutter.project_name}}/Dockerfile | 68 ++++++++ .../{{cookiecutter.project_name}}/README.md | 39 +++++ .../_pyproject.toml | 33 ++++ .../requirements.txt | 8 + .../__init__.py | 0 .../__main__.py | 12 ++ .../{{cookiecutter.__package_name}}/main.py | 1 + .../{{cookiecutter.__package_name}}/routes.py | 1 + templates/beautifulsoup/Dockerfile | 2 +- templates/playwright/Dockerfile | 2 +- tests/unit/test_cli.py | 126 +++++++++++--- 24 files changed, 595 insertions(+), 57 deletions(-) create mode 100644 src/crawlee/project_template/cookiecutter.json create mode 100644 src/crawlee/project_template/hooks/post_gen_project.py create mode 100644 src/crawlee/project_template/hooks/pre_gen_project.py create mode 100644 src/crawlee/project_template/templates/main.py create mode 100644 src/crawlee/project_template/templates/main_beautifulsoup.py create mode 100644 src/crawlee/project_template/templates/main_parsel.py create mode 100644 src/crawlee/project_template/templates/main_playwright.py create mode 100644 src/crawlee/project_template/templates/routes_beautifulsoup.py create mode 100644 src/crawlee/project_template/templates/routes_parsel.py create mode 100644 src/crawlee/project_template/templates/routes_playwright.py create mode 100644 src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore create mode 100644 src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile create mode 100644 src/crawlee/project_template/{{cookiecutter.project_name}}/README.md create mode 100644 src/crawlee/project_template/{{cookiecutter.project_name}}/_pyproject.toml create mode 100644 src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt create mode 100644 src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py create mode 100644 src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py create mode 100644 src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py create mode 100644 src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py diff --git a/pyproject.toml b/pyproject.toml index 963704767..94a322e12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,7 @@ crawlee = "crawlee._cli:cli" [tool.ruff] line-length = 120 +extend-exclude = ["project_template"] [tool.ruff.lint] select = ["ALL"] @@ -189,6 +190,7 @@ timeout = 1200 [tool.mypy] python_version = "3.9" plugins = ["pydantic.mypy"] +exclude = ["project_template"] files = ["src", "tests"] check_untyped_defs = true disallow_incomplete_defs = true diff --git a/src/crawlee/_cli.py b/src/crawlee/_cli.py index 288d61ba0..f7bccd8be 100644 --- a/src/crawlee/_cli.py +++ b/src/crawlee/_cli.py @@ -1,21 +1,27 @@ # ruff: noqa: TRY301, FBT002, UP007 from __future__ import annotations -import os +import importlib.resources +import json from pathlib import Path from typing import Annotated, Optional, cast -import httpx import inquirer # type: ignore[import-untyped] import typer from cookiecutter.main import cookiecutter # type: ignore[import-untyped] from inquirer.render.console import ConsoleRender # type: ignore[import-untyped] from rich.progress import Progress, SpinnerColumn, TextColumn -TEMPLATE_LIST_URL = 'https://api.github.com/repos/apify/crawlee-python/contents/templates' - cli = typer.Typer(no_args_is_help=True) +template_directory = importlib.resources.files('crawlee') / 'project_template' +cookiecutter_json = json.load((template_directory / 'cookiecutter.json').open()) + +crawler_choices = cookiecutter_json['crawler_type'] +http_client_choices = cookiecutter_json['http_client'] +package_manager_choices = cookiecutter_json['package_manager'] +default_start_url = cookiecutter_json['start_url'] + @cli.callback(invoke_without_command=True) def callback( @@ -64,25 +70,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str: return project_name -def _prompt_for_template() -> str: - """Prompt the user to select a template from a list.""" - # Fetch available templates - response = httpx.get( - TEMPLATE_LIST_URL, - timeout=httpx.Timeout(10), - headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [], +def _prompt_text(message: str, default: str) -> str: + return cast( + str, + ConsoleRender().render( + inquirer.Text( + name='text', + message=message, + default=default, + validate=lambda _, value: bool(value.strip()), + ), + ), ) - response.raise_for_status() - template_choices = [item['name'] for item in response.json() if item['type'] == 'dir'] - # Prompt for template choice + +def _prompt_choice(message: str, choices: list[str]) -> str: + """Prompt the user to pick one from a list of choices.""" return cast( str, ConsoleRender().render( inquirer.List( - name='template', - message='Please select the template for your new Crawlee project', - choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices], + name='choice', + message=message, + choices=[(choice[0].upper() + choice[1:], choice) for choice in choices], + ), + ), + ) + + +def _prompt_bool(message: str, *, default: bool) -> bool: + return cast( + bool, + ConsoleRender().render( + inquirer.Confirm( + name='confirm', + message=message, + default=default, ), ), ) @@ -92,14 +115,38 @@ def _prompt_for_template() -> str: def create( project_name: Optional[str] = typer.Argument( default=None, + show_default=False, help='The name of the project and the directory that will be created to contain it. ' 'If none is given, you will be prompted.', + ), + crawler_type: Optional[str] = typer.Option( + None, + '--crawler-type', + '--template', + show_default=False, + help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.', + ), + http_client: Optional[str] = typer.Option( + None, + show_default=False, + help='The library that will be used to make HTTP requests in your crawler. ' + 'If none is given, you will be prompted.', + ), + package_manager: Optional[str] = typer.Option( + default=None, show_default=False, + help='Package manager to be used in the new project. If none is given, you will be prompted.', ), - template: Optional[str] = typer.Option( + start_url: Optional[str] = typer.Option( default=None, - help='The template to be used to create the project. If none is given, you will be prompted.', show_default=False, + help='The URL where crawling should start. If none is given, you will be prompted.', + ), + enable_apify_integration: Optional[bool] = typer.Option( + None, + '--apify/--no-apify', + show_default=False, + help='Should Apify integration be set up for you? If not given, you will be prompted.', ), ) -> None: """Bootstrap a new Crawlee project.""" @@ -107,11 +154,38 @@ def create( # Prompt for project name if not provided. project_name = _prompt_for_project_name(project_name) - # Prompt for template choice if not provided. - if template is None: - template = _prompt_for_template() + # Prompt for crawler_type if not provided. + if crawler_type is None: + crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices) + + # Prompt for http_client if not provided. + if http_client is None: + http_client = _prompt_choice('Please select the HTTP client', http_client_choices) + + # Prompt for package manager if not provided. + if package_manager is None: + package_manager = _prompt_choice('Please select the package manager', package_manager_choices) + + # Prompt for start URL + if start_url is None: + start_url = _prompt_text('Please specify the start URL', default=default_start_url) + + # Ask about Apify integration if not explicitly configured + if enable_apify_integration is None: + enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False) + + if all( + [ + project_name, + crawler_type, + http_client, + package_manager, + start_url, + enable_apify_integration is not None, + ] + ): + package_name = project_name.replace('-', '_') - if project_name and template: # Start the bootstrap process. with Progress( SpinnerColumn(), @@ -120,21 +194,39 @@ def create( ) as progress: progress.add_task(description='Bootstrapping...', total=None) cookiecutter( - template='gh:apify/crawlee-python', - directory=f'templates/{template}', + template=str(template_directory), no_input=True, - extra_context={'project_name': project_name}, + extra_context={ + 'project_name': project_name, + 'package_manager': package_manager, + 'crawler_type': crawler_type, + 'http_client': http_client, + 'enable_apify_integration': enable_apify_integration, + 'start_url': start_url, + }, ) typer.echo(f'Your project "{project_name}" was created.') - typer.echo( - f'To run it, navigate to the directory: "cd {project_name}", ' - 'install dependencies with "poetry install", ' - f'and run it using "poetry run python -m {project_name}".' - ) + + if package_manager == 'manual': + typer.echo( + f'To run it, navigate to the directory: "cd {project_name}", ' + f'install the dependencies listed in "requirements.txt" ' + f'and run it using "python -m {package_name}".' + ) + elif package_manager == 'pip': + typer.echo( + f'To run it, navigate to the directory: "cd {project_name}", ' + f'activate the virtual environment in ".venv" ("source .venv/bin/activate") ' + f'and run your project using "python -m {package_name}".' + ) + elif package_manager == 'poetry': + typer.echo( + f'To run it, navigate to the directory: "cd {project_name}", ' + f'and run it using "poetry run python -m {package_name}".' + ) + typer.echo(f'See the "{project_name}/README.md" for more information.') - except httpx.HTTPStatusError as exc: - typer.echo(f'Failed to fetch templates: {exc}.', err=True) except KeyboardInterrupt: typer.echo('Operation cancelled by user.') diff --git a/src/crawlee/project_template/cookiecutter.json b/src/crawlee/project_template/cookiecutter.json new file mode 100644 index 000000000..90b8d1c4a --- /dev/null +++ b/src/crawlee/project_template/cookiecutter.json @@ -0,0 +1,13 @@ +{ + "project_name": "crawlee-python-project", + "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", + "crawler_type": ["beautifulsoup", "parsel", "playwright"], + "http_client": ["httpx", "curl-impersonate"], + "package_manager": ["poetry", "pip", "manual"], + "enable_apify_integration": false, + "start_url": "https://crawlee.dev", + "_jinja2_env_vars": { + "line_statement_prefix": "# %" + }, + "_extensions": ["jinja2.ext.do"] +} diff --git a/src/crawlee/project_template/hooks/post_gen_project.py b/src/crawlee/project_template/hooks/post_gen_project.py new file mode 100644 index 000000000..a62f99031 --- /dev/null +++ b/src/crawlee/project_template/hooks/post_gen_project.py @@ -0,0 +1,34 @@ +import platform +import subprocess +from pathlib import Path + +Path('_pyproject.toml').rename('pyproject.toml') + +# % if cookiecutter.package_manager == 'poetry' +Path('requirements.txt').unlink() + +subprocess.check_call(['poetry', 'install']) +# % if cookiecutter.crawler_type == 'playwright' +subprocess.check_call(['poetry', 'run', 'playwright', 'install']) +# % endif +# % elif cookiecutter.package_manager == 'pip' +import venv # noqa: E402 + +# Create a virtual environment +venv_root = Path('.venv') +venv.main([str(venv_root)]) + +if platform.system() == 'Windows': # noqa: SIM108 + path = venv_root / 'Scripts' +else: + path = venv_root / 'bin' + +# Install requirements and generate requirements.txt as an impromptu lockfile +subprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt']) +with open('requirements.txt', 'w') as requirements_txt: + subprocess.check_call([str(path / 'pip'), 'freeze'], stdout=requirements_txt) + +# % if cookiecutter.crawler_type == 'playwright' +subprocess.check_call([str(path / 'playwright'), 'install']) +# % endif +# % endif diff --git a/src/crawlee/project_template/hooks/pre_gen_project.py b/src/crawlee/project_template/hooks/pre_gen_project.py new file mode 100644 index 000000000..5e1d1f8db --- /dev/null +++ b/src/crawlee/project_template/hooks/pre_gen_project.py @@ -0,0 +1,12 @@ +# % if cookiecutter.package_manager == 'poetry' +import subprocess +import re + +try: + version = subprocess.check_output(['poetry', '--version']).decode().strip() +except OSError as exc: + raise RuntimeError('You chose to use the Poetry package manager, but it does not seem to be installed') from exc + +if not re.match(r'Poetry \(version 1\..*\)', version): + raise RuntimeError(f'Poetry 1.x is required, but "{version}" is installed') +# % endif diff --git a/src/crawlee/project_template/templates/main.py b/src/crawlee/project_template/templates/main.py new file mode 100644 index 000000000..dd9add928 --- /dev/null +++ b/src/crawlee/project_template/templates/main.py @@ -0,0 +1,46 @@ +# % if cookiecutter.enable_apify_integration +from apify import Actor +# % endif +# % block import required +# % endblock +# % if cookiecutter.http_client == 'curl-impersonate' +from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient +# % elif cookiecutter.http_client == 'httpx' +from crawlee.http_clients._httpx import HttpxHttpClient +# % endif + +from .routes import router + +# % filter truncate(0, end='') +# % block http_client_instantiation +# % if cookiecutter.http_client == 'curl-impersonate' +http_client=CurlImpersonateHttpClient(), +# % elif cookiecutter.http_client == 'httpx' +http_client=HttpxHttpClient(), +# % endif +# % endblock +# % endfilter + +async def main() -> None: + """The crawler entry point.""" + # % filter truncate(0, end='') + # % block instantiation required + # % endblock + # % endfilter + + # % if cookiecutter.enable_apify_integration + async with Actor: + # % filter indent(width=8, first=False) + {{ self.instantiation() }} + # % endfilter + # % else + # % filter indent(width=4, first=False) + {{ self.instantiation() }} + # % endfilter + # % endif + + await crawler.run( + [ + '{{ cookiecutter.start_url }}', + ] + ) diff --git a/src/crawlee/project_template/templates/main_beautifulsoup.py b/src/crawlee/project_template/templates/main_beautifulsoup.py new file mode 100644 index 000000000..c021f2a1a --- /dev/null +++ b/src/crawlee/project_template/templates/main_beautifulsoup.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler +# % endblock + +# % block instantiation +crawler = BeautifulSoupCrawler( + request_handler=router, + max_requests_per_crawl=50, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/main_parsel.py b/src/crawlee/project_template/templates/main_parsel.py new file mode 100644 index 000000000..e3d703fda --- /dev/null +++ b/src/crawlee/project_template/templates/main_parsel.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.parsel_crawler import ParselCrawler +# % endblock + +# % block instantiation +crawler = ParselCrawler( + request_handler=router, + max_requests_per_crawl=50, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/main_playwright.py b/src/crawlee/project_template/templates/main_playwright.py new file mode 100644 index 000000000..252336d60 --- /dev/null +++ b/src/crawlee/project_template/templates/main_playwright.py @@ -0,0 +1,13 @@ +# % extends 'main.py' + +# % block import +from crawlee.playwright_crawler import PlaywrightCrawler +# % endblock + +# % block instantiation +crawler = PlaywrightCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=50, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/routes_beautifulsoup.py b/src/crawlee/project_template/templates/routes_beautifulsoup.py new file mode 100644 index 000000000..4b8715a35 --- /dev/null +++ b/src/crawlee/project_template/templates/routes_beautifulsoup.py @@ -0,0 +1,19 @@ +from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext +from crawlee.router import Router + +router = Router[BeautifulSoupCrawlingContext]() + + +@router.default_handler +async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = context.soup.find('title') + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title.text if title else None, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/templates/routes_parsel.py b/src/crawlee/project_template/templates/routes_parsel.py new file mode 100644 index 000000000..b5c3e8118 --- /dev/null +++ b/src/crawlee/project_template/templates/routes_parsel.py @@ -0,0 +1,19 @@ +from crawlee.parsel_crawler import ParselCrawlingContext +from crawlee.router import Router + +router = Router[ParselCrawlingContext]() + + +@router.default_handler +async def default_handler(context: ParselCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = context.selector.xpath('//title/text()').get() + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/templates/routes_playwright.py b/src/crawlee/project_template/templates/routes_playwright.py new file mode 100644 index 000000000..47aa207cc --- /dev/null +++ b/src/crawlee/project_template/templates/routes_playwright.py @@ -0,0 +1,19 @@ +from crawlee.playwright_crawler import PlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[PlaywrightCrawlingContext]() + + +@router.default_handler +async def default_handler(context: PlaywrightCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = await context.page.query_selector('title') + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': await title.inner_text() if title else None, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore b/src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore new file mode 100644 index 000000000..1d17dae13 --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore @@ -0,0 +1 @@ +.venv diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile new file mode 100644 index 000000000..74b1d926c --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile @@ -0,0 +1,68 @@ +# First, specify the base Docker image. +# You can see the Docker images from Apify at https://hub.docker.com/r/apify/. +# You can also use any other image from Docker Hub. +# % if cookiecutter.crawler_type == 'playwright' +FROM apify/actor-python-playwright:3.13 +# % else +FROM apify/actor-python:3.13 +# % endif + +RUN apt install -yq git && rm -rf /var/lib/apt/lists/* + +# % if cookiecutter.package_manager == 'poetry' +RUN pip install -U pip setuptools \ + && pip install 'poetry<2' \ + && poetry self add poetry-plugin-export + +# Second, copy just poetry.lock and pyproject.toml into the Actor image, +# since those should be the only files that affects the dependency install in the next step, +# in order to speed up the build +COPY pyproject.toml ./ +COPY poetry.lock ./ + +# Install the dependencies +RUN echo "Python version:" \ + && python --version \ + && echo "Installing dependencies:" \ + # Export packages from poetry.lock + && poetry export -f requirements.txt --without-hashes | \ + # Replace playwright version so that it matches whatever is pre-installed in the image + sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \ + # Install everything using pip (ignore dependency checks - the lockfile is correct, period) + pip install -r /dev/stdin --no-dependencies \ + && echo "All installed Python packages:" \ + && pip freeze +# % elif cookiecutter.package_manager == 'pip' +RUN pip install -U pip setuptools + +# Second, copy just requirements.txt into the Actor image, +# since it should be the only file that affects the dependency install in the next step, +# in order to speed up the build +COPY requirements.txt ./ + +# Install the dependencies +RUN echo "Python version:" \ + && python --version \ + && echo "Installing dependencies:" \ + # Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image + cat requirements.txt \ + # Replace playwright version so that it matches whatever is pre-installed in the image + sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \ + # Install everything using pip (ignore dependency checks - the lockfile is correct, period) + pip install -r /dev/stdin --no-dependencies \ + && echo "All installed Python packages:" \ + && pip freeze +# % elif cookiecutter.package_manager == 'manual' +# TODO install dependencies +# % endif + +# Next, copy the remaining files and directories with the source code. +# Since we do this after installing the dependencies, quick build will be really fast +# for most source file changes. +COPY . ./ + +# Use compileall to ensure the runnability of the Actor Python code. +RUN python -m compileall -q . + +# Specify how to launch the source code of your Actor. +CMD ["python", "-m", "{{ cookiecutter.__package_name }}"] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/README.md b/src/crawlee/project_template/{{cookiecutter.project_name}}/README.md new file mode 100644 index 000000000..92fa18444 --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/README.md @@ -0,0 +1,39 @@ +# {{cookiecutter.project_name}} + +Project skeleton generated by Crawlee ({{ cookiecutter.crawler_type | capitalize }} template). + +## Usage + +{% if cookiecutter.package_manager == 'poetry' -%} +To get started, ensure you have [Poetry](https://python-poetry.org/), a package and dependency management system, installed on your machine. We recommend installing it with the following command: + +```sh +pipx install poetry +``` + +Next, install the project dependencies: + +```sh +poetry install +``` + +Finally, launch the crawler with: + +```sh +poetry run python -m {{cookiecutter.__package_name}} +``` +{% elif cookiecutter.package_manager == 'pip' -%} +To install dependencies, your can run the following command: + +```sh +pip install . +``` + +When the dependencies are installed, you may launch the crawler with: + +```sh +python -m {{cookiecutter.__package_name}} +``` +{% elif cookiecutter.package_manager == 'manual' -%} +You selected the manual dependency installation method, so you're on your own. There is a simple `requirements.txt` file to get you started. +{% endif %} diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/_pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/_pyproject.toml new file mode 100644 index 000000000..e1814cd1e --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/_pyproject.toml @@ -0,0 +1,33 @@ +# % set extras = [cookiecutter.crawler_type] +# % if cookiecutter.enable_apify_integration +# % do extras.append('apify') +# % endif +# % if cookiecutter.http_client == 'curl-impersonate' +# % do extras.append('curl-impersonate') +# % endif + +# % if cookiecutter.package_manager == 'poetry' +[tool.poetry] +name = "{{cookiecutter.project_name}}" +version = "0.0.1" +description = "" +authors = ["Your Name "] +readme = "README.md" +package-mode = false + +[tool.poetry.dependencies] +python = "^3.9" +crawlee = {version = "*", extras = {{ extras | tojson }}} + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" +# % else +[project] +name = "{{cookiecutter.project_name}}" +version = "0.0.1" +description = "" +authors = ["Your Name "] +readme = "README.md" +requires-python = ">=3.9" +# % endif diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt new file mode 100644 index 000000000..a77db502e --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt @@ -0,0 +1,8 @@ +# % set extras = [cookiecutter.crawler_type] +# % if cookiecutter.enable_apify_integration +# % do extras.append('apify') +# % endif +# % if cookiecutter.http_client == 'curl-impersonate' +# % do extras.append('curl-impersonate') +# % endif +crawlee[{{ extras | join(',') }}] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py new file mode 100644 index 000000000..9f3ec5aff --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py @@ -0,0 +1,12 @@ +import asyncio +import platform + +from .main import main + + +if __name__ == '__main__': + if platform.system == 'Windows': + # This mitigates a warning raised by curl-cffi. If you do not need to use curl-impersonate, you may remove this. + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + + asyncio.run(main()) diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py new file mode 100644 index 000000000..d6591ab12 --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py @@ -0,0 +1 @@ +# % include 'main_%s.py' % cookiecutter.crawler_type diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py new file mode 100644 index 000000000..dfab2f1bb --- /dev/null +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py @@ -0,0 +1 @@ +# % include 'routes_%s.py' % cookiecutter.crawler_type diff --git a/templates/beautifulsoup/Dockerfile b/templates/beautifulsoup/Dockerfile index 7d97d6ba8..b170601b0 100644 --- a/templates/beautifulsoup/Dockerfile +++ b/templates/beautifulsoup/Dockerfile @@ -6,7 +6,7 @@ FROM apify/actor-python:3.13 RUN apt install -yq git && rm -rf /var/lib/apt/lists/* RUN pip install -U pip setuptools \ - && pip install poetry \ + && pip install 'poetry<2' \ && poetry self add poetry-plugin-export # Second, copy just poetry.lock and pyproject.toml into the Actor image, diff --git a/templates/playwright/Dockerfile b/templates/playwright/Dockerfile index 8a10ce79a..aec01ae31 100644 --- a/templates/playwright/Dockerfile +++ b/templates/playwright/Dockerfile @@ -6,7 +6,7 @@ FROM apify/actor-python-playwright:3.13 RUN apt install -yq git && rm -rf /var/lib/apt/lists/* RUN pip install -U pip setuptools \ - && pip install poetry \ + && pip install 'poetry<2' \ && poetry self add poetry-plugin-export # Second, copy just poetry.lock and pyproject.toml into the Actor image, diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 787feb28a..a452950ad 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -1,7 +1,7 @@ from __future__ import annotations import os -from unittest.mock import Mock +from unittest.mock import ANY, Mock import pytest import readchar @@ -26,6 +26,10 @@ def test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyP *'my_project', readchar.key.ENTER, readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -34,10 +38,16 @@ def test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyP assert 'Your project "my_project" was created.' in result.output mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/beautifulsoup', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'beautifulsoup', + 'http_client': 'httpx', + 'enable_apify_integration': False, + 'start_url': 'https://crawlee.dev', + }, ) @@ -48,6 +58,10 @@ def test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkey readchar.key.ENTER, readchar.key.DOWN, readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -56,21 +70,48 @@ def test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkey assert 'Your project "my_project" was created.' in result.output mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/playwright', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'parsel', + 'http_client': 'httpx', + 'enable_apify_integration': False, + 'start_url': 'https://crawlee.dev', + }, ) def test_create_non_interactive(mock_cookiecutter: Mock) -> None: - runner.invoke(crawlee._cli.cli, ['create', 'my_project', '--template', 'playwright']) + runner.invoke( + crawlee._cli.cli, + [ + 'create', + 'my_project', + '--crawler-type', + 'playwright', + '--http-client', + 'curl-impersonate', + '--package-manager', + 'pip', + '--start-url', + 'https://yr.no', + '--no-apify', + ], + ) mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/playwright', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'pip', + 'crawler_type': 'playwright', + 'http_client': 'curl-impersonate', + 'start_url': 'https://yr.no', + 'enable_apify_integration': False, + }, ) @@ -89,14 +130,35 @@ def test_create_existing_folder( os.chdir(tmp) (tmp / 'existing_project').mkdir() - result = runner.invoke(crawlee._cli.cli, ['create', 'existing_project', '--template', 'playwright']) + result = runner.invoke( + crawlee._cli.cli, + [ + 'create', + 'existing_project', + '--crawler-type', + 'playwright', + '--http-client', + 'curl-impersonate', + '--package-manager', + 'pip', + '--start-url', + 'https://yr.no', + '--no-apify', + ], + ) assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/playwright', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'pip', + 'crawler_type': 'playwright', + 'http_client': 'curl-impersonate', + 'start_url': 'https://yr.no', + 'enable_apify_integration': False, + }, ) @@ -109,6 +171,10 @@ def test_create_existing_folder_interactive( readchar.key.ENTER, *'my_project', readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -121,10 +187,16 @@ def test_create_existing_folder_interactive( assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/playwright', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'playwright', + 'http_client': 'httpx', + 'start_url': 'https://crawlee.dev', + 'enable_apify_integration': False, + }, ) @@ -139,6 +211,10 @@ def test_create_existing_folder_interactive_multiple_attempts( readchar.key.ENTER, *'my_project', readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -148,12 +224,18 @@ def test_create_existing_folder_interactive_multiple_attempts( (tmp / 'existing_project').mkdir() (tmp / 'existing_project_2').mkdir() - result = runner.invoke(crawlee._cli.cli, ['create', '--template', 'playwright']) + result = runner.invoke(crawlee._cli.cli, ['create', '--crawler-type', 'playwright']) assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( - template='gh:apify/crawlee-python', - directory='templates/playwright', + template=ANY, no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'playwright', + 'http_client': 'httpx', + 'start_url': 'https://crawlee.dev', + 'enable_apify_integration': False, + }, )