Skip to content

Commit

Permalink
feat: Improve project bootstrapping (apify#538)
Browse files Browse the repository at this point in the history
This adds a unified `crawlee/project_template` template. The original
`playwright` and `beautifulsoup` templates are kept for compatibility
with older versions of the CLI.

The user is now prompted for package manager type (pip, poetry), crawler
type, start URL and whether or not Apify integration should be set up.

- closes apify#317
- closes apify#414 (http client selection is not implemented)
- closes apify#511
- closes apify#495

### TODO

- [x] http client selection
- [x] disable poetry option if it isn't installed
- [x] rectify the pip-based setup
1. **manual dependency installation** - no automatic installation, just
dump requirements.txt and tell the user to handle it any way they want
2. **pip+venv** - dump requirements.txt, make a virtualenv (.venv) using
the current python interpreter, install requirements and tell user to
activate it
- ~should be disabled if `venv` module is not present~ it's stdlib
- [x] test the whole thing on Windows (mainly the various package
manager configurations)
- [x] fix how cookiecutter.json is read (it is not present when
installing via pip)
  • Loading branch information
janbuchar authored and Mantisus committed Dec 10, 2024
1 parent 2543a66 commit a217258
Show file tree
Hide file tree
Showing 24 changed files with 595 additions and 57 deletions.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ crawlee = "crawlee._cli:cli"

[tool.ruff]
line-length = 120
extend-exclude = ["project_template"]

[tool.ruff.lint]
select = ["ALL"]
Expand Down Expand Up @@ -189,6 +190,7 @@ timeout = 1200
[tool.mypy]
python_version = "3.9"
plugins = ["pydantic.mypy"]
exclude = ["project_template"]
files = ["src", "tests"]
check_untyped_defs = true
disallow_incomplete_defs = true
Expand Down
158 changes: 125 additions & 33 deletions src/crawlee/_cli.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
# ruff: noqa: TRY301, FBT002, UP007
from __future__ import annotations

import os
import importlib.resources
import json
from pathlib import Path
from typing import Annotated, Optional, cast

import httpx
import inquirer # type: ignore[import-untyped]
import typer
from cookiecutter.main import cookiecutter # type: ignore[import-untyped]
from inquirer.render.console import ConsoleRender # type: ignore[import-untyped]
from rich.progress import Progress, SpinnerColumn, TextColumn

TEMPLATE_LIST_URL = 'https://api.github.com/repos/apify/crawlee-python/contents/templates'

cli = typer.Typer(no_args_is_help=True)

template_directory = importlib.resources.files('crawlee') / 'project_template'
cookiecutter_json = json.load((template_directory / 'cookiecutter.json').open())

crawler_choices = cookiecutter_json['crawler_type']
http_client_choices = cookiecutter_json['http_client']
package_manager_choices = cookiecutter_json['package_manager']
default_start_url = cookiecutter_json['start_url']


@cli.callback(invoke_without_command=True)
def callback(
Expand Down Expand Up @@ -64,25 +70,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str:
return project_name


def _prompt_for_template() -> str:
"""Prompt the user to select a template from a list."""
# Fetch available templates
response = httpx.get(
TEMPLATE_LIST_URL,
timeout=httpx.Timeout(10),
headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [],
def _prompt_text(message: str, default: str) -> str:
return cast(
str,
ConsoleRender().render(
inquirer.Text(
name='text',
message=message,
default=default,
validate=lambda _, value: bool(value.strip()),
),
),
)
response.raise_for_status()
template_choices = [item['name'] for item in response.json() if item['type'] == 'dir']

# Prompt for template choice

def _prompt_choice(message: str, choices: list[str]) -> str:
"""Prompt the user to pick one from a list of choices."""
return cast(
str,
ConsoleRender().render(
inquirer.List(
name='template',
message='Please select the template for your new Crawlee project',
choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices],
name='choice',
message=message,
choices=[(choice[0].upper() + choice[1:], choice) for choice in choices],
),
),
)


def _prompt_bool(message: str, *, default: bool) -> bool:
return cast(
bool,
ConsoleRender().render(
inquirer.Confirm(
name='confirm',
message=message,
default=default,
),
),
)
Expand All @@ -92,26 +115,77 @@ def _prompt_for_template() -> str:
def create(
project_name: Optional[str] = typer.Argument(
default=None,
show_default=False,
help='The name of the project and the directory that will be created to contain it. '
'If none is given, you will be prompted.',
),
crawler_type: Optional[str] = typer.Option(
None,
'--crawler-type',
'--template',
show_default=False,
help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',
),
http_client: Optional[str] = typer.Option(
None,
show_default=False,
help='The library that will be used to make HTTP requests in your crawler. '
'If none is given, you will be prompted.',
),
package_manager: Optional[str] = typer.Option(
default=None,
show_default=False,
help='Package manager to be used in the new project. If none is given, you will be prompted.',
),
template: Optional[str] = typer.Option(
start_url: Optional[str] = typer.Option(
default=None,
help='The template to be used to create the project. If none is given, you will be prompted.',
show_default=False,
help='The URL where crawling should start. If none is given, you will be prompted.',
),
enable_apify_integration: Optional[bool] = typer.Option(
None,
'--apify/--no-apify',
show_default=False,
help='Should Apify integration be set up for you? If not given, you will be prompted.',
),
) -> None:
"""Bootstrap a new Crawlee project."""
try:
# Prompt for project name if not provided.
project_name = _prompt_for_project_name(project_name)

# Prompt for template choice if not provided.
if template is None:
template = _prompt_for_template()
# Prompt for crawler_type if not provided.
if crawler_type is None:
crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices)

# Prompt for http_client if not provided.
if http_client is None:
http_client = _prompt_choice('Please select the HTTP client', http_client_choices)

# Prompt for package manager if not provided.
if package_manager is None:
package_manager = _prompt_choice('Please select the package manager', package_manager_choices)

# Prompt for start URL
if start_url is None:
start_url = _prompt_text('Please specify the start URL', default=default_start_url)

# Ask about Apify integration if not explicitly configured
if enable_apify_integration is None:
enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False)

if all(
[
project_name,
crawler_type,
http_client,
package_manager,
start_url,
enable_apify_integration is not None,
]
):
package_name = project_name.replace('-', '_')

if project_name and template:
# Start the bootstrap process.
with Progress(
SpinnerColumn(),
Expand All @@ -120,21 +194,39 @@ def create(
) as progress:
progress.add_task(description='Bootstrapping...', total=None)
cookiecutter(
template='gh:apify/crawlee-python',
directory=f'templates/{template}',
template=str(template_directory),
no_input=True,
extra_context={'project_name': project_name},
extra_context={
'project_name': project_name,
'package_manager': package_manager,
'crawler_type': crawler_type,
'http_client': http_client,
'enable_apify_integration': enable_apify_integration,
'start_url': start_url,
},
)

typer.echo(f'Your project "{project_name}" was created.')
typer.echo(
f'To run it, navigate to the directory: "cd {project_name}", '
'install dependencies with "poetry install", '
f'and run it using "poetry run python -m {project_name}".'
)

if package_manager == 'manual':
typer.echo(
f'To run it, navigate to the directory: "cd {project_name}", '
f'install the dependencies listed in "requirements.txt" '
f'and run it using "python -m {package_name}".'
)
elif package_manager == 'pip':
typer.echo(
f'To run it, navigate to the directory: "cd {project_name}", '
f'activate the virtual environment in ".venv" ("source .venv/bin/activate") '
f'and run your project using "python -m {package_name}".'
)
elif package_manager == 'poetry':
typer.echo(
f'To run it, navigate to the directory: "cd {project_name}", '
f'and run it using "poetry run python -m {package_name}".'
)

typer.echo(f'See the "{project_name}/README.md" for more information.')

except httpx.HTTPStatusError as exc:
typer.echo(f'Failed to fetch templates: {exc}.', err=True)
except KeyboardInterrupt:
typer.echo('Operation cancelled by user.')
13 changes: 13 additions & 0 deletions src/crawlee/project_template/cookiecutter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"project_name": "crawlee-python-project",
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
"crawler_type": ["beautifulsoup", "parsel", "playwright"],
"http_client": ["httpx", "curl-impersonate"],
"package_manager": ["poetry", "pip", "manual"],
"enable_apify_integration": false,
"start_url": "https://crawlee.dev",
"_jinja2_env_vars": {
"line_statement_prefix": "# %"
},
"_extensions": ["jinja2.ext.do"]
}
34 changes: 34 additions & 0 deletions src/crawlee/project_template/hooks/post_gen_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import platform
import subprocess
from pathlib import Path

Path('_pyproject.toml').rename('pyproject.toml')

# % if cookiecutter.package_manager == 'poetry'
Path('requirements.txt').unlink()

subprocess.check_call(['poetry', 'install'])
# % if cookiecutter.crawler_type == 'playwright'
subprocess.check_call(['poetry', 'run', 'playwright', 'install'])
# % endif
# % elif cookiecutter.package_manager == 'pip'
import venv # noqa: E402

# Create a virtual environment
venv_root = Path('.venv')
venv.main([str(venv_root)])

if platform.system() == 'Windows': # noqa: SIM108
path = venv_root / 'Scripts'
else:
path = venv_root / 'bin'

# Install requirements and generate requirements.txt as an impromptu lockfile
subprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt'])
with open('requirements.txt', 'w') as requirements_txt:
subprocess.check_call([str(path / 'pip'), 'freeze'], stdout=requirements_txt)

# % if cookiecutter.crawler_type == 'playwright'
subprocess.check_call([str(path / 'playwright'), 'install'])
# % endif
# % endif
12 changes: 12 additions & 0 deletions src/crawlee/project_template/hooks/pre_gen_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % if cookiecutter.package_manager == 'poetry'
import subprocess
import re

try:
version = subprocess.check_output(['poetry', '--version']).decode().strip()
except OSError as exc:
raise RuntimeError('You chose to use the Poetry package manager, but it does not seem to be installed') from exc

if not re.match(r'Poetry \(version 1\..*\)', version):
raise RuntimeError(f'Poetry 1.x is required, but "{version}" is installed')
# % endif
46 changes: 46 additions & 0 deletions src/crawlee/project_template/templates/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# % if cookiecutter.enable_apify_integration
from apify import Actor
# % endif
# % block import required
# % endblock
# % if cookiecutter.http_client == 'curl-impersonate'
from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient
# % elif cookiecutter.http_client == 'httpx'
from crawlee.http_clients._httpx import HttpxHttpClient
# % endif

from .routes import router

# % filter truncate(0, end='')
# % block http_client_instantiation
# % if cookiecutter.http_client == 'curl-impersonate'
http_client=CurlImpersonateHttpClient(),
# % elif cookiecutter.http_client == 'httpx'
http_client=HttpxHttpClient(),
# % endif
# % endblock
# % endfilter

async def main() -> None:
"""The crawler entry point."""
# % filter truncate(0, end='')
# % block instantiation required
# % endblock
# % endfilter

# % if cookiecutter.enable_apify_integration
async with Actor:
# % filter indent(width=8, first=False)
{{ self.instantiation() }}
# % endfilter
# % else
# % filter indent(width=4, first=False)
{{ self.instantiation() }}
# % endfilter
# % endif

await crawler.run(
[
'{{ cookiecutter.start_url }}',
]
)
12 changes: 12 additions & 0 deletions src/crawlee/project_template/templates/main_beautifulsoup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
# % endblock

# % block instantiation
crawler = BeautifulSoupCrawler(
request_handler=router,
max_requests_per_crawl=50,
{{ self.http_client_instantiation() }})
# % endblock
12 changes: 12 additions & 0 deletions src/crawlee/project_template/templates/main_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.parsel_crawler import ParselCrawler
# % endblock

# % block instantiation
crawler = ParselCrawler(
request_handler=router,
max_requests_per_crawl=50,
{{ self.http_client_instantiation() }})
# % endblock
13 changes: 13 additions & 0 deletions src/crawlee/project_template/templates/main_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# % extends 'main.py'

# % block import
from crawlee.playwright_crawler import PlaywrightCrawler
# % endblock

# % block instantiation
crawler = PlaywrightCrawler(
request_handler=router,
headless=True,
max_requests_per_crawl=50,
{{ self.http_client_instantiation() }})
# % endblock
Loading

0 comments on commit a217258

Please sign in to comment.