Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Improve project bootstrapping #538

Merged
merged 19 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 93 additions & 24 deletions src/crawlee/_cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ruff: noqa: TRY301, FBT002, UP007
from __future__ import annotations

import os
import json
from pathlib import Path
from typing import Annotated, Optional, cast

Expand All @@ -16,6 +16,11 @@

cli = typer.Typer(no_args_is_help=True)

cookiecutter_json = json.load((Path().parent.parent.parent / 'templates' / 'crawler' / 'cookiecutter.json').open())
janbuchar marked this conversation as resolved.
Show resolved Hide resolved
crawler_choices = cookiecutter_json['crawler_type']
package_manager_choices = cookiecutter_json['package_manager']
default_start_url = cookiecutter_json['start_url']


@cli.callback(invoke_without_command=True)
def callback(
Expand Down Expand Up @@ -64,25 +69,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str:
return project_name


def _prompt_for_template() -> str:
"""Prompt the user to select a template from a list."""
# Fetch available templates
response = httpx.get(
TEMPLATE_LIST_URL,
timeout=httpx.Timeout(10),
headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [],
def _prompt_text(message: str, default: str) -> str:
return cast(
str,
ConsoleRender().render(
inquirer.Text(
name='text',
message=message,
default=default,
validate=lambda _, value: bool(value.strip()),
),
),
)
response.raise_for_status()
template_choices = [item['name'] for item in response.json() if item['type'] == 'dir']

# Prompt for template choice

def _prompt_choice(message: str, choices: list[str]) -> str:
"""Prompt the user to pick one from a list of choices."""
return cast(
str,
ConsoleRender().render(
inquirer.List(
name='template',
message='Please select the template for your new Crawlee project',
choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices],
name='choice',
message=message,
choices=[(choice[0].upper() + choice[1:], choice) for choice in choices],
),
),
)


def _prompt_bool(message: str, *, default: bool) -> bool:
return cast(
bool,
ConsoleRender().render(
inquirer.Confirm(
name='confirm',
message=message,
default=default,
),
),
)
Expand All @@ -92,26 +114,66 @@ def _prompt_for_template() -> str:
def create(
project_name: Optional[str] = typer.Argument(
default=None,
show_default=False,
help='The name of the project and the directory that will be created to contain it. '
'If none is given, you will be prompted.',
),
crawler_type: Optional[str] = typer.Option(
None,
'--crawler-type',
'--template',
show_default=False,
help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',
),
template: Optional[str] = typer.Option(
package_manager: Optional[str] = typer.Option(
default=None,
help='The template to be used to create the project. If none is given, you will be prompted.',
show_default=False,
help='Package manager to be used in the new project. If none is given, you will be prompted.',
),
start_url: Optional[str] = typer.Option(
janbuchar marked this conversation as resolved.
Show resolved Hide resolved
default=None,
show_default=False,
help='The URL where crawling should start. If none is given, you will be prompted.',
),
enable_apify_integration: Optional[bool] = typer.Option(
None,
'--apify/--no-apify',
show_default=False,
help='Should Apify integration be set up for you? If not given, you will be prompted.',
),
) -> None:
"""Bootstrap a new Crawlee project."""
try:
# Prompt for project name if not provided.
project_name = _prompt_for_project_name(project_name)

# Prompt for template choice if not provided.
if template is None:
template = _prompt_for_template()
# Prompt for crawler_type if not provided.
if crawler_type is None:
crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices)

# Prompt for package manager if not provided.
if package_manager is None:
package_manager = _prompt_choice('Please select the package manager', package_manager_choices)

# Prompt for start URL
if start_url is None:
start_url = _prompt_text('Please specify the start URL', default=default_start_url)

# Ask about Apify integration if not explicitly configured
if enable_apify_integration is None:
enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False)

if all(
[
project_name,
crawler_type,
package_manager,
start_url,
enable_apify_integration is not None,
]
):
package_name = project_name.replace('-', '_')

if project_name and template:
# Start the bootstrap process.
with Progress(
SpinnerColumn(),
Expand All @@ -121,16 +183,23 @@ def create(
progress.add_task(description='Bootstrapping...', total=None)
cookiecutter(
template='gh:apify/crawlee-python',
directory=f'templates/{template}',
directory='templates/crawler',
no_input=True,
extra_context={'project_name': project_name},
extra_context={
'project_name': project_name,
'package_manager': package_manager,
'crawler_type': crawler_type,
'enable_apify_integration': enable_apify_integration,
'start_url': start_url,
},
)

typer.echo(f'Your project "{project_name}" was created.')
typer.echo(
f'To run it, navigate to the directory: "cd {project_name}", '
'install dependencies with "poetry install", '
f'and run it using "poetry run python -m {project_name}".'
f'and run it using "poetry run python -m {package_name}".'
if package_manager == 'poetry'
else f'and run it using "python -m {package_name}".'
)
typer.echo(f'See the "{project_name}/README.md" for more information.')

Expand Down
12 changes: 12 additions & 0 deletions templates/crawler/cookiecutter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"project_name": "crawlee-python-beautifulsoup-project",
janbuchar marked this conversation as resolved.
Show resolved Hide resolved
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
"crawler_type": ["beautifulsoup", "parsel", "playwright"],
"package_manager": ["poetry", "pip"],
"enable_apify_integration": false,
"start_url": "https://crawlee.dev",
"_jinja2_env_vars": {
"line_statement_prefix": "# %"
},
"_extensions": ["jinja2.ext.do"]
}
13 changes: 13 additions & 0 deletions templates/crawler/hooks/post_gen_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import subprocess

# % if cookiecutter.package_manager == 'poetry'
subprocess.check_call(['poetry', 'install'])
# % if cookiecutter.crawler_type == 'playwright'
subprocess.check_call(['poetry', 'run', 'playwright', 'install'])
# % endif
# % elif cookiecutter.package_manager == 'pip'
subprocess.check_call(['pip', 'install', '.'])
# % if cookiecutter.crawler_type == 'playwright'
subprocess.check_call(['playwright', 'install'])
# % endif
# % endif
32 changes: 32 additions & 0 deletions templates/crawler/templates/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# % if cookiecutter.enable_apify_integration
from apify import Actor
# % endif
# % block import required
# % endblock

from .routes import router


async def main() -> None:
"""The crawler entry point."""
# % filter truncate(0, end='')
# % block instantiation required
# % endblock
# % endfilter

# % if cookiecutter.enable_apify_integration
async with Actor:
# % filter indent(width=8, first=False)
{{ self.instantiation() }}
# % endfilter
# % else
# % filter indent(width=4, first=False)
{{ self.instantiation() }}
# % endfilter
# % endif

await crawler.run(
[
'{{ cookiecutter.start_url }}',
]
)
12 changes: 12 additions & 0 deletions templates/crawler/templates/main_beautifulsoup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
# % endblock

# % block instantiation
crawler = BeautifulSoupCrawler(
request_handler=router,
max_requests_per_crawl=50,
)
# % endblock
12 changes: 12 additions & 0 deletions templates/crawler/templates/main_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.parsel_crawler import ParselCrawler
# % endblock

# % block instantiation
crawler = ParselCrawler(
request_handler=router,
max_requests_per_crawl=50,
)
# % endblock
13 changes: 13 additions & 0 deletions templates/crawler/templates/main_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# % extends 'main.py'

# % block import
from crawlee.playwright_crawler import PlaywrightCrawler
# % endblock

# % block instantiation
crawler = PlaywrightCrawler(
request_handler=router,
headless=True,
max_requests_per_crawl=50,
)
# % endblock
19 changes: 19 additions & 0 deletions templates/crawler/templates/routes_beautifulsoup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
from crawlee.router import Router

router = Router[BeautifulSoupCrawlingContext]()


@router.default_handler
async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = context.soup.find('title')
await context.push_data(
{
'url': context.request.loaded_url,
'title': title.text if title else None,
}
)

await context.enqueue_links()
19 changes: 19 additions & 0 deletions templates/crawler/templates/routes_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.parsel_crawler import ParselCrawlingContext
from crawlee.router import Router

router = Router[ParselCrawlingContext]()


@router.default_handler
async def default_handler(context: ParselCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = context.selector.xpath('//title/text()').get()
await context.push_data(
{
'url': context.request.loaded_url,
'title': title,
}
)

await context.enqueue_links()
19 changes: 19 additions & 0 deletions templates/crawler/templates/routes_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.playwright_crawler import PlaywrightCrawlingContext
from crawlee.router import Router

router = Router[PlaywrightCrawlingContext]()


@router.default_handler
async def default_handler(context: PlaywrightCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = await context.page.query_selector('title')
await context.push_data(
{
'url': context.request.loaded_url,
'title': await title.inner_text() if title else None,
}
)

await context.enqueue_links()
Loading
Loading