forked from apify/crawlee-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Improve project bootstrapping (apify#538)
This adds a unified `crawlee/project_template` template. The original `playwright` and `beautifulsoup` templates are kept for compatibility with older versions of the CLI. The user is now prompted for package manager type (pip, poetry), crawler type, start URL and whether or not Apify integration should be set up. - closes apify#317 - closes apify#414 (http client selection is not implemented) - closes apify#511 - closes apify#495 ### TODO - [x] http client selection - [x] disable poetry option if it isn't installed - [x] rectify the pip-based setup 1. **manual dependency installation** - no automatic installation, just dump requirements.txt and tell the user to handle it any way they want 2. **pip+venv** - dump requirements.txt, make a virtualenv (.venv) using the current python interpreter, install requirements and tell user to activate it - ~should be disabled if `venv` module is not present~ it's stdlib - [x] test the whole thing on Windows (mainly the various package manager configurations) - [x] fix how cookiecutter.json is read (it is not present when installing via pip)
- Loading branch information
Showing
24 changed files
with
595 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
{ | ||
"project_name": "crawlee-python-project", | ||
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", | ||
"crawler_type": ["beautifulsoup", "parsel", "playwright"], | ||
"http_client": ["httpx", "curl-impersonate"], | ||
"package_manager": ["poetry", "pip", "manual"], | ||
"enable_apify_integration": false, | ||
"start_url": "https://crawlee.dev", | ||
"_jinja2_env_vars": { | ||
"line_statement_prefix": "# %" | ||
}, | ||
"_extensions": ["jinja2.ext.do"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import platform | ||
import subprocess | ||
from pathlib import Path | ||
|
||
Path('_pyproject.toml').rename('pyproject.toml') | ||
|
||
# % if cookiecutter.package_manager == 'poetry' | ||
Path('requirements.txt').unlink() | ||
|
||
subprocess.check_call(['poetry', 'install']) | ||
# % if cookiecutter.crawler_type == 'playwright' | ||
subprocess.check_call(['poetry', 'run', 'playwright', 'install']) | ||
# % endif | ||
# % elif cookiecutter.package_manager == 'pip' | ||
import venv # noqa: E402 | ||
|
||
# Create a virtual environment | ||
venv_root = Path('.venv') | ||
venv.main([str(venv_root)]) | ||
|
||
if platform.system() == 'Windows': # noqa: SIM108 | ||
path = venv_root / 'Scripts' | ||
else: | ||
path = venv_root / 'bin' | ||
|
||
# Install requirements and generate requirements.txt as an impromptu lockfile | ||
subprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt']) | ||
with open('requirements.txt', 'w') as requirements_txt: | ||
subprocess.check_call([str(path / 'pip'), 'freeze'], stdout=requirements_txt) | ||
|
||
# % if cookiecutter.crawler_type == 'playwright' | ||
subprocess.check_call([str(path / 'playwright'), 'install']) | ||
# % endif | ||
# % endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# % if cookiecutter.package_manager == 'poetry' | ||
import subprocess | ||
import re | ||
|
||
try: | ||
version = subprocess.check_output(['poetry', '--version']).decode().strip() | ||
except OSError as exc: | ||
raise RuntimeError('You chose to use the Poetry package manager, but it does not seem to be installed') from exc | ||
|
||
if not re.match(r'Poetry \(version 1\..*\)', version): | ||
raise RuntimeError(f'Poetry 1.x is required, but "{version}" is installed') | ||
# % endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# % if cookiecutter.enable_apify_integration | ||
from apify import Actor | ||
# % endif | ||
# % block import required | ||
# % endblock | ||
# % if cookiecutter.http_client == 'curl-impersonate' | ||
from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient | ||
# % elif cookiecutter.http_client == 'httpx' | ||
from crawlee.http_clients._httpx import HttpxHttpClient | ||
# % endif | ||
|
||
from .routes import router | ||
|
||
# % filter truncate(0, end='') | ||
# % block http_client_instantiation | ||
# % if cookiecutter.http_client == 'curl-impersonate' | ||
http_client=CurlImpersonateHttpClient(), | ||
# % elif cookiecutter.http_client == 'httpx' | ||
http_client=HttpxHttpClient(), | ||
# % endif | ||
# % endblock | ||
# % endfilter | ||
|
||
async def main() -> None: | ||
"""The crawler entry point.""" | ||
# % filter truncate(0, end='') | ||
# % block instantiation required | ||
# % endblock | ||
# % endfilter | ||
|
||
# % if cookiecutter.enable_apify_integration | ||
async with Actor: | ||
# % filter indent(width=8, first=False) | ||
{{ self.instantiation() }} | ||
# % endfilter | ||
# % else | ||
# % filter indent(width=4, first=False) | ||
{{ self.instantiation() }} | ||
# % endfilter | ||
# % endif | ||
|
||
await crawler.run( | ||
[ | ||
'{{ cookiecutter.start_url }}', | ||
] | ||
) |
12 changes: 12 additions & 0 deletions
12
src/crawlee/project_template/templates/main_beautifulsoup.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# % extends 'main.py' | ||
|
||
# % block import | ||
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler | ||
# % endblock | ||
|
||
# % block instantiation | ||
crawler = BeautifulSoupCrawler( | ||
request_handler=router, | ||
max_requests_per_crawl=50, | ||
{{ self.http_client_instantiation() }}) | ||
# % endblock |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# % extends 'main.py' | ||
|
||
# % block import | ||
from crawlee.parsel_crawler import ParselCrawler | ||
# % endblock | ||
|
||
# % block instantiation | ||
crawler = ParselCrawler( | ||
request_handler=router, | ||
max_requests_per_crawl=50, | ||
{{ self.http_client_instantiation() }}) | ||
# % endblock |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# % extends 'main.py' | ||
|
||
# % block import | ||
from crawlee.playwright_crawler import PlaywrightCrawler | ||
# % endblock | ||
|
||
# % block instantiation | ||
crawler = PlaywrightCrawler( | ||
request_handler=router, | ||
headless=True, | ||
max_requests_per_crawl=50, | ||
{{ self.http_client_instantiation() }}) | ||
# % endblock |
Oops, something went wrong.