Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add apply_apify_settings to Scrapy subpackage #178

Merged
merged 4 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

## [1.5.3](../../releases/tag/v1.5.3) - Unreleased

...
### Added

- Add `apply_apify_settings` function to Scrapy subpackage

## [1.5.2](../../releases/tag/v1.5.2) - 2024-01-19

Expand Down
41 changes: 41 additions & 0 deletions src/apify/scrapy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

try:
from scrapy import Request, Spider
from scrapy.settings import Settings # noqa: TCH002
from scrapy.utils.project import get_project_settings
from scrapy.utils.python import to_bytes
from scrapy.utils.request import request_from_dict
except ImportError as exc:
Expand Down Expand Up @@ -153,6 +155,45 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
return scrapy_request


def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings:
"""Integrates Apify configuration into a Scrapy project settings.

Note: The function directly modifies the passed `settings` object and also returns it.

Args:
settings: Scrapy project settings to be modified.
proxy_config: Proxy configuration to be stored in the settings.

Returns:
Scrapy project settings with custom configurations.
"""
if settings is None:
settings = get_project_settings()

# Use ApifyScheduler as the scheduler
settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler'

# Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000),
# ensuring it is executed as the final step in the pipeline sequence
settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000

# Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None

# Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950

# Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000)
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000

# Store the proxy configuration
settings['APIFY_PROXY_SETTINGS'] = proxy_config

return settings


async def open_queue_with_custom_client() -> RequestQueue:
"""Open a Request Queue with custom Apify Client.

Expand Down
62 changes: 62 additions & 0 deletions tests/unit/scrapy/utils/test_apply_apify_settings.py
vdusek marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

from scrapy.settings import Settings

from apify.scrapy.utils import apply_apify_settings


def test__apply_apify_settings__overrides_scheduler() -> None:
settings = Settings()
new_settings = apply_apify_settings(settings=settings)

assert new_settings.get('SCHEDULER') == 'apify.scrapy.scheduler.ApifyScheduler'


def test__apply_apify_settings__update_item_pipelines() -> None:
settings = Settings(
{
'ITEM_PIPELINES': {
'scrapy.pipelines.files.FilesPipeline': 1,
}
}
)
new_settings = apply_apify_settings(settings=settings)

assert new_settings.get('ITEM_PIPELINES') == {
'scrapy.pipelines.files.FilesPipeline': 1,
'apify.scrapy.pipelines.ActorDatasetPushPipeline': 1000,
}


def test__apply_apify_settings__update_downloader_middlewares() -> None:
settings = Settings(
{
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 234,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 345,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543,
},
}
)
new_settings = apply_apify_settings(settings=settings)

assert new_settings.get('DOWNLOADER_MIDDLEWARES') == {
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': None,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 950,
'apify.scrapy.middlewares.ApifyRetryMiddleware': 1000,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543,
}


def test__apply_apify_settings__add_proxy_config() -> None:
settings = Settings()
new_settings = apply_apify_settings(settings=settings)
assert new_settings.get('APIFY_PROXY_SETTINGS') is None

settings = Settings()
proxy_config = {'useApifyProxy': True, 'apifyProxyGroups': []}
new_settings = apply_apify_settings(settings=settings, proxy_config=proxy_config)
assert new_settings.get('APIFY_PROXY_SETTINGS') == {'useApifyProxy': True, 'apifyProxyGroups': []}