diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f96e96f..a2f65278 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,9 @@ ## [1.5.3](../../releases/tag/v1.5.3) - Unreleased -... +### Added + +- Add `apply_apify_settings` function to Scrapy subpackage ## [1.5.2](../../releases/tag/v1.5.2) - 2024-01-19 diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index 272288ef..dc67c198 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -8,6 +8,8 @@ try: from scrapy import Request, Spider + from scrapy.settings import Settings # noqa: TCH002 + from scrapy.utils.project import get_project_settings from scrapy.utils.python import to_bytes from scrapy.utils.request import request_from_dict except ImportError as exc: @@ -153,6 +155,45 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request: return scrapy_request +def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings: + """Integrates Apify configuration into a Scrapy project settings. + + Note: The function directly modifies the passed `settings` object and also returns it. + + Args: + settings: Scrapy project settings to be modified. + proxy_config: Proxy configuration to be stored in the settings. + + Returns: + Scrapy project settings with custom configurations. + """ + if settings is None: + settings = get_project_settings() + + # Use ApifyScheduler as the scheduler + settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler' + + # Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000), + # ensuring it is executed as the final step in the pipeline sequence + settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000 + + # Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt + settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None + + # Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware + settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None + settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950 + + # Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000) + settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None + settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000 + + # Store the proxy configuration + settings['APIFY_PROXY_SETTINGS'] = proxy_config + + return settings + + async def open_queue_with_custom_client() -> RequestQueue: """Open a Request Queue with custom Apify Client. diff --git a/tests/unit/scrapy/utils/test_apply_apify_settings.py b/tests/unit/scrapy/utils/test_apply_apify_settings.py new file mode 100644 index 00000000..9de69379 --- /dev/null +++ b/tests/unit/scrapy/utils/test_apply_apify_settings.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from scrapy.settings import Settings + +from apify.scrapy.utils import apply_apify_settings + + +def test__apply_apify_settings__overrides_scheduler() -> None: + settings = Settings() + new_settings = apply_apify_settings(settings=settings) + + assert new_settings.get('SCHEDULER') == 'apify.scrapy.scheduler.ApifyScheduler' + + +def test__apply_apify_settings__update_item_pipelines() -> None: + settings = Settings( + { + 'ITEM_PIPELINES': { + 'scrapy.pipelines.files.FilesPipeline': 1, + } + } + ) + new_settings = apply_apify_settings(settings=settings) + + assert new_settings.get('ITEM_PIPELINES') == { + 'scrapy.pipelines.files.FilesPipeline': 1, + 'apify.scrapy.pipelines.ActorDatasetPushPipeline': 1000, + } + + +def test__apply_apify_settings__update_downloader_middlewares() -> None: + settings = Settings( + { + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123, + 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 234, + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 345, + 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543, + }, + } + ) + new_settings = apply_apify_settings(settings=settings) + + assert new_settings.get('DOWNLOADER_MIDDLEWARES') == { + 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': None, + 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, + 'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 950, + 'apify.scrapy.middlewares.ApifyRetryMiddleware': 1000, + 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543, + } + + +def test__apply_apify_settings__add_proxy_config() -> None: + settings = Settings() + new_settings = apply_apify_settings(settings=settings) + assert new_settings.get('APIFY_PROXY_SETTINGS') is None + + settings = Settings() + proxy_config = {'useApifyProxy': True, 'apifyProxyGroups': []} + new_settings = apply_apify_settings(settings=settings, proxy_config=proxy_config) + assert new_settings.get('APIFY_PROXY_SETTINGS') == {'useApifyProxy': True, 'apifyProxyGroups': []}