feat: update Python Scrapy template to use new SDK (#311)

Relates: #303
apify · Feb 19, 2025 · d3587e0 · d3587e0
1 parent a061cc1
commit d3587e0
Show file tree

Hide file tree

Showing 9 changed files with 75 additions and 141 deletions.
diff --git a/templates/python-scrapy/.actor/input_schema.json b/templates/python-scrapy/.actor/input_schema.json
@@ -7,10 +7,17 @@
             "title": "Start URLs",
             "type": "array",
             "description": "URLs to start with",
-            "prefill": [
-                { "url": "https://apify.com" }
-            ],
-            "editor": "requestListSources"
+            "editor": "requestListSources",
+            "prefill": [{ "url": "https://apify.com/" }],
+            "default": [{ "url": "https://apify.com/" }]
+        },
+        "allowedDomains": {
+            "title": "Allowed domains",
+            "type": "array",
+            "description": "Domains that the scraper is allowed to crawl.",
+            "editor": "json",
+            "prefill": ["apify.com"],
+            "default": ["apify.com"]
         },
         "proxyConfiguration": {
             "sectionCaption": "Proxy and HTTP configuration",

diff --git a/templates/python-scrapy/requirements.txt b/templates/python-scrapy/requirements.txt
@@ -2,5 +2,4 @@
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
 apify[scrapy] < 3.0
-nest-asyncio ~= 1.6
-scrapy ~= 2.12
+scrapy < 3.0
diff --git a/templates/python-scrapy/src/__main__.py b/templates/python-scrapy/src/__main__.py
@@ -1,4 +1,3 @@
-# ruff: noqa: E402
 """Apify Actor integration for Scrapy projects.
 
 This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
@@ -11,109 +10,26 @@
 We recommend you do not modify this file unless you really know what you are doing.
 """
 
-# We need to configure the logging first before we import anything else, so that nothing else imports
-# `scrapy.utils.log` before we patch it.
+# ruff: noqa: E402
 from __future__ import annotations
 
-from logging import StreamHandler, getLogger
-from typing import Any
-
-from apify.log import ActorLogFormatter
-from scrapy.utils import log as scrapy_logging
-from scrapy.utils.project import get_project_settings
-
-# Define names of the loggers.
-MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
-OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
-ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
-
-# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
-# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
-# a specific logger, do it in this file.
-settings = get_project_settings()
-LOGGING_LEVEL = settings['LOG_LEVEL']
-
-# Define a logging handler which will be used for the loggers.
-apify_handler = StreamHandler()
-apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))
-
-
-def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
-    """Configure a logger with the specified settings.
-
-    Args:
-        logger_name: The name of the logger to be configured.
-        log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).
-        handlers: Optional list of logging handlers.
-    """
-    logger = getLogger(logger_name)
-    logger.setLevel(log_level)
-    logger.handlers = []
-
-    for handler in handlers:
-        logger.addHandler(handler)
-
+from twisted.internet import asyncioreactor
 
-# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
-# the `main.py` and Scrapy components.
-for logger_name in MAIN_LOGGER_NAMES:
-    configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
+# Install Twisted's asyncio reactor before importing any other Twisted or
+# Scrapy components.
+asyncioreactor.install()  # type: ignore[no-untyped-call]
 
-# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
-# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
-# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
-# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
-# otherwise we would lose some log messages.
-old_configure_logging = scrapy_logging.configure_logging
-
-
-def new_configure_logging(*args: Any, **kwargs: Any) -> None:
-    """Configure logging for Scrapy and root loggers to ensure consistent logging behavior.
-
-    We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
-    logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
-    loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
-    these four loggers and the root logger.
-    """
-    old_configure_logging(*args, **kwargs)
-
-    # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`
-    # property within spiders. See details in the Spider logger property:
-    # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.
-    configure_logger(None, LOGGING_LEVEL, apify_handler)
-
-    # We modify other loggers only by setting up their log level. A custom log handler is added
-    # only to the root logger to avoid duplicate log messages.
-    for logger_name in ALL_LOGGER_NAMES:
-        configure_logger(logger_name, LOGGING_LEVEL)
-
-    # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless
-    # messages, especially when running on the platform.
-    configure_logger('httpx', 'WARNING')
-
-
-scrapy_logging.configure_logging = new_configure_logging
-
-# Now we can do the rest of the setup.
-import asyncio
 import os
 
-import nest_asyncio
-from scrapy.utils.reactor import install_reactor
+from apify.scrapy import initialize_logging, run_scrapy_actor
 
+# Import your main Actor coroutine here.
 from .main import main
 
-# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is
-# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
-# to work together.
-#
-# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly
-# on Windows.
-install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
-nest_asyncio.apply()
-
-# Specify the path to the Scrapy project settings module.
+# Ensure the location to the Scrapy settings module is defined.
 os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
 
-# Run the Apify main coroutine in the event loop.
-asyncio.run(main())
+
+if __name__ == '__main__':
+    initialize_logging()
+    run_scrapy_actor(main())
diff --git a/templates/python-scrapy/src/main.py b/templates/python-scrapy/src/main.py
@@ -22,38 +22,31 @@
 from __future__ import annotations
 
 from apify import Actor
-from apify.scrapy.utils import apply_apify_settings
-from scrapy.crawler import CrawlerProcess
+from apify.scrapy import apply_apify_settings
+from scrapy.crawler import CrawlerRunner
+from scrapy.utils.defer import deferred_to_future
 
 # Import your Scrapy spider here.
-from .spiders.title import TitleSpider as Spider
-
-# Default input values for local execution using `apify run`.
-LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]
+from .spiders import TitleSpider as Spider
 
 
 async def main() -> None:
     """Apify Actor main coroutine for executing the Scrapy spider."""
     async with Actor:
-        Actor.log.info('Actor is being executed...')
-
         # Retrieve and process Actor input.
         actor_input = await Actor.get_input() or {}
-        start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
+        start_urls = [url['url'] for url in actor_input.get('startUrls', [])]
+        allowed_domains = actor_input.get('allowedDomains')
         proxy_config = actor_input.get('proxyConfiguration')
 
-        # Open the default request queue for handling URLs to be processed.
-        request_queue = await Actor.open_request_queue()
-
-        # Enqueue the start URLs.
-        for start_url in start_urls:
-            url = start_url.get('url')
-            await request_queue.add_request(url)
-
-        # Apply Apify settings, it will override the Scrapy project settings.
+        # Apply Apify settings, which will override the Scrapy project settings.
         settings = apply_apify_settings(proxy_config=proxy_config)
 
-        # Execute the spider using Scrapy `CrawlerProcess`.
-        process = CrawlerProcess(settings, install_root_handler=False)
-        process.crawl(Spider)
-        process.start()
+        # Create CrawlerRunner and execute the Scrapy spider.
+        crawler_runner = CrawlerRunner(settings)
+        crawl_deferred = crawler_runner.crawl(
+            Spider,
+            start_urls=start_urls,
+            allowed_domains=allowed_domains,
+        )
+        await deferred_to_future(crawl_deferred)
diff --git a/templates/python-scrapy/src/middlewares.py b/templates/python-scrapy/src/middlewares.py
@@ -1,4 +1,3 @@
-# ruff: noqa: D101, D102, ARG002, UP028
 """Scrapy middlewares module.
 
 This module defines Scrapy middlewares. Middlewares are processing components that handle requests and
@@ -10,6 +9,7 @@
 https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 """
 
+# ruff: noqa: D101, D102, ARG002, UP028
 from __future__ import annotations
 
 from typing import TYPE_CHECKING

diff --git a/templates/python-scrapy/src/pipelines.py b/templates/python-scrapy/src/pipelines.py
@@ -1,4 +1,3 @@
-# ruff: noqa: ARG002, D102
 """Scrapy item pipelines module.
 
 This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components
@@ -8,6 +7,7 @@
 http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 """
 
+# ruff: noqa: ARG002, D102
 from scrapy import Spider
 
 from .items import TitleItem

diff --git a/templates/python-scrapy/src/settings.py b/templates/python-scrapy/src/settings.py
@@ -6,14 +6,15 @@
 http://doc.scrapy.org/en/latest/topics/settings.html
 """
 
-# You can update these options and add new ones
 BOT_NAME = 'titlebot'
 DEPTH_LIMIT = 1
 LOG_LEVEL = 'INFO'
 NEWSPIDER_MODULE = 'src.spiders'
-REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
 ROBOTSTXT_OBEY = True
 SPIDER_MODULES = ['src.spiders']
+TELNETCONSOLE_ENABLED = False
+# Do not change the Twisted reactor unless you really know what you are doing.
+TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
 ITEM_PIPELINES = {
     'src.pipelines.TitleItemPipeline': 123,
 }

diff --git a/templates/python-scrapy/src/spiders/__init__.py b/templates/python-scrapy/src/spiders/__init__.py
@@ -6,3 +6,7 @@
 For detailed information on creating and utilizing spiders, refer to the official documentation:
 https://docs.scrapy.org/en/latest/topics/spiders.html
 """
+
+from .title import TitleSpider
+
+__all__ = ['TitleSpider']
diff --git a/templates/python-scrapy/src/spiders/title.py b/templates/python-scrapy/src/spiders/title.py
@@ -1,7 +1,7 @@
-# ruff: noqa: RUF012, TID252
+# ruff: noqa: TID252
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 from urllib.parse import urljoin
 
 from scrapy import Request, Spider
@@ -15,19 +15,32 @@
 
 
 class TitleSpider(Spider):
-    """Scrapes title pages and enqueues all links found on the page."""
+    """A spider that scrapes web pages to extract titles and discover new links.
 
-    name = 'title_spider'
+    This spider retrieves the content of the <title> element from each page and queues
+    any valid hyperlinks for further crawling.
+    """
 
-    # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input
-    # when the project is executed using Apify.
-    start_urls = ['https://apify.com/']
+    name = 'title_spider'
 
-    # Scrape only the pages within the Apify domain.
-    allowed_domains = ['apify.com']
+    def __init__(
+        self,
+        start_urls: list[str],
+        allowed_domains: list[str],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """A default costructor.
 
-    # Limit the number of pages to scrape.
-    custom_settings = {'CLOSESPIDER_PAGECOUNT': 10}
+        Args:
+            start_urls: URLs to start the scraping from.
+            allowed_domains: Domains that the scraper is allowed to crawl.
+            *args: Additional positional arguments.
+            **kwargs: Additional keyword arguments.
+        """
+        super().__init__(*args, **kwargs)
+        self.start_urls = start_urls
+        self.allowed_domains = allowed_domains
 
     def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:
         """Parse the web page response.
@@ -36,7 +49,7 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None
             response: The web page response.
 
         Yields:
-            Yields scraped TitleItem and Requests for links.
+            Yields scraped `TitleItem` and new `Request` objects for links.
         """
         self.logger.info('TitleSpider is parsing %s...', response)
 
@@ -45,7 +58,8 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None
         title = response.css('title::text').extract_first()
         yield TitleItem(url=url, title=title)
 
-        # Extract all links from the page, create Requests out of them, and yield them
+        # Extract all links from the page, create `Request` objects out of them,
+        # and yield them.
         for link_href in response.css('a::attr("href")'):
             link_url = urljoin(response.url, link_href.get())
             if link_url.startswith(('http://', 'https://')):