Skip to content

Commit

Permalink
feat: update Python Scrapy template to use new SDK (#311)
Browse files Browse the repository at this point in the history
Relates: #303
  • Loading branch information
vdusek authored Feb 19, 2025
1 parent a061cc1 commit d3587e0
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 141 deletions.
15 changes: 11 additions & 4 deletions templates/python-scrapy/.actor/input_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,17 @@
"title": "Start URLs",
"type": "array",
"description": "URLs to start with",
"prefill": [
{ "url": "https://apify.com" }
],
"editor": "requestListSources"
"editor": "requestListSources",
"prefill": [{ "url": "https://apify.com/" }],
"default": [{ "url": "https://apify.com/" }]
},
"allowedDomains": {
"title": "Allowed domains",
"type": "array",
"description": "Domains that the scraper is allowed to crawl.",
"editor": "json",
"prefill": ["apify.com"],
"default": ["apify.com"]
},
"proxyConfiguration": {
"sectionCaption": "Proxy and HTTP configuration",
Expand Down
3 changes: 1 addition & 2 deletions templates/python-scrapy/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,4 @@
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify[scrapy] < 3.0
nest-asyncio ~= 1.6
scrapy ~= 2.12
scrapy < 3.0
108 changes: 12 additions & 96 deletions templates/python-scrapy/src/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# ruff: noqa: E402
"""Apify Actor integration for Scrapy projects.
This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
Expand All @@ -11,109 +10,26 @@
We recommend you do not modify this file unless you really know what you are doing.
"""

# We need to configure the logging first before we import anything else, so that nothing else imports
# `scrapy.utils.log` before we patch it.
# ruff: noqa: E402
from __future__ import annotations

from logging import StreamHandler, getLogger
from typing import Any

from apify.log import ActorLogFormatter
from scrapy.utils import log as scrapy_logging
from scrapy.utils.project import get_project_settings

# Define names of the loggers.
MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES

# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
# a specific logger, do it in this file.
settings = get_project_settings()
LOGGING_LEVEL = settings['LOG_LEVEL']

# Define a logging handler which will be used for the loggers.
apify_handler = StreamHandler()
apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))


def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
"""Configure a logger with the specified settings.
Args:
logger_name: The name of the logger to be configured.
log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).
handlers: Optional list of logging handlers.
"""
logger = getLogger(logger_name)
logger.setLevel(log_level)
logger.handlers = []

for handler in handlers:
logger.addHandler(handler)

from twisted.internet import asyncioreactor

# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
# the `main.py` and Scrapy components.
for logger_name in MAIN_LOGGER_NAMES:
configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
# Install Twisted's asyncio reactor before importing any other Twisted or
# Scrapy components.
asyncioreactor.install() # type: ignore[no-untyped-call]

# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
# otherwise we would lose some log messages.
old_configure_logging = scrapy_logging.configure_logging


def new_configure_logging(*args: Any, **kwargs: Any) -> None:
"""Configure logging for Scrapy and root loggers to ensure consistent logging behavior.
We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
these four loggers and the root logger.
"""
old_configure_logging(*args, **kwargs)

# We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`
# property within spiders. See details in the Spider logger property:
# https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.
configure_logger(None, LOGGING_LEVEL, apify_handler)

# We modify other loggers only by setting up their log level. A custom log handler is added
# only to the root logger to avoid duplicate log messages.
for logger_name in ALL_LOGGER_NAMES:
configure_logger(logger_name, LOGGING_LEVEL)

# Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless
# messages, especially when running on the platform.
configure_logger('httpx', 'WARNING')


scrapy_logging.configure_logging = new_configure_logging

# Now we can do the rest of the setup.
import asyncio
import os

import nest_asyncio
from scrapy.utils.reactor import install_reactor
from apify.scrapy import initialize_logging, run_scrapy_actor

# Import your main Actor coroutine here.
from .main import main

# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is
# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
# to work together.
#
# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly
# on Windows.
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
nest_asyncio.apply()

# Specify the path to the Scrapy project settings module.
# Ensure the location to the Scrapy settings module is defined.
os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'

# Run the Apify main coroutine in the event loop.
asyncio.run(main())

if __name__ == '__main__':
initialize_logging()
run_scrapy_actor(main())
37 changes: 15 additions & 22 deletions templates/python-scrapy/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,38 +22,31 @@
from __future__ import annotations

from apify import Actor
from apify.scrapy.utils import apply_apify_settings
from scrapy.crawler import CrawlerProcess
from apify.scrapy import apply_apify_settings
from scrapy.crawler import CrawlerRunner
from scrapy.utils.defer import deferred_to_future

# Import your Scrapy spider here.
from .spiders.title import TitleSpider as Spider

# Default input values for local execution using `apify run`.
LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]
from .spiders import TitleSpider as Spider


async def main() -> None:
"""Apify Actor main coroutine for executing the Scrapy spider."""
async with Actor:
Actor.log.info('Actor is being executed...')

# Retrieve and process Actor input.
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
start_urls = [url['url'] for url in actor_input.get('startUrls', [])]
allowed_domains = actor_input.get('allowedDomains')
proxy_config = actor_input.get('proxyConfiguration')

# Open the default request queue for handling URLs to be processed.
request_queue = await Actor.open_request_queue()

# Enqueue the start URLs.
for start_url in start_urls:
url = start_url.get('url')
await request_queue.add_request(url)

# Apply Apify settings, it will override the Scrapy project settings.
# Apply Apify settings, which will override the Scrapy project settings.
settings = apply_apify_settings(proxy_config=proxy_config)

# Execute the spider using Scrapy `CrawlerProcess`.
process = CrawlerProcess(settings, install_root_handler=False)
process.crawl(Spider)
process.start()
# Create CrawlerRunner and execute the Scrapy spider.
crawler_runner = CrawlerRunner(settings)
crawl_deferred = crawler_runner.crawl(
Spider,
start_urls=start_urls,
allowed_domains=allowed_domains,
)
await deferred_to_future(crawl_deferred)
2 changes: 1 addition & 1 deletion templates/python-scrapy/src/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# ruff: noqa: D101, D102, ARG002, UP028
"""Scrapy middlewares module.
This module defines Scrapy middlewares. Middlewares are processing components that handle requests and
Expand All @@ -10,6 +9,7 @@
https://docs.scrapy.org/en/latest/topics/spider-middleware.html
"""

# ruff: noqa: D101, D102, ARG002, UP028
from __future__ import annotations

from typing import TYPE_CHECKING
Expand Down
2 changes: 1 addition & 1 deletion templates/python-scrapy/src/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# ruff: noqa: ARG002, D102
"""Scrapy item pipelines module.
This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components
Expand All @@ -8,6 +7,7 @@
http://doc.scrapy.org/en/latest/topics/item-pipeline.html
"""

# ruff: noqa: ARG002, D102
from scrapy import Spider

from .items import TitleItem
Expand Down
5 changes: 3 additions & 2 deletions templates/python-scrapy/src/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
http://doc.scrapy.org/en/latest/topics/settings.html
"""

# You can update these options and add new ones
BOT_NAME = 'titlebot'
DEPTH_LIMIT = 1
LOG_LEVEL = 'INFO'
NEWSPIDER_MODULE = 'src.spiders'
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
ROBOTSTXT_OBEY = True
SPIDER_MODULES = ['src.spiders']
TELNETCONSOLE_ENABLED = False
# Do not change the Twisted reactor unless you really know what you are doing.
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
ITEM_PIPELINES = {
'src.pipelines.TitleItemPipeline': 123,
}
Expand Down
4 changes: 4 additions & 0 deletions templates/python-scrapy/src/spiders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,7 @@
For detailed information on creating and utilizing spiders, refer to the official documentation:
https://docs.scrapy.org/en/latest/topics/spiders.html
"""

from .title import TitleSpider

__all__ = ['TitleSpider']
40 changes: 27 additions & 13 deletions templates/python-scrapy/src/spiders/title.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ruff: noqa: RUF012, TID252
# ruff: noqa: TID252
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
from urllib.parse import urljoin

from scrapy import Request, Spider
Expand All @@ -15,19 +15,32 @@


class TitleSpider(Spider):
"""Scrapes title pages and enqueues all links found on the page."""
"""A spider that scrapes web pages to extract titles and discover new links.
name = 'title_spider'
This spider retrieves the content of the <title> element from each page and queues
any valid hyperlinks for further crawling.
"""

# The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input
# when the project is executed using Apify.
start_urls = ['https://apify.com/']
name = 'title_spider'

# Scrape only the pages within the Apify domain.
allowed_domains = ['apify.com']
def __init__(
self,
start_urls: list[str],
allowed_domains: list[str],
*args: Any,
**kwargs: Any,
) -> None:
"""A default costructor.
# Limit the number of pages to scrape.
custom_settings = {'CLOSESPIDER_PAGECOUNT': 10}
Args:
start_urls: URLs to start the scraping from.
allowed_domains: Domains that the scraper is allowed to crawl.
*args: Additional positional arguments.
**kwargs: Additional keyword arguments.
"""
super().__init__(*args, **kwargs)
self.start_urls = start_urls
self.allowed_domains = allowed_domains

def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:
"""Parse the web page response.
Expand All @@ -36,7 +49,7 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None
response: The web page response.
Yields:
Yields scraped TitleItem and Requests for links.
Yields scraped `TitleItem` and new `Request` objects for links.
"""
self.logger.info('TitleSpider is parsing %s...', response)

Expand All @@ -45,7 +58,8 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None
title = response.css('title::text').extract_first()
yield TitleItem(url=url, title=title)

# Extract all links from the page, create Requests out of them, and yield them
# Extract all links from the page, create `Request` objects out of them,
# and yield them.
for link_href in response.css('a::attr("href")'):
link_url = urljoin(response.url, link_href.get())
if link_url.startswith(('http://', 'https://')):
Expand Down

0 comments on commit d3587e0

Please sign in to comment.