From d8b2438a90f93709ff83b4e990295d1e59927a9b Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 24 Dec 2024 09:38:37 +0100 Subject: [PATCH 01/79] WIP --- .../_adaptive_playwright_crawler.py | 52 +++++++++++++++++++ .../crawlers/_adaptive_playwright/_example.py | 45 ++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py create mode 100644 src/crawlee/crawlers/_adaptive_playwright/_example.py diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py new file mode 100644 index 0000000000..8c1f15f0cb --- /dev/null +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -0,0 +1,52 @@ +from dataclasses import dataclass +from typing import Callable, Awaitable + +from bs4 import BeautifulSoup +from playwright.async_api import Page, Response + +from crawlee._types import EnqueueLinksFunction +from crawlee._utils.docs import docs_group +from crawlee.crawlers import BeautifulSoupCrawler, PlaywrightCrawler, ContextPipeline, AbstractHttpCrawler, \ + ParsedHttpCrawlingContext, PlaywrightCrawlingContext +from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser + + +@dataclass(frozen=True) +@docs_group('Data structures') +class AdaptiveCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup], PlaywrightCrawlingContext): + _page: Page + _response: Response + _enqueue_links: EnqueueLinksFunction + _infinite_scroll: Callable[[], Awaitable[None]] + + @property + def page(self) -> Page: + """The Playwright `Page` object for the current page.""" + return self._page + + @property + def response(self) -> Response: + """The Playwright `Response` object containing the response details for the current URL.""" + return self._response + + @property + def enqueue_links(self) -> EnqueueLinksFunction: + """The Playwright `EnqueueLinksFunction` implementation.""" + return self._enqueue_links + + @property + def infinite_scroll(self) -> Callable[[], Awaitable[None]]: + """A function to perform infinite scrolling on the page. This scrolls to the bottom, triggering + the loading of additional content if present.""" + return self._infinite_scroll + +class AdaptivePlaywrightCrawler(AbstractHttpCrawler[AdaptiveCrawlingContext, BeautifulSoup], PlaywrightCrawler): + + def __init__(self): + context_pipeline = ContextPipeline().compose(self._open_page).compose(self._navigate).compose( + self._handle_blocked_request) + super().__init__(parser=BeautifulSoupParser(), _context_pipeline=context_pipeline) + self._context_pipeline = ContextPipeline().compose(self._open_page).compose(self._navigate) + + def _decide_crawler_type(self): + diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py new file mode 100644 index 0000000000..2eeccd4d5f --- /dev/null +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -0,0 +1,45 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawlingContext +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler + + + + +async def main(): + crawler = AdaptivePlaywrightCrawler() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page using Playwright's API. + posts = await context.page.query_selector_all('.athing') + data = [] + + for post in posts: + # Get the HTML elements for the title and rank within each post. + title_element = await post.query_selector('.title a') + rank_element = await post.query_selector('.rank') + + # Extract the data we want from the elements. + title = await title_element.inner_text() if title_element else None + rank = await rank_element.inner_text() if rank_element else None + href = await title_element.get_attribute('href') if title_element else None + + data.append({'title': title, 'rank': rank, 'href': href}) + + # Push the extracted data to the default dataset. In local configuration, + # the data will be stored as JSON files in ./storage/datasets/default. + await context.push_data(data) + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) From c12acdaa9a8722b56f5138fbc721a87ee15b96c4 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 31 Dec 2024 18:16:39 +0100 Subject: [PATCH 02/79] More feasible version of composition. Add method to BasicCrawler to handle just one request. --- poetry.lock | 361 +++++++++--------- .../_adaptive_playwright_crawler.py | 117 +++--- .../_adaptive_playwright_crawling_context.py | 81 ++++ .../crawlers/_adaptive_playwright/_example.py | 37 +- src/crawlee/crawlers/_basic/_basic_crawler.py | 25 ++ 5 files changed, 359 insertions(+), 262 deletions(-) create mode 100644 src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py diff --git a/poetry.lock b/poetry.lock index dd74f74144..2407270fa6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "annotated-types" @@ -541,127 +541,114 @@ files = [ [[package]] name = "charset-normalizer" -version = "3.4.0" +version = "3.4.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false -python-versions = ">=3.7.0" +python-versions = ">=3.7" files = [ - {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"}, - {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"}, - {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"}, - {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"}, - {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"}, - {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"}, - {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"}, - {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"}, - {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-win32.whl", hash = "sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f30bf9fd9be89ecb2360c7d94a711f00c09b976258846efe40db3d05828e8089"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:97f68b8d6831127e4787ad15e6757232e14e12060bec17091b85eb1486b91d8d"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7974a0b5ecd505609e3b19742b60cee7aa2aa2fb3151bc917e6e2646d7667dcf"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc54db6c8593ef7d4b2a331b58653356cf04f67c960f584edb7c3d8c97e8f39e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:311f30128d7d333eebd7896965bfcfbd0065f1716ec92bd5638d7748eb6f936a"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:7d053096f67cd1241601111b698f5cad775f97ab25d81567d3f59219b5f1adbd"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:807f52c1f798eef6cf26beb819eeb8819b1622ddfeef9d0977a8502d4db6d534"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:dccbe65bd2f7f7ec22c4ff99ed56faa1e9f785482b9bbd7c717e26fd723a1d1e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:2fb9bd477fdea8684f78791a6de97a953c51831ee2981f8e4f583ff3b9d9687e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:01732659ba9b5b873fc117534143e4feefecf3b2078b0a6a2e925271bb6f4cfa"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:7a4f97a081603d2050bfaffdefa5b02a9ec823f8348a572e39032caa8404a487"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7b1bef6280950ee6c177b326508f86cad7ad4dff12454483b51d8b7d673a2c5d"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-win32.whl", hash = "sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765"}, + {file = "charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85"}, + {file = "charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3"}, ] [[package]] name = "click" -version = "8.1.7" +version = "8.1.8" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" files = [ - {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, - {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, + {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, + {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, ] [package.dependencies] @@ -701,73 +688,73 @@ rich = "*" [[package]] name = "coverage" -version = "7.6.9" +version = "7.6.10" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.9" files = [ - {file = "coverage-7.6.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:85d9636f72e8991a1706b2b55b06c27545448baf9f6dbf51c4004609aacd7dcb"}, - {file = "coverage-7.6.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:608a7fd78c67bee8936378299a6cb9f5149bb80238c7a566fc3e6717a4e68710"}, - {file = "coverage-7.6.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96d636c77af18b5cb664ddf12dab9b15a0cfe9c0bde715da38698c8cea748bfa"}, - {file = "coverage-7.6.9-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75cded8a3cff93da9edc31446872d2997e327921d8eed86641efafd350e1df1"}, - {file = "coverage-7.6.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7b15f589593110ae767ce997775d645b47e5cbbf54fd322f8ebea6277466cec"}, - {file = "coverage-7.6.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:44349150f6811b44b25574839b39ae35291f6496eb795b7366fef3bd3cf112d3"}, - {file = "coverage-7.6.9-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:d891c136b5b310d0e702e186d70cd16d1119ea8927347045124cb286b29297e5"}, - {file = "coverage-7.6.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:db1dab894cc139f67822a92910466531de5ea6034ddfd2b11c0d4c6257168073"}, - {file = "coverage-7.6.9-cp310-cp310-win32.whl", hash = "sha256:41ff7b0da5af71a51b53f501a3bac65fb0ec311ebed1632e58fc6107f03b9198"}, - {file = "coverage-7.6.9-cp310-cp310-win_amd64.whl", hash = "sha256:35371f8438028fdccfaf3570b31d98e8d9eda8bb1d6ab9473f5a390969e98717"}, - {file = "coverage-7.6.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:932fc826442132dde42ee52cf66d941f581c685a6313feebed358411238f60f9"}, - {file = "coverage-7.6.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:085161be5f3b30fd9b3e7b9a8c301f935c8313dcf928a07b116324abea2c1c2c"}, - {file = "coverage-7.6.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ccc660a77e1c2bf24ddbce969af9447a9474790160cfb23de6be4fa88e3951c7"}, - {file = "coverage-7.6.9-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c69e42c892c018cd3c8d90da61d845f50a8243062b19d228189b0224150018a9"}, - {file = "coverage-7.6.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0824a28ec542a0be22f60c6ac36d679e0e262e5353203bea81d44ee81fe9c6d4"}, - {file = "coverage-7.6.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:4401ae5fc52ad8d26d2a5d8a7428b0f0c72431683f8e63e42e70606374c311a1"}, - {file = "coverage-7.6.9-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:98caba4476a6c8d59ec1eb00c7dd862ba9beca34085642d46ed503cc2d440d4b"}, - {file = "coverage-7.6.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ee5defd1733fd6ec08b168bd4f5387d5b322f45ca9e0e6c817ea6c4cd36313e3"}, - {file = "coverage-7.6.9-cp311-cp311-win32.whl", hash = "sha256:f2d1ec60d6d256bdf298cb86b78dd715980828f50c46701abc3b0a2b3f8a0dc0"}, - {file = "coverage-7.6.9-cp311-cp311-win_amd64.whl", hash = "sha256:0d59fd927b1f04de57a2ba0137166d31c1a6dd9e764ad4af552912d70428c92b"}, - {file = "coverage-7.6.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:99e266ae0b5d15f1ca8d278a668df6f51cc4b854513daab5cae695ed7b721cf8"}, - {file = "coverage-7.6.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9901d36492009a0a9b94b20e52ebfc8453bf49bb2b27bca2c9706f8b4f5a554a"}, - {file = "coverage-7.6.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abd3e72dd5b97e3af4246cdada7738ef0e608168de952b837b8dd7e90341f015"}, - {file = "coverage-7.6.9-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff74026a461eb0660366fb01c650c1d00f833a086b336bdad7ab00cc952072b3"}, - {file = "coverage-7.6.9-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65dad5a248823a4996724a88eb51d4b31587aa7aa428562dbe459c684e5787ae"}, - {file = "coverage-7.6.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:22be16571504c9ccea919fcedb459d5ab20d41172056206eb2994e2ff06118a4"}, - {file = "coverage-7.6.9-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f957943bc718b87144ecaee70762bc2bc3f1a7a53c7b861103546d3a403f0a6"}, - {file = "coverage-7.6.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0ae1387db4aecb1f485fb70a6c0148c6cdaebb6038f1d40089b1fc84a5db556f"}, - {file = "coverage-7.6.9-cp312-cp312-win32.whl", hash = "sha256:1a330812d9cc7ac2182586f6d41b4d0fadf9be9049f350e0efb275c8ee8eb692"}, - {file = "coverage-7.6.9-cp312-cp312-win_amd64.whl", hash = "sha256:b12c6b18269ca471eedd41c1b6a1065b2f7827508edb9a7ed5555e9a56dcfc97"}, - {file = "coverage-7.6.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:899b8cd4781c400454f2f64f7776a5d87bbd7b3e7f7bda0cb18f857bb1334664"}, - {file = "coverage-7.6.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:61f70dc68bd36810972e55bbbe83674ea073dd1dcc121040a08cdf3416c5349c"}, - {file = "coverage-7.6.9-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a289d23d4c46f1a82d5db4abeb40b9b5be91731ee19a379d15790e53031c014"}, - {file = "coverage-7.6.9-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e216d8044a356fc0337c7a2a0536d6de07888d7bcda76febcb8adc50bdbbd00"}, - {file = "coverage-7.6.9-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c026eb44f744acaa2bda7493dad903aa5bf5fc4f2554293a798d5606710055d"}, - {file = "coverage-7.6.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e77363e8425325384f9d49272c54045bbed2f478e9dd698dbc65dbc37860eb0a"}, - {file = "coverage-7.6.9-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:777abfab476cf83b5177b84d7486497e034eb9eaea0d746ce0c1268c71652077"}, - {file = "coverage-7.6.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:447af20e25fdbe16f26e84eb714ba21d98868705cb138252d28bc400381f6ffb"}, - {file = "coverage-7.6.9-cp313-cp313-win32.whl", hash = "sha256:d872ec5aeb086cbea771c573600d47944eea2dcba8be5f3ee649bfe3cb8dc9ba"}, - {file = "coverage-7.6.9-cp313-cp313-win_amd64.whl", hash = "sha256:fd1213c86e48dfdc5a0cc676551db467495a95a662d2396ecd58e719191446e1"}, - {file = "coverage-7.6.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ba9e7484d286cd5a43744e5f47b0b3fb457865baf07bafc6bee91896364e1419"}, - {file = "coverage-7.6.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e5ea1cf0872ee455c03e5674b5bca5e3e68e159379c1af0903e89f5eba9ccc3a"}, - {file = "coverage-7.6.9-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d10e07aa2b91835d6abec555ec8b2733347956991901eea6ffac295f83a30e4"}, - {file = "coverage-7.6.9-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:13a9e2d3ee855db3dd6ea1ba5203316a1b1fd8eaeffc37c5b54987e61e4194ae"}, - {file = "coverage-7.6.9-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c38bf15a40ccf5619fa2fe8f26106c7e8e080d7760aeccb3722664c8656b030"}, - {file = "coverage-7.6.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d5275455b3e4627c8e7154feaf7ee0743c2e7af82f6e3b561967b1cca755a0be"}, - {file = "coverage-7.6.9-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8f8770dfc6e2c6a2d4569f411015c8d751c980d17a14b0530da2d7f27ffdd88e"}, - {file = "coverage-7.6.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8d2dfa71665a29b153a9681edb1c8d9c1ea50dfc2375fb4dac99ea7e21a0bcd9"}, - {file = "coverage-7.6.9-cp313-cp313t-win32.whl", hash = "sha256:5e6b86b5847a016d0fbd31ffe1001b63355ed309651851295315031ea7eb5a9b"}, - {file = "coverage-7.6.9-cp313-cp313t-win_amd64.whl", hash = "sha256:97ddc94d46088304772d21b060041c97fc16bdda13c6c7f9d8fcd8d5ae0d8611"}, - {file = "coverage-7.6.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:adb697c0bd35100dc690de83154627fbab1f4f3c0386df266dded865fc50a902"}, - {file = "coverage-7.6.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:be57b6d56e49c2739cdf776839a92330e933dd5e5d929966fbbd380c77f060be"}, - {file = "coverage-7.6.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1592791f8204ae9166de22ba7e6705fa4ebd02936c09436a1bb85aabca3e599"}, - {file = "coverage-7.6.9-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4e12ae8cc979cf83d258acb5e1f1cf2f3f83524d1564a49d20b8bec14b637f08"}, - {file = "coverage-7.6.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb5555cff66c4d3d6213a296b360f9e1a8e323e74e0426b6c10ed7f4d021e464"}, - {file = "coverage-7.6.9-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b9389a429e0e5142e69d5bf4a435dd688c14478a19bb901735cdf75e57b13845"}, - {file = "coverage-7.6.9-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:592ac539812e9b46046620341498caf09ca21023c41c893e1eb9dbda00a70cbf"}, - {file = "coverage-7.6.9-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a27801adef24cc30871da98a105f77995e13a25a505a0161911f6aafbd66e678"}, - {file = "coverage-7.6.9-cp39-cp39-win32.whl", hash = "sha256:8e3c3e38930cfb729cb8137d7f055e5a473ddaf1217966aa6238c88bd9fd50e6"}, - {file = "coverage-7.6.9-cp39-cp39-win_amd64.whl", hash = "sha256:e28bf44afa2b187cc9f41749138a64435bf340adfcacb5b2290c070ce99839d4"}, - {file = "coverage-7.6.9-pp39.pp310-none-any.whl", hash = "sha256:f3ca78518bc6bc92828cd11867b121891d75cae4ea9e908d72030609b996db1b"}, - {file = "coverage-7.6.9.tar.gz", hash = "sha256:4a8d8977b0c6ef5aeadcb644da9e69ae0dcfe66ec7f368c89c72e058bd71164d"}, + {file = "coverage-7.6.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5c912978f7fbf47ef99cec50c4401340436d200d41d714c7a4766f377c5b7b78"}, + {file = "coverage-7.6.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a01ec4af7dfeb96ff0078ad9a48810bb0cc8abcb0115180c6013a6b26237626c"}, + {file = "coverage-7.6.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3b204c11e2b2d883946fe1d97f89403aa1811df28ce0447439178cc7463448a"}, + {file = "coverage-7.6.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32ee6d8491fcfc82652a37109f69dee9a830e9379166cb73c16d8dc5c2915165"}, + {file = "coverage-7.6.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675cefc4c06e3b4c876b85bfb7c59c5e2218167bbd4da5075cbe3b5790a28988"}, + {file = "coverage-7.6.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f4f620668dbc6f5e909a0946a877310fb3d57aea8198bde792aae369ee1c23b5"}, + {file = "coverage-7.6.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:4eea95ef275de7abaef630c9b2c002ffbc01918b726a39f5a4353916ec72d2f3"}, + {file = "coverage-7.6.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e2f0280519e42b0a17550072861e0bc8a80a0870de260f9796157d3fca2733c5"}, + {file = "coverage-7.6.10-cp310-cp310-win32.whl", hash = "sha256:bc67deb76bc3717f22e765ab3e07ee9c7a5e26b9019ca19a3b063d9f4b874244"}, + {file = "coverage-7.6.10-cp310-cp310-win_amd64.whl", hash = "sha256:0f460286cb94036455e703c66988851d970fdfd8acc2a1122ab7f4f904e4029e"}, + {file = "coverage-7.6.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ea3c8f04b3e4af80e17bab607c386a830ffc2fb88a5484e1df756478cf70d1d3"}, + {file = "coverage-7.6.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:507a20fc863cae1d5720797761b42d2d87a04b3e5aeb682ef3b7332e90598f43"}, + {file = "coverage-7.6.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d37a84878285b903c0fe21ac8794c6dab58150e9359f1aaebbeddd6412d53132"}, + {file = "coverage-7.6.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a534738b47b0de1995f85f582d983d94031dffb48ab86c95bdf88dc62212142f"}, + {file = "coverage-7.6.10-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d7a2bf79378d8fb8afaa994f91bfd8215134f8631d27eba3e0e2c13546ce994"}, + {file = "coverage-7.6.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6713ba4b4ebc330f3def51df1d5d38fad60b66720948112f114968feb52d3f99"}, + {file = "coverage-7.6.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ab32947f481f7e8c763fa2c92fd9f44eeb143e7610c4ca9ecd6a36adab4081bd"}, + {file = "coverage-7.6.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:7bbd8c8f1b115b892e34ba66a097b915d3871db7ce0e6b9901f462ff3a975377"}, + {file = "coverage-7.6.10-cp311-cp311-win32.whl", hash = "sha256:299e91b274c5c9cdb64cbdf1b3e4a8fe538a7a86acdd08fae52301b28ba297f8"}, + {file = "coverage-7.6.10-cp311-cp311-win_amd64.whl", hash = "sha256:489a01f94aa581dbd961f306e37d75d4ba16104bbfa2b0edb21d29b73be83609"}, + {file = "coverage-7.6.10-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:27c6e64726b307782fa5cbe531e7647aee385a29b2107cd87ba7c0105a5d3853"}, + {file = "coverage-7.6.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c56e097019e72c373bae32d946ecf9858fda841e48d82df7e81c63ac25554078"}, + {file = "coverage-7.6.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7827a5bc7bdb197b9e066cdf650b2887597ad124dd99777332776f7b7c7d0d0"}, + {file = "coverage-7.6.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:204a8238afe787323a8b47d8be4df89772d5c1e4651b9ffa808552bdf20e1d50"}, + {file = "coverage-7.6.10-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e67926f51821b8e9deb6426ff3164870976fe414d033ad90ea75e7ed0c2e5022"}, + {file = "coverage-7.6.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e78b270eadb5702938c3dbe9367f878249b5ef9a2fcc5360ac7bff694310d17b"}, + {file = "coverage-7.6.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:714f942b9c15c3a7a5fe6876ce30af831c2ad4ce902410b7466b662358c852c0"}, + {file = "coverage-7.6.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:abb02e2f5a3187b2ac4cd46b8ced85a0858230b577ccb2c62c81482ca7d18852"}, + {file = "coverage-7.6.10-cp312-cp312-win32.whl", hash = "sha256:55b201b97286cf61f5e76063f9e2a1d8d2972fc2fcfd2c1272530172fd28c359"}, + {file = "coverage-7.6.10-cp312-cp312-win_amd64.whl", hash = "sha256:e4ae5ac5e0d1e4edfc9b4b57b4cbecd5bc266a6915c500f358817a8496739247"}, + {file = "coverage-7.6.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:05fca8ba6a87aabdd2d30d0b6c838b50510b56cdcfc604d40760dae7153b73d9"}, + {file = "coverage-7.6.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9e80eba8801c386f72e0712a0453431259c45c3249f0009aff537a517b52942b"}, + {file = "coverage-7.6.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a372c89c939d57abe09e08c0578c1d212e7a678135d53aa16eec4430adc5e690"}, + {file = "coverage-7.6.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ec22b5e7fe7a0fa8509181c4aac1db48f3dd4d3a566131b313d1efc102892c18"}, + {file = "coverage-7.6.10-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26bcf5c4df41cad1b19c84af71c22cbc9ea9a547fc973f1f2cc9a290002c8b3c"}, + {file = "coverage-7.6.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e4630c26b6084c9b3cb53b15bd488f30ceb50b73c35c5ad7871b869cb7365fd"}, + {file = "coverage-7.6.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2396e8116db77789f819d2bc8a7e200232b7a282c66e0ae2d2cd84581a89757e"}, + {file = "coverage-7.6.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:79109c70cc0882e4d2d002fe69a24aa504dec0cc17169b3c7f41a1d341a73694"}, + {file = "coverage-7.6.10-cp313-cp313-win32.whl", hash = "sha256:9e1747bab246d6ff2c4f28b4d186b205adced9f7bd9dc362051cc37c4a0c7bd6"}, + {file = "coverage-7.6.10-cp313-cp313-win_amd64.whl", hash = "sha256:254f1a3b1eef5f7ed23ef265eaa89c65c8c5b6b257327c149db1ca9d4a35f25e"}, + {file = "coverage-7.6.10-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2ccf240eb719789cedbb9fd1338055de2761088202a9a0b73032857e53f612fe"}, + {file = "coverage-7.6.10-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:0c807ca74d5a5e64427c8805de15b9ca140bba13572d6d74e262f46f50b13273"}, + {file = "coverage-7.6.10-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bcfa46d7709b5a7ffe089075799b902020b62e7ee56ebaed2f4bdac04c508d8"}, + {file = "coverage-7.6.10-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4e0de1e902669dccbf80b0415fb6b43d27edca2fbd48c74da378923b05316098"}, + {file = "coverage-7.6.10-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f7b444c42bbc533aaae6b5a2166fd1a797cdb5eb58ee51a92bee1eb94a1e1cb"}, + {file = "coverage-7.6.10-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b330368cb99ef72fcd2dc3ed260adf67b31499584dc8a20225e85bfe6f6cfed0"}, + {file = "coverage-7.6.10-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:9a7cfb50515f87f7ed30bc882f68812fd98bc2852957df69f3003d22a2aa0abf"}, + {file = "coverage-7.6.10-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6f93531882a5f68c28090f901b1d135de61b56331bba82028489bc51bdd818d2"}, + {file = "coverage-7.6.10-cp313-cp313t-win32.whl", hash = "sha256:89d76815a26197c858f53c7f6a656686ec392b25991f9e409bcef020cd532312"}, + {file = "coverage-7.6.10-cp313-cp313t-win_amd64.whl", hash = "sha256:54a5f0f43950a36312155dae55c505a76cd7f2b12d26abeebbe7a0b36dbc868d"}, + {file = "coverage-7.6.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:656c82b8a0ead8bba147de9a89bda95064874c91a3ed43a00e687f23cc19d53a"}, + {file = "coverage-7.6.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ccc2b70a7ed475c68ceb548bf69cec1e27305c1c2606a5eb7c3afff56a1b3b27"}, + {file = "coverage-7.6.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5e37dc41d57ceba70956fa2fc5b63c26dba863c946ace9705f8eca99daecdc4"}, + {file = "coverage-7.6.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0aa9692b4fdd83a4647eeb7db46410ea1322b5ed94cd1715ef09d1d5922ba87f"}, + {file = "coverage-7.6.10-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa744da1820678b475e4ba3dfd994c321c5b13381d1041fe9c608620e6676e25"}, + {file = "coverage-7.6.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c0b1818063dc9e9d838c09e3a473c1422f517889436dd980f5d721899e66f315"}, + {file = "coverage-7.6.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:59af35558ba08b758aec4d56182b222976330ef8d2feacbb93964f576a7e7a90"}, + {file = "coverage-7.6.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7ed2f37cfce1ce101e6dffdfd1c99e729dd2ffc291d02d3e2d0af8b53d13840d"}, + {file = "coverage-7.6.10-cp39-cp39-win32.whl", hash = "sha256:4bcc276261505d82f0ad426870c3b12cb177752834a633e737ec5ee79bbdff18"}, + {file = "coverage-7.6.10-cp39-cp39-win_amd64.whl", hash = "sha256:457574f4599d2b00f7f637a0700a6422243b3565509457b2dbd3f50703e11f59"}, + {file = "coverage-7.6.10-pp39.pp310-none-any.whl", hash = "sha256:fd34e7b3405f0cc7ab03d54a334c17a9e802897580d964bd8c2001f4b9fd488f"}, + {file = "coverage-7.6.10.tar.gz", hash = "sha256:7fb105327c8f8f0682e29843e2ff96af9dcbe5bab8eeb4b398c6a33a16d80a23"}, ] [package.dependencies] @@ -1024,13 +1011,13 @@ xmod = "*" [[package]] name = "eval-type-backport" -version = "0.2.0" +version = "0.2.2" description = "Like `typing._eval_type`, but lets older Python versions use newer typing features." optional = false python-versions = ">=3.8" files = [ - {file = "eval_type_backport-0.2.0-py3-none-any.whl", hash = "sha256:ac2f73d30d40c5a30a80b8739a789d6bb5e49fdffa66d7912667e2015d9c9933"}, - {file = "eval_type_backport-0.2.0.tar.gz", hash = "sha256:68796cfbc7371ebf923f03bdf7bef415f3ec098aeced24e054b253a0e78f7b37"}, + {file = "eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a"}, + {file = "eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1"}, ] [package.extras] @@ -1300,13 +1287,13 @@ files = [ [[package]] name = "identify" -version = "2.6.3" +version = "2.6.4" description = "File identification library for Python" optional = false python-versions = ">=3.9" files = [ - {file = "identify-2.6.3-py2.py3-none-any.whl", hash = "sha256:9edba65473324c2ea9684b1f944fe3191db3345e50b6d04571d10ed164f8d7bd"}, - {file = "identify-2.6.3.tar.gz", hash = "sha256:62f5dae9b5fef52c84cc188514e9ea4f3f636b1d8799ab5ebc475471f9e47a02"}, + {file = "identify-2.6.4-py2.py3-none-any.whl", hash = "sha256:993b0f01b97e0568c179bb9196391ff391bfb88a99099dbf5ce392b68f42d0af"}, + {file = "identify-2.6.4.tar.gz", hash = "sha256:285a7d27e397652e8cafe537a6cc97dd470a970f48fb2e9d979aa38eae5513ac"}, ] [package.extras] @@ -1450,13 +1437,13 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"] [[package]] name = "jinja2" -version = "3.1.4" +version = "3.1.5" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, - {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, + {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, + {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, ] [package.dependencies] @@ -3240,13 +3227,13 @@ files = [ [[package]] name = "urllib3" -version = "2.2.3" +version = "2.3.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, - {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"}, + {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, + {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, ] [package.extras] diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 8c1f15f0cb..74021163b7 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -1,52 +1,77 @@ -from dataclasses import dataclass -from typing import Callable, Awaitable - -from bs4 import BeautifulSoup -from playwright.async_api import Page, Response - -from crawlee._types import EnqueueLinksFunction -from crawlee._utils.docs import docs_group -from crawlee.crawlers import BeautifulSoupCrawler, PlaywrightCrawler, ContextPipeline, AbstractHttpCrawler, \ - ParsedHttpCrawlingContext, PlaywrightCrawlingContext -from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser - - -@dataclass(frozen=True) -@docs_group('Data structures') -class AdaptiveCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup], PlaywrightCrawlingContext): - _page: Page - _response: Response - _enqueue_links: EnqueueLinksFunction - _infinite_scroll: Callable[[], Awaitable[None]] - - @property - def page(self) -> Page: - """The Playwright `Page` object for the current page.""" - return self._page - - @property - def response(self) -> Response: - """The Playwright `Response` object containing the response details for the current URL.""" - return self._response - - @property - def enqueue_links(self) -> EnqueueLinksFunction: - """The Playwright `EnqueueLinksFunction` implementation.""" - return self._enqueue_links - - @property - def infinite_scroll(self) -> Callable[[], Awaitable[None]]: - """A function to perform infinite scrolling on the page. This scrolls to the bottom, triggering - the loading of additional content if present.""" - return self._infinite_scroll - -class AdaptivePlaywrightCrawler(AbstractHttpCrawler[AdaptiveCrawlingContext, BeautifulSoup], PlaywrightCrawler): - - def __init__(self): +from __future__ import annotations + +import asyncio +from collections.abc import Sequence + + +from crawlee import Request +from crawlee._types import BasicCrawlingContext, RequestHandlerRunResult + +from crawlee.crawlers import BeautifulSoupCrawler, BasicCrawler, BeautifulSoupCrawlingContext, PlaywrightCrawler, \ + PlaywrightCrawlingContext +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import \ + AdaptivePlaywrightCrawlingContext +from crawlee.statistics import FinalStatistics + + +class AdaptivePlaywrightCrawler(BasicCrawler[AdaptivePlaywrightCrawlingContext]): + + """ + def __init__(self) -> None: context_pipeline = ContextPipeline().compose(self._open_page).compose(self._navigate).compose( self._handle_blocked_request) super().__init__(parser=BeautifulSoupParser(), _context_pipeline=context_pipeline) self._context_pipeline = ContextPipeline().compose(self._open_page).compose(self._navigate) + """ + def __init__(self, max_requests_per_crawl: int) -> None: + self.beautifulsoup_crawler = BeautifulSoupCrawler() + self.playwright_crawler = PlaywrightCrawler() + + @self.beautifulsoup_crawler.router.default_handler + async def request_handler_beautiful_soup(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing with BS: {context.request.url} ...') + adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_beautifulsoup_crawling_context(context) + await self.router(adaptive_crawling_context) + + @self.playwright_crawler.router.default_handler + async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing with PW: {context.request.url} ...') + adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context(context) + await self.router(adaptive_crawling_context) + + super().__init__(max_requests_per_crawl=max_requests_per_crawl) + + async def run( + self, + requests: Sequence[str | Request] | None = None, + *, + purge_request_queue: bool = True, + ) -> FinalStatistics: + async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics, + self.playwright_crawler._additional_context_managers[0]): # TODO: Create something more robust that does not leak implementation so much + top_crawler_statistics = await super().run(requests=requests, purge_request_queue=purge_request_queue) + return top_crawler_statistics + + # Can't use override as mypy does not like it for double underscore pribvate method. + async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: + + result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store) + + await self.beautifulsoup_crawler._crawl_one(context = context, + request_handler_timeout=self._request_handler_timeout, + result= result) + await self.playwright_crawler._crawl_one(context=context, + request_handler_timeout=self._request_handler_timeout, + result=result) + await self.commit_result(result = result, context = context) + + async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: + result_tasks = [] + result_tasks.extend([ + asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls]) + result_tasks.extend([ + asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls]) + await asyncio.gather(*result_tasks) + - def _decide_crawler_type(self): diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py new file mode 100644 index 0000000000..0c73bb2da6 --- /dev/null +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -0,0 +1,81 @@ +from __future__ import annotations +from collections.abc import Awaitable, Callable +from dataclasses import dataclass, fields +from typing_extensions import Self + +from bs4 import BeautifulSoup +from playwright.async_api import Response, Page + +from crawlee import HttpHeaders +from crawlee._utils.docs import docs_group +from crawlee.crawlers import BeautifulSoupCrawlingContext, PlaywrightCrawlingContext, BeautifulSoupParserType + + +@dataclass(frozen=True) +@docs_group('Data structures') +class AdaptivePlaywrightCrawlingContext(BeautifulSoupCrawlingContext): + _response: Response | None = None + _infinite_scroll: Callable[[], Awaitable[None]] | None = None + _page : Page | None = None + # TODO: UseStateFunction + + @property + def page(self) -> Page: + if not self._page: + raise RuntimeError('Page was not crawled with PlaywrightCrawler') + return self._page + + @property + def infinite_scroll(self) -> Callable[[], Awaitable[None]]: + if not self._infinite_scroll: + raise RuntimeError('Page was not crawled with PlaywrightCrawler') + return self._infinite_scroll + + @property + def response(self) -> Response: + if not self._response: + raise RuntimeError('Page was not crawled with PlaywrightCrawler') + return self._response + + @classmethod + def from_beautifulsoup_crawling_context(cls, context: BeautifulSoupCrawlingContext) -> Self: + """Convenience constructor that creates new context from existing `BeautifulSoupCrawlingContext`.""" + return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) + + @classmethod + async def from_playwright_crawling_context(cls, context: PlaywrightCrawlingContext, + beautiful_soup_parser_type: BeautifulSoupParserType | None) -> Self: + """Convenience constructor that creates new context from existing `PlaywrightCrawlingContext`.""" + context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} + # Remove playwright specific attributes and pass them as private instead to be available as property. + context_kwargs['_response'] = context_kwargs.pop('response') + context_kwargs['_page'] = context_kwargs.pop('page') + context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll') + http_response = await _HttpResponse.from_playwright_response(context.response) + return cls(parsed_content= BeautifulSoup(http_response.read(), features=beautiful_soup_parser_type), + http_response = http_response, + **context_kwargs) + + +@dataclass(frozen=True) +class _HttpResponse: + http_version : str + status_code : int + headers: HttpHeaders + _content: bytes + + def read(self) -> bytes: + return self._content + + @classmethod + async def from_playwright_response(cls, response: Response) -> Self: + headers = HttpHeaders(response.headers) + status_code = response.status + # Can't find this anywhere in PlayWright, but some headers can include information about protocol. + # In firefox for example: 'x-firefox-spdy' + # Might be also obtained by executing JS code in browser: performance.getEntries()[0].nextHopProtocol + # Response headers capitalization not respecting http1.1 Pascal case. Always lower case in PlayWright. + http_version = 'TODO' + _content = await response.body() + + return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index 2eeccd4d5f..d43a86dcc2 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -2,39 +2,18 @@ from crawlee.crawlers import PlaywrightCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import \ + AdaptivePlaywrightCrawlingContext - - -async def main(): - crawler = AdaptivePlaywrightCrawler() +async def main() ->None: + crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=2) @crawler.router.default_handler - async def request_handler(context: PlaywrightCrawlingContext) -> None: - context.log.info(f'Processing {context.request.url} ...') - - # Extract data from the page using Playwright's API. - posts = await context.page.query_selector_all('.athing') - data = [] - - for post in posts: - # Get the HTML elements for the title and rank within each post. - title_element = await post.query_selector('.title a') - rank_element = await post.query_selector('.rank') - - # Extract the data we want from the elements. - title = await title_element.inner_text() if title_element else None - rank = await rank_element.inner_text() if rank_element else None - href = await title_element.get_attribute('href') if title_element else None - - data.append({'title': title, 'rank': rank, 'href': href}) - - # Push the extracted data to the default dataset. In local configuration, - # the data will be stored as JSON files in ./storage/datasets/default. - await context.push_data(data) - - # Find a link to the next page and enqueue it if it exists. - await context.enqueue_links(selector='.morelink') + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + context.log.info(f'Processing with Top adaptive_crawler: {context.request.url} ...') + await context.enqueue_links() + await context.push_data({'Top crwaler Url': context.request.url}) # Run the crawler with the initial list of URLs. diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index d659f84f18..9b61725849 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1104,6 +1104,31 @@ async def __run_task_function(self) -> None: async def __run_request_handler(self, context: BasicCrawlingContext) -> None: await self._context_pipeline(context, self.router) + + async def _crawl_one(self,*, context: BasicCrawlingContext, request_handler_timeout: timedelta, + result: RequestHandlerRunResult) -> RequestHandlerRunResult: + """Populate result by crawling one request from input context. Use result to re-route context callbacks to it.""" + result_specific_context = BasicCrawlingContext( + request=context.request, + session=context.session, + proxy_info=context.proxy_info, + send_request=context.send_request, + add_requests=result.add_requests, + push_data=result.push_data, + get_key_value_store=result.get_key_value_store, + use_state=self._use_state, # Not sure about this one. TODO: Learn what it is used for + log=self._logger, # Not sure, maybe take from new context? + ) + + await wait_for( + lambda: self.__run_request_handler(result_specific_context), + timeout=request_handler_timeout, + timeout_message='Request handler timed out after ' + f'{self._request_handler_timeout.total_seconds()} seconds', + logger=self._logger, + ) + return result + def _is_session_blocked_status_code(self, session: Session | None, status_code: int) -> bool: """Check if the HTTP status code indicates that the session was blocked by the target website. From 623d341f969dd0b46d81e68b7ae9d86ae7e63211 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 31 Dec 2024 19:13:48 +0100 Subject: [PATCH 03/79] Pass properly all kwargs to subcrawlers --- .../_abstract_http/_abstract_http_crawler.py | 17 +-- .../crawlers/_adaptive_playwright/__init__.py | 0 .../_adaptive_playwright_crawler.py | 114 +++++++++++++----- .../_adaptive_playwright_crawling_context.py | 13 +- .../crawlers/_adaptive_playwright/_example.py | 6 +- src/crawlee/crawlers/_basic/_basic_crawler.py | 39 +++--- 6 files changed, 133 insertions(+), 56 deletions(-) create mode 100644 src/crawlee/crawlers/_adaptive_playwright/__init__.py diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 3b69ed19c2..ecf378dd73 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Callable, Generic from pydantic import ValidationError -from typing_extensions import NotRequired, TypeVar +from typing_extensions import NotRequired, TypedDict, TypeVar from crawlee import EnqueueStrategy from crawlee._request import BaseRequestData @@ -30,18 +30,21 @@ @docs_group('Data structures') -class HttpCrawlerOptions(Generic[TCrawlingContext], BasicCrawlerOptions[TCrawlingContext]): - """Arguments for the `AbstractHttpCrawler` constructor. - - It is intended for typing forwarded `__init__` arguments in the subclasses. - """ - +class _HttpCrawlerOptions(Generic[TCrawlingContext], TypedDict): additional_http_error_status_codes: NotRequired[Iterable[int]] """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered.""" ignore_http_error_status_codes: NotRequired[Iterable[int]] """HTTP status codes typically considered errors but to be treated as successful responses.""" +@docs_group('Data structures') +class HttpCrawlerOptions(Generic[TCrawlingContext],_HttpCrawlerOptions, BasicCrawlerOptions[TCrawlingContext]): + """Arguments for the `AbstractHttpCrawler` constructor. + + It is intended for typing forwarded `__init__` arguments in the subclasses. + """ + + @docs_group('Abstract classes') class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC): diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 74021163b7..224e526ec2 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -1,31 +1,91 @@ from __future__ import annotations import asyncio -from collections.abc import Sequence +from typing import TYPE_CHECKING, Any +from IPython.core.completer import TypedDict -from crawlee import Request from crawlee._types import BasicCrawlingContext, RequestHandlerRunResult +from crawlee._utils.docs import docs_group +from crawlee.crawlers import ( + BasicCrawler, + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, + BeautifulSoupParserType, + ContextPipeline, + PlaywrightCrawler, + PlaywrightCrawlingContext, +) +from crawlee.crawlers._abstract_http._abstract_http_crawler import _HttpCrawlerOptions +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( + AdaptivePlaywrightCrawlingContext, +) -from crawlee.crawlers import BeautifulSoupCrawler, BasicCrawler, BeautifulSoupCrawlingContext, PlaywrightCrawler, \ - PlaywrightCrawlingContext -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import \ - AdaptivePlaywrightCrawlingContext -from crawlee.statistics import FinalStatistics +if TYPE_CHECKING: + from collections.abc import Awaitable, Callable, Mapping, Sequence + + from typing_extensions import NotRequired, Unpack + + from crawlee import Request + from crawlee.browsers import BrowserPool + from crawlee.browsers._types import BrowserType + from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions + from crawlee.statistics import FinalStatistics + + +@docs_group('Data structures') +class _BeautifulsoupCrawlerAdditionalOptions(_HttpCrawlerOptions): + """Additional options that can be specified for BeautifulsoupCrawler.""" + + parser: NotRequired[BeautifulSoupParserType] + """Parser type used by BeautifulSoup.""" + + +@docs_group('Data structures') +class _PlaywrightCrawlerAdditionalOptions(TypedDict): + """Additional options that can be specified for PlaywrightCrawler.""" + + browser_pool: NotRequired[BrowserPool] + """A `BrowserPool` instance to be used for launching the browsers and getting pages.""" + + browser_type: NotRequired[BrowserType] + """The type of browser to launch ('chromium', 'firefox', or 'webkit'). + This option should not be used if `browser_pool` is provided.""" + + + browser_launch_options: NotRequired[Mapping[str, Any]] + """Keyword arguments to pass to the browser launch method. These options are provided + directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright + documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch. + This option should not be used if `browser_pool` is provided.""" + + browser_new_context_options: NotRequired[Mapping[str, Any]] + """Keyword arguments to pass to the browser new context method. These options + are provided directly to Playwright's `browser.new_context` method. For more details, refer to the + Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. + This option should not be used if `browser_pool` is provided.""" + + headless: NotRequired[bool] + """Whether to run the browser in headless mode. + This option should not be used if `browser_pool` is provided.""" class AdaptivePlaywrightCrawler(BasicCrawler[AdaptivePlaywrightCrawlingContext]): + """Adaptive crawler that uses both BeautifulSoup crawler and PlaywrightCrawler.""" + + def __init__(self, + beautifulsoup_crawler_kwargs: _BeautifulsoupCrawlerAdditionalOptions | None = None, + playwright_crawler_args: _PlaywrightCrawlerAdditionalOptions | None = None, + request_handler: Callable[[AdaptivePlaywrightCrawlingContext], Awaitable[None]] | None = None, + _context_pipeline: ContextPipeline[AdaptivePlaywrightCrawlingContext] | None = None, + **kwargs: Unpack[_BasicCrawlerOptions]) -> None: - """ - def __init__(self) -> None: - context_pipeline = ContextPipeline().compose(self._open_page).compose(self._navigate).compose( - self._handle_blocked_request) - super().__init__(parser=BeautifulSoupParser(), _context_pipeline=context_pipeline) - self._context_pipeline = ContextPipeline().compose(self._open_page).compose(self._navigate) - """ - def __init__(self, max_requests_per_crawl: int) -> None: - self.beautifulsoup_crawler = BeautifulSoupCrawler() - self.playwright_crawler = PlaywrightCrawler() + beautifulsoup_crawler_kwargs = beautifulsoup_crawler_kwargs or {} + beautifulsoup_crawler_kwargs.setdefault('parser', 'lxml') + playwright_crawler_args = playwright_crawler_args or {} + + self.beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **kwargs) + self.playwright_crawler = PlaywrightCrawler(**playwright_crawler_args, **kwargs) @self.beautifulsoup_crawler.router.default_handler async def request_handler_beautiful_soup(context: BeautifulSoupCrawlingContext) -> None: @@ -36,10 +96,11 @@ async def request_handler_beautiful_soup(context: BeautifulSoupCrawlingContext) @self.playwright_crawler.router.default_handler async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing with PW: {context.request.url} ...') - adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context(context) + adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( + context=context, beautiful_soup_parser_type=beautifulsoup_crawler_kwargs['parser']) await self.router(adaptive_crawling_context) - super().__init__(max_requests_per_crawl=max_requests_per_crawl) + super().__init__(request_handler=request_handler, _context_pipeline=_context_pipeline, **kwargs) async def run( self, @@ -49,20 +110,19 @@ async def run( ) -> FinalStatistics: async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics, self.playwright_crawler._additional_context_managers[0]): # TODO: Create something more robust that does not leak implementation so much - top_crawler_statistics = await super().run(requests=requests, purge_request_queue=purge_request_queue) - return top_crawler_statistics + return await super().run(requests=requests, purge_request_queue=purge_request_queue) - # Can't use override as mypy does not like it for double underscore pribvate method. - async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: + # Can't use override as mypy does not like it for double underscore private method. + async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store) - await self.beautifulsoup_crawler._crawl_one(context = context, + await self.beautifulsoup_crawler.crawl_one(context = context, request_handler_timeout=self._request_handler_timeout, result= result) - await self.playwright_crawler._crawl_one(context=context, - request_handler_timeout=self._request_handler_timeout, - result=result) + await self.playwright_crawler.crawl_one(context=context, + request_handler_timeout=self._request_handler_timeout, + result=result) await self.commit_result(result = result, context = context) async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 0c73bb2da6..72c2057bec 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -1,14 +1,19 @@ from __future__ import annotations -from collections.abc import Awaitable, Callable + from dataclasses import dataclass, fields -from typing_extensions import Self +from typing import TYPE_CHECKING from bs4 import BeautifulSoup -from playwright.async_api import Response, Page from crawlee import HttpHeaders from crawlee._utils.docs import docs_group -from crawlee.crawlers import BeautifulSoupCrawlingContext, PlaywrightCrawlingContext, BeautifulSoupParserType +from crawlee.crawlers import BeautifulSoupCrawlingContext, BeautifulSoupParserType, PlaywrightCrawlingContext + +if TYPE_CHECKING: + from collections.abc import Awaitable, Callable + + from playwright.async_api import Page, Response + from typing_extensions import Self @dataclass(frozen=True) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index d43a86dcc2..75d92d855e 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -1,9 +1,9 @@ import asyncio -from crawlee.crawlers import PlaywrightCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import \ - AdaptivePlaywrightCrawlingContext +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( + AdaptivePlaywrightCrawlingContext, +) async def main() ->None: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 9b61725849..faab29b570 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -65,11 +65,8 @@ @docs_group('Data structures') -class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): - """Arguments for the `BasicCrawler` constructor. - - It is intended for typing forwarded `__init__` arguments in the subclasses. - """ +class _BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): + """Non-generic options for basic crawler.""" configuration: NotRequired[Configuration] """The configuration object. Some of its properties are used as defaults for the crawler.""" @@ -92,9 +89,6 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): http_client: NotRequired[BaseHttpClient] """HTTP client used by `BasicCrawlingContext.send_request` method.""" - request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]] - """A callable responsible for handling requests.""" - max_request_retries: NotRequired[int] """Maximum number of attempts to process a single request.""" @@ -135,10 +129,6 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): configure_logging: NotRequired[bool] """If True, the crawler will set up logging infrastructure automatically.""" - _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]] - """Enables extending the request lifecycle and modifying the crawling context. Intended for use by - subclasses rather than direct instantiation of `BasicCrawler`.""" - _additional_context_managers: NotRequired[Sequence[AbstractAsyncContextManager]] """Additional context managers used throughout the crawler lifecycle. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.""" @@ -147,6 +137,25 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): """A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.""" +@docs_group('Data structures') +class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext], TypedDict): + """Generic options for basic crawler.""" + request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]] + """A callable responsible for handling requests.""" + + _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]] + """Enables extending the request lifecycle and modifying the crawling context. Intended for use by + subclasses rather than direct instantiation of `BasicCrawler`.""" + +@docs_group('Data structures') +class BasicCrawlerOptions(Generic[TCrawlingContext], + _BasicCrawlerOptions , + _BasicCrawlerOptionsGeneric[TCrawlingContext]): + """Arguments for the `BasicCrawler` constructor. + + It is intended for typing forwarded `__init__` arguments in the subclasses. + """ + @docs_group('Classes') class BasicCrawler(Generic[TCrawlingContext]): @@ -1105,9 +1114,9 @@ async def __run_request_handler(self, context: BasicCrawlingContext) -> None: await self._context_pipeline(context, self.router) - async def _crawl_one(self,*, context: BasicCrawlingContext, request_handler_timeout: timedelta, - result: RequestHandlerRunResult) -> RequestHandlerRunResult: - """Populate result by crawling one request from input context. Use result to re-route context callbacks to it.""" + async def crawl_one(self, *, context: BasicCrawlingContext, request_handler_timeout: timedelta, + result: RequestHandlerRunResult) -> RequestHandlerRunResult: + """Populate result by crawling one request from input context. Route context callbacks to result.""" result_specific_context = BasicCrawlingContext( request=context.request, session=context.session, From 548349efa7e933068f6f3c39c0390d2261a1d51b Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 1 Jan 2025 11:38:51 +0100 Subject: [PATCH 04/79] Add run decision logic from JS version Add statistics TODO: Make mypy happy about statistics. Wrap existing statistics from init in adaptive statistics. Silent subcrawler statistics and loggers in general. (Set level to error?) --- .../_adaptive_playwright_crawler.py | 84 ++++++++++++++++--- ..._adaptive_playwright_crawler_statistics.py | 45 ++++++++++ .../_adaptive_playwright_crawling_context.py | 9 +- .../crawlers/_adaptive_playwright/_example.py | 4 +- .../_rendering_type_predictor.py | 39 +++++++++ .../_result_comparator.py | 29 +++++++ 6 files changed, 194 insertions(+), 16 deletions(-) create mode 100644 src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py create mode 100644 src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py create mode 100644 src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 224e526ec2..d384fad220 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +from random import random from typing import TYPE_CHECKING, Any from IPython.core.completer import TypedDict @@ -17,9 +18,18 @@ PlaywrightCrawlingContext, ) from crawlee.crawlers._abstract_http._abstract_http_crawler import _HttpCrawlerOptions +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( + AdaptivePlaywrightCrawlerStatistics, +) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, ) +from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( + DefaultRenderingTypePredictor, + RenderingType, + RenderingTypePredictor, +) +from crawlee.crawlers._adaptive_playwright._result_comparator import SubCrawlerRun, default_result_comparator if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Mapping, Sequence @@ -74,6 +84,9 @@ class AdaptivePlaywrightCrawler(BasicCrawler[AdaptivePlaywrightCrawlingContext]) """Adaptive crawler that uses both BeautifulSoup crawler and PlaywrightCrawler.""" def __init__(self, + rendering_type_predictor: RenderingTypePredictor | None = None, + result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, + result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, beautifulsoup_crawler_kwargs: _BeautifulsoupCrawlerAdditionalOptions | None = None, playwright_crawler_args: _PlaywrightCrawlerAdditionalOptions | None = None, request_handler: Callable[[AdaptivePlaywrightCrawlingContext], Awaitable[None]] | None = None, @@ -84,6 +97,10 @@ def __init__(self, beautifulsoup_crawler_kwargs.setdefault('parser', 'lxml') playwright_crawler_args = playwright_crawler_args or {} + self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() + self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 + self.result_comparator = result_comparator or default_result_comparator + self.beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **kwargs) self.playwright_crawler = PlaywrightCrawler(**playwright_crawler_args, **kwargs) @@ -100,6 +117,12 @@ async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None context=context, beautiful_soup_parser_type=beautifulsoup_crawler_kwargs['parser']) await self.router(adaptive_crawling_context) + # Make user adaptive statistics are used + if 'statistics' in kwargs: + statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics() + else: + statistics = AdaptivePlaywrightCrawlerStatistics() + kwargs['statistics'] = statistics super().__init__(request_handler=request_handler, _context_pipeline=_context_pipeline, **kwargs) async def run( @@ -108,22 +131,61 @@ async def run( *, purge_request_queue: bool = True, ) -> FinalStatistics: + + # TODO: Create something more robust that does not leak implementation so much async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics, - self.playwright_crawler._additional_context_managers[0]): # TODO: Create something more robust that does not leak implementation so much + self.playwright_crawler._additional_context_managers[0]): return await super().run(requests=requests, purge_request_queue=purge_request_queue) # Can't use override as mypy does not like it for double underscore private method. async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 - - result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store) - - await self.beautifulsoup_crawler.crawl_one(context = context, - request_handler_timeout=self._request_handler_timeout, - result= result) - await self.playwright_crawler.crawl_one(context=context, - request_handler_timeout=self._request_handler_timeout, - result=result) - await self.commit_result(result = result, context = context) + async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler) -> SubCrawlerRun: + try: + crawl_result = await crawler.crawl_one( + context = context, + request_handler_timeout=self._request_handler_timeout, + result= RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)) + return SubCrawlerRun(result=crawl_result) + except Exception as e: + return SubCrawlerRun(exception=e) + + + rendering_type_prediction = self.rendering_type_predictor.predict(context.request.url, context.request.label) + should_detect_rendering_type = random() < rendering_type_prediction.detection_probability_recommendation + + if not should_detect_rendering_type: + self.log.debug( + f'Predicted rendering type {rendering_type_prediction.rendering_type} for {context.request.url}') + if rendering_type_prediction.rendering_type == 'static': + self.statistics.track_http_only_request_handler_runs() + + bs_run = await _run_subcrawler(self.beautifulsoup_crawler) + if bs_run.ok and self.result_checker(bs_run.result): + await self.commit_result(result = bs_run.result, context=context) + return + if not bs_run.ok: + context.log.exception(msg=f'Static crawler: failed for {context.request.url}', + exc_info=bs_run.exception) + else: + context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}') + + pw_run = await _run_subcrawler(self.playwright_crawler) + + if pw_run.exception is not None: + raise pw_run.exception + await self.commit_result(result = pw_run.result, context=context) + + if should_detect_rendering_type: + detection_result: RenderingType + bs_run = await _run_subcrawler(self.beautifulsoup_crawler) + + if bs_run.ok and self.result_comparator(bs_run.result,pw_run.result): + detection_result = 'static' + else: + detection_result = 'client only' + + context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}') + self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result) async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: result_tasks = [] diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py new file mode 100644 index 0000000000..ae5490e5f3 --- /dev/null +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -0,0 +1,45 @@ +from typing import Annotated + +from pydantic import Field +from typing_extensions import Self, override + +from crawlee._utils.docs import docs_group +from crawlee.statistics import Statistics, StatisticsState + + +@docs_group('Data structures') +class AdaptivePlaywrightCrawlerStatisticState(StatisticsState): + http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0 + browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0 + rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0 + +@docs_group('Classes') +class AdaptivePlaywrightCrawlerStatistics(Statistics[AdaptivePlaywrightCrawlerStatisticState]): + + + def __init__(self, *args, **kwargs) -> None: + if 'state_model' not in kwargs: + kwargs['state_model'] = AdaptivePlaywrightCrawlerStatisticState + super().__init__(*args, **kwargs) + + @classmethod + def from_statistics(cls, statistics: Statistics) -> Self: + pass + + + @override + def reset(self) -> None: + super().reset() + self.state.http_only_request_handler_runs = 0 + self.state.browser_request_Handler_runs = 0 + self.state.rendering_type_mispredictions = 0 + + def track_http_only_request_handler_runs(self) -> None: + self.state.http_only_request_handler_runs += 1 + + def track_browser_request_handler_runs(self) -> None: + self.state.browser_request_Handler_runs += 1 + + def track_rendering_type_mispredictions(self) -> None: + self.state.rendering_type_mispredictions += 1 + diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 72c2057bec..6079969a34 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -56,7 +56,10 @@ async def from_playwright_crawling_context(cls, context: PlaywrightCrawlingConte context_kwargs['_response'] = context_kwargs.pop('response') context_kwargs['_page'] = context_kwargs.pop('page') context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll') - http_response = await _HttpResponse.from_playwright_response(context.response) + # This might be always available. + protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol') + http_response = await _HttpResponse.from_playwright_response(response = context.response, + protocol = protocol_guess or '') return cls(parsed_content= BeautifulSoup(http_response.read(), features=beautiful_soup_parser_type), http_response = http_response, **context_kwargs) @@ -73,14 +76,14 @@ def read(self) -> bytes: return self._content @classmethod - async def from_playwright_response(cls, response: Response) -> Self: + async def from_playwright_response(cls, response: Response, protocol: str) -> Self: headers = HttpHeaders(response.headers) status_code = response.status # Can't find this anywhere in PlayWright, but some headers can include information about protocol. # In firefox for example: 'x-firefox-spdy' # Might be also obtained by executing JS code in browser: performance.getEntries()[0].nextHopProtocol # Response headers capitalization not respecting http1.1 Pascal case. Always lower case in PlayWright. - http_version = 'TODO' + http_version = protocol _content = await response.body() return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index 75d92d855e..9bb5566437 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -7,7 +7,7 @@ async def main() ->None: - crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=2) + crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=10) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @@ -17,7 +17,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: # Run the crawler with the initial list of URLs. - await crawler.run(['https://news.ycombinator.com/']) + await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': diff --git a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py new file mode 100644 index 0000000000..2bef0cccce --- /dev/null +++ b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from random import choice +from typing import Literal + +from typing_extensions import override + +RenderingType = Literal['static', 'client only'] + +@dataclass(frozen=True) +class RenderingTypePrediction: + rendering_type: RenderingType + detection_probability_recommendation: float + + + +class RenderingTypePredictor(ABC): + + @abstractmethod + def predict(self, url: str, label: str | None) -> RenderingTypePrediction: + ... + + @abstractmethod + def store_result(self, url: str, label: str | None, crawl_type: RenderingType) -> None: + ... + + +class DefaultRenderingTypePredictor(RenderingTypePredictor): + #Dummy version of predictor. Proper version will be implemented in another change. + + @override + def predict(self, url: str, label: str | None) -> RenderingTypePrediction: # Will be implemented later + return RenderingTypePrediction(choice(['static', 'client only']), 0.1) + + @override + def store_result(self, url: str, label: str | None, crawl_type: RenderingType) -> None: + pass diff --git a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py new file mode 100644 index 0000000000..7190107df3 --- /dev/null +++ b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from crawlee._types import RequestHandlerRunResult + + +@dataclass(frozen=True) +class SubCrawlerRun: + result: RequestHandlerRunResult | None= None + exception: Exception | None= None + + @property + def ok(self) -> bool: + return not bool(self.exception) + +def default_result_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: + + # PlayWright can produce links with extra arguments compared to pure BS. Default comparator ignores this as it does + # not consider add_requests_kwargs + # https://sdk.apify.com/docs/guides/getting-started + # https://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.1735494277124&__hstc=7591405.e2b9302ed00c5bfaee3a870166792181.1735494277124.1735494277124.1735494277124.1 + + return ( + (result_1.push_data_calls == result_2.push_data_calls) and + (result_1.add_requests_calls == result_2.add_requests_calls) and + (result_1.key_value_store_changes == result_2.key_value_store_changes)) From 29d510ae01a263d22b8df60f2964dd3c78d904d1 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 1 Jan 2025 15:55:31 +0100 Subject: [PATCH 05/79] Statistics class change lead too many ripple changes. Ignore and create TODO follow up issue for refactoring Statistics class after technical discussion. --- .../_adaptive_playwright_crawler.py | 35 ++++++----- ..._adaptive_playwright_crawler_statistics.py | 61 +++++++++++++------ .../_result_comparator.py | 3 - src/crawlee/statistics/_statistics.py | 1 + 4 files changed, 62 insertions(+), 38 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index d384fad220..008d86df10 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -119,10 +119,10 @@ async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None # Make user adaptive statistics are used if 'statistics' in kwargs: - statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics() + statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=kwargs['statistics']) else: statistics = AdaptivePlaywrightCrawlerStatistics() - kwargs['statistics'] = statistics + kwargs['statistics'] = statistics # type:ignore[typeddict-item] # Statistics class would need refactoring beyond the scope of this change. TODO: super().__init__(request_handler=request_handler, _context_pipeline=_context_pipeline, **kwargs) async def run( @@ -157,35 +157,40 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler) -> self.log.debug( f'Predicted rendering type {rendering_type_prediction.rendering_type} for {context.request.url}') if rendering_type_prediction.rendering_type == 'static': - self.statistics.track_http_only_request_handler_runs() + self.statistics.track_http_only_request_handler_runs() # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: bs_run = await _run_subcrawler(self.beautifulsoup_crawler) - if bs_run.ok and self.result_checker(bs_run.result): + if bs_run.result and self.result_checker(bs_run.result): await self.commit_result(result = bs_run.result, context=context) return - if not bs_run.ok: + if bs_run.exception: context.log.exception(msg=f'Static crawler: failed for {context.request.url}', exc_info=bs_run.exception) else: context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}') + self.stats.rendering_type_mispredictions() # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: + context.log.debug(f'Running browser request handler for {context.request.url}') pw_run = await _run_subcrawler(self.playwright_crawler) + self.stats.browser_request_handler_runs()# type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: if pw_run.exception is not None: raise pw_run.exception - await self.commit_result(result = pw_run.result, context=context) - if should_detect_rendering_type: - detection_result: RenderingType - bs_run = await _run_subcrawler(self.beautifulsoup_crawler) + if pw_run.result: + await self.commit_result(result = pw_run.result, context=context) - if bs_run.ok and self.result_comparator(bs_run.result,pw_run.result): - detection_result = 'static' - else: - detection_result = 'client only' + if should_detect_rendering_type: + detection_result: RenderingType + bs_run = await _run_subcrawler(self.beautifulsoup_crawler) + + if bs_run.result and self.result_comparator(bs_run.result,pw_run.result): + detection_result = 'static' + else: + detection_result = 'client only' - context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}') - self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result) + context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}') + self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result) async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: result_tasks = [] diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index ae5490e5f3..04180e3191 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -1,11 +1,21 @@ -from typing import Annotated +from __future__ import annotations + +from datetime import timedelta +from typing import TYPE_CHECKING, Annotated from pydantic import Field -from typing_extensions import Self, override from crawlee._utils.docs import docs_group from crawlee.statistics import Statistics, StatisticsState +if TYPE_CHECKING: + from logging import Logger + + from typing_extensions import Self + + from crawlee.statistics._statistics import TStatisticsState + from crawlee.storages import KeyValueStore + @docs_group('Data structures') class AdaptivePlaywrightCrawlerStatisticState(StatisticsState): @@ -16,30 +26,41 @@ class AdaptivePlaywrightCrawlerStatisticState(StatisticsState): @docs_group('Classes') class AdaptivePlaywrightCrawlerStatistics(Statistics[AdaptivePlaywrightCrawlerStatisticState]): - - def __init__(self, *args, **kwargs) -> None: - if 'state_model' not in kwargs: - kwargs['state_model'] = AdaptivePlaywrightCrawlerStatisticState - super().__init__(*args, **kwargs) + def __init__(self,*, + persistence_enabled: bool = False, + persist_state_kvs_name: str = 'default', + persist_state_key: str | None = None, + key_value_store: KeyValueStore | None = None, + log_message: str = 'Statistics', + periodic_message_logger: Logger | None = None, + log_interval: timedelta = timedelta(minutes=1), + state_model: type[AdaptivePlaywrightCrawlerStatisticState] = AdaptivePlaywrightCrawlerStatisticState) -> None: + super().__init__(persistence_enabled=persistence_enabled, + persist_state_kvs_name=persist_state_kvs_name, + persist_state_key=persist_state_key, + key_value_store=key_value_store, + log_message=log_message, + periodic_message_logger=periodic_message_logger, + log_interval=log_interval, + state_model=state_model) @classmethod - def from_statistics(cls, statistics: Statistics) -> Self: - pass - - - @override - def reset(self) -> None: - super().reset() - self.state.http_only_request_handler_runs = 0 - self.state.browser_request_Handler_runs = 0 - self.state.rendering_type_mispredictions = 0 + def from_statistics(cls, statistics: Statistics[TStatisticsState]) -> Self: + return cls(persistence_enabled=statistics._persistence_enabled, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: + persist_state_kvs_name=statistics._persist_state_kvs_name, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: + persist_state_key=statistics._persist_state_key, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: + key_value_store=statistics._key_value_store, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: + log_message=statistics._log_message, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: + periodic_message_logger=statistics._periodic_message_logger, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: + log_interval=statistics._log_interval, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: + state_model=AdaptivePlaywrightCrawlerStatisticState) def track_http_only_request_handler_runs(self) -> None: - self.state.http_only_request_handler_runs += 1 + self.state.http_only_request_handler_runs += 1 # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: def track_browser_request_handler_runs(self) -> None: - self.state.browser_request_Handler_runs += 1 + self.state.browser_request_handler_runs += 1 # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: def track_rendering_type_mispredictions(self) -> None: - self.state.rendering_type_mispredictions += 1 + self.state.rendering_type_mispredictions += 1 # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: diff --git a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py index 7190107df3..80111b3060 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py @@ -12,9 +12,6 @@ class SubCrawlerRun: result: RequestHandlerRunResult | None= None exception: Exception | None= None - @property - def ok(self) -> bool: - return not bool(self.exception) def default_result_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index 00612f4753..ca9a0a2a10 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -97,6 +97,7 @@ def __init__( self._log_message = log_message self._periodic_message_logger = periodic_message_logger or logger + self._log_interval = log_interval self._periodic_logger = RecurringTask(self._log, log_interval) # Flag to indicate the context state. From 04eefd94dcdfe789a96587da3900eafd80eeb8d1 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 1 Jan 2025 17:13:30 +0100 Subject: [PATCH 06/79] Handle sub crawlers loggers Handle use state. --- .../_adaptive_playwright_crawler.py | 52 +++++++++++++++---- src/crawlee/crawlers/_basic/_basic_crawler.py | 15 ++++-- 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 008d86df10..24843a6871 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -1,12 +1,15 @@ from __future__ import annotations import asyncio +import logging +from copy import deepcopy +from logging import getLogger from random import random from typing import TYPE_CHECKING, Any from IPython.core.completer import TypedDict -from crawlee._types import BasicCrawlingContext, RequestHandlerRunResult +from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult from crawlee._utils.docs import docs_group from crawlee.crawlers import ( BasicCrawler, @@ -92,27 +95,40 @@ def __init__(self, request_handler: Callable[[AdaptivePlaywrightCrawlingContext], Awaitable[None]] | None = None, _context_pipeline: ContextPipeline[AdaptivePlaywrightCrawlingContext] | None = None, **kwargs: Unpack[_BasicCrawlerOptions]) -> None: + # TODO: pre navigation hooks support for subcrawlers. How to handle??? + + # Sub crawler kwargs might be modified + bs_kwargs = deepcopy(kwargs) + pw_kwargs = deepcopy(kwargs) beautifulsoup_crawler_kwargs = beautifulsoup_crawler_kwargs or {} beautifulsoup_crawler_kwargs.setdefault('parser', 'lxml') playwright_crawler_args = playwright_crawler_args or {} + # Each sub crawler will use custom logger. + bs_logger = getLogger('Subcrawler_BS') + bs_logger.setLevel(logging.ERROR) + bs_kwargs['_logger'] = bs_logger + + pw_logger = getLogger('Subcrawler_PW') + pw_logger.setLevel(logging.ERROR) + pw_kwargs['_logger'] = pw_logger + + self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 self.result_comparator = result_comparator or default_result_comparator - self.beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **kwargs) - self.playwright_crawler = PlaywrightCrawler(**playwright_crawler_args, **kwargs) + self.beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **bs_kwargs) + self.playwright_crawler = PlaywrightCrawler(**playwright_crawler_args, **pw_kwargs) @self.beautifulsoup_crawler.router.default_handler async def request_handler_beautiful_soup(context: BeautifulSoupCrawlingContext) -> None: - context.log.info(f'Processing with BS: {context.request.url} ...') adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_beautifulsoup_crawling_context(context) await self.router(adaptive_crawling_context) @self.playwright_crawler.router.default_handler async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None: - context.log.info(f'Processing with PW: {context.request.url} ...') adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( context=context, beautiful_soup_parser_type=beautifulsoup_crawler_kwargs['parser']) await self.router(adaptive_crawling_context) @@ -122,6 +138,13 @@ async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=kwargs['statistics']) else: statistics = AdaptivePlaywrightCrawlerStatistics() + + # Each sub crawler will use custom logger. + if '_logger' not in kwargs: + top_logger = getLogger(__name__) + top_logger.setLevel(logging.DEBUG) + kwargs['_logger'] = top_logger + kwargs['statistics'] = statistics # type:ignore[typeddict-item] # Statistics class would need refactoring beyond the scope of this change. TODO: super().__init__(request_handler=request_handler, _context_pipeline=_context_pipeline, **kwargs) @@ -139,12 +162,13 @@ async def run( # Can't use override as mypy does not like it for double underscore private method. async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 - async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler) -> SubCrawlerRun: + async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, use_state: dict | None = None) -> SubCrawlerRun: try: crawl_result = await crawler.crawl_one( context = context, request_handler_timeout=self._request_handler_timeout, - result= RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)) + result= RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store), + use_state=use_state) return SubCrawlerRun(result=crawl_result) except Exception as e: return SubCrawlerRun(exception=e) @@ -157,6 +181,7 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler) -> self.log.debug( f'Predicted rendering type {rendering_type_prediction.rendering_type} for {context.request.url}') if rendering_type_prediction.rendering_type == 'static': + context.log.debug(f'Running static request for {context.request.url}') self.statistics.track_http_only_request_handler_runs() # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: bs_run = await _run_subcrawler(self.beautifulsoup_crawler) @@ -168,11 +193,18 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler) -> exc_info=bs_run.exception) else: context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}') - self.stats.rendering_type_mispredictions() # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: + self.stats.track_rendering_type_mispredictions() # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: context.log.debug(f'Running browser request handler for {context.request.url}') + # TODO: What is this used for actually??? + await context.use_state('CRAWLEE_STATE', {'some': 'state'}) + kvs = await context.get_key_value_store() + default_value =dict[str, JsonSerializable]() + old_state: dict[str, JsonSerializable] = await kvs.get_value('CRAWLEE_STATE', default_value) + old_state_copy = deepcopy(old_state) + pw_run = await _run_subcrawler(self.playwright_crawler) - self.stats.browser_request_handler_runs()# type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: + self.statistics.track_browser_request_handler_runs()# type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: if pw_run.exception is not None: raise pw_run.exception @@ -182,7 +214,7 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler) -> if should_detect_rendering_type: detection_result: RenderingType - bs_run = await _run_subcrawler(self.beautifulsoup_crawler) + bs_run = await _run_subcrawler(self.beautifulsoup_crawler, use_state=old_state_copy) if bs_run.result and self.result_comparator(bs_run.result,pw_run.result): detection_result = 'static' diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index faab29b570..ba08e04519 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1115,8 +1115,17 @@ async def __run_request_handler(self, context: BasicCrawlingContext) -> None: async def crawl_one(self, *, context: BasicCrawlingContext, request_handler_timeout: timedelta, - result: RequestHandlerRunResult) -> RequestHandlerRunResult: + result: RequestHandlerRunResult, use_state: dict[str, + JsonSerializable] | None = None) -> RequestHandlerRunResult: """Populate result by crawling one request from input context. Route context callbacks to result.""" + if use_state is not None: + async def get_old_use_state(key: str, default_value: dict[str, + JsonSerializable] | None = None) -> dict[str, JsonSerializable]: + return use_state + use_state_function = get_old_use_state + else: + use_state_function = context.use_state + result_specific_context = BasicCrawlingContext( request=context.request, session=context.session, @@ -1125,8 +1134,8 @@ async def crawl_one(self, *, context: BasicCrawlingContext, request_handler_time add_requests=result.add_requests, push_data=result.push_data, get_key_value_store=result.get_key_value_store, - use_state=self._use_state, # Not sure about this one. TODO: Learn what it is used for - log=self._logger, # Not sure, maybe take from new context? + use_state=use_state_function, + log=context.log ) await wait_for( From 33efed34a73e6bc1f3e5c02193dba0cb8c9518b9 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 2 Jan 2025 13:56:00 +0100 Subject: [PATCH 07/79] Statistics change to be usable without ignores. Pre-navigation hooks delegation to sub crawler hooks. --- .../_adaptive_playwright_crawler.py | 29 +++++++-- ..._adaptive_playwright_crawler_statistics.py | 58 ++++++++++++------ .../crawlers/_adaptive_playwright/_example.py | 9 +++ src/crawlee/statistics/_statistics.py | 59 ++++++++++--------- 4 files changed, 106 insertions(+), 49 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 24843a6871..86ec7f42ba 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -19,6 +19,7 @@ ContextPipeline, PlaywrightCrawler, PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, ) from crawlee.crawlers._abstract_http._abstract_http_crawler import _HttpCrawlerOptions from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( @@ -33,6 +34,7 @@ RenderingTypePredictor, ) from crawlee.crawlers._adaptive_playwright._result_comparator import SubCrawlerRun, default_result_comparator +from crawlee.statistics import Statistics, StatisticsState if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Mapping, Sequence @@ -95,7 +97,7 @@ def __init__(self, request_handler: Callable[[AdaptivePlaywrightCrawlingContext], Awaitable[None]] | None = None, _context_pipeline: ContextPipeline[AdaptivePlaywrightCrawlingContext] | None = None, **kwargs: Unpack[_BasicCrawlerOptions]) -> None: - # TODO: pre navigation hooks support for subcrawlers. How to handle??? + # TODO: pre navigation hooks support for subcrawlers. How to handle different contexts??? Not in JS. They have to be separate # Sub crawler kwargs might be modified bs_kwargs = deepcopy(kwargs) @@ -114,6 +116,9 @@ def __init__(self, pw_logger.setLevel(logging.ERROR) pw_kwargs['_logger'] = pw_logger + # Each sub crawler will use own statistics. + bs_kwargs['statistics'] = Statistics[StatisticsState]() + pw_kwargs['statistics'] = Statistics[StatisticsState]() self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 @@ -133,7 +138,8 @@ async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None context=context, beautiful_soup_parser_type=beautifulsoup_crawler_kwargs['parser']) await self.router(adaptive_crawling_context) - # Make user adaptive statistics are used + + # Make user statistics are used. if 'statistics' in kwargs: statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=kwargs['statistics']) else: @@ -145,7 +151,7 @@ async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None top_logger.setLevel(logging.DEBUG) kwargs['_logger'] = top_logger - kwargs['statistics'] = statistics # type:ignore[typeddict-item] # Statistics class would need refactoring beyond the scope of this change. TODO: + kwargs['statistics'] = statistics super().__init__(request_handler=request_handler, _context_pipeline=_context_pipeline, **kwargs) async def run( @@ -162,7 +168,8 @@ async def run( # Can't use override as mypy does not like it for double underscore private method. async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 - async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, use_state: dict | None = None) -> SubCrawlerRun: + async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, + use_state: dict | None = None) -> SubCrawlerRun: try: crawl_result = await crawler.crawl_one( context = context, @@ -233,4 +240,18 @@ async def commit_result(self, result: RequestHandlerRunResult, context: BasicCra await asyncio.gather(*result_tasks) + def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None: + """Pre navigation hooks for adaptive crawler are delegated to sub crawlers.""" + raise RuntimeError('Pre navigation hooks are ambiguous in adaptive crawling context. Use specific hook instead:' + '`pre_navigation_hook_pw` for playwright sub crawler related hooks or' + '`pre_navigation_hook_bs`for beautifulsoup sub crawler related hooks') + + def pre_navigation_hook_pw(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: + """Pre navigation hooks for playwright sub crawler of adaptive crawler.""" + self.playwright_crawler.pre_navigation_hook(hook) + + + def pre_navigation_hook_bs(self, hook: Callable[[BasicCrawlingContext], Awaitable[None]]) -> None: + """Pre navigation hooks for beautifulsoup sub crawler of adaptive crawler.""" + self.beautifulsoup_crawler.pre_navigation_hook(hook) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index 04180e3191..8879290e18 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -1,9 +1,10 @@ from __future__ import annotations from datetime import timedelta -from typing import TYPE_CHECKING, Annotated +from typing import TYPE_CHECKING, Annotated, Any, cast -from pydantic import Field +from pydantic import BaseModel, ConfigDict, Field +from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee.statistics import Statistics, StatisticsState @@ -13,18 +14,20 @@ from typing_extensions import Self - from crawlee.statistics._statistics import TStatisticsState from crawlee.storages import KeyValueStore @docs_group('Data structures') -class AdaptivePlaywrightCrawlerStatisticState(StatisticsState): +class PredictorState(BaseModel): + model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants') + http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0 browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0 rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0 @docs_group('Classes') -class AdaptivePlaywrightCrawlerStatistics(Statistics[AdaptivePlaywrightCrawlerStatisticState]): +class AdaptivePlaywrightCrawlerStatistics(Statistics[StatisticsState]): + def __init__(self,*, persistence_enabled: bool = False, @@ -34,7 +37,8 @@ def __init__(self,*, log_message: str = 'Statistics', periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), - state_model: type[AdaptivePlaywrightCrawlerStatisticState] = AdaptivePlaywrightCrawlerStatisticState) -> None: + state_model: type[StatisticsState] = StatisticsState) -> None: + self._predictor_state = PredictorState() super().__init__(persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, persist_state_key=persist_state_key, @@ -43,24 +47,42 @@ def __init__(self,*, periodic_message_logger=periodic_message_logger, log_interval=log_interval, state_model=state_model) + self._persist_predictor_state_key = self._get_default_persist_state_key() + '_PREDICTOR' @classmethod - def from_statistics(cls, statistics: Statistics[TStatisticsState]) -> Self: - return cls(persistence_enabled=statistics._persistence_enabled, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: - persist_state_kvs_name=statistics._persist_state_kvs_name, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: - persist_state_key=statistics._persist_state_key, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: - key_value_store=statistics._key_value_store, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: - log_message=statistics._log_message, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: - periodic_message_logger=statistics._periodic_message_logger, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: - log_interval=statistics._log_interval, # noqa:SLF001 # Statistics class would need refactoring beyond the scope of this change. TODO: - state_model=AdaptivePlaywrightCrawlerStatisticState) + def from_statistics(cls, statistics: Statistics[StatisticsState]) -> Self: + return cls(persistence_enabled=statistics._persistence_enabled, # noqa:SLF001 # Accessing private member to create copy like-object. + persist_state_kvs_name=statistics._persist_state_kvs_name, # noqa:SLF001 # Accessing private member to create copy like-object. + persist_state_key=statistics._persist_state_key, # noqa:SLF001 # Accessing private member to create copy like-object. + key_value_store=statistics._key_value_store, # noqa:SLF001 # Accessing private member to create copy like-object. + log_message=statistics._log_message, # noqa:SLF001 # Accessing private member to create copy like-object. + periodic_message_logger=statistics._periodic_message_logger, # noqa:SLF001 # Accessing private member to create copy like-object. + log_interval=statistics._log_interval, # noqa:SLF001 # Accessing private member to create copy like-object. + state_model=statistics._state_model, # noqa:SLF001 # Accessing private member to create copy like-object. + ) def track_http_only_request_handler_runs(self) -> None: - self.state.http_only_request_handler_runs += 1 # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: + self._predictor_state.http_only_request_handler_runs += 1 def track_browser_request_handler_runs(self) -> None: - self.state.browser_request_handler_runs += 1 # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: + self._predictor_state.browser_request_handler_runs += 1 def track_rendering_type_mispredictions(self) -> None: - self.state.rendering_type_mispredictions += 1 # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: + self._predictor_state.rendering_type_mispredictions += 1 + + @override + async def _persist_other_statistics(self, key_value_store: KeyValueStore) -> None: + """Persist state of predictor.""" + await key_value_store.set_value( + self._persist_predictor_state_key, + self._predictor_state.model_dump(mode='json', by_alias=True), + 'application/json', + ) + + + @override + async def _load_other_statistics(self, key_value_store: KeyValueStore) -> None: + """Load state of predictor.""" + stored_state = await key_value_store.get_value(self._persist_predictor_state_key, cast(Any, {})) + self._predictor_state = self._predictor_state.__class__.model_validate(stored_state) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index 9bb5566437..30dee537ef 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -1,5 +1,7 @@ import asyncio +from crawlee._types import BasicCrawlingContext +from crawlee.crawlers import PlaywrightPreNavCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, @@ -15,6 +17,13 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: await context.enqueue_links() await context.push_data({'Top crwaler Url': context.request.url}) + @crawler.pre_navigation_hook_bs + async def bs_hook(context: BasicCrawlingContext) -> None: + context.log.info(f'BS pre navigation hook for: {context.request.url} ...') + + @crawler.pre_navigation_hook_pw + async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: + context.log.info(f'PW pre navigation hook for: {context.request.url} ...') # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev/']) diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index ca9a0a2a10..621a0c312a 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -88,7 +88,7 @@ def __init__( self._requests_in_progress = dict[str, RequestProcessingRecord]() if persist_state_key is None: - persist_state_key = f'SDK_CRAWLER_STATISTICS_{self._id}' + persist_state_key = self._get_default_persist_state_key() self._persistence_enabled = persistence_enabled self._persist_state_key = persist_state_key @@ -103,6 +103,9 @@ def __init__( # Flag to indicate the context state. self._active = False + def _get_default_persist_state_key(self) -> str: + return f'SDK_CRAWLER_STATISTICS_{self._id}' + @property def active(self) -> bool: """Indicates whether the context is active.""" @@ -245,36 +248,27 @@ def _log(self) -> None: self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}') async def _maybe_load_statistics(self) -> None: - if not self._persistence_enabled: - return - - if not self._key_value_store: - return - - stored_state = await self._key_value_store.get_value(self._persist_state_key, cast(Any, {})) + if self._persistence_enabled and self._key_value_store is not None: + await self._load_crawler_statistics(self._key_value_store) + await self._load_other_statistics(self._key_value_store) - saved_state = self.state.__class__.model_validate(stored_state) - self.state = saved_state - - if saved_state.stats_persisted_at is not None and saved_state.crawler_last_started_at: - self._instance_start = datetime.now(timezone.utc) - ( - saved_state.stats_persisted_at - saved_state.crawler_last_started_at - ) - elif saved_state.crawler_last_started_at: - self._instance_start = saved_state.crawler_last_started_at - - async def _persist_state(self, event_data: EventPersistStateData) -> None: - logger.debug(f'Persisting state of the Statistics (event_data={event_data}).') + async def _load_crawler_statistics(self, key_value_store: KeyValueStore) -> None: + stored_state = await key_value_store.get_value(self._persist_state_key, cast(Any, {})) - if not self._persistence_enabled: - return + saved_state = self.state.__class__.model_validate(stored_state) + self.state = saved_state - if not self._key_value_store: - return + if saved_state.stats_persisted_at is not None and saved_state.crawler_last_started_at: + self._instance_start = datetime.now(timezone.utc) - ( + saved_state.stats_persisted_at - saved_state.crawler_last_started_at + ) + elif saved_state.crawler_last_started_at: + self._instance_start = saved_state.crawler_last_started_at - if not self._instance_start: - return + async def _load_other_statistics(self, key_value_store: KeyValueStore) -> None: + """Load other statistics. Not implemented in Statistics.""" + async def _persist_crawler_statistics(self, key_value_store: KeyValueStore) -> None: final_statistics = self.calculate() persisted_state = StatisticsPersistedState( stats_id=self._id, @@ -289,12 +283,23 @@ async def _persist_state(self, event_data: EventPersistStateData) -> None: logger.debug('Persisting state') - await self._key_value_store.set_value( + await key_value_store.set_value( self._persist_state_key, self.state.model_dump(mode='json', by_alias=True) | persisted_state.model_dump(mode='json', by_alias=True), 'application/json', ) + async def _persist_other_statistics(self, key_value_store: KeyValueStore) -> None: + """Save other statistics. Not implemented in Statistics.""" + + + async def _persist_state(self, event_data: EventPersistStateData) -> None: + logger.debug(f'Persisting state of the Statistics (event_data={event_data}).') + if self._persistence_enabled and self._key_value_store is not None and self._instance_start is not None: + await self._persist_crawler_statistics(self._key_value_store) + await self._persist_other_statistics(self._key_value_store) + + def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None: retry_count = record.retry_count From 202aceb7320cfd018d11f23b0757651a33f15742 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 2 Jan 2025 14:44:46 +0100 Subject: [PATCH 08/79] Align use_state with JS implementation. --- src/crawlee/_types.py | 1 - .../_adaptive_playwright_crawler.py | 47 +++++++++---------- .../crawlers/_adaptive_playwright/_example.py | 9 +++- src/crawlee/crawlers/_basic/_basic_crawler.py | 38 ++++++++++----- .../crawlers/_basic/test_basic_crawler.py | 12 ++--- 5 files changed, 63 insertions(+), 44 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 1fba2248ac..30d6ebf401 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -298,7 +298,6 @@ class UseStateFunction(Protocol): def __call__( self, - key: str, default_value: dict[str, JsonSerializable] | None = None, ) -> Coroutine[None, None, dict[str, JsonSerializable]]: ... diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 86ec7f42ba..306bb0685f 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -97,17 +97,29 @@ def __init__(self, request_handler: Callable[[AdaptivePlaywrightCrawlingContext], Awaitable[None]] | None = None, _context_pipeline: ContextPipeline[AdaptivePlaywrightCrawlingContext] | None = None, **kwargs: Unpack[_BasicCrawlerOptions]) -> None: - # TODO: pre navigation hooks support for subcrawlers. How to handle different contexts??? Not in JS. They have to be separate - - # Sub crawler kwargs might be modified + # Some sub crawler kwargs are internally modified. Prepare copies. bs_kwargs = deepcopy(kwargs) pw_kwargs = deepcopy(kwargs) + # Adaptive crawling related. + self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() + self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 + self.result_comparator = result_comparator or default_result_comparator + # Use AdaptivePlaywrightCrawlerStatistics. + if 'statistics' in kwargs: + # If statistics already specified by user, create AdaptivePlaywrightCrawlerStatistics from it. + statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=kwargs['statistics']) + else: + statistics = AdaptivePlaywrightCrawlerStatistics() + kwargs['statistics'] = statistics + + + # Sub crawlers related. beautifulsoup_crawler_kwargs = beautifulsoup_crawler_kwargs or {} beautifulsoup_crawler_kwargs.setdefault('parser', 'lxml') playwright_crawler_args = playwright_crawler_args or {} - # Each sub crawler will use custom logger. + # Each sub crawler will use custom logger . bs_logger = getLogger('Subcrawler_BS') bs_logger.setLevel(logging.ERROR) bs_kwargs['_logger'] = bs_logger @@ -120,38 +132,23 @@ def __init__(self, bs_kwargs['statistics'] = Statistics[StatisticsState]() pw_kwargs['statistics'] = Statistics[StatisticsState]() - self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() - self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 - self.result_comparator = result_comparator or default_result_comparator - self.beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **bs_kwargs) self.playwright_crawler = PlaywrightCrawler(**playwright_crawler_args, **pw_kwargs) @self.beautifulsoup_crawler.router.default_handler async def request_handler_beautiful_soup(context: BeautifulSoupCrawlingContext) -> None: + """Handler for routing from beautifulsoup_crawler to adaptive_crawler handler.""" adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_beautifulsoup_crawling_context(context) await self.router(adaptive_crawling_context) @self.playwright_crawler.router.default_handler async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None: + """Handler for routing from playwright_crawler to adaptive_crawler handler.""" adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( context=context, beautiful_soup_parser_type=beautifulsoup_crawler_kwargs['parser']) await self.router(adaptive_crawling_context) - # Make user statistics are used. - if 'statistics' in kwargs: - statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=kwargs['statistics']) - else: - statistics = AdaptivePlaywrightCrawlerStatistics() - - # Each sub crawler will use custom logger. - if '_logger' not in kwargs: - top_logger = getLogger(__name__) - top_logger.setLevel(logging.DEBUG) - kwargs['_logger'] = top_logger - - kwargs['statistics'] = statistics super().__init__(request_handler=request_handler, _context_pipeline=_context_pipeline, **kwargs) async def run( @@ -203,11 +200,10 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, self.stats.track_rendering_type_mispredictions() # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: context.log.debug(f'Running browser request handler for {context.request.url}') - # TODO: What is this used for actually??? - await context.use_state('CRAWLEE_STATE', {'some': 'state'}) + kvs = await context.get_key_value_store() default_value =dict[str, JsonSerializable]() - old_state: dict[str, JsonSerializable] = await kvs.get_value('CRAWLEE_STATE', default_value) + old_state: dict[str, JsonSerializable] = await kvs.get_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) old_state_copy = deepcopy(old_state) pw_run = await _run_subcrawler(self.playwright_crawler) @@ -244,7 +240,8 @@ def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None: """Pre navigation hooks for adaptive crawler are delegated to sub crawlers.""" raise RuntimeError('Pre navigation hooks are ambiguous in adaptive crawling context. Use specific hook instead:' '`pre_navigation_hook_pw` for playwright sub crawler related hooks or' - '`pre_navigation_hook_bs`for beautifulsoup sub crawler related hooks') + '`pre_navigation_hook_bs`for beautifulsoup sub crawler related hooks. \n' + f'{hook=} will not be used!!!') def pre_navigation_hook_pw(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: """Pre navigation hooks for playwright sub crawler of adaptive crawler.""" diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index 30dee537ef..5f91a4f6df 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -1,4 +1,6 @@ import asyncio +import logging +from logging import getLogger from crawlee._types import BasicCrawlingContext from crawlee.crawlers import PlaywrightPreNavCrawlingContext @@ -9,7 +11,12 @@ async def main() ->None: - crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=10) + # TODO: remove in review + top_logger = getLogger(__name__) + top_logger.setLevel(logging.DEBUG) + + + crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=10, _logger=top_logger) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index ba08e04519..45cba3e301 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -181,6 +181,7 @@ class BasicCrawler(Generic[TCrawlingContext]): - direct storage interaction helpers, - and more. """ + CRAWLEE_STATE_KEY = 'CRAWLEE_STATE' def __init__( self, @@ -574,10 +575,10 @@ async def add_requests( ) async def _use_state( - self, key: str, default_value: dict[str, JsonSerializable] | None = None + self, default_value: dict[str, JsonSerializable] | None = None ) -> dict[str, JsonSerializable]: store = await self.get_key_value_store() - return await store.get_auto_saved_value(key, default_value) + return await store.get_auto_saved_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) async def _save_crawler_state(self) -> None: store = await self.get_key_value_store() @@ -1114,19 +1115,34 @@ async def __run_request_handler(self, context: BasicCrawlingContext) -> None: await self._context_pipeline(context, self.router) - async def crawl_one(self, *, context: BasicCrawlingContext, request_handler_timeout: timedelta, - result: RequestHandlerRunResult, use_state: dict[str, - JsonSerializable] | None = None) -> RequestHandlerRunResult: - """Populate result by crawling one request from input context. Route context callbacks to result.""" + async def crawl_one(self, *, context: BasicCrawlingContext, + request_handler_timeout: timedelta, + result: RequestHandlerRunResult, + use_state: dict[str,JsonSerializable] | None = None + ) -> RequestHandlerRunResult: + """Populate result by crawling one request from input `context`. + + Context callbacks are routed to `result` and are not commited. + + Args: + context: Context used for crawling. It contains `request` that will be crawled. + request_handler_timeout: Timeout in seconds for request handling. + result: Record of calls to storage-related context helpers. + use_state: Existing state that will be used when `context.use_state` is called. + If none, take `use_state` from input `context`. + + Returns: + Same input result object that is mutated in the process. + """ if use_state is not None: - async def get_old_use_state(key: str, default_value: dict[str, - JsonSerializable] | None = None) -> dict[str, JsonSerializable]: + async def get_input_state(default_value: dict[str, JsonSerializable] | None = None # noqa:ARG001 # Intentionally unused arguments. Closure, that generates same output regardless of inputs. + ) -> dict[str, JsonSerializable]: return use_state - use_state_function = get_old_use_state + use_state_function = get_input_state else: use_state_function = context.use_state - result_specific_context = BasicCrawlingContext( + context_linked_to_result = BasicCrawlingContext( request=context.request, session=context.session, proxy_info=context.proxy_info, @@ -1139,7 +1155,7 @@ async def get_old_use_state(key: str, default_value: dict[str, ) await wait_for( - lambda: self.__run_request_handler(result_specific_context), + lambda: self.__run_request_handler(context_linked_to_result), timeout=request_handler_timeout, timeout_message='Request handler timed out after ' f'{self._request_handler_timeout.total_seconds()} seconds', diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index b6e7693a7e..7c9693eeb1 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -710,13 +710,13 @@ async def test_context_use_state(key_value_store: KeyValueStore) -> None: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - await context.use_state('state', {'hello': 'world'}) + await context.use_state({'hello': 'world'}) await crawler.run(['https://hello.world']) store = await crawler.get_key_value_store() - assert (await store.get_value('state')) == {'hello': 'world'} + assert (await store.get_value(BasicCrawler.CRAWLEE_STATE_KEY)) == {'hello': 'world'} async def test_context_handlers_use_state(key_value_store: KeyValueStore) -> None: @@ -728,20 +728,20 @@ async def test_context_handlers_use_state(key_value_store: KeyValueStore) -> Non @crawler.router.handler('one') async def handler_one(context: BasicCrawlingContext) -> None: - state = await context.use_state('state', {'hello': 'world'}) + state = await context.use_state({'hello': 'world'}) state_in_handler_one.update(state) state['hello'] = 'new_world' await context.add_requests([Request.from_url('https://crawlee.dev/docs/quick-start', label='two')]) @crawler.router.handler('two') async def handler_two(context: BasicCrawlingContext) -> None: - state = await context.use_state('state', {'hello': 'world'}) + state = await context.use_state({'hello': 'world'}) state_in_handler_two.update(state) state['hello'] = 'last_world' @crawler.router.handler('three') async def handler_three(context: BasicCrawlingContext) -> None: - state = await context.use_state('state', {'hello': 'world'}) + state = await context.use_state({'hello': 'world'}) state_in_handler_three.update(state) await crawler.run([Request.from_url('https://crawlee.dev/', label='one')]) @@ -759,7 +759,7 @@ async def handler_three(context: BasicCrawlingContext) -> None: store = await crawler.get_key_value_store() # The state in the KVS must match with the last set state - assert (await store.get_value('state')) == {'hello': 'last_world'} + assert (await store.get_value(BasicCrawler.CRAWLEE_STATE_KEY)) == {'hello': 'last_world'} async def test_max_requests_per_crawl(httpbin: URL) -> None: From d474a94cf553dacd4f0ddb56b419c86c1cc8568f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 2 Jan 2025 15:15:37 +0100 Subject: [PATCH 09/79] Remove "fake generics" from Statistics. Statistics were marked as generics, but in reality were not. Hardcoding state_model to make it explicit and clear. --- .../_adaptive_playwright_crawler.py | 18 ++++++++++++------ .../_adaptive_playwright_crawler_statistics.py | 4 ++-- src/crawlee/crawlers/_basic/_basic_crawler.py | 6 +++--- src/crawlee/statistics/_statistics.py | 11 +++++------ 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 306bb0685f..44412a8175 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -34,7 +34,7 @@ RenderingTypePredictor, ) from crawlee.crawlers._adaptive_playwright._result_comparator import SubCrawlerRun, default_result_comparator -from crawlee.statistics import Statistics, StatisticsState +from crawlee.statistics import Statistics if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Mapping, Sequence @@ -106,6 +106,7 @@ def __init__(self, self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 self.result_comparator = result_comparator or default_result_comparator # Use AdaptivePlaywrightCrawlerStatistics. + # Very hard to work with current "fake generic" Statistics. TODO: Discuss best approach. if 'statistics' in kwargs: # If statistics already specified by user, create AdaptivePlaywrightCrawlerStatistics from it. statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=kwargs['statistics']) @@ -113,6 +114,11 @@ def __init__(self, statistics = AdaptivePlaywrightCrawlerStatistics() kwargs['statistics'] = statistics + # self.statistics is hard coded in BasicCrawler to Statistics, so even when we save children class in it, mypy + # will complain about using child-specific methods. Save same object to another attribute so that + # AdaptivePlaywrightCrawlerStatistics specific methods can be access in "type safe manner". + self.adaptive_statistics = statistics + # Sub crawlers related. beautifulsoup_crawler_kwargs = beautifulsoup_crawler_kwargs or {} @@ -129,8 +135,8 @@ def __init__(self, pw_kwargs['_logger'] = pw_logger # Each sub crawler will use own statistics. - bs_kwargs['statistics'] = Statistics[StatisticsState]() - pw_kwargs['statistics'] = Statistics[StatisticsState]() + bs_kwargs['statistics'] = Statistics() + pw_kwargs['statistics'] = Statistics() self.beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **bs_kwargs) self.playwright_crawler = PlaywrightCrawler(**playwright_crawler_args, **pw_kwargs) @@ -186,7 +192,7 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, f'Predicted rendering type {rendering_type_prediction.rendering_type} for {context.request.url}') if rendering_type_prediction.rendering_type == 'static': context.log.debug(f'Running static request for {context.request.url}') - self.statistics.track_http_only_request_handler_runs() # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: + self.adaptive_statistics.track_http_only_request_handler_runs() bs_run = await _run_subcrawler(self.beautifulsoup_crawler) if bs_run.result and self.result_checker(bs_run.result): @@ -197,7 +203,7 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, exc_info=bs_run.exception) else: context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}') - self.stats.track_rendering_type_mispredictions() # type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: + self.adaptive_statistics.track_rendering_type_mispredictions() context.log.debug(f'Running browser request handler for {context.request.url}') @@ -207,7 +213,7 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, old_state_copy = deepcopy(old_state) pw_run = await _run_subcrawler(self.playwright_crawler) - self.statistics.track_browser_request_handler_runs()# type:ignore[attr-defined] # Statistics class would need refactoring beyond the scope of this change. TODO: + self.adaptive_statistics.track_browser_request_handler_runs() if pw_run.exception is not None: raise pw_run.exception diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index 8879290e18..17a9770af1 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -26,7 +26,7 @@ class PredictorState(BaseModel): rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0 @docs_group('Classes') -class AdaptivePlaywrightCrawlerStatistics(Statistics[StatisticsState]): +class AdaptivePlaywrightCrawlerStatistics(Statistics): def __init__(self,*, @@ -50,7 +50,7 @@ def __init__(self,*, self._persist_predictor_state_key = self._get_default_persist_state_key() + '_PREDICTOR' @classmethod - def from_statistics(cls, statistics: Statistics[StatisticsState]) -> Self: + def from_statistics(cls, statistics: Statistics) -> Self: return cls(persistence_enabled=statistics._persistence_enabled, # noqa:SLF001 # Accessing private member to create copy like-object. persist_state_kvs_name=statistics._persist_state_kvs_name, # noqa:SLF001 # Accessing private member to create copy like-object. persist_state_key=statistics._persist_state_key, # noqa:SLF001 # Accessing private member to create copy like-object. diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 45cba3e301..9b09b15e54 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -54,7 +54,7 @@ from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo from crawlee.request_loaders import RequestManager from crawlee.sessions import Session - from crawlee.statistics import FinalStatistics, StatisticsState + from crawlee.statistics import FinalStatistics from crawlee.storage_clients import BaseStorageClient from crawlee.storage_clients.models import DatasetItemsListPage from crawlee.storages._dataset import ExportDataCsvKwargs, ExportDataJsonKwargs, GetDataKwargs, PushDataKwargs @@ -120,7 +120,7 @@ class _BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): request_handler_timeout: NotRequired[timedelta] """Maximum duration allowed for a single request handler to run.""" - statistics: NotRequired[Statistics[StatisticsState]] + statistics: NotRequired[Statistics] """A custom `Statistics` instance, allowing the use of non-default configuration.""" abort_on_error: NotRequired[bool] @@ -354,7 +354,7 @@ def router(self, router: Router[TCrawlingContext]) -> None: self._router = router @property - def statistics(self) -> Statistics[StatisticsState]: + def statistics(self) -> Statistics: """Statistics about the current (or last) crawler run.""" return self._statistics diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index 621a0c312a..948ca4cd2a 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -4,9 +4,7 @@ import math from datetime import datetime, timedelta, timezone from logging import Logger, getLogger -from typing import TYPE_CHECKING, Any, Generic, cast - -from typing_extensions import Self, TypeVar +from typing import TYPE_CHECKING, Any, cast from crawlee import service_locator from crawlee._utils.context import ensure_context @@ -20,7 +18,8 @@ if TYPE_CHECKING: from types import TracebackType -TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) + from typing_extensions import Self + logger = getLogger(__name__) @@ -54,7 +53,7 @@ def retry_count(self) -> int: @docs_group('Classes') -class Statistics(Generic[TStatisticsState]): +class Statistics: """An interface to collecting and logging runtime statistics for requests. All information is saved to the key value store so that it persists between migrations, abortions and resurrections. @@ -72,7 +71,7 @@ def __init__( log_message: str = 'Statistics', periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), - state_model: type[TStatisticsState] = cast(Any, StatisticsState), # noqa: B008 - in an ideal world, TStatisticsState would be inferred from this argument, but I haven't managed to do that + state_model: type[StatisticsState] = StatisticsState, ) -> None: self._id = Statistics.__next_id Statistics.__next_id += 1 From e95ff192e7278318159b828a78491a91d932385f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 2 Jan 2025 15:34:18 +0100 Subject: [PATCH 10/79] Align result comparator witrh JS implementation. --- .../_adaptive_playwright_crawler.py | 9 ++++++-- .../_result_comparator.py | 21 +++++++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 44412a8175..5bdb43b05f 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -33,7 +33,10 @@ RenderingType, RenderingTypePredictor, ) -from crawlee.crawlers._adaptive_playwright._result_comparator import SubCrawlerRun, default_result_comparator +from crawlee.crawlers._adaptive_playwright._result_comparator import ( + SubCrawlerRun, + create_comparator, +) from crawlee.statistics import Statistics if TYPE_CHECKING: @@ -104,7 +107,9 @@ def __init__(self, # Adaptive crawling related. self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 - self.result_comparator = result_comparator or default_result_comparator + + self.result_comparator = result_comparator or create_comparator(result_checker) + # Use AdaptivePlaywrightCrawlerStatistics. # Very hard to work with current "fake generic" Statistics. TODO: Discuss best approach. if 'statistics' in kwargs: diff --git a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py index 80111b3060..14c0dbb3a0 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py @@ -4,6 +4,8 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: + from collections.abc import Callable + from crawlee._types import RequestHandlerRunResult @@ -13,10 +15,21 @@ class SubCrawlerRun: exception: Exception | None= None -def default_result_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: +def create_comparator(result_checker: Callable[[RequestHandlerRunResult], bool] | None + ) -> Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool]: + """Factory for creating comparator function.""" + if result_checker: + # Fallback comparator if only user-specific checker is defined. + return lambda result_1, result_2: result_checker(result_1) and result_checker(result_2) + # Fallback default comparator. + return push_data_only_comparator + + +def full_result_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: + """Compare results by comparing all their parts.""" # PlayWright can produce links with extra arguments compared to pure BS. Default comparator ignores this as it does - # not consider add_requests_kwargs + # Maybe full comparator should have flag about taking into account only urls without parameters. # https://sdk.apify.com/docs/guides/getting-started # https://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.1735494277124&__hstc=7591405.e2b9302ed00c5bfaee3a870166792181.1735494277124.1735494277124.1735494277124.1 @@ -24,3 +37,7 @@ def default_result_comparator(result_1: RequestHandlerRunResult, result_2: Reque (result_1.push_data_calls == result_2.push_data_calls) and (result_1.add_requests_calls == result_2.add_requests_calls) and (result_1.key_value_store_changes == result_2.key_value_store_changes)) + +def push_data_only_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: + """Compare results by comparing their push dara calls. Ignore other parts of results in comparison.""" + return result_1.push_data_calls == result_2.push_data_calls From 2408d85b3c58baee767e300a2d78ed68ea414b3a Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 2 Jan 2025 17:30:13 +0100 Subject: [PATCH 11/79] Add doc strings. WIP KVS handling. Currently it does not go through Result handler. --- docs/examples/code/beautifulsoup_crawler.py | 2 +- src/crawlee/_types.py | 4 ++ .../crawlers/_adaptive_playwright/__init__.py | 13 +++++ .../_adaptive_playwright_crawler.py | 54 ++++++++++++++++--- .../crawlers/_adaptive_playwright/_example.py | 9 +++- src/crawlee/crawlers/_basic/_basic_crawler.py | 9 +++- 6 files changed, 81 insertions(+), 10 deletions(-) diff --git a/docs/examples/code/beautifulsoup_crawler.py b/docs/examples/code/beautifulsoup_crawler.py index 94857bd736..0df6182677 100644 --- a/docs/examples/code/beautifulsoup_crawler.py +++ b/docs/examples/code/beautifulsoup_crawler.py @@ -25,7 +25,7 @@ async def main() -> None: @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') - + await context.use_state({"asd":"sad"}) # Extract data from the page. data = { 'url': context.request.url, diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 30d6ebf401..23888b77e0 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -450,3 +450,7 @@ async def get_key_value_store( ) return self.key_value_store_changes[id, name] + + + async def use_state(self): + # TODO: Somehow make crawlers add to kvs through this. Currently it does it directly diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py index e69de29bb2..2634289eac 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/__init__.py +++ b/src/crawlee/crawlers/_adaptive_playwright/__init__.py @@ -0,0 +1,13 @@ +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import \ + AdaptivePlaywrightCrawlingContext + +__all__ = [ + 'AdaptivePlaywrightCrawler', + 'AdaptivePlaywrightCrawlingContext', + 'HttpCrawlerOptions', + 'ParsedHttpCrawlingContext', +] + + + diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 5bdb43b05f..5ab1733288 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -89,7 +89,13 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict): class AdaptivePlaywrightCrawler(BasicCrawler[AdaptivePlaywrightCrawlingContext]): - """Adaptive crawler that uses both BeautifulSoup crawler and PlaywrightCrawler.""" + """Adaptive crawler that uses both `BeautifulSoupCrawler` and `PlaywrightCrawler`. + + It tries to detect whether it is sufficient to crawl with `BeautifulSoupCrawler` (which is faster) or if + `PlaywrightCrawler` should be used (in case `BeautifulSoupCrawler` did not work as expected for specific url.). + + # TODO: Add example + """ def __init__(self, rendering_type_predictor: RenderingTypePredictor | None = None, @@ -97,9 +103,21 @@ def __init__(self, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, beautifulsoup_crawler_kwargs: _BeautifulsoupCrawlerAdditionalOptions | None = None, playwright_crawler_args: _PlaywrightCrawlerAdditionalOptions | None = None, - request_handler: Callable[[AdaptivePlaywrightCrawlingContext], Awaitable[None]] | None = None, - _context_pipeline: ContextPipeline[AdaptivePlaywrightCrawlingContext] | None = None, **kwargs: Unpack[_BasicCrawlerOptions]) -> None: + """A default constructor. + + Args: + rendering_type_predictor: Object that implements RenderingTypePredictor and is capable of predicting which + rendering method should be used. If None, then `DefaultRenderingTypePredictor` is used. + result_checker: Function that evaluates whether crawling result is valid or not. + result_comparator: Function that compares two crawling results and decides whether they are equivalent. + beautifulsoup_crawler_kwargs: BeautifulsoupCrawler only kwargs that are passed to the sub crawler. + playwright_crawler_args: PlaywrightCrawler only kwargs that are passed to the sub crawler. + kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. + """ + + + # Some sub crawler kwargs are internally modified. Prepare copies. bs_kwargs = deepcopy(kwargs) pw_kwargs = deepcopy(kwargs) @@ -140,8 +158,8 @@ def __init__(self, pw_kwargs['_logger'] = pw_logger # Each sub crawler will use own statistics. - bs_kwargs['statistics'] = Statistics() - pw_kwargs['statistics'] = Statistics() + bs_kwargs['statistics'] = Statistics(periodic_message_logger=bs_logger) + pw_kwargs['statistics'] = Statistics(periodic_message_logger=pw_logger) self.beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **bs_kwargs) self.playwright_crawler = PlaywrightCrawler(**playwright_crawler_args, **pw_kwargs) @@ -160,7 +178,7 @@ async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None await self.router(adaptive_crawling_context) - super().__init__(request_handler=request_handler, _context_pipeline=_context_pipeline, **kwargs) + super().__init__(**kwargs) async def run( self, @@ -168,6 +186,13 @@ async def run( *, purge_request_queue: bool = True, ) -> FinalStatistics: + """Run the crawler until all requests are processed. + + Args: + requests: The requests to be enqueued before the crawler starts. + purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default + request queue will be purged. + """ # TODO: Create something more robust that does not leak implementation so much async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics, @@ -176,8 +201,20 @@ async def run( # Can't use override as mypy does not like it for double underscore private method. async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 + """Overrided BasicCrawler method that delegates request processing to sub crawlers. + + To decide which sub crawler should process the request it runs `rendering_type_predictor`. + To check if results are valid it uses `result_checker`. + To compare results of both sub crawlers it uses `result_comparator`. + + Reference implementation: https://github.com/apify/crawlee/blob/master/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts + """ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, use_state: dict | None = None) -> SubCrawlerRun: + """Helper closure that creates new `RequestHandlerRunResult` and delegates request handling to sub crawler. + + Produces `SubCrawlerRun` that either contains filled `RequestHandlerRunResult` or exception. + """ try: crawl_result = await crawler.crawl_one( context = context, @@ -244,8 +281,13 @@ async def commit_result(self, result: RequestHandlerRunResult, context: BasicCra asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls]) result_tasks.extend([ asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls]) + + # What to do with KV changes???? await asyncio.gather(*result_tasks) + # Optimize if needed + await self._commit_key_value_store_changes(result) + def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None: """Pre navigation hooks for adaptive crawler are delegated to sub crawlers.""" diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index 5f91a4f6df..61f675d9b3 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -14,15 +14,20 @@ async def main() ->None: # TODO: remove in review top_logger = getLogger(__name__) top_logger.setLevel(logging.DEBUG) + i=0 - - crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=10, _logger=top_logger) + crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=10, + _logger=top_logger, + playwright_crawler_args={"headless":False}) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + nonlocal i + i = i+1 context.log.info(f'Processing with Top adaptive_crawler: {context.request.url} ...') await context.enqueue_links() await context.push_data({'Top crwaler Url': context.request.url}) + await context.use_state({"bla":i}) @crawler.pre_navigation_hook_bs async def bs_hook(context: BasicCrawlingContext) -> None: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 9b09b15e54..d48f6a13dc 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -946,11 +946,18 @@ async def _commit_request_handler_result( for push_data_call in result.push_data_calls: await self._push_data(**push_data_call) + await self._commit_key_value_store_changes(result) + + + + async def _commit_key_value_store_changes(self, result: RequestHandlerRunResult) -> None: for (id, name), changes in result.key_value_store_changes.items(): store = await self.get_key_value_store(id=id, name=name) for key, value in changes.updates.items(): await store.set_value(key, value.content, value.content_type) + + async def __is_finished_function(self) -> bool: self._stop_if_max_requests_count_exceeded() if self._unexpected_stop: @@ -1140,7 +1147,7 @@ async def get_input_state(default_value: dict[str, JsonSerializable] | None = No return use_state use_state_function = get_input_state else: - use_state_function = context.use_state + use_state_function = result.use_state context_linked_to_result = BasicCrawlingContext( request=context.request, From d345259148da43dc0de6b458991a43fe56e015e8 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 3 Jan 2025 11:02:48 +0100 Subject: [PATCH 12/79] use_state through RequestHandlerRunResult --- docs/examples/code/beautifulsoup_crawler.py | 2 +- src/crawlee/_types.py | 36 +++++++++++++- .../crawlers/_adaptive_playwright/__init__.py | 5 +- .../_adaptive_playwright_crawler.py | 7 +-- .../crawlers/_adaptive_playwright/_example.py | 4 +- src/crawlee/crawlers/_basic/_basic_crawler.py | 9 +--- src/crawlee/storages/_key_value_store.py | 31 ------------ tests/unit/storages/test_key_value_store.py | 47 +------------------ 8 files changed, 45 insertions(+), 96 deletions(-) diff --git a/docs/examples/code/beautifulsoup_crawler.py b/docs/examples/code/beautifulsoup_crawler.py index 0df6182677..dbbc410a4d 100644 --- a/docs/examples/code/beautifulsoup_crawler.py +++ b/docs/examples/code/beautifulsoup_crawler.py @@ -25,7 +25,7 @@ async def main() -> None: @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') - await context.use_state({"asd":"sad"}) + await context.use_state({'asd':'sad'}) # Extract data from the page. data = { 'url': context.request.url, diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 23888b77e0..6f27b237f1 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Iterator, Mapping +from copy import deepcopy from dataclasses import dataclass from enum import Enum from typing import TYPE_CHECKING, Annotated, Any, Literal, Optional, Protocol, TypeVar, Union, cast, overload @@ -402,12 +403,17 @@ def __call__( class RequestHandlerRunResult: """Record of calls to storage-related context helpers.""" + CRAWLEE_STATE_KEY = 'CRAWLEE_STATE' def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None: self._key_value_store_getter = key_value_store_getter self.add_requests_calls = list[AddRequestsKwargs]() self.push_data_calls = list[PushDataFunctionCall]() self.key_value_store_changes = dict[tuple[Optional[str], Optional[str]], KeyValueStoreChangeRecords]() + # This is handle to dict available to user. If it gets mutated, it needs to be reflected in changes. + self._use_state_user: None | dict[str, JsonSerializable] = None + # Last known use_state by RequestHandlerRunResult. Used for mutation detection by user. + self._last_known_use_state: None | dict[str, JsonSerializable] = None async def add_requests( self, @@ -452,5 +458,31 @@ async def get_key_value_store( return self.key_value_store_changes[id, name] - async def use_state(self): - # TODO: Somehow make crawlers add to kvs through this. Currently it does it directly + async def use_state(self, default_value: dict[str, JsonSerializable] | None = None) -> dict[str, JsonSerializable]: + # Find if the value is already present i + _default: dict[str, JsonSerializable] = default_value or {} + default_kvs_changes = await self.get_key_value_store() + + use_state: dict[str, JsonSerializable] = await default_kvs_changes.get_value(self.CRAWLEE_STATE_KEY, _default) + + if use_state is _default: + # Set default value if there is no value in change records or actual kvs. + await default_kvs_changes.set_value(self.CRAWLEE_STATE_KEY, _default) + + # This will be same dict that is available to the user and can be mutated at any point. + self._use_state_user = use_state + # This will not be available to user and should not be change. + self._last_known_use_state = deepcopy(self._use_state_user) + + return use_state + + async def update_mutated_use_state(self) -> None: + """Update use_state if it was mutated by the user.""" + if self._use_state_user != self._last_known_use_state: + default_kvs_changes = await self.get_key_value_store() + await default_kvs_changes.set_value(self.CRAWLEE_STATE_KEY, self._use_state_user) + + + + + diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py index 2634289eac..daa9b14cb6 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/__init__.py +++ b/src/crawlee/crawlers/_adaptive_playwright/__init__.py @@ -1,6 +1,7 @@ from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import \ - AdaptivePlaywrightCrawlingContext +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( + AdaptivePlaywrightCrawlingContext, +) __all__ = [ 'AdaptivePlaywrightCrawler', diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 5ab1733288..813f6c841c 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -16,7 +16,6 @@ BeautifulSoupCrawler, BeautifulSoupCrawlingContext, BeautifulSoupParserType, - ContextPipeline, PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext, @@ -115,9 +114,6 @@ def __init__(self, playwright_crawler_args: PlaywrightCrawler only kwargs that are passed to the sub crawler. kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ - - - # Some sub crawler kwargs are internally modified. Prepare copies. bs_kwargs = deepcopy(kwargs) pw_kwargs = deepcopy(kwargs) @@ -193,7 +189,6 @@ async def run( purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default request queue will be purged. """ - # TODO: Create something more robust that does not leak implementation so much async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics, self.playwright_crawler._additional_context_managers[0]): @@ -249,6 +244,8 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, context.log.debug(f'Running browser request handler for {context.request.url}') + + # This might not be needed if kvs access is properly routed through results and we commit PW result in the end of the function kvs = await context.get_key_value_store() default_value =dict[str, JsonSerializable]() old_state: dict[str, JsonSerializable] = await kvs.get_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index 61f675d9b3..3a0908ea4b 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -18,7 +18,7 @@ async def main() ->None: crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=10, _logger=top_logger, - playwright_crawler_args={"headless":False}) + playwright_crawler_args={'headless':False}) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @@ -27,7 +27,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: context.log.info(f'Processing with Top adaptive_crawler: {context.request.url} ...') await context.enqueue_links() await context.push_data({'Top crwaler Url': context.request.url}) - await context.use_state({"bla":i}) + await context.use_state({'bla':i}) @crawler.pre_navigation_hook_bs async def bs_hook(context: BasicCrawlingContext) -> None: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index d48f6a13dc..7ca35fe928 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -574,12 +574,6 @@ async def add_requests( wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout, ) - async def _use_state( - self, default_value: dict[str, JsonSerializable] | None = None - ) -> dict[str, JsonSerializable]: - store = await self.get_key_value_store() - return await store.get_auto_saved_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) - async def _save_crawler_state(self) -> None: store = await self.get_key_value_store() await store.persist_autosaved_values() @@ -951,6 +945,7 @@ async def _commit_request_handler_result( async def _commit_key_value_store_changes(self, result: RequestHandlerRunResult) -> None: + await result.update_mutated_use_state() for (id, name), changes in result.key_value_store_changes.items(): store = await self.get_key_value_store(id=id, name=name) for key, value in changes.updates.items(): @@ -1011,7 +1006,7 @@ async def __run_task_function(self) -> None: add_requests=result.add_requests, push_data=result.push_data, get_key_value_store=result.get_key_value_store, - use_state=self._use_state, + use_state=result.use_state, log=self._logger, ) diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 427032940d..be21ece989 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -182,37 +182,6 @@ async def get_public_url(self, key: str) -> str: """ return await self._resource_client.get_public_url(key) - async def get_auto_saved_value( - self, - key: str, - default_value: dict[str, JsonSerializable] | None = None, - ) -> dict[str, JsonSerializable]: - """Gets a value from KVS that will be automatically saved on changes. - - Args: - key: Key of the record, to store the value. - default_value: Value to be used if the record does not exist yet. Should be a dictionary. - - Returns: - Returns the value of the key. - """ - default_value = {} if default_value is None else default_value - - if key in self._cache: - return self._cache[key] - - value = await self.get_value(key, default_value) - - if not isinstance(value, dict): - raise TypeError( - f'Expected dictionary for persist state value at key "{key}, but got {type(value).__name__}' - ) - - self._cache[key] = value - - self._ensure_persist_event() - - return value @property def _cache(self) -> dict[str, dict[str, JsonSerializable]]: diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index b1ba14088b..b57b1ca98e 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -1,7 +1,7 @@ from __future__ import annotations import asyncio -from datetime import datetime, timedelta, timezone +from datetime import timedelta from typing import TYPE_CHECKING from unittest.mock import patch from urllib.parse import urlparse @@ -14,7 +14,6 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator - from crawlee._types import JsonSerializable @pytest.fixture @@ -134,47 +133,3 @@ async def test_get_public_url(key_value_store: KeyValueStore) -> None: with open(path) as f: # noqa: ASYNC230 content = await asyncio.to_thread(f.read) assert content == 'static' - - -async def test_get_auto_saved_value_default_value(key_value_store: KeyValueStore) -> None: - default_value: dict[str, JsonSerializable] = {'hello': 'world'} - value = await key_value_store.get_auto_saved_value('state', default_value) - assert value == default_value - - -async def test_get_auto_saved_value_cache_value(key_value_store: KeyValueStore) -> None: - default_value: dict[str, JsonSerializable] = {'hello': 'world'} - key_name = 'state' - - value = await key_value_store.get_auto_saved_value(key_name, default_value) - value['hello'] = 'new_world' - value_one = await key_value_store.get_auto_saved_value(key_name) - assert value_one == {'hello': 'new_world'} - - value_one['hello'] = ['new_world'] - value_two = await key_value_store.get_auto_saved_value(key_name) - assert value_two == {'hello': ['new_world']} - - -async def test_get_auto_saved_value_auto_save(key_value_store: KeyValueStore, mock_event_manager: EventManager) -> None: # noqa: ARG001 - # This is not a realtime system and timing constrains can be hard to enforce. - # For the test to avoid flakiness it needs some time tolerance. - autosave_deadline_time = 1 - autosave_check_period = 0.01 - - async def autosaved_within_deadline(key: str, expected_value: dict[str, str]) -> bool: - """Check if the `key_value_store` of `key` has expected value within `autosave_deadline_time` seconds.""" - deadline = datetime.now(tz=timezone.utc) + timedelta(seconds=autosave_deadline_time) - while datetime.now(tz=timezone.utc) < deadline: - await asyncio.sleep(autosave_check_period) - if await key_value_store.get_value(key) == expected_value: - return True - return False - - default_value: dict[str, JsonSerializable] = {'hello': 'world'} - key_name = 'state' - value = await key_value_store.get_auto_saved_value(key_name, default_value) - assert await autosaved_within_deadline(key=key_name, expected_value={'hello': 'world'}) - - value['hello'] = 'new_world' - assert await autosaved_within_deadline(key=key_name, expected_value={'hello': 'new_world'}) From bfa9290f552db9c58daffe85840b7f59e56d1a7f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 6 Jan 2025 09:16:52 +0100 Subject: [PATCH 13/79] Revert "use_state through RequestHandlerRunResult" This reverts commit d345259148da43dc0de6b458991a43fe56e015e8. --- docs/examples/code/beautifulsoup_crawler.py | 2 +- src/crawlee/_types.py | 36 +------------- .../crawlers/_adaptive_playwright/__init__.py | 5 +- .../_adaptive_playwright_crawler.py | 7 ++- .../crawlers/_adaptive_playwright/_example.py | 4 +- src/crawlee/crawlers/_basic/_basic_crawler.py | 9 +++- src/crawlee/storages/_key_value_store.py | 31 ++++++++++++ tests/unit/storages/test_key_value_store.py | 47 ++++++++++++++++++- 8 files changed, 96 insertions(+), 45 deletions(-) diff --git a/docs/examples/code/beautifulsoup_crawler.py b/docs/examples/code/beautifulsoup_crawler.py index dbbc410a4d..0df6182677 100644 --- a/docs/examples/code/beautifulsoup_crawler.py +++ b/docs/examples/code/beautifulsoup_crawler.py @@ -25,7 +25,7 @@ async def main() -> None: @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') - await context.use_state({'asd':'sad'}) + await context.use_state({"asd":"sad"}) # Extract data from the page. data = { 'url': context.request.url, diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 6f27b237f1..23888b77e0 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -1,7 +1,6 @@ from __future__ import annotations from collections.abc import Iterator, Mapping -from copy import deepcopy from dataclasses import dataclass from enum import Enum from typing import TYPE_CHECKING, Annotated, Any, Literal, Optional, Protocol, TypeVar, Union, cast, overload @@ -403,17 +402,12 @@ def __call__( class RequestHandlerRunResult: """Record of calls to storage-related context helpers.""" - CRAWLEE_STATE_KEY = 'CRAWLEE_STATE' def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None: self._key_value_store_getter = key_value_store_getter self.add_requests_calls = list[AddRequestsKwargs]() self.push_data_calls = list[PushDataFunctionCall]() self.key_value_store_changes = dict[tuple[Optional[str], Optional[str]], KeyValueStoreChangeRecords]() - # This is handle to dict available to user. If it gets mutated, it needs to be reflected in changes. - self._use_state_user: None | dict[str, JsonSerializable] = None - # Last known use_state by RequestHandlerRunResult. Used for mutation detection by user. - self._last_known_use_state: None | dict[str, JsonSerializable] = None async def add_requests( self, @@ -458,31 +452,5 @@ async def get_key_value_store( return self.key_value_store_changes[id, name] - async def use_state(self, default_value: dict[str, JsonSerializable] | None = None) -> dict[str, JsonSerializable]: - # Find if the value is already present i - _default: dict[str, JsonSerializable] = default_value or {} - default_kvs_changes = await self.get_key_value_store() - - use_state: dict[str, JsonSerializable] = await default_kvs_changes.get_value(self.CRAWLEE_STATE_KEY, _default) - - if use_state is _default: - # Set default value if there is no value in change records or actual kvs. - await default_kvs_changes.set_value(self.CRAWLEE_STATE_KEY, _default) - - # This will be same dict that is available to the user and can be mutated at any point. - self._use_state_user = use_state - # This will not be available to user and should not be change. - self._last_known_use_state = deepcopy(self._use_state_user) - - return use_state - - async def update_mutated_use_state(self) -> None: - """Update use_state if it was mutated by the user.""" - if self._use_state_user != self._last_known_use_state: - default_kvs_changes = await self.get_key_value_store() - await default_kvs_changes.set_value(self.CRAWLEE_STATE_KEY, self._use_state_user) - - - - - + async def use_state(self): + # TODO: Somehow make crawlers add to kvs through this. Currently it does it directly diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py index daa9b14cb6..2634289eac 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/__init__.py +++ b/src/crawlee/crawlers/_adaptive_playwright/__init__.py @@ -1,7 +1,6 @@ from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( - AdaptivePlaywrightCrawlingContext, -) +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import \ + AdaptivePlaywrightCrawlingContext __all__ = [ 'AdaptivePlaywrightCrawler', diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 813f6c841c..5ab1733288 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -16,6 +16,7 @@ BeautifulSoupCrawler, BeautifulSoupCrawlingContext, BeautifulSoupParserType, + ContextPipeline, PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext, @@ -114,6 +115,9 @@ def __init__(self, playwright_crawler_args: PlaywrightCrawler only kwargs that are passed to the sub crawler. kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ + + + # Some sub crawler kwargs are internally modified. Prepare copies. bs_kwargs = deepcopy(kwargs) pw_kwargs = deepcopy(kwargs) @@ -189,6 +193,7 @@ async def run( purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default request queue will be purged. """ + # TODO: Create something more robust that does not leak implementation so much async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics, self.playwright_crawler._additional_context_managers[0]): @@ -244,8 +249,6 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, context.log.debug(f'Running browser request handler for {context.request.url}') - - # This might not be needed if kvs access is properly routed through results and we commit PW result in the end of the function kvs = await context.get_key_value_store() default_value =dict[str, JsonSerializable]() old_state: dict[str, JsonSerializable] = await kvs.get_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index 3a0908ea4b..61f675d9b3 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -18,7 +18,7 @@ async def main() ->None: crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=10, _logger=top_logger, - playwright_crawler_args={'headless':False}) + playwright_crawler_args={"headless":False}) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @@ -27,7 +27,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: context.log.info(f'Processing with Top adaptive_crawler: {context.request.url} ...') await context.enqueue_links() await context.push_data({'Top crwaler Url': context.request.url}) - await context.use_state({'bla':i}) + await context.use_state({"bla":i}) @crawler.pre_navigation_hook_bs async def bs_hook(context: BasicCrawlingContext) -> None: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 7ca35fe928..d48f6a13dc 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -574,6 +574,12 @@ async def add_requests( wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout, ) + async def _use_state( + self, default_value: dict[str, JsonSerializable] | None = None + ) -> dict[str, JsonSerializable]: + store = await self.get_key_value_store() + return await store.get_auto_saved_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) + async def _save_crawler_state(self) -> None: store = await self.get_key_value_store() await store.persist_autosaved_values() @@ -945,7 +951,6 @@ async def _commit_request_handler_result( async def _commit_key_value_store_changes(self, result: RequestHandlerRunResult) -> None: - await result.update_mutated_use_state() for (id, name), changes in result.key_value_store_changes.items(): store = await self.get_key_value_store(id=id, name=name) for key, value in changes.updates.items(): @@ -1006,7 +1011,7 @@ async def __run_task_function(self) -> None: add_requests=result.add_requests, push_data=result.push_data, get_key_value_store=result.get_key_value_store, - use_state=result.use_state, + use_state=self._use_state, log=self._logger, ) diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index be21ece989..427032940d 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -182,6 +182,37 @@ async def get_public_url(self, key: str) -> str: """ return await self._resource_client.get_public_url(key) + async def get_auto_saved_value( + self, + key: str, + default_value: dict[str, JsonSerializable] | None = None, + ) -> dict[str, JsonSerializable]: + """Gets a value from KVS that will be automatically saved on changes. + + Args: + key: Key of the record, to store the value. + default_value: Value to be used if the record does not exist yet. Should be a dictionary. + + Returns: + Returns the value of the key. + """ + default_value = {} if default_value is None else default_value + + if key in self._cache: + return self._cache[key] + + value = await self.get_value(key, default_value) + + if not isinstance(value, dict): + raise TypeError( + f'Expected dictionary for persist state value at key "{key}, but got {type(value).__name__}' + ) + + self._cache[key] = value + + self._ensure_persist_event() + + return value @property def _cache(self) -> dict[str, dict[str, JsonSerializable]]: diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index b57b1ca98e..b1ba14088b 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -1,7 +1,7 @@ from __future__ import annotations import asyncio -from datetime import timedelta +from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING from unittest.mock import patch from urllib.parse import urlparse @@ -14,6 +14,7 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator + from crawlee._types import JsonSerializable @pytest.fixture @@ -133,3 +134,47 @@ async def test_get_public_url(key_value_store: KeyValueStore) -> None: with open(path) as f: # noqa: ASYNC230 content = await asyncio.to_thread(f.read) assert content == 'static' + + +async def test_get_auto_saved_value_default_value(key_value_store: KeyValueStore) -> None: + default_value: dict[str, JsonSerializable] = {'hello': 'world'} + value = await key_value_store.get_auto_saved_value('state', default_value) + assert value == default_value + + +async def test_get_auto_saved_value_cache_value(key_value_store: KeyValueStore) -> None: + default_value: dict[str, JsonSerializable] = {'hello': 'world'} + key_name = 'state' + + value = await key_value_store.get_auto_saved_value(key_name, default_value) + value['hello'] = 'new_world' + value_one = await key_value_store.get_auto_saved_value(key_name) + assert value_one == {'hello': 'new_world'} + + value_one['hello'] = ['new_world'] + value_two = await key_value_store.get_auto_saved_value(key_name) + assert value_two == {'hello': ['new_world']} + + +async def test_get_auto_saved_value_auto_save(key_value_store: KeyValueStore, mock_event_manager: EventManager) -> None: # noqa: ARG001 + # This is not a realtime system and timing constrains can be hard to enforce. + # For the test to avoid flakiness it needs some time tolerance. + autosave_deadline_time = 1 + autosave_check_period = 0.01 + + async def autosaved_within_deadline(key: str, expected_value: dict[str, str]) -> bool: + """Check if the `key_value_store` of `key` has expected value within `autosave_deadline_time` seconds.""" + deadline = datetime.now(tz=timezone.utc) + timedelta(seconds=autosave_deadline_time) + while datetime.now(tz=timezone.utc) < deadline: + await asyncio.sleep(autosave_check_period) + if await key_value_store.get_value(key) == expected_value: + return True + return False + + default_value: dict[str, JsonSerializable] = {'hello': 'world'} + key_name = 'state' + value = await key_value_store.get_auto_saved_value(key_name, default_value) + assert await autosaved_within_deadline(key=key_name, expected_value={'hello': 'world'}) + + value['hello'] = 'new_world' + assert await autosaved_within_deadline(key=key_name, expected_value={'hello': 'new_world'}) From 63f278a01345adff6fe6d48714d42cc243cdd374 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 6 Jan 2025 10:34:14 +0100 Subject: [PATCH 14/79] Add basic delegation test. --- docs/examples/code/beautifulsoup_crawler.py | 2 +- src/crawlee/_types.py | 4 - .../crawlers/_adaptive_playwright/__init__.py | 5 +- .../_adaptive_playwright_crawler.py | 5 - .../_adaptive_playwright_crawling_context.py | 11 ++- .../crawlers/_adaptive_playwright/_example.py | 4 +- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- .../test_adaptive_playwright_crawler.py | 94 +++++++++++++++++++ 8 files changed, 108 insertions(+), 19 deletions(-) create mode 100644 tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py diff --git a/docs/examples/code/beautifulsoup_crawler.py b/docs/examples/code/beautifulsoup_crawler.py index 0df6182677..dbbc410a4d 100644 --- a/docs/examples/code/beautifulsoup_crawler.py +++ b/docs/examples/code/beautifulsoup_crawler.py @@ -25,7 +25,7 @@ async def main() -> None: @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') - await context.use_state({"asd":"sad"}) + await context.use_state({'asd':'sad'}) # Extract data from the page. data = { 'url': context.request.url, diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 23888b77e0..30d6ebf401 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -450,7 +450,3 @@ async def get_key_value_store( ) return self.key_value_store_changes[id, name] - - - async def use_state(self): - # TODO: Somehow make crawlers add to kvs through this. Currently it does it directly diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py index 2634289eac..daa9b14cb6 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/__init__.py +++ b/src/crawlee/crawlers/_adaptive_playwright/__init__.py @@ -1,6 +1,7 @@ from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import \ - AdaptivePlaywrightCrawlingContext +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( + AdaptivePlaywrightCrawlingContext, +) __all__ = [ 'AdaptivePlaywrightCrawler', diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 5ab1733288..881fa0ea63 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -16,7 +16,6 @@ BeautifulSoupCrawler, BeautifulSoupCrawlingContext, BeautifulSoupParserType, - ContextPipeline, PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext, @@ -115,9 +114,6 @@ def __init__(self, playwright_crawler_args: PlaywrightCrawler only kwargs that are passed to the sub crawler. kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ - - - # Some sub crawler kwargs are internally modified. Prepare copies. bs_kwargs = deepcopy(kwargs) pw_kwargs = deepcopy(kwargs) @@ -193,7 +189,6 @@ async def run( purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default request queue will be purged. """ - # TODO: Create something more robust that does not leak implementation so much async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics, self.playwright_crawler._additional_context_managers[0]): diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 6079969a34..f2a71d8f62 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -15,6 +15,10 @@ from playwright.async_api import Page, Response from typing_extensions import Self +class AdaptiveContextError(RuntimeError): + pass + + @dataclass(frozen=True) @docs_group('Data structures') @@ -22,24 +26,23 @@ class AdaptivePlaywrightCrawlingContext(BeautifulSoupCrawlingContext): _response: Response | None = None _infinite_scroll: Callable[[], Awaitable[None]] | None = None _page : Page | None = None - # TODO: UseStateFunction @property def page(self) -> Page: if not self._page: - raise RuntimeError('Page was not crawled with PlaywrightCrawler') + raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._page @property def infinite_scroll(self) -> Callable[[], Awaitable[None]]: if not self._infinite_scroll: - raise RuntimeError('Page was not crawled with PlaywrightCrawler') + raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._infinite_scroll @property def response(self) -> Response: if not self._response: - raise RuntimeError('Page was not crawled with PlaywrightCrawler') + raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._response @classmethod diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index 61f675d9b3..3a0908ea4b 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -18,7 +18,7 @@ async def main() ->None: crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=10, _logger=top_logger, - playwright_crawler_args={"headless":False}) + playwright_crawler_args={'headless':False}) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @@ -27,7 +27,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: context.log.info(f'Processing with Top adaptive_crawler: {context.request.url} ...') await context.enqueue_links() await context.push_data({'Top crwaler Url': context.request.url}) - await context.use_state({"bla":i}) + await context.use_state({'bla':i}) @crawler.pre_navigation_hook_bs async def bs_hook(context: BasicCrawlingContext) -> None: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index d48f6a13dc..1c5fe450bc 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1147,7 +1147,7 @@ async def get_input_state(default_value: dict[str, JsonSerializable] | None = No return use_state use_state_function = get_input_state else: - use_state_function = result.use_state + use_state_function = context.use_state context_linked_to_result = BasicCrawlingContext( request=context.request, diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py new file mode 100644 index 0000000000..27b42e21f3 --- /dev/null +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from itertools import cycle +from typing import TYPE_CHECKING + +import pytest +from typing_extensions import override + +from crawlee.crawlers._adaptive_playwright import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import AdaptiveContextError +from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( + RenderingType, + RenderingTypePrediction, + RenderingTypePredictor, +) + +if TYPE_CHECKING: + from collections.abc import Iterator + + from crawlee._types import BasicCrawlingContext + from crawlee.crawlers import PlaywrightPreNavCrawlingContext + + +class _SimpleRenderingTypePredictor(RenderingTypePredictor): + """Simplified predictor for tests.""" + + def __init__(self, rendering_types: Iterator[RenderingType], + detection_probability_recommendation: Iterator[int]) -> None: + self._rendering_types = rendering_types + self._detection_probability_recommendation = detection_probability_recommendation + + @override + def predict(self, url: str, label: str | None) -> RenderingTypePrediction: + return RenderingTypePrediction(next(self._rendering_types), next(self._detection_probability_recommendation)) + + @override + def store_result(self, url: str, label: str | None, crawl_type: RenderingType) -> None: + pass + + + +@pytest.mark.parametrize(('expected_pw_count', 'expected_bs_count', 'rendering_types'), [ + (0,2, cycle(['static'])), + (2,0, cycle(['client only'])), + (1,1, cycle(['static','client only'])), +]) +async def test_adaptive_crawling(expected_pw_count: int, expected_bs_count: int, + rendering_types: Iterator[RenderingType]) -> None: + """Tests correct routing to pre-nav hooks and correct handling through proper handler.""" + static_only_predictor = _SimpleRenderingTypePredictor( + rendering_types = rendering_types, + detection_probability_recommendation=cycle([0]) + ) + + + crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor) + + pw_handler_count = 0 + bs_handler_count = 0 + + pw_hook_count = 0 + bs_hook_count = 0 + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + nonlocal pw_handler_count + nonlocal bs_handler_count + + try: + context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. + pw_handler_count += 1 + except AdaptiveContextError: + bs_handler_count += 1 + + + @crawler.pre_navigation_hook_bs + async def bs_hook(context: BasicCrawlingContext) -> None: # noqa:ARG001 # Intentionally unused arg + nonlocal bs_hook_count + bs_hook_count += 1 + + @crawler.pre_navigation_hook_pw + async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: # noqa:ARG001 # Intentionally unused arg + nonlocal pw_hook_count + pw_hook_count += 1 + + + await crawler.run(['https://crawlee.dev/', + 'https://crawlee.dev/docs/quick-start']) + + assert pw_handler_count == expected_pw_count + assert pw_hook_count == expected_pw_count + + assert bs_handler_count == expected_bs_count + assert bs_hook_count == expected_bs_count From e190788c9026e0c843603340a2d0b1097bfd96dc Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 6 Jan 2025 12:59:53 +0100 Subject: [PATCH 15/79] Add context test. --- .../crawlers/_adaptive_playwright/_example.py | 2 +- .../test_adaptive_playwright_crawler.py | 54 +++++++++++++++---- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index 3a0908ea4b..a6f828dc80 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -11,7 +11,7 @@ async def main() ->None: - # TODO: remove in review + # TODO: remove in review. Move this to documentation examples instead. top_logger = getLogger(__name__) top_logger.setLevel(logging.DEBUG) i=0 diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 27b42e21f3..f83f29e640 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -6,6 +6,8 @@ import pytest from typing_extensions import override +from crawlee._types import BasicCrawlingContext +from crawlee.crawlers import PlaywrightPreNavCrawlingContext from crawlee.crawlers._adaptive_playwright import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import AdaptiveContextError from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( @@ -17,8 +19,6 @@ if TYPE_CHECKING: from collections.abc import Iterator - from crawlee._types import BasicCrawlingContext - from crawlee.crawlers import PlaywrightPreNavCrawlingContext class _SimpleRenderingTypePredictor(RenderingTypePredictor): @@ -39,17 +39,22 @@ def store_result(self, url: str, label: str | None, crawl_type: RenderingType) - -@pytest.mark.parametrize(('expected_pw_count', 'expected_bs_count', 'rendering_types'), [ - (0,2, cycle(['static'])), - (2,0, cycle(['client only'])), - (1,1, cycle(['static','client only'])), +@pytest.mark.parametrize(('expected_pw_count', 'expected_bs_count', 'rendering_types', + 'detection_probability_recommendation'), [ + pytest.param(0,2, cycle(['static']), cycle([0]), id='Static only.'), + pytest.param(2,0, cycle(['client only']), cycle([0]), id='Client only.'), + pytest.param(1,1, cycle(['static','client only']), cycle([0]),id='Mixed.'), + pytest.param(2,2, cycle(['static','client only']), cycle([1]),id='Enforced rendering type detection.'), ]) async def test_adaptive_crawling(expected_pw_count: int, expected_bs_count: int, - rendering_types: Iterator[RenderingType]) -> None: + rendering_types: Iterator[RenderingType], + detection_probability_recommendation: Iterator[int]) -> None: """Tests correct routing to pre-nav hooks and correct handling through proper handler.""" + requests = ['https://crawlee.dev/', 'https://crawlee.dev/docs/quick-start'] + static_only_predictor = _SimpleRenderingTypePredictor( rendering_types = rendering_types, - detection_probability_recommendation=cycle([0]) + detection_probability_recommendation=detection_probability_recommendation ) @@ -67,6 +72,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: nonlocal bs_handler_count try: + # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. pw_handler_count += 1 except AdaptiveContextError: @@ -84,11 +90,39 @@ async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: # noqa:ARG pw_hook_count += 1 - await crawler.run(['https://crawlee.dev/', - 'https://crawlee.dev/docs/quick-start']) + await crawler.run(requests) assert pw_handler_count == expected_pw_count assert pw_hook_count == expected_pw_count assert bs_handler_count == expected_bs_count assert bs_hook_count == expected_bs_count + + +async def test_adaptive_crawling_context() -> None: + """Tests that correct context is used.""" + requests = ['https://crawlee.dev/'] + + static_only_predictor = _SimpleRenderingTypePredictor( + rendering_types = cycle(['static']), + detection_probability_recommendation=cycle([1]) + ) + + crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor) + + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + assert context.request.url == requests[0] + + @crawler.pre_navigation_hook_bs + async def bs_hook(context: BasicCrawlingContext) -> None: + assert type(context) is BasicCrawlingContext + assert context.request.url == requests[0] + + @crawler.pre_navigation_hook_pw + async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: + assert type(context) is PlaywrightPreNavCrawlingContext + assert context.request.url == requests[0] + + await crawler.run(requests) From 0ecb137dccddf1cfb43c6204fb0a26923c1fc32b Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 6 Jan 2025 14:06:15 +0100 Subject: [PATCH 16/79] Add tests for use_state and predictor. --- .../test_adaptive_playwright_crawler.py | 127 +++++++++++++++--- 1 file changed, 106 insertions(+), 21 deletions(-) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index f83f29e640..b646663c0f 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -1,13 +1,15 @@ from __future__ import annotations from itertools import cycle -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast +from unittest.mock import Mock, patch import pytest from typing_extensions import override +from crawlee import Request from crawlee._types import BasicCrawlingContext -from crawlee.crawlers import PlaywrightPreNavCrawlingContext +from crawlee.crawlers import BasicCrawler, PlaywrightPreNavCrawlingContext from crawlee.crawlers._adaptive_playwright import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import AdaptiveContextError from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( @@ -24,10 +26,10 @@ class _SimpleRenderingTypePredictor(RenderingTypePredictor): """Simplified predictor for tests.""" - def __init__(self, rendering_types: Iterator[RenderingType], - detection_probability_recommendation: Iterator[int]) -> None: - self._rendering_types = rendering_types - self._detection_probability_recommendation = detection_probability_recommendation + def __init__(self, rendering_types: Iterator[RenderingType] | None = None, + detection_probability_recommendation: None | Iterator[int] = None) -> None: + self._rendering_types = rendering_types or cycle(['static']) + self._detection_probability_recommendation = detection_probability_recommendation or cycle([1]) @override def predict(self, url: str, label: str | None) -> RenderingTypePrediction: @@ -41,10 +43,10 @@ def store_result(self, url: str, label: str | None, crawl_type: RenderingType) - @pytest.mark.parametrize(('expected_pw_count', 'expected_bs_count', 'rendering_types', 'detection_probability_recommendation'), [ - pytest.param(0,2, cycle(['static']), cycle([0]), id='Static only.'), - pytest.param(2,0, cycle(['client only']), cycle([0]), id='Client only.'), - pytest.param(1,1, cycle(['static','client only']), cycle([0]),id='Mixed.'), - pytest.param(2,2, cycle(['static','client only']), cycle([1]),id='Enforced rendering type detection.'), + pytest.param(0,2, cycle(['static']), cycle([0]), id='Static only'), + pytest.param(2,0, cycle(['client only']), cycle([0]), id='Client only'), + pytest.param(1,1, cycle(['static','client only']), cycle([0]),id='Mixed'), + pytest.param(2,2, cycle(['static','client only']), cycle([1]),id='Enforced rendering type detection'), ]) async def test_adaptive_crawling(expected_pw_count: int, expected_bs_count: int, rendering_types: Iterator[RenderingType], @@ -52,13 +54,13 @@ async def test_adaptive_crawling(expected_pw_count: int, expected_bs_count: int, """Tests correct routing to pre-nav hooks and correct handling through proper handler.""" requests = ['https://crawlee.dev/', 'https://crawlee.dev/docs/quick-start'] - static_only_predictor = _SimpleRenderingTypePredictor( + predictor = _SimpleRenderingTypePredictor( rendering_types = rendering_types, detection_probability_recommendation=detection_probability_recommendation ) - crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor) + crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=predictor) pw_handler_count = 0 bs_handler_count = 0 @@ -100,16 +102,10 @@ async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: # noqa:ARG async def test_adaptive_crawling_context() -> None: - """Tests that correct context is used.""" + """Tests that correct context is used. Enforced rendering type detection to run both sub crawlers.""" + static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() requests = ['https://crawlee.dev/'] - - static_only_predictor = _SimpleRenderingTypePredictor( - rendering_types = cycle(['static']), - detection_probability_recommendation=cycle([1]) - ) - - crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor) - + crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @@ -126,3 +122,92 @@ async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: assert context.request.url == requests[0] await crawler.run(requests) + + +async def test_adaptive_crawling_result() -> None: + """Tests that result only from one sub crawler is saved. + + Enforced rendering type detection to run both sub crawlers.""" + static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() + requests = ['https://crawlee.dev/'] + crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) + + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + try: + # page is available only if it was crawled by PlaywrightCrawler. + context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. + await context.push_data({'handler': 'pw'}) + except AdaptiveContextError: + await context.push_data({'handler': 'bs'}) + + await crawler.run(requests) + + dataset = await crawler.get_dataset() + items = [item async for item in dataset.iterate_items()] + + # Enforced rendering type detection will trigger both sub crawlers, but only pw crawler result is saved. + assert items == [{'handler': 'pw'}] + + + +@pytest.mark.parametrize(('pw_saved_data', 'bs_saved_data', 'expected_result_renderingl_type'), [ + pytest.param({'some': 'data'}, {'some': 'data'}, 'static', id='Same results from sub crawlers'), + pytest.param({'some': 'data'}, {'different': 'data'}, 'client only', id='Different results from sub crawlers'), +]) +async def test_adaptive_crawling_predictor_calls(pw_saved_data: dict[str, str], bs_saved_data: dict[str, str], + expected_result_renderingl_type: RenderingType) -> None: + """Tests expected predictor calls. Same results.""" + some_label = 'bla' + some_url = 'https://crawlee.dev/' + static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() + requests = [Request.from_url(url=some_url, label=some_label)] + crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) + + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + try: + # page is available only if it was crawled by PlaywrightCrawler. + context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. + await context.push_data(pw_saved_data) + except AdaptiveContextError: + await context.push_data(bs_saved_data) + + with (patch.object(static_only_predictor_enforce_detection, 'store_result', Mock()) as mocked_store_result, + patch.object(static_only_predictor_enforce_detection, 'predict', Mock( + return_value=RenderingTypePrediction('static', 1))) as mocked_predict): + await crawler.run(requests) + + mocked_predict.assert_called_once_with(some_url, some_label) + # If `static` and `client only` results are same, `store_result` should be called with `static`. + mocked_store_result.assert_called_once_with(some_url, some_label, expected_result_renderingl_type) + + +async def test_adaptive_crawling_result_use_state_isolation() -> None: + """Tests that global state accessed through `use_state` is changed only by one sub crawler. + + Enforced rendering type detection to run both sub crawlers.""" + static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() + requests = ['https://crawlee.dev/'] + crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) + store = await crawler.get_key_value_store() + await store.set_value(BasicCrawler.CRAWLEE_STATE_KEY, {'counter': 0}) + request_handler_calls = 0 + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + nonlocal request_handler_calls + state = cast(dict[str, int],await context.use_state()) + request_handler_calls += 1 + state['counter'] += 1 + + await crawler.run(requests) + + await store.persist_autosaved_values() + + # Request handler was called twice + assert request_handler_calls == 2 + # Increment of global state happened only once + assert (await store.get_value(BasicCrawler.CRAWLEE_STATE_KEY))['counter'] == 1 From b73c702025a77641cd9427a84cb86d5e7c97371c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 6 Jan 2025 14:08:39 +0100 Subject: [PATCH 17/79] Remove unintended edit. --- docs/examples/code/beautifulsoup_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/code/beautifulsoup_crawler.py b/docs/examples/code/beautifulsoup_crawler.py index dbbc410a4d..94857bd736 100644 --- a/docs/examples/code/beautifulsoup_crawler.py +++ b/docs/examples/code/beautifulsoup_crawler.py @@ -25,7 +25,7 @@ async def main() -> None: @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') - await context.use_state({'asd':'sad'}) + # Extract data from the page. data = { 'url': context.request.url, From 5c79d0de4d5df1ad682c350ecce053459dbabc30 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 7 Jan 2025 09:34:46 +0100 Subject: [PATCH 18/79] Add tests for statistics. Fix wrong id for predictor_state persistence. --- .../_adaptive_playwright_crawler.py | 2 +- ..._adaptive_playwright_crawler_statistics.py | 14 ++++---- .../test_adaptive_playwright_crawler.py | 24 +++++++++++++ ..._adaptive_playwright_crawler_statistics.py | 34 +++++++++++++++++++ 4 files changed, 66 insertions(+), 8 deletions(-) create mode 100644 tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 881fa0ea63..ea7e4c6ce8 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -120,7 +120,7 @@ def __init__(self, # Adaptive crawling related. self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() - self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 + self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 # Intentionally unused argument. self.result_comparator = result_comparator or create_comparator(result_checker) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index 17a9770af1..8c5a0c3237 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -38,7 +38,7 @@ def __init__(self,*, periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), state_model: type[StatisticsState] = StatisticsState) -> None: - self._predictor_state = PredictorState() + self.predictor_state = PredictorState() super().__init__(persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, persist_state_key=persist_state_key, @@ -47,7 +47,7 @@ def __init__(self,*, periodic_message_logger=periodic_message_logger, log_interval=log_interval, state_model=state_model) - self._persist_predictor_state_key = self._get_default_persist_state_key() + '_PREDICTOR' + self._persist_predictor_state_key = self._persist_state_key + '_PREDICTOR' @classmethod def from_statistics(cls, statistics: Statistics) -> Self: @@ -62,20 +62,20 @@ def from_statistics(cls, statistics: Statistics) -> Self: ) def track_http_only_request_handler_runs(self) -> None: - self._predictor_state.http_only_request_handler_runs += 1 + self.predictor_state.http_only_request_handler_runs += 1 def track_browser_request_handler_runs(self) -> None: - self._predictor_state.browser_request_handler_runs += 1 + self.predictor_state.browser_request_handler_runs += 1 def track_rendering_type_mispredictions(self) -> None: - self._predictor_state.rendering_type_mispredictions += 1 + self.predictor_state.rendering_type_mispredictions += 1 @override async def _persist_other_statistics(self, key_value_store: KeyValueStore) -> None: """Persist state of predictor.""" await key_value_store.set_value( self._persist_predictor_state_key, - self._predictor_state.model_dump(mode='json', by_alias=True), + self.predictor_state.model_dump(mode='json', by_alias=True), 'application/json', ) @@ -84,5 +84,5 @@ async def _persist_other_statistics(self, key_value_store: KeyValueStore) -> Non async def _load_other_statistics(self, key_value_store: KeyValueStore) -> None: """Load state of predictor.""" stored_state = await key_value_store.get_value(self._persist_predictor_state_key, cast(Any, {})) - self._predictor_state = self._predictor_state.__class__.model_validate(stored_state) + self.predictor_state = self.predictor_state.__class__.model_validate(stored_state) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index b646663c0f..ad29b71a16 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -211,3 +211,27 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: assert request_handler_calls == 2 # Increment of global state happened only once assert (await store.get_value(BasicCrawler.CRAWLEE_STATE_KEY))['counter'] == 1 + + +async def test_adaptive_crawling_statistics() -> None: + """Test adaptive crawling related statistics. + + Crawler set to static crawling, but due to result_checker returning False on static crawling result it + will do browser crawling instead well. This increments all three adaptive crawling related stats.""" + requests = ['https://crawlee.dev/'] + + static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) + + crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_no_detection, + result_checker=lambda result: False) # noqa: ARG005 # Intentionally unused argument. + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + pass + + await crawler.run(requests) + + assert crawler.adaptive_statistics.predictor_state.http_only_request_handler_runs == 1 + assert crawler.adaptive_statistics.predictor_state.browser_request_handler_runs == 1 + assert crawler.adaptive_statistics.predictor_state.rendering_type_mispredictions == 1 + diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py new file mode 100644 index 0000000000..24c180e168 --- /dev/null +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py @@ -0,0 +1,34 @@ +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( + AdaptivePlaywrightCrawlerStatistics, +) + + +async def test_predictor_state_persistence() -> None: + """Test that adaptive statistics can be correctly persisted and initialized from persisted values.""" + + async with AdaptivePlaywrightCrawlerStatistics(persistence_enabled=True) as adaptive_statistics: + adaptive_statistics.track_rendering_type_mispredictions() + adaptive_statistics.track_rendering_type_mispredictions() + + adaptive_statistics.track_http_only_request_handler_runs() + adaptive_statistics.track_http_only_request_handler_runs() + adaptive_statistics.track_http_only_request_handler_runs() + + adaptive_statistics.track_browser_request_handler_runs() + + persistence_state_key = adaptive_statistics._persist_state_key + # adaptive_statistics are persisted after leaving the context + + # new_adaptive_statistics are initialized from the persisted values. + async with AdaptivePlaywrightCrawlerStatistics( + persistence_enabled=True, + persist_state_key=persistence_state_key) as new_adaptive_statistics: + pass + + assert new_adaptive_statistics.predictor_state.rendering_type_mispredictions == 2 + assert new_adaptive_statistics.predictor_state.http_only_request_handler_runs == 3 + assert new_adaptive_statistics.predictor_state.browser_request_handler_runs == 1 + + + + From 957915a7081b4169dd25507509d9a6b7242cf35d Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 7 Jan 2025 10:43:41 +0100 Subject: [PATCH 19/79] Add test for error handling adn commiting correct results. Add test for pre nav hook Add test for statistics in crawler init --- .../_result_comparator.py | 2 +- .../test_adaptive_playwright_crawler.py | 88 +++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py index 14c0dbb3a0..a628819fb9 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py @@ -28,7 +28,7 @@ def create_comparator(result_checker: Callable[[RequestHandlerRunResult], bool] def full_result_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: """Compare results by comparing all their parts.""" - # PlayWright can produce links with extra arguments compared to pure BS. Default comparator ignores this as it does + # PlayWright can produce links with extra arguments compared to pure BS. Default comparator ignores this. # Maybe full comparator should have flag about taking into account only urls without parameters. # https://sdk.apify.com/docs/guides/getting-started # https://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.1735494277124&__hstc=7591405.e2b9302ed00c5bfaee3a870166792181.1735494277124.1735494277124.1735494277124.1 diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index ad29b71a16..5f748a624e 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -1,5 +1,7 @@ from __future__ import annotations +import logging +from datetime import timedelta from itertools import cycle from typing import TYPE_CHECKING, cast from unittest.mock import Mock, patch @@ -11,12 +13,16 @@ from crawlee._types import BasicCrawlingContext from crawlee.crawlers import BasicCrawler, PlaywrightPreNavCrawlingContext from crawlee.crawlers._adaptive_playwright import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( + AdaptivePlaywrightCrawlerStatistics, +) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import AdaptiveContextError from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( RenderingType, RenderingTypePrediction, RenderingTypePredictor, ) +from crawlee.statistics import Statistics if TYPE_CHECKING: from collections.abc import Iterator @@ -235,3 +241,85 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: assert crawler.adaptive_statistics.predictor_state.browser_request_handler_runs == 1 assert crawler.adaptive_statistics.predictor_state.rendering_type_mispredictions == 1 +def test_adaptive_default_hooks_raise_exception() -> None: + """Trying to attach usual pre-navigation hook raises exception. + + It is ambiguous and so sub crawler specific hooks should be used instead.""" + + crawler = AdaptivePlaywrightCrawler() + + with pytest.raises(RuntimeError): + @crawler.pre_navigation_hook + def some_hook() -> None: + pass + + +@pytest.mark.parametrize('error_in_pw_crawler', [ + pytest.param(False, id='Error only in bs sub crawler'), + pytest.param(True, id='Error in both sub crawlers'), +]) +async def test_adaptive_crawler_exceptions_in_sub_crawlers(*,error_in_pw_crawler: bool) -> None: + """Test that correct results are commited when exceptions are raised in sub crawlers. + + Exception in bs sub crawler will be logged and pw sub crawler used instead. + Any result from bs sub crawler will be discarded, result form pw crawler will be saved instead. + (But global state modifications through `use_state` will not be reverted!!!) + + Exception in pw sub crawler will prevent any result from being commited. Even if `push_data` was called before + the exception + """ + requests = ['https://crawlee.dev/'] + static_only_no_detection_predictor = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) + + crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_no_detection_predictor) + saved_data = {'some': 'data'} + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + + try: + # page is available only if it was crawled by PlaywrightCrawler. + context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. + await context.push_data(saved_data) + if error_in_pw_crawler: + raise RuntimeError('Some pw sub crawler related error') + + except AdaptiveContextError: + await context.push_data({'this': 'data should not be saved'}) + raise RuntimeError('Some bs sub crawler related error') from None + + await crawler.run(requests) + + dataset = await crawler.get_dataset() + stored_results = [item async for item in dataset.iterate_items()] + + if error_in_pw_crawler: + assert stored_results == [] + else: + assert stored_results == [saved_data] + + +def test_adaptive_playwright_crawler_statistics_in_init() -> None: + """Tests that adaptive crawler uses created AdaptivePlaywrightCrawlerStatistics from inputted Statistics.""" + persistence_enabled = True + persist_state_kvs_name = 'some name' + persist_state_key = 'come key' + log_message = 'some message' + periodic_message_logger = logging.getLogger('some logger') # Accessing private member to create copy like-object. + log_interval = timedelta(minutes=2) + statistics = Statistics(persistence_enabled=persistence_enabled, + persist_state_kvs_name=persist_state_kvs_name, + persist_state_key=persist_state_key, + log_message=log_message, + periodic_message_logger=periodic_message_logger, + log_interval=log_interval) + + crawler = AdaptivePlaywrightCrawler(statistics=statistics) + + assert type(crawler._statistics) is AdaptivePlaywrightCrawlerStatistics + assert crawler._statistics._persistence_enabled == persistence_enabled + assert crawler._statistics._persist_state_kvs_name == persist_state_kvs_name + assert crawler._statistics._persist_state_key == persist_state_key + assert crawler._statistics._log_message == log_message + assert crawler._statistics._periodic_message_logger == periodic_message_logger + assert crawler._statistics._log_interval == log_interval From f12f605e3d4aab5a9c285587de5fe31e3b392cb9 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 7 Jan 2025 11:33:15 +0100 Subject: [PATCH 20/79] Add crawl_one_required_contexts property. (Alternative to accessing internals of sub crawlers) Cleanup commit results. --- .../_adaptive_playwright_crawler.py | 37 +++++++++++++------ src/crawlee/crawlers/_basic/_basic_crawler.py | 7 ++++ .../_playwright/_playwright_crawler.py | 9 +++++ .../test_adaptive_playwright_crawler.py | 4 +- 4 files changed, 43 insertions(+), 14 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index ea7e4c6ce8..bfc2925fc9 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -2,6 +2,7 @@ import asyncio import logging +from contextlib import AsyncExitStack from copy import deepcopy from logging import getLogger from random import random @@ -189,14 +190,27 @@ async def run( purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default request queue will be purged. """ - # TODO: Create something more robust that does not leak implementation so much - async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics, - self.playwright_crawler._additional_context_managers[0]): + contexts_to_enter = [ + cm + for cm in (self.beautifulsoup_crawler.crawl_one_required_contexts + + self.playwright_crawler.crawl_one_required_contexts) + if cm and getattr(cm, 'active', False) is False + ] + + # Enter contexts required by sub crawler for them to be able to do `crawl_one` + async with AsyncExitStack() as exit_stack: + for context in contexts_to_enter: + await exit_stack.enter_async_context(context) return await super().run(requests=requests, purge_request_queue=purge_request_queue) + # AsyncExitStack can in theory swallow exceptions and so the return might not execute. + # https://github.com/python/mypy/issues/7726 + raise RuntimeError('FinalStatistics not created.') + + # Can't use override as mypy does not like it for double underscore private method. async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 - """Overrided BasicCrawler method that delegates request processing to sub crawlers. + """Override BasicCrawler method that delegates request processing to sub crawlers. To decide which sub crawler should process the request it runs `rendering_type_predictor`. To check if results are valid it uses `result_checker`. @@ -271,17 +285,16 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result) async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: - result_tasks = [] - result_tasks.extend([ - asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls]) - result_tasks.extend([ - asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls]) + result_tasks = [ + asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls + ] + [ + asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls + ] + [ + asyncio.create_task(self._commit_key_value_store_changes(result)) + ] - # What to do with KV changes???? await asyncio.gather(*result_tasks) - # Optimize if needed - await self._commit_key_value_store_changes(result) def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 1c5fe450bc..36c4efda84 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1122,6 +1122,13 @@ async def __run_request_handler(self, context: BasicCrawlingContext) -> None: await self._context_pipeline(context, self.router) + @property + def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]: + """Contexts that have to be active before `crawl_one` can be called.""" + contexts: list[AbstractAsyncContextManager] = [] + contexts.append(self.statistics) + return contexts + async def crawl_one(self, *, context: BasicCrawlingContext, request_handler_timeout: timedelta, result: RequestHandlerRunResult, diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index df1494f3a9..cb3bd94ad1 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Mapping + from contextlib import AbstractAsyncContextManager from typing_extensions import Unpack @@ -285,3 +286,11 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], hook: A coroutine function to be called before each navigation. """ self._pre_navigation_hooks.append(hook) + + + @property + def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]: + """Contexts that have to be active before `crawl_one` can be called.""" + contexts = super().crawl_one_required_contexts + contexts.append(self._browser_pool) + return contexts diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 5f748a624e..a93ecbbe0c 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -3,7 +3,7 @@ import logging from datetime import timedelta from itertools import cycle -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Any, cast from unittest.mock import Mock, patch import pytest @@ -250,7 +250,7 @@ def test_adaptive_default_hooks_raise_exception() -> None: with pytest.raises(RuntimeError): @crawler.pre_navigation_hook - def some_hook() -> None: + async def some_hook(whatever: Any) -> None: pass From 5256af2593d8119ffbe5b5e2873145b904c98a93 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 7 Jan 2025 14:53:16 +0100 Subject: [PATCH 21/79] Lint --- .../_abstract_http/_abstract_http_crawler.py | 4 +- .../crawlers/_adaptive_playwright/__init__.py | 3 - .../_adaptive_playwright_crawler.py | 91 ++++++++------- ..._adaptive_playwright_crawler_statistics.py | 49 ++++---- .../_adaptive_playwright_crawling_context.py | 26 +++-- .../crawlers/_adaptive_playwright/_example.py | 14 +-- .../_rendering_type_predictor.py | 13 +-- .../_result_comparator.py | 18 +-- src/crawlee/crawlers/_basic/_basic_crawler.py | 43 +++---- .../_playwright/_playwright_crawler.py | 1 - src/crawlee/statistics/_statistics.py | 20 ++-- .../test_adaptive_playwright_crawler.py | 110 ++++++++++-------- ..._adaptive_playwright_crawler_statistics.py | 8 +- website/pydoc-markdown/generate_ast.py | 7 +- .../google_docstring_processor.py | 75 ++++++------ 15 files changed, 246 insertions(+), 236 deletions(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index ecf378dd73..7685e09570 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -37,15 +37,15 @@ class _HttpCrawlerOptions(Generic[TCrawlingContext], TypedDict): ignore_http_error_status_codes: NotRequired[Iterable[int]] """HTTP status codes typically considered errors but to be treated as successful responses.""" + @docs_group('Data structures') -class HttpCrawlerOptions(Generic[TCrawlingContext],_HttpCrawlerOptions, BasicCrawlerOptions[TCrawlingContext]): +class HttpCrawlerOptions(Generic[TCrawlingContext], _HttpCrawlerOptions, BasicCrawlerOptions[TCrawlingContext]): """Arguments for the `AbstractHttpCrawler` constructor. It is intended for typing forwarded `__init__` arguments in the subclasses. """ - @docs_group('Abstract classes') class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC): """A web crawler for performing HTTP requests. diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py index daa9b14cb6..46addf4483 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/__init__.py +++ b/src/crawlee/crawlers/_adaptive_playwright/__init__.py @@ -9,6 +9,3 @@ 'HttpCrawlerOptions', 'ParsedHttpCrawlingContext', ] - - - diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index bfc2925fc9..141bc2f516 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -70,7 +70,6 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict): """The type of browser to launch ('chromium', 'firefox', or 'webkit'). This option should not be used if `browser_pool` is provided.""" - browser_launch_options: NotRequired[Mapping[str, Any]] """Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright @@ -97,13 +96,15 @@ class AdaptivePlaywrightCrawler(BasicCrawler[AdaptivePlaywrightCrawlingContext]) # TODO: Add example """ - def __init__(self, - rendering_type_predictor: RenderingTypePredictor | None = None, - result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, - result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, - beautifulsoup_crawler_kwargs: _BeautifulsoupCrawlerAdditionalOptions | None = None, - playwright_crawler_args: _PlaywrightCrawlerAdditionalOptions | None = None, - **kwargs: Unpack[_BasicCrawlerOptions]) -> None: + def __init__( + self, + rendering_type_predictor: RenderingTypePredictor | None = None, + result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, + result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, + beautifulsoup_crawler_kwargs: _BeautifulsoupCrawlerAdditionalOptions | None = None, + playwright_crawler_args: _PlaywrightCrawlerAdditionalOptions | None = None, + **kwargs: Unpack[_BasicCrawlerOptions], + ) -> None: """A default constructor. Args: @@ -121,7 +122,7 @@ def __init__(self, # Adaptive crawling related. self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() - self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 # Intentionally unused argument. + self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 # Intentionally unused argument. self.result_comparator = result_comparator or create_comparator(result_checker) @@ -139,7 +140,6 @@ def __init__(self, # AdaptivePlaywrightCrawlerStatistics specific methods can be access in "type safe manner". self.adaptive_statistics = statistics - # Sub crawlers related. beautifulsoup_crawler_kwargs = beautifulsoup_crawler_kwargs or {} beautifulsoup_crawler_kwargs.setdefault('parser', 'lxml') @@ -171,10 +171,10 @@ async def request_handler_beautiful_soup(context: BeautifulSoupCrawlingContext) async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None: """Handler for routing from playwright_crawler to adaptive_crawler handler.""" adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( - context=context, beautiful_soup_parser_type=beautifulsoup_crawler_kwargs['parser']) + context=context, beautiful_soup_parser_type=beautifulsoup_crawler_kwargs['parser'] + ) await self.router(adaptive_crawling_context) - super().__init__(**kwargs) async def run( @@ -192,8 +192,10 @@ async def run( """ contexts_to_enter = [ cm - for cm in (self.beautifulsoup_crawler.crawl_one_required_contexts - + self.playwright_crawler.crawl_one_required_contexts) + for cm in ( + self.beautifulsoup_crawler.crawl_one_required_contexts + + self.playwright_crawler.crawl_one_required_contexts + ) if cm and getattr(cm, 'active', False) is False ] @@ -207,9 +209,8 @@ async def run( # https://github.com/python/mypy/issues/7726 raise RuntimeError('FinalStatistics not created.') - # Can't use override as mypy does not like it for double underscore private method. - async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 + async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 """Override BasicCrawler method that delegates request processing to sub crawlers. To decide which sub crawler should process the request it runs `rendering_type_predictor`. @@ -218,40 +219,44 @@ async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext Reference implementation: https://github.com/apify/crawlee/blob/master/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts """ - async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, - use_state: dict | None = None) -> SubCrawlerRun: + + async def _run_subcrawler( + crawler: BeautifulSoupCrawler | PlaywrightCrawler, use_state: dict | None = None + ) -> SubCrawlerRun: """Helper closure that creates new `RequestHandlerRunResult` and delegates request handling to sub crawler. Produces `SubCrawlerRun` that either contains filled `RequestHandlerRunResult` or exception. """ try: crawl_result = await crawler.crawl_one( - context = context, - request_handler_timeout=self._request_handler_timeout, - result= RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store), - use_state=use_state) + context=context, + request_handler_timeout=self._request_handler_timeout, + result=RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store), + use_state=use_state, + ) return SubCrawlerRun(result=crawl_result) except Exception as e: return SubCrawlerRun(exception=e) - rendering_type_prediction = self.rendering_type_predictor.predict(context.request.url, context.request.label) should_detect_rendering_type = random() < rendering_type_prediction.detection_probability_recommendation if not should_detect_rendering_type: self.log.debug( - f'Predicted rendering type {rendering_type_prediction.rendering_type} for {context.request.url}') + f'Predicted rendering type {rendering_type_prediction.rendering_type} for {context.request.url}' + ) if rendering_type_prediction.rendering_type == 'static': context.log.debug(f'Running static request for {context.request.url}') self.adaptive_statistics.track_http_only_request_handler_runs() bs_run = await _run_subcrawler(self.beautifulsoup_crawler) if bs_run.result and self.result_checker(bs_run.result): - await self.commit_result(result = bs_run.result, context=context) + await self.commit_result(result=bs_run.result, context=context) return if bs_run.exception: - context.log.exception(msg=f'Static crawler: failed for {context.request.url}', - exc_info=bs_run.exception) + context.log.exception( + msg=f'Static crawler: failed for {context.request.url}', exc_info=bs_run.exception + ) else: context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}') self.adaptive_statistics.track_rendering_type_mispredictions() @@ -259,7 +264,7 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, context.log.debug(f'Running browser request handler for {context.request.url}') kvs = await context.get_key_value_store() - default_value =dict[str, JsonSerializable]() + default_value = dict[str, JsonSerializable]() old_state: dict[str, JsonSerializable] = await kvs.get_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) old_state_copy = deepcopy(old_state) @@ -270,13 +275,13 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, raise pw_run.exception if pw_run.result: - await self.commit_result(result = pw_run.result, context=context) + await self.commit_result(result=pw_run.result, context=context) if should_detect_rendering_type: detection_result: RenderingType bs_run = await _run_subcrawler(self.beautifulsoup_crawler, use_state=old_state_copy) - if bs_run.result and self.result_comparator(bs_run.result,pw_run.result): + if bs_run.result and self.result_comparator(bs_run.result, pw_run.result): detection_result = 'static' else: detection_result = 'client only' @@ -285,31 +290,27 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler, self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result) async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: - result_tasks = [ - asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls - ] + [ - asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls - ] + [ - asyncio.create_task(self._commit_key_value_store_changes(result)) - ] + result_tasks = ( + [asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls] + + [asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls] + + [asyncio.create_task(self._commit_key_value_store_changes(result))] + ) await asyncio.gather(*result_tasks) - - def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None: """Pre navigation hooks for adaptive crawler are delegated to sub crawlers.""" - raise RuntimeError('Pre navigation hooks are ambiguous in adaptive crawling context. Use specific hook instead:' - '`pre_navigation_hook_pw` for playwright sub crawler related hooks or' - '`pre_navigation_hook_bs`for beautifulsoup sub crawler related hooks. \n' - f'{hook=} will not be used!!!') + raise RuntimeError( + 'Pre navigation hooks are ambiguous in adaptive crawling context. Use specific hook instead:' + '`pre_navigation_hook_pw` for playwright sub crawler related hooks or' + '`pre_navigation_hook_bs`for beautifulsoup sub crawler related hooks. \n' + f'{hook=} will not be used!!!' + ) def pre_navigation_hook_pw(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: """Pre navigation hooks for playwright sub crawler of adaptive crawler.""" self.playwright_crawler.pre_navigation_hook(hook) - def pre_navigation_hook_bs(self, hook: Callable[[BasicCrawlingContext], Awaitable[None]]) -> None: """Pre navigation hooks for beautifulsoup sub crawler of adaptive crawler.""" self.beautifulsoup_crawler.pre_navigation_hook(hook) - diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index 8c5a0c3237..1391e2e587 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -25,11 +25,12 @@ class PredictorState(BaseModel): browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0 rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0 + @docs_group('Classes') class AdaptivePlaywrightCrawlerStatistics(Statistics): - - - def __init__(self,*, + def __init__( + self, + *, persistence_enabled: bool = False, persist_state_kvs_name: str = 'default', persist_state_key: str | None = None, @@ -37,29 +38,33 @@ def __init__(self,*, log_message: str = 'Statistics', periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), - state_model: type[StatisticsState] = StatisticsState) -> None: + state_model: type[StatisticsState] = StatisticsState, + ) -> None: self.predictor_state = PredictorState() - super().__init__(persistence_enabled=persistence_enabled, - persist_state_kvs_name=persist_state_kvs_name, - persist_state_key=persist_state_key, - key_value_store=key_value_store, - log_message=log_message, - periodic_message_logger=periodic_message_logger, - log_interval=log_interval, - state_model=state_model) + super().__init__( + persistence_enabled=persistence_enabled, + persist_state_kvs_name=persist_state_kvs_name, + persist_state_key=persist_state_key, + key_value_store=key_value_store, + log_message=log_message, + periodic_message_logger=periodic_message_logger, + log_interval=log_interval, + state_model=state_model, + ) self._persist_predictor_state_key = self._persist_state_key + '_PREDICTOR' @classmethod def from_statistics(cls, statistics: Statistics) -> Self: - return cls(persistence_enabled=statistics._persistence_enabled, # noqa:SLF001 # Accessing private member to create copy like-object. - persist_state_kvs_name=statistics._persist_state_kvs_name, # noqa:SLF001 # Accessing private member to create copy like-object. - persist_state_key=statistics._persist_state_key, # noqa:SLF001 # Accessing private member to create copy like-object. - key_value_store=statistics._key_value_store, # noqa:SLF001 # Accessing private member to create copy like-object. - log_message=statistics._log_message, # noqa:SLF001 # Accessing private member to create copy like-object. - periodic_message_logger=statistics._periodic_message_logger, # noqa:SLF001 # Accessing private member to create copy like-object. - log_interval=statistics._log_interval, # noqa:SLF001 # Accessing private member to create copy like-object. - state_model=statistics._state_model, # noqa:SLF001 # Accessing private member to create copy like-object. - ) + return cls( + persistence_enabled=statistics._persistence_enabled, # noqa:SLF001 # Accessing private member to create copy like-object. + persist_state_kvs_name=statistics._persist_state_kvs_name, # noqa:SLF001 # Accessing private member to create copy like-object. + persist_state_key=statistics._persist_state_key, # noqa:SLF001 # Accessing private member to create copy like-object. + key_value_store=statistics._key_value_store, # noqa:SLF001 # Accessing private member to create copy like-object. + log_message=statistics._log_message, # noqa:SLF001 # Accessing private member to create copy like-object. + periodic_message_logger=statistics._periodic_message_logger, # noqa:SLF001 # Accessing private member to create copy like-object. + log_interval=statistics._log_interval, # noqa:SLF001 # Accessing private member to create copy like-object. + state_model=statistics._state_model, # noqa:SLF001 # Accessing private member to create copy like-object. + ) def track_http_only_request_handler_runs(self) -> None: self.predictor_state.http_only_request_handler_runs += 1 @@ -79,10 +84,8 @@ async def _persist_other_statistics(self, key_value_store: KeyValueStore) -> Non 'application/json', ) - @override async def _load_other_statistics(self, key_value_store: KeyValueStore) -> None: """Load state of predictor.""" stored_state = await key_value_store.get_value(self._persist_predictor_state_key, cast(Any, {})) self.predictor_state = self.predictor_state.__class__.model_validate(stored_state) - diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index f2a71d8f62..796aa68b02 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -15,17 +15,17 @@ from playwright.async_api import Page, Response from typing_extensions import Self + class AdaptiveContextError(RuntimeError): pass - @dataclass(frozen=True) @docs_group('Data structures') class AdaptivePlaywrightCrawlingContext(BeautifulSoupCrawlingContext): _response: Response | None = None _infinite_scroll: Callable[[], Awaitable[None]] | None = None - _page : Page | None = None + _page: Page | None = None @property def page(self) -> Page: @@ -51,8 +51,9 @@ def from_beautifulsoup_crawling_context(cls, context: BeautifulSoupCrawlingConte return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) @classmethod - async def from_playwright_crawling_context(cls, context: PlaywrightCrawlingContext, - beautiful_soup_parser_type: BeautifulSoupParserType | None) -> Self: + async def from_playwright_crawling_context( + cls, context: PlaywrightCrawlingContext, beautiful_soup_parser_type: BeautifulSoupParserType | None + ) -> Self: """Convenience constructor that creates new context from existing `PlaywrightCrawlingContext`.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} # Remove playwright specific attributes and pass them as private instead to be available as property. @@ -61,17 +62,20 @@ async def from_playwright_crawling_context(cls, context: PlaywrightCrawlingConte context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll') # This might be always available. protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol') - http_response = await _HttpResponse.from_playwright_response(response = context.response, - protocol = protocol_guess or '') - return cls(parsed_content= BeautifulSoup(http_response.read(), features=beautiful_soup_parser_type), - http_response = http_response, - **context_kwargs) + http_response = await _HttpResponse.from_playwright_response( + response=context.response, protocol=protocol_guess or '' + ) + return cls( + parsed_content=BeautifulSoup(http_response.read(), features=beautiful_soup_parser_type), + http_response=http_response, + **context_kwargs, + ) @dataclass(frozen=True) class _HttpResponse: - http_version : str - status_code : int + http_version: str + status_code: int headers: HttpHeaders _content: bytes diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index a6f828dc80..e355ef27f2 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -10,24 +10,24 @@ ) -async def main() ->None: +async def main() -> None: # TODO: remove in review. Move this to documentation examples instead. top_logger = getLogger(__name__) top_logger.setLevel(logging.DEBUG) - i=0 + i = 0 - crawler = AdaptivePlaywrightCrawler(max_requests_per_crawl=10, - _logger=top_logger, - playwright_crawler_args={'headless':False}) + crawler = AdaptivePlaywrightCrawler( + max_requests_per_crawl=10, _logger=top_logger, playwright_crawler_args={'headless': False} + ) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: nonlocal i - i = i+1 + i = i + 1 context.log.info(f'Processing with Top adaptive_crawler: {context.request.url} ...') await context.enqueue_links() await context.push_data({'Top crwaler Url': context.request.url}) - await context.use_state({'bla':i}) + await context.use_state({'bla': i}) @crawler.pre_navigation_hook_bs async def bs_hook(context: BasicCrawlingContext) -> None: diff --git a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py index 2bef0cccce..4ece912ad2 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py @@ -9,29 +9,26 @@ RenderingType = Literal['static', 'client only'] + @dataclass(frozen=True) class RenderingTypePrediction: rendering_type: RenderingType detection_probability_recommendation: float - class RenderingTypePredictor(ABC): - @abstractmethod - def predict(self, url: str, label: str | None) -> RenderingTypePrediction: - ... + def predict(self, url: str, label: str | None) -> RenderingTypePrediction: ... @abstractmethod - def store_result(self, url: str, label: str | None, crawl_type: RenderingType) -> None: - ... + def store_result(self, url: str, label: str | None, crawl_type: RenderingType) -> None: ... class DefaultRenderingTypePredictor(RenderingTypePredictor): - #Dummy version of predictor. Proper version will be implemented in another change. + # Dummy version of predictor. Proper version will be implemented in another change. @override - def predict(self, url: str, label: str | None) -> RenderingTypePrediction: # Will be implemented later + def predict(self, url: str, label: str | None) -> RenderingTypePrediction: # Will be implemented later return RenderingTypePrediction(choice(['static', 'client only']), 0.1) @override diff --git a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py index a628819fb9..beef4405ed 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py @@ -11,13 +11,13 @@ @dataclass(frozen=True) class SubCrawlerRun: - result: RequestHandlerRunResult | None= None - exception: Exception | None= None + result: RequestHandlerRunResult | None = None + exception: Exception | None = None - -def create_comparator(result_checker: Callable[[RequestHandlerRunResult], bool] | None - ) -> Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool]: +def create_comparator( + result_checker: Callable[[RequestHandlerRunResult], bool] | None, +) -> Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool]: """Factory for creating comparator function.""" if result_checker: # Fallback comparator if only user-specific checker is defined. @@ -34,9 +34,11 @@ def full_result_comparator(result_1: RequestHandlerRunResult, result_2: RequestH # https://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.1735494277124&__hstc=7591405.e2b9302ed00c5bfaee3a870166792181.1735494277124.1735494277124.1735494277124.1 return ( - (result_1.push_data_calls == result_2.push_data_calls) and - (result_1.add_requests_calls == result_2.add_requests_calls) and - (result_1.key_value_store_changes == result_2.key_value_store_changes)) + (result_1.push_data_calls == result_2.push_data_calls) + and (result_1.add_requests_calls == result_2.add_requests_calls) + and (result_1.key_value_store_changes == result_2.key_value_store_changes) + ) + def push_data_only_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: """Compare results by comparing their push dara calls. Ignore other parts of results in comparison.""" diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 36c4efda84..4724e27849 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -137,9 +137,11 @@ class _BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): """A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.""" + @docs_group('Data structures') class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext], TypedDict): """Generic options for basic crawler.""" + request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]] """A callable responsible for handling requests.""" @@ -147,10 +149,11 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext], TypedDict): """Enables extending the request lifecycle and modifying the crawling context. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.""" + @docs_group('Data structures') -class BasicCrawlerOptions(Generic[TCrawlingContext], - _BasicCrawlerOptions , - _BasicCrawlerOptionsGeneric[TCrawlingContext]): +class BasicCrawlerOptions( + Generic[TCrawlingContext], _BasicCrawlerOptions, _BasicCrawlerOptionsGeneric[TCrawlingContext] +): """Arguments for the `BasicCrawler` constructor. It is intended for typing forwarded `__init__` arguments in the subclasses. @@ -181,6 +184,7 @@ class BasicCrawler(Generic[TCrawlingContext]): - direct storage interaction helpers, - and more. """ + CRAWLEE_STATE_KEY = 'CRAWLEE_STATE' def __init__( @@ -574,9 +578,7 @@ async def add_requests( wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout, ) - async def _use_state( - self, default_value: dict[str, JsonSerializable] | None = None - ) -> dict[str, JsonSerializable]: + async def _use_state(self, default_value: dict[str, JsonSerializable] | None = None) -> dict[str, JsonSerializable]: store = await self.get_key_value_store() return await store.get_auto_saved_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) @@ -948,16 +950,12 @@ async def _commit_request_handler_result( await self._commit_key_value_store_changes(result) - - async def _commit_key_value_store_changes(self, result: RequestHandlerRunResult) -> None: for (id, name), changes in result.key_value_store_changes.items(): store = await self.get_key_value_store(id=id, name=name) for key, value in changes.updates.items(): await store.set_value(key, value.content, value.content_type) - - async def __is_finished_function(self) -> bool: self._stop_if_max_requests_count_exceeded() if self._unexpected_stop: @@ -1121,7 +1119,6 @@ async def __run_task_function(self) -> None: async def __run_request_handler(self, context: BasicCrawlingContext) -> None: await self._context_pipeline(context, self.router) - @property def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]: """Contexts that have to be active before `crawl_one` can be called.""" @@ -1129,11 +1126,14 @@ def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]: contexts.append(self.statistics) return contexts - async def crawl_one(self, *, context: BasicCrawlingContext, - request_handler_timeout: timedelta, - result: RequestHandlerRunResult, - use_state: dict[str,JsonSerializable] | None = None - ) -> RequestHandlerRunResult: + async def crawl_one( + self, + *, + context: BasicCrawlingContext, + request_handler_timeout: timedelta, + result: RequestHandlerRunResult, + use_state: dict[str, JsonSerializable] | None = None, + ) -> RequestHandlerRunResult: """Populate result by crawling one request from input `context`. Context callbacks are routed to `result` and are not commited. @@ -1149,9 +1149,12 @@ async def crawl_one(self, *, context: BasicCrawlingContext, Same input result object that is mutated in the process. """ if use_state is not None: - async def get_input_state(default_value: dict[str, JsonSerializable] | None = None # noqa:ARG001 # Intentionally unused arguments. Closure, that generates same output regardless of inputs. - ) -> dict[str, JsonSerializable]: + + async def get_input_state( + default_value: dict[str, JsonSerializable] | None = None, # noqa:ARG001 # Intentionally unused arguments. Closure, that generates same output regardless of inputs. + ) -> dict[str, JsonSerializable]: return use_state + use_state_function = get_input_state else: use_state_function = context.use_state @@ -1165,14 +1168,14 @@ async def get_input_state(default_value: dict[str, JsonSerializable] | None = No push_data=result.push_data, get_key_value_store=result.get_key_value_store, use_state=use_state_function, - log=context.log + log=context.log, ) await wait_for( lambda: self.__run_request_handler(context_linked_to_result), timeout=request_handler_timeout, timeout_message='Request handler timed out after ' - f'{self._request_handler_timeout.total_seconds()} seconds', + f'{self._request_handler_timeout.total_seconds()} seconds', logger=self._logger, ) return result diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index cb3bd94ad1..da9c341461 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -287,7 +287,6 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], """ self._pre_navigation_hooks.append(hook) - @property def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]: """Contexts that have to be active before `crawl_one` can be called.""" diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index 948ca4cd2a..09e9a5616a 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -252,17 +252,17 @@ async def _maybe_load_statistics(self) -> None: await self._load_other_statistics(self._key_value_store) async def _load_crawler_statistics(self, key_value_store: KeyValueStore) -> None: - stored_state = await key_value_store.get_value(self._persist_state_key, cast(Any, {})) + stored_state = await key_value_store.get_value(self._persist_state_key, cast(Any, {})) - saved_state = self.state.__class__.model_validate(stored_state) - self.state = saved_state + saved_state = self.state.__class__.model_validate(stored_state) + self.state = saved_state - if saved_state.stats_persisted_at is not None and saved_state.crawler_last_started_at: - self._instance_start = datetime.now(timezone.utc) - ( - saved_state.stats_persisted_at - saved_state.crawler_last_started_at - ) - elif saved_state.crawler_last_started_at: - self._instance_start = saved_state.crawler_last_started_at + if saved_state.stats_persisted_at is not None and saved_state.crawler_last_started_at: + self._instance_start = datetime.now(timezone.utc) - ( + saved_state.stats_persisted_at - saved_state.crawler_last_started_at + ) + elif saved_state.crawler_last_started_at: + self._instance_start = saved_state.crawler_last_started_at async def _load_other_statistics(self, key_value_store: KeyValueStore) -> None: """Load other statistics. Not implemented in Statistics.""" @@ -291,14 +291,12 @@ async def _persist_crawler_statistics(self, key_value_store: KeyValueStore) -> N async def _persist_other_statistics(self, key_value_store: KeyValueStore) -> None: """Save other statistics. Not implemented in Statistics.""" - async def _persist_state(self, event_data: EventPersistStateData) -> None: logger.debug(f'Persisting state of the Statistics (event_data={event_data}).') if self._persistence_enabled and self._key_value_store is not None and self._instance_start is not None: await self._persist_crawler_statistics(self._key_value_store) await self._persist_other_statistics(self._key_value_store) - def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None: retry_count = record.retry_count diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index a93ecbbe0c..8ac6c89dc6 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -28,13 +28,15 @@ from collections.abc import Iterator - class _SimpleRenderingTypePredictor(RenderingTypePredictor): """Simplified predictor for tests.""" - def __init__(self, rendering_types: Iterator[RenderingType] | None = None, - detection_probability_recommendation: None | Iterator[int] = None) -> None: - self._rendering_types = rendering_types or cycle(['static']) + def __init__( + self, + rendering_types: Iterator[RenderingType] | None = None, + detection_probability_recommendation: None | Iterator[int] = None, + ) -> None: + self._rendering_types = rendering_types or cycle(['static']) self._detection_probability_recommendation = detection_probability_recommendation or cycle([1]) @override @@ -46,26 +48,28 @@ def store_result(self, url: str, label: str | None, crawl_type: RenderingType) - pass - -@pytest.mark.parametrize(('expected_pw_count', 'expected_bs_count', 'rendering_types', - 'detection_probability_recommendation'), [ - pytest.param(0,2, cycle(['static']), cycle([0]), id='Static only'), - pytest.param(2,0, cycle(['client only']), cycle([0]), id='Client only'), - pytest.param(1,1, cycle(['static','client only']), cycle([0]),id='Mixed'), - pytest.param(2,2, cycle(['static','client only']), cycle([1]),id='Enforced rendering type detection'), -]) -async def test_adaptive_crawling(expected_pw_count: int, expected_bs_count: int, - rendering_types: Iterator[RenderingType], - detection_probability_recommendation: Iterator[int]) -> None: +@pytest.mark.parametrize( + ('expected_pw_count', 'expected_bs_count', 'rendering_types', 'detection_probability_recommendation'), + [ + pytest.param(0, 2, cycle(['static']), cycle([0]), id='Static only'), + pytest.param(2, 0, cycle(['client only']), cycle([0]), id='Client only'), + pytest.param(1, 1, cycle(['static', 'client only']), cycle([0]), id='Mixed'), + pytest.param(2, 2, cycle(['static', 'client only']), cycle([1]), id='Enforced rendering type detection'), + ], +) +async def test_adaptive_crawling( + expected_pw_count: int, + expected_bs_count: int, + rendering_types: Iterator[RenderingType], + detection_probability_recommendation: Iterator[int], +) -> None: """Tests correct routing to pre-nav hooks and correct handling through proper handler.""" requests = ['https://crawlee.dev/', 'https://crawlee.dev/docs/quick-start'] predictor = _SimpleRenderingTypePredictor( - rendering_types = rendering_types, - detection_probability_recommendation=detection_probability_recommendation + rendering_types=rendering_types, detection_probability_recommendation=detection_probability_recommendation ) - crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=predictor) pw_handler_count = 0 @@ -86,7 +90,6 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: except AdaptiveContextError: bs_handler_count += 1 - @crawler.pre_navigation_hook_bs async def bs_hook(context: BasicCrawlingContext) -> None: # noqa:ARG001 # Intentionally unused arg nonlocal bs_hook_count @@ -97,7 +100,6 @@ async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: # noqa:ARG nonlocal pw_hook_count pw_hook_count += 1 - await crawler.run(requests) assert pw_handler_count == expected_pw_count @@ -138,7 +140,6 @@ async def test_adaptive_crawling_result() -> None: requests = ['https://crawlee.dev/'] crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) - @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: try: @@ -157,13 +158,16 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: assert items == [{'handler': 'pw'}] - -@pytest.mark.parametrize(('pw_saved_data', 'bs_saved_data', 'expected_result_renderingl_type'), [ - pytest.param({'some': 'data'}, {'some': 'data'}, 'static', id='Same results from sub crawlers'), - pytest.param({'some': 'data'}, {'different': 'data'}, 'client only', id='Different results from sub crawlers'), -]) -async def test_adaptive_crawling_predictor_calls(pw_saved_data: dict[str, str], bs_saved_data: dict[str, str], - expected_result_renderingl_type: RenderingType) -> None: +@pytest.mark.parametrize( + ('pw_saved_data', 'bs_saved_data', 'expected_result_renderingl_type'), + [ + pytest.param({'some': 'data'}, {'some': 'data'}, 'static', id='Same results from sub crawlers'), + pytest.param({'some': 'data'}, {'different': 'data'}, 'client only', id='Different results from sub crawlers'), + ], +) +async def test_adaptive_crawling_predictor_calls( + pw_saved_data: dict[str, str], bs_saved_data: dict[str, str], expected_result_renderingl_type: RenderingType +) -> None: """Tests expected predictor calls. Same results.""" some_label = 'bla' some_url = 'https://crawlee.dev/' @@ -171,7 +175,6 @@ async def test_adaptive_crawling_predictor_calls(pw_saved_data: dict[str, str], requests = [Request.from_url(url=some_url, label=some_label)] crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) - @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: try: @@ -181,9 +184,12 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: except AdaptiveContextError: await context.push_data(bs_saved_data) - with (patch.object(static_only_predictor_enforce_detection, 'store_result', Mock()) as mocked_store_result, - patch.object(static_only_predictor_enforce_detection, 'predict', Mock( - return_value=RenderingTypePrediction('static', 1))) as mocked_predict): + with ( + patch.object(static_only_predictor_enforce_detection, 'store_result', Mock()) as mocked_store_result, + patch.object( + static_only_predictor_enforce_detection, 'predict', Mock(return_value=RenderingTypePrediction('static', 1)) + ) as mocked_predict, + ): await crawler.run(requests) mocked_predict.assert_called_once_with(some_url, some_label) @@ -205,7 +211,7 @@ async def test_adaptive_crawling_result_use_state_isolation() -> None: @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: nonlocal request_handler_calls - state = cast(dict[str, int],await context.use_state()) + state = cast(dict[str, int], await context.use_state()) request_handler_calls += 1 state['counter'] += 1 @@ -228,8 +234,10 @@ async def test_adaptive_crawling_statistics() -> None: static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) - crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_no_detection, - result_checker=lambda result: False) # noqa: ARG005 # Intentionally unused argument. + crawler = AdaptivePlaywrightCrawler( + rendering_type_predictor=static_only_predictor_no_detection, + result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. + ) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @@ -241,6 +249,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: assert crawler.adaptive_statistics.predictor_state.browser_request_handler_runs == 1 assert crawler.adaptive_statistics.predictor_state.rendering_type_mispredictions == 1 + def test_adaptive_default_hooks_raise_exception() -> None: """Trying to attach usual pre-navigation hook raises exception. @@ -249,16 +258,20 @@ def test_adaptive_default_hooks_raise_exception() -> None: crawler = AdaptivePlaywrightCrawler() with pytest.raises(RuntimeError): + @crawler.pre_navigation_hook async def some_hook(whatever: Any) -> None: pass -@pytest.mark.parametrize('error_in_pw_crawler', [ - pytest.param(False, id='Error only in bs sub crawler'), - pytest.param(True, id='Error in both sub crawlers'), -]) -async def test_adaptive_crawler_exceptions_in_sub_crawlers(*,error_in_pw_crawler: bool) -> None: +@pytest.mark.parametrize( + 'error_in_pw_crawler', + [ + pytest.param(False, id='Error only in bs sub crawler'), + pytest.param(True, id='Error in both sub crawlers'), + ], +) +async def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_pw_crawler: bool) -> None: """Test that correct results are commited when exceptions are raised in sub crawlers. Exception in bs sub crawler will be logged and pw sub crawler used instead. @@ -276,7 +289,6 @@ async def test_adaptive_crawler_exceptions_in_sub_crawlers(*,error_in_pw_crawle @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. @@ -291,7 +303,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: await crawler.run(requests) dataset = await crawler.get_dataset() - stored_results = [item async for item in dataset.iterate_items()] + stored_results = [item async for item in dataset.iterate_items()] if error_in_pw_crawler: assert stored_results == [] @@ -307,12 +319,14 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: log_message = 'some message' periodic_message_logger = logging.getLogger('some logger') # Accessing private member to create copy like-object. log_interval = timedelta(minutes=2) - statistics = Statistics(persistence_enabled=persistence_enabled, - persist_state_kvs_name=persist_state_kvs_name, - persist_state_key=persist_state_key, - log_message=log_message, - periodic_message_logger=periodic_message_logger, - log_interval=log_interval) + statistics = Statistics( + persistence_enabled=persistence_enabled, + persist_state_kvs_name=persist_state_kvs_name, + persist_state_key=persist_state_key, + log_message=log_message, + periodic_message_logger=periodic_message_logger, + log_interval=log_interval, + ) crawler = AdaptivePlaywrightCrawler(statistics=statistics) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py index 24c180e168..c6bc7b9eaa 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py @@ -21,14 +21,10 @@ async def test_predictor_state_persistence() -> None: # new_adaptive_statistics are initialized from the persisted values. async with AdaptivePlaywrightCrawlerStatistics( - persistence_enabled=True, - persist_state_key=persistence_state_key) as new_adaptive_statistics: + persistence_enabled=True, persist_state_key=persistence_state_key + ) as new_adaptive_statistics: pass assert new_adaptive_statistics.predictor_state.rendering_type_mispredictions == 2 assert new_adaptive_statistics.predictor_state.http_only_request_handler_runs == 3 assert new_adaptive_statistics.predictor_state.browser_request_handler_runs == 1 - - - - diff --git a/website/pydoc-markdown/generate_ast.py b/website/pydoc-markdown/generate_ast.py index 9163d1c117..8dfe580268 100644 --- a/website/pydoc-markdown/generate_ast.py +++ b/website/pydoc-markdown/generate_ast.py @@ -45,9 +45,4 @@ repo_root_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../')) -print( - json.dumps(dump, indent=4).replace( - repo_root_path, - 'REPO_ROOT_PLACEHOLDER' - ) -) +print(json.dumps(dump, indent=4).replace(repo_root_path, 'REPO_ROOT_PLACEHOLDER')) diff --git a/website/pydoc-markdown/google_docstring_processor.py b/website/pydoc-markdown/google_docstring_processor.py index 154462c4d2..9c6dcf41c2 100644 --- a/website/pydoc-markdown/google_docstring_processor.py +++ b/website/pydoc-markdown/google_docstring_processor.py @@ -79,37 +79,37 @@ class ApifyGoogleProcessor(Processor): """ _param_res = [ - re.compile(r"^(?P\S+):\s+(?P.+)$"), - re.compile(r"^(?P\S+)\s+\((?P[^)]+)\):\s+(?P.+)$"), - re.compile(r"^(?P\S+)\s+--\s+(?P.+)$"), - re.compile(r"^(?P\S+)\s+\{\[(?P\S+)\]\}\s+--\s+(?P.+)$"), - re.compile(r"^(?P\S+)\s+\{(?P\S+)\}\s+--\s+(?P.+)$"), + re.compile(r'^(?P\S+):\s+(?P.+)$'), + re.compile(r'^(?P\S+)\s+\((?P[^)]+)\):\s+(?P.+)$'), + re.compile(r'^(?P\S+)\s+--\s+(?P.+)$'), + re.compile(r'^(?P\S+)\s+\{\[(?P\S+)\]\}\s+--\s+(?P.+)$'), + re.compile(r'^(?P\S+)\s+\{(?P\S+)\}\s+--\s+(?P.+)$'), ] _keywords_map = { - "Args:": "Arguments", - "Arguments:": "Arguments", - "Attributes:": "Attributes", - "Example:": "Example", - "Examples:": "Examples", - "Keyword Args:": "Arguments", - "Keyword Arguments:": "Arguments", - "Methods:": "Methods", - "Note:": "Notes", - "Notes:": "Notes", - "Other Parameters:": "Arguments", - "Parameters:": "Arguments", - "Return:": "Returns", - "Returns:": "Returns", - "Raises:": "Raises", - "References:": "References", - "See Also:": "See Also", - "Todo:": "Todo", - "Warning:": "Warnings", - "Warnings:": "Warnings", - "Warns:": "Warns", - "Yield:": "Yields", - "Yields:": "Yields", + 'Args:': 'Arguments', + 'Arguments:': 'Arguments', + 'Attributes:': 'Attributes', + 'Example:': 'Example', + 'Examples:': 'Examples', + 'Keyword Args:': 'Arguments', + 'Keyword Arguments:': 'Arguments', + 'Methods:': 'Methods', + 'Note:': 'Notes', + 'Notes:': 'Notes', + 'Other Parameters:': 'Arguments', + 'Parameters:': 'Arguments', + 'Return:': 'Returns', + 'Returns:': 'Returns', + 'Raises:': 'Raises', + 'References:': 'References', + 'See Also:': 'See Also', + 'Todo:': 'Todo', + 'Warning:': 'Warnings', + 'Warnings:': 'Warnings', + 'Warns:': 'Warns', + 'Yield:': 'Yields', + 'Yields:': 'Yields', } def check_docstring_format(self, docstring: str) -> bool: @@ -139,9 +139,9 @@ def _commit(): lines.extend(current_lines) current_lines.clear() - for line in node.docstring.content.split("\n"): + for line in node.docstring.content.split('\n'): multiline_argument_offset += 1 - if line.lstrip().startswith("```"): + if line.lstrip().startswith('```'): in_codeblock = not in_codeblock current_lines.append(line) if not in_codeblock: @@ -171,15 +171,16 @@ def _commit(): if not param_match: if multiline_argument_offset == 1: - current_lines[-1]["desc"] += "\n" + line + current_lines[-1]['desc'] += '\n' + line multiline_argument_offset = 0 else: current_lines.append(line) _commit() - node.docstring.content = json.dumps({ - "text": "\n".join(lines), - "sections": sections, - }, indent=None) - - + node.docstring.content = json.dumps( + { + 'text': '\n'.join(lines), + 'sections': sections, + }, + indent=None, + ) From 2fd7aae16950aff20876a47c875b1cda39c0fc77 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 8 Jan 2025 09:18:40 +0100 Subject: [PATCH 22/79] Remove BasicCrawler modifications. Add it in adaptive crawler instead at the cost of accessing many private members. --- .../_adaptive_playwright_crawler.py | 69 ++++++++++++++++--- src/crawlee/crawlers/_basic/_basic_crawler.py | 61 ---------------- .../_playwright/_playwright_crawler.py | 8 --- 3 files changed, 61 insertions(+), 77 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 141bc2f516..29af48af30 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -2,7 +2,7 @@ import asyncio import logging -from contextlib import AsyncExitStack +from contextlib import AbstractAsyncContextManager, AsyncExitStack from copy import deepcopy from logging import getLogger from random import random @@ -12,6 +12,7 @@ from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult from crawlee._utils.docs import docs_group +from crawlee._utils.wait import wait_for from crawlee.crawlers import ( BasicCrawler, BeautifulSoupCrawler, @@ -41,6 +42,7 @@ if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Mapping, Sequence + from datetime import timedelta from typing_extensions import NotRequired, Unpack @@ -177,6 +179,51 @@ async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None super().__init__(**kwargs) + @staticmethod + async def crawl_one_with( + crawler: BeautifulSoupCrawler | PlaywrightCrawler, + context: BasicCrawlingContext, + timeout: timedelta, + result: RequestHandlerRunResult, + state: dict[str, JsonSerializable] | None = None, + ) -> RequestHandlerRunResult: + if state is not None: + + async def get_input_state( + default_value: dict[str, JsonSerializable] | None = None, # noqa:ARG001 # Intentionally unused arguments. Closure, that generates same output regardless of inputs. + ) -> dict[str, JsonSerializable]: + return state + + use_state_function = get_input_state + else: + use_state_function = context.use_state + + context_linked_to_result = BasicCrawlingContext( + request=context.request, + session=context.session, + proxy_info=context.proxy_info, + send_request=context.send_request, + add_requests=result.add_requests, + push_data=result.push_data, + get_key_value_store=result.get_key_value_store, + use_state=use_state_function, + log=context.log, + ) + + # Mypy needs type narrowing. + if type(crawler) is PlaywrightCrawler: + run_pipeline = crawler._context_pipeline(context_linked_to_result, crawler.router) # noqa:SLF001 # Intentional access to private member. + if type(crawler) is BeautifulSoupCrawler: + run_pipeline = crawler._context_pipeline(context_linked_to_result, crawler.router) # noqa:SLF001 # Intentional access to private member. + + await wait_for( + lambda: run_pipeline, + timeout=timeout, + timeout_message=f'Sub crawler timed out after {timeout.total_seconds()} seconds', + logger=crawler._logger, # noqa:SLF001 # Intentional access to private member. + ) + return result + async def run( self, requests: Sequence[str | Request] | None = None, @@ -190,12 +237,17 @@ async def run( purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default request queue will be purged. """ + required_contexts_pw_crawler: list[AbstractAsyncContextManager] = [ + self.playwright_crawler._statistics, # noqa:SLF001 # Intentional access to private member. + self.playwright_crawler._browser_pool, # noqa:SLF001 # Intentional access to private member. + ] + required_contexts_bs_crawler: list[AbstractAsyncContextManager] = [ + self.beautifulsoup_crawler._statistics, # noqa:SLF001 # Intentional access to private member. + ] + contexts_to_enter = [ cm - for cm in ( - self.beautifulsoup_crawler.crawl_one_required_contexts - + self.playwright_crawler.crawl_one_required_contexts - ) + for cm in (required_contexts_pw_crawler + required_contexts_bs_crawler) if cm and getattr(cm, 'active', False) is False ] @@ -228,11 +280,12 @@ async def _run_subcrawler( Produces `SubCrawlerRun` that either contains filled `RequestHandlerRunResult` or exception. """ try: - crawl_result = await crawler.crawl_one( + crawl_result = await self.crawl_one_with( + crawler=crawler, context=context, - request_handler_timeout=self._request_handler_timeout, + timeout=self._request_handler_timeout, result=RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store), - use_state=use_state, + state=use_state, ) return SubCrawlerRun(result=crawl_result) except Exception as e: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 4724e27849..e3fa22af9a 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1119,67 +1119,6 @@ async def __run_task_function(self) -> None: async def __run_request_handler(self, context: BasicCrawlingContext) -> None: await self._context_pipeline(context, self.router) - @property - def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]: - """Contexts that have to be active before `crawl_one` can be called.""" - contexts: list[AbstractAsyncContextManager] = [] - contexts.append(self.statistics) - return contexts - - async def crawl_one( - self, - *, - context: BasicCrawlingContext, - request_handler_timeout: timedelta, - result: RequestHandlerRunResult, - use_state: dict[str, JsonSerializable] | None = None, - ) -> RequestHandlerRunResult: - """Populate result by crawling one request from input `context`. - - Context callbacks are routed to `result` and are not commited. - - Args: - context: Context used for crawling. It contains `request` that will be crawled. - request_handler_timeout: Timeout in seconds for request handling. - result: Record of calls to storage-related context helpers. - use_state: Existing state that will be used when `context.use_state` is called. - If none, take `use_state` from input `context`. - - Returns: - Same input result object that is mutated in the process. - """ - if use_state is not None: - - async def get_input_state( - default_value: dict[str, JsonSerializable] | None = None, # noqa:ARG001 # Intentionally unused arguments. Closure, that generates same output regardless of inputs. - ) -> dict[str, JsonSerializable]: - return use_state - - use_state_function = get_input_state - else: - use_state_function = context.use_state - - context_linked_to_result = BasicCrawlingContext( - request=context.request, - session=context.session, - proxy_info=context.proxy_info, - send_request=context.send_request, - add_requests=result.add_requests, - push_data=result.push_data, - get_key_value_store=result.get_key_value_store, - use_state=use_state_function, - log=context.log, - ) - - await wait_for( - lambda: self.__run_request_handler(context_linked_to_result), - timeout=request_handler_timeout, - timeout_message='Request handler timed out after ' - f'{self._request_handler_timeout.total_seconds()} seconds', - logger=self._logger, - ) - return result - def _is_session_blocked_status_code(self, session: Session | None, status_code: int) -> bool: """Check if the HTTP status code indicates that the session was blocked by the target website. diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index da9c341461..df1494f3a9 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -20,7 +20,6 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Mapping - from contextlib import AbstractAsyncContextManager from typing_extensions import Unpack @@ -286,10 +285,3 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], hook: A coroutine function to be called before each navigation. """ self._pre_navigation_hooks.append(hook) - - @property - def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]: - """Contexts that have to be active before `crawl_one` can be called.""" - contexts = super().crawl_one_required_contexts - contexts.append(self._browser_pool) - return contexts From 714b5bd0378c105f594821fedb98caadb3fd2083 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 8 Jan 2025 09:56:38 +0100 Subject: [PATCH 23/79] Make _commit_result consistent with how other result components are handled. --- .../_adaptive_playwright_crawler.py | 11 ++++++----- src/crawlee/crawlers/_basic/_basic_crawler.py | 17 +++++++++++++---- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 29af48af30..2ae8ddfbd1 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -8,7 +8,7 @@ from random import random from typing import TYPE_CHECKING, Any -from IPython.core.completer import TypedDict +from typing_extensions import TypedDict from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult from crawlee._utils.docs import docs_group @@ -304,7 +304,7 @@ async def _run_subcrawler( bs_run = await _run_subcrawler(self.beautifulsoup_crawler) if bs_run.result and self.result_checker(bs_run.result): - await self.commit_result(result=bs_run.result, context=context) + await self._commit_result(result=bs_run.result, context=context) return if bs_run.exception: context.log.exception( @@ -328,7 +328,7 @@ async def _run_subcrawler( raise pw_run.exception if pw_run.result: - await self.commit_result(result=pw_run.result, context=context) + await self._commit_result(result=pw_run.result, context=context) if should_detect_rendering_type: detection_result: RenderingType @@ -342,11 +342,12 @@ async def _run_subcrawler( context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}') self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result) - async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: + async def _commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: + """Execute calls from `result` on the context.""" result_tasks = ( [asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls] + [asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls] - + [asyncio.create_task(self._commit_key_value_store_changes(result))] + + [asyncio.create_task(self._commit_key_value_store_changes(result, context.get_key_value_store))] ) await asyncio.gather(*result_tasks) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index e3fa22af9a..9ceafdb85f 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -22,7 +22,13 @@ from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus from crawlee._log_config import configure_logger, get_configured_log_level from crawlee._request import Request, RequestState -from crawlee._types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction +from crawlee._types import ( + BasicCrawlingContext, + GetKeyValueStoreFromRequestHandlerFunction, + HttpHeaders, + RequestHandlerRunResult, + SendRequestFunction, +) from crawlee._utils.byte_size import ByteSize from crawlee._utils.docs import docs_group from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute @@ -948,11 +954,14 @@ async def _commit_request_handler_result( for push_data_call in result.push_data_calls: await self._push_data(**push_data_call) - await self._commit_key_value_store_changes(result) + await self._commit_key_value_store_changes(result, get_kvs = self.get_key_value_store) - async def _commit_key_value_store_changes(self, result: RequestHandlerRunResult) -> None: + @staticmethod + async def _commit_key_value_store_changes(result: RequestHandlerRunResult, + get_kvs: GetKeyValueStoreFromRequestHandlerFunction) -> None: + """Store key value store changes recorded in result.""" for (id, name), changes in result.key_value_store_changes.items(): - store = await self.get_key_value_store(id=id, name=name) + store = await get_kvs(id=id, name=name) for key, value in changes.updates.items(): await store.set_value(key, value.content, value.content_type) From b38dda1725707042950c7ed56ea8a8eccd1cb37d Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 8 Jan 2025 13:10:49 +0100 Subject: [PATCH 24/79] Remove subcrawlers and add _OrphanPipeline --- .../_adaptive_playwright_crawler.py | 116 +++++++++++------- src/crawlee/crawlers/_basic/_basic_crawler.py | 7 +- 2 files changed, 77 insertions(+), 46 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 2ae8ddfbd1..28c58c5fbd 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -4,6 +4,7 @@ import logging from contextlib import AbstractAsyncContextManager, AsyncExitStack from copy import deepcopy +from dataclasses import dataclass from logging import getLogger from random import random from typing import TYPE_CHECKING, Any @@ -18,6 +19,7 @@ BeautifulSoupCrawler, BeautifulSoupCrawlingContext, BeautifulSoupParserType, + ContextPipeline, PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext, @@ -41,7 +43,7 @@ from crawlee.statistics import Statistics if TYPE_CHECKING: - from collections.abc import Awaitable, Callable, Mapping, Sequence + from collections.abc import Awaitable, Callable, Coroutine, Mapping, Sequence from datetime import timedelta from typing_extensions import NotRequired, Unpack @@ -50,6 +52,7 @@ from crawlee.browsers import BrowserPool from crawlee.browsers._types import BrowserType from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions + from crawlee.router import Router from crawlee.statistics import FinalStatistics @@ -89,6 +92,38 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict): This option should not be used if `browser_pool` is provided.""" +@dataclass +class _OrphanPlaywrightContextPipeline: + pre_navigation_hook: Callable[[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]], None] + pipeline: ContextPipeline[PlaywrightCrawlingContext] + needed_contexts: list[AbstractAsyncContextManager] + top_router: Router[AdaptivePlaywrightCrawlingContext] + + def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: + async def from_pw_to_router(context: PlaywrightCrawlingContext) -> None: + adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( + context=context, beautiful_soup_parser_type='lxml' + ) + await self.top_router(adaptive_crawling_context) + + return self.pipeline(top_context, from_pw_to_router) + + +@dataclass +class _OrphanBeautifulsoupContextPipeline: + pre_navigation_hook: Callable[[Callable[[BasicCrawlingContext], Awaitable[None]]], None] + pipeline: ContextPipeline[BeautifulSoupCrawlingContext] + needed_contexts: list[AbstractAsyncContextManager] + top_router: Router[AdaptivePlaywrightCrawlingContext] + + def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: + async def from_pw_to_router(context: BeautifulSoupCrawlingContext) -> None: + adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_beautifulsoup_crawling_context(context) + await self.top_router(adaptive_crawling_context) + + return self.pipeline(top_context, from_pw_to_router) + + class AdaptivePlaywrightCrawler(BasicCrawler[AdaptivePlaywrightCrawlingContext]): """Adaptive crawler that uses both `BeautifulSoupCrawler` and `PlaywrightCrawler`. @@ -142,6 +177,8 @@ def __init__( # AdaptivePlaywrightCrawlerStatistics specific methods can be access in "type safe manner". self.adaptive_statistics = statistics + super().__init__(**kwargs) + # Sub crawlers related. beautifulsoup_crawler_kwargs = beautifulsoup_crawler_kwargs or {} beautifulsoup_crawler_kwargs.setdefault('parser', 'lxml') @@ -160,28 +197,34 @@ def __init__( bs_kwargs['statistics'] = Statistics(periodic_message_logger=bs_logger) pw_kwargs['statistics'] = Statistics(periodic_message_logger=pw_logger) - self.beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **bs_kwargs) - self.playwright_crawler = PlaywrightCrawler(**playwright_crawler_args, **pw_kwargs) - - @self.beautifulsoup_crawler.router.default_handler - async def request_handler_beautiful_soup(context: BeautifulSoupCrawlingContext) -> None: - """Handler for routing from beautifulsoup_crawler to adaptive_crawler handler.""" - adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_beautifulsoup_crawling_context(context) - await self.router(adaptive_crawling_context) + # Initialize sub crawlers to create their pipelines. + beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **bs_kwargs) + playwright_crawler = PlaywrightCrawler(**playwright_crawler_args, **pw_kwargs) - @self.playwright_crawler.router.default_handler - async def request_handler_playwright(context: PlaywrightCrawlingContext) -> None: - """Handler for routing from playwright_crawler to adaptive_crawler handler.""" - adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( - context=context, beautiful_soup_parser_type=beautifulsoup_crawler_kwargs['parser'] - ) - await self.router(adaptive_crawling_context) + required_contexts_pw_crawler: list[AbstractAsyncContextManager] = [ + playwright_crawler._statistics, # noqa:SLF001 # Intentional access to private member. + playwright_crawler._browser_pool, # noqa:SLF001 # Intentional access to private member. + ] + required_contexts_bs_crawler: list[AbstractAsyncContextManager] = [ + beautifulsoup_crawler._statistics, # noqa:SLF001 # Intentional access to private member. + ] - super().__init__(**kwargs) + self._pw_context_pipeline = _OrphanPlaywrightContextPipeline( + pipeline=playwright_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. + needed_contexts=required_contexts_pw_crawler, + top_router=self.router, + pre_navigation_hook=playwright_crawler.pre_navigation_hook, + ) + self._bs_context_pipeline = _OrphanBeautifulsoupContextPipeline( + pipeline=beautifulsoup_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. + needed_contexts=required_contexts_bs_crawler, + top_router=self.router, + pre_navigation_hook=beautifulsoup_crawler.pre_navigation_hook, + ) - @staticmethod async def crawl_one_with( - crawler: BeautifulSoupCrawler | PlaywrightCrawler, + self, + subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanBeautifulsoupContextPipeline, context: BasicCrawlingContext, timeout: timedelta, result: RequestHandlerRunResult, @@ -210,17 +253,11 @@ async def get_input_state( log=context.log, ) - # Mypy needs type narrowing. - if type(crawler) is PlaywrightCrawler: - run_pipeline = crawler._context_pipeline(context_linked_to_result, crawler.router) # noqa:SLF001 # Intentional access to private member. - if type(crawler) is BeautifulSoupCrawler: - run_pipeline = crawler._context_pipeline(context_linked_to_result, crawler.router) # noqa:SLF001 # Intentional access to private member. - await wait_for( - lambda: run_pipeline, + lambda: subcrawler_pipeline.create_pipeline_call(context_linked_to_result), timeout=timeout, timeout_message=f'Sub crawler timed out after {timeout.total_seconds()} seconds', - logger=crawler._logger, # noqa:SLF001 # Intentional access to private member. + logger=self._logger, ) return result @@ -237,17 +274,9 @@ async def run( purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default request queue will be purged. """ - required_contexts_pw_crawler: list[AbstractAsyncContextManager] = [ - self.playwright_crawler._statistics, # noqa:SLF001 # Intentional access to private member. - self.playwright_crawler._browser_pool, # noqa:SLF001 # Intentional access to private member. - ] - required_contexts_bs_crawler: list[AbstractAsyncContextManager] = [ - self.beautifulsoup_crawler._statistics, # noqa:SLF001 # Intentional access to private member. - ] - contexts_to_enter = [ cm - for cm in (required_contexts_pw_crawler + required_contexts_bs_crawler) + for cm in self._bs_context_pipeline.needed_contexts + self._pw_context_pipeline.needed_contexts if cm and getattr(cm, 'active', False) is False ] @@ -273,7 +302,8 @@ async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext """ async def _run_subcrawler( - crawler: BeautifulSoupCrawler | PlaywrightCrawler, use_state: dict | None = None + subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanBeautifulsoupContextPipeline, + use_state: dict | None = None, ) -> SubCrawlerRun: """Helper closure that creates new `RequestHandlerRunResult` and delegates request handling to sub crawler. @@ -281,7 +311,7 @@ async def _run_subcrawler( """ try: crawl_result = await self.crawl_one_with( - crawler=crawler, + subcrawler_pipeline=subcrawler_pipeline, context=context, timeout=self._request_handler_timeout, result=RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store), @@ -302,7 +332,7 @@ async def _run_subcrawler( context.log.debug(f'Running static request for {context.request.url}') self.adaptive_statistics.track_http_only_request_handler_runs() - bs_run = await _run_subcrawler(self.beautifulsoup_crawler) + bs_run = await _run_subcrawler(self._bs_context_pipeline) if bs_run.result and self.result_checker(bs_run.result): await self._commit_result(result=bs_run.result, context=context) return @@ -321,7 +351,7 @@ async def _run_subcrawler( old_state: dict[str, JsonSerializable] = await kvs.get_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) old_state_copy = deepcopy(old_state) - pw_run = await _run_subcrawler(self.playwright_crawler) + pw_run = await _run_subcrawler(self._pw_context_pipeline) self.adaptive_statistics.track_browser_request_handler_runs() if pw_run.exception is not None: @@ -332,7 +362,7 @@ async def _run_subcrawler( if should_detect_rendering_type: detection_result: RenderingType - bs_run = await _run_subcrawler(self.beautifulsoup_crawler, use_state=old_state_copy) + bs_run = await _run_subcrawler(self._bs_context_pipeline, use_state=old_state_copy) if bs_run.result and self.result_comparator(bs_run.result, pw_run.result): detection_result = 'static' @@ -363,8 +393,8 @@ def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None: def pre_navigation_hook_pw(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: """Pre navigation hooks for playwright sub crawler of adaptive crawler.""" - self.playwright_crawler.pre_navigation_hook(hook) + self._pw_context_pipeline.pre_navigation_hook(hook) def pre_navigation_hook_bs(self, hook: Callable[[BasicCrawlingContext], Awaitable[None]]) -> None: """Pre navigation hooks for beautifulsoup sub crawler of adaptive crawler.""" - self.beautifulsoup_crawler.pre_navigation_hook(hook) + self._bs_context_pipeline.pre_navigation_hook(hook) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 9ceafdb85f..2d41b86d36 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -954,11 +954,12 @@ async def _commit_request_handler_result( for push_data_call in result.push_data_calls: await self._push_data(**push_data_call) - await self._commit_key_value_store_changes(result, get_kvs = self.get_key_value_store) + await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store) @staticmethod - async def _commit_key_value_store_changes(result: RequestHandlerRunResult, - get_kvs: GetKeyValueStoreFromRequestHandlerFunction) -> None: + async def _commit_key_value_store_changes( + result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction + ) -> None: """Store key value store changes recorded in result.""" for (id, name), changes in result.key_value_store_changes.items(): store = await get_kvs(id=id, name=name) From ffb2a78a8fbcd07ae64bf322f5b7973b3617114a Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 8 Jan 2025 13:32:54 +0100 Subject: [PATCH 25/79] Use dummy statistics in subcrawlers. Use different url for unit tests. --- .../_adaptive_playwright_crawler.py | 25 ++++++++++++++++--- .../crawlers/_adaptive_playwright/_example.py | 4 +-- .../test_adaptive_playwright_crawler.py | 17 +++++++------ 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 28c58c5fbd..2b6a1c59ea 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -9,7 +9,7 @@ from random import random from typing import TYPE_CHECKING, Any -from typing_extensions import TypedDict +from typing_extensions import Self, TypedDict from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult from crawlee._utils.docs import docs_group @@ -45,6 +45,7 @@ if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Coroutine, Mapping, Sequence from datetime import timedelta + from types import TracebackType from typing_extensions import NotRequired, Unpack @@ -92,6 +93,22 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict): This option should not be used if `browser_pool` is provided.""" +class _NoActiveStatistics(Statistics): + """Statistics compliant object that is not supposed to do anything when active. To be used in sub crawlers.""" + + async def __aenter__(self) -> Self: + self._active = True + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + self._active = False + + @dataclass class _OrphanPlaywrightContextPipeline: pre_navigation_hook: Callable[[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]], None] @@ -193,9 +210,9 @@ def __init__( pw_logger.setLevel(logging.ERROR) pw_kwargs['_logger'] = pw_logger - # Each sub crawler will use own statistics. - bs_kwargs['statistics'] = Statistics(periodic_message_logger=bs_logger) - pw_kwargs['statistics'] = Statistics(periodic_message_logger=pw_logger) + # Each sub crawler will use own dummy statistics. + bs_kwargs['statistics'] = _NoActiveStatistics() + pw_kwargs['statistics'] = _NoActiveStatistics() # Initialize sub crawlers to create their pipelines. beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **bs_kwargs) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index e355ef27f2..ef5b99bfb0 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -11,7 +11,7 @@ async def main() -> None: - # TODO: remove in review. Move this to documentation examples instead. + # remove in review. Move this to documentation examples instead. top_logger = getLogger(__name__) top_logger.setLevel(logging.DEBUG) i = 0 @@ -38,7 +38,7 @@ async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: context.log.info(f'PW pre navigation hook for: {context.request.url} ...') # Run the crawler with the initial list of URLs. - await crawler.run(['https://crawlee.dev/']) + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) if __name__ == '__main__': diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 8ac6c89dc6..b180ab0818 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -64,7 +64,10 @@ async def test_adaptive_crawling( detection_probability_recommendation: Iterator[int], ) -> None: """Tests correct routing to pre-nav hooks and correct handling through proper handler.""" - requests = ['https://crawlee.dev/', 'https://crawlee.dev/docs/quick-start'] + requests = [ + 'https://warehouse-theme-metal.myshopify.com/', + 'https://warehouse-theme-metal.myshopify.com/collections', + ] predictor = _SimpleRenderingTypePredictor( rendering_types=rendering_types, detection_probability_recommendation=detection_probability_recommendation @@ -112,7 +115,7 @@ async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: # noqa:ARG async def test_adaptive_crawling_context() -> None: """Tests that correct context is used. Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() - requests = ['https://crawlee.dev/'] + requests = ['https://warehouse-theme-metal.myshopify.com/'] crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) @crawler.router.default_handler @@ -137,7 +140,7 @@ async def test_adaptive_crawling_result() -> None: Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() - requests = ['https://crawlee.dev/'] + requests = ['https://warehouse-theme-metal.myshopify.com/'] crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) @crawler.router.default_handler @@ -170,7 +173,7 @@ async def test_adaptive_crawling_predictor_calls( ) -> None: """Tests expected predictor calls. Same results.""" some_label = 'bla' - some_url = 'https://crawlee.dev/' + some_url = 'https://warehouse-theme-metal.myshopify.com/' static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() requests = [Request.from_url(url=some_url, label=some_label)] crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) @@ -202,7 +205,7 @@ async def test_adaptive_crawling_result_use_state_isolation() -> None: Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() - requests = ['https://crawlee.dev/'] + requests = ['https://warehouse-theme-metal.myshopify.com/'] crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) store = await crawler.get_key_value_store() await store.set_value(BasicCrawler.CRAWLEE_STATE_KEY, {'counter': 0}) @@ -230,7 +233,7 @@ async def test_adaptive_crawling_statistics() -> None: Crawler set to static crawling, but due to result_checker returning False on static crawling result it will do browser crawling instead well. This increments all three adaptive crawling related stats.""" - requests = ['https://crawlee.dev/'] + requests = ['https://warehouse-theme-metal.myshopify.com/'] static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) @@ -281,7 +284,7 @@ async def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_pw_crawle Exception in pw sub crawler will prevent any result from being commited. Even if `push_data` was called before the exception """ - requests = ['https://crawlee.dev/'] + requests = ['https://warehouse-theme-metal.myshopify.com/'] static_only_no_detection_predictor = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_no_detection_predictor) From 3b0522843d0b79942dcaa4b12aa39ffe477ca479 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 8 Jan 2025 13:41:31 +0100 Subject: [PATCH 26/79] Keep predictor related functions on predictor_state --- .../_adaptive_playwright_crawler.py | 8 ++++---- .../_adaptive_playwright_crawler_statistics.py | 18 +++++++++--------- .../test_adaptive_playwright_crawler.py | 6 +++--- ...t_adaptive_playwright_crawler_statistics.py | 12 ++++++------ 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 2b6a1c59ea..e39039992e 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -192,7 +192,7 @@ def __init__( # self.statistics is hard coded in BasicCrawler to Statistics, so even when we save children class in it, mypy # will complain about using child-specific methods. Save same object to another attribute so that # AdaptivePlaywrightCrawlerStatistics specific methods can be access in "type safe manner". - self.adaptive_statistics = statistics + self.predictor_state = statistics.predictor_state super().__init__(**kwargs) @@ -347,7 +347,7 @@ async def _run_subcrawler( ) if rendering_type_prediction.rendering_type == 'static': context.log.debug(f'Running static request for {context.request.url}') - self.adaptive_statistics.track_http_only_request_handler_runs() + self.predictor_state.track_http_only_request_handler_runs() bs_run = await _run_subcrawler(self._bs_context_pipeline) if bs_run.result and self.result_checker(bs_run.result): @@ -359,7 +359,7 @@ async def _run_subcrawler( ) else: context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}') - self.adaptive_statistics.track_rendering_type_mispredictions() + self.predictor_state.track_rendering_type_mispredictions() context.log.debug(f'Running browser request handler for {context.request.url}') @@ -369,7 +369,7 @@ async def _run_subcrawler( old_state_copy = deepcopy(old_state) pw_run = await _run_subcrawler(self._pw_context_pipeline) - self.adaptive_statistics.track_browser_request_handler_runs() + self.predictor_state.track_browser_request_handler_runs() if pw_run.exception is not None: raise pw_run.exception diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index 1391e2e587..9ff2ec887a 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -25,6 +25,15 @@ class PredictorState(BaseModel): browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0 rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0 + def track_http_only_request_handler_runs(self) -> None: + self.http_only_request_handler_runs += 1 + + def track_browser_request_handler_runs(self) -> None: + self.browser_request_handler_runs += 1 + + def track_rendering_type_mispredictions(self) -> None: + self.rendering_type_mispredictions += 1 + @docs_group('Classes') class AdaptivePlaywrightCrawlerStatistics(Statistics): @@ -66,15 +75,6 @@ def from_statistics(cls, statistics: Statistics) -> Self: state_model=statistics._state_model, # noqa:SLF001 # Accessing private member to create copy like-object. ) - def track_http_only_request_handler_runs(self) -> None: - self.predictor_state.http_only_request_handler_runs += 1 - - def track_browser_request_handler_runs(self) -> None: - self.predictor_state.browser_request_handler_runs += 1 - - def track_rendering_type_mispredictions(self) -> None: - self.predictor_state.rendering_type_mispredictions += 1 - @override async def _persist_other_statistics(self, key_value_store: KeyValueStore) -> None: """Persist state of predictor.""" diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index b180ab0818..e901aaaa57 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -248,9 +248,9 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: await crawler.run(requests) - assert crawler.adaptive_statistics.predictor_state.http_only_request_handler_runs == 1 - assert crawler.adaptive_statistics.predictor_state.browser_request_handler_runs == 1 - assert crawler.adaptive_statistics.predictor_state.rendering_type_mispredictions == 1 + assert crawler.predictor_state.http_only_request_handler_runs == 1 + assert crawler.predictor_state.browser_request_handler_runs == 1 + assert crawler.predictor_state.rendering_type_mispredictions == 1 def test_adaptive_default_hooks_raise_exception() -> None: diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py index c6bc7b9eaa..a885597d20 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py @@ -7,14 +7,14 @@ async def test_predictor_state_persistence() -> None: """Test that adaptive statistics can be correctly persisted and initialized from persisted values.""" async with AdaptivePlaywrightCrawlerStatistics(persistence_enabled=True) as adaptive_statistics: - adaptive_statistics.track_rendering_type_mispredictions() - adaptive_statistics.track_rendering_type_mispredictions() + adaptive_statistics.predictor_state.track_rendering_type_mispredictions() + adaptive_statistics.predictor_state.track_rendering_type_mispredictions() - adaptive_statistics.track_http_only_request_handler_runs() - adaptive_statistics.track_http_only_request_handler_runs() - adaptive_statistics.track_http_only_request_handler_runs() + adaptive_statistics.predictor_state.track_http_only_request_handler_runs() + adaptive_statistics.predictor_state.track_http_only_request_handler_runs() + adaptive_statistics.predictor_state.track_http_only_request_handler_runs() - adaptive_statistics.track_browser_request_handler_runs() + adaptive_statistics.predictor_state.track_browser_request_handler_runs() persistence_state_key = adaptive_statistics._persist_state_key # adaptive_statistics are persisted after leaving the context From dc06490b3623bb29a0413e3ccddd47d7c7f9fb97 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 8 Jan 2025 16:50:03 +0100 Subject: [PATCH 27/79] Unify pre-nav hooks. (By temporal wrapper context.) --- .../_adaptive_playwright_crawler.py | 28 +++----- .../_adaptive_playwright_crawling_context.py | 65 ++++++++++++++++- .../crawlers/_adaptive_playwright/_example.py | 16 ++--- .../test_adaptive_playwright_crawler.py | 70 +++++++++---------- 4 files changed, 114 insertions(+), 65 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index e39039992e..b9f1d6b26e 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -30,6 +30,7 @@ ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPreNavCrawlingContext, ) from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( DefaultRenderingTypePredictor, @@ -259,9 +260,9 @@ async def get_input_state( use_state_function = context.use_state context_linked_to_result = BasicCrawlingContext( - request=context.request, - session=context.session, - proxy_info=context.proxy_info, + request=deepcopy(context.request), + session=deepcopy(context.session), + proxy_info=deepcopy(context.proxy_info), send_request=context.send_request, add_requests=result.add_requests, push_data=result.push_data, @@ -400,18 +401,11 @@ async def _commit_result(self, result: RequestHandlerRunResult, context: BasicCr await asyncio.gather(*result_tasks) def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None: - """Pre navigation hooks for adaptive crawler are delegated to sub crawlers.""" - raise RuntimeError( - 'Pre navigation hooks are ambiguous in adaptive crawling context. Use specific hook instead:' - '`pre_navigation_hook_pw` for playwright sub crawler related hooks or' - '`pre_navigation_hook_bs`for beautifulsoup sub crawler related hooks. \n' - f'{hook=} will not be used!!!' - ) - - def pre_navigation_hook_pw(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: - """Pre navigation hooks for playwright sub crawler of adaptive crawler.""" - self._pw_context_pipeline.pre_navigation_hook(hook) + """Pre navigation hooks for adaptive crawler are delegated to sub crawlers. - def pre_navigation_hook_bs(self, hook: Callable[[BasicCrawlingContext], Awaitable[None]]) -> None: - """Pre navigation hooks for beautifulsoup sub crawler of adaptive crawler.""" - self._bs_context_pipeline.pre_navigation_hook(hook) + Hooks are wrapped in context that handles possibly missing `page` object by throwing `AdaptiveContextError`. + Hooks that try to access `context.page` will have to catch this exception if triggered by static pipeline. + """ + wrapped_hook = AdaptivePlaywrightPreNavCrawlingContext.wrap_hook_in_temporal_adaptive_context(hook) + self._pw_context_pipeline.pre_navigation_hook(wrapped_hook) + self._bs_context_pipeline.pre_navigation_hook(wrapped_hook) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 796aa68b02..48a5dee102 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -1,13 +1,19 @@ from __future__ import annotations from dataclasses import dataclass, fields -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, cast from bs4 import BeautifulSoup -from crawlee import HttpHeaders +from crawlee import HttpHeaders, Request +from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group -from crawlee.crawlers import BeautifulSoupCrawlingContext, BeautifulSoupParserType, PlaywrightCrawlingContext +from crawlee.crawlers import ( + BeautifulSoupCrawlingContext, + BeautifulSoupParserType, + PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, +) if TYPE_CHECKING: from collections.abc import Awaitable, Callable @@ -72,6 +78,59 @@ async def from_playwright_crawling_context( ) +class AdaptivePlaywrightPreNavCrawlingContext: + """This is just wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext. + + Changes done to this context are done to the wrapped context. + Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext. + """ + + def __init__(self, context: BasicCrawlingContext | AdaptivePlaywrightCrawlingContext) -> None: + self._context = context + + @property + def page(self) -> Page: + if type(self._context) is PlaywrightPreNavCrawlingContext: + return self._context.page + raise AdaptiveContextError('Page is not crawled with PlaywrightCrawler.') + + @property + def request(self) -> Request: + return self._context.request + + def __getattr__(self, name: str) -> Any: + if name == '_context': + return super().__getattribute__(name) + + if name == 'page' and type(self._context) is BasicCrawlingContext: + raise AdaptiveContextError('Page is not crawled with PlaywrightCrawler.') + + return getattr(self._context, name) + + def __setattr__(self, name: str, value: Any) -> None: + """Maybe not needed if all allowed changes on context are mutation and not assignment???""" + if name == '_context': + super().__setattr__(name, value) + return + setattr(self._context, name, value) + + @classmethod + def wrap_hook_in_temporal_adaptive_context( + cls, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]] + ) -> Callable[[PlaywrightPreNavCrawlingContext | BasicCrawlingContext], Awaitable[None]]: + """This will wrap context entering hook functions in unified AdaptivePlaywrightPreNavCrawlingContext. + + Any changes done to this wrapped context happens to the original context. + """ + + def wrapped_hook(context: PlaywrightPreNavCrawlingContext | BasicCrawlingContext) -> Awaitable[None]: + # This cast is done to hide PlaywrightPreNavCrawlingContext from user. + wrapped_context = cast(PlaywrightPreNavCrawlingContext, cls(context)) + return hook(wrapped_context) + + return wrapped_hook + + @dataclass(frozen=True) class _HttpResponse: http_version: str diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index ef5b99bfb0..4d90170e1a 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -2,10 +2,10 @@ import logging from logging import getLogger -from crawlee._types import BasicCrawlingContext from crawlee.crawlers import PlaywrightPreNavCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( + AdaptiveContextError, AdaptivePlaywrightCrawlingContext, ) @@ -29,13 +29,13 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: await context.push_data({'Top crwaler Url': context.request.url}) await context.use_state({'bla': i}) - @crawler.pre_navigation_hook_bs - async def bs_hook(context: BasicCrawlingContext) -> None: - context.log.info(f'BS pre navigation hook for: {context.request.url} ...') - - @crawler.pre_navigation_hook_pw - async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: - context.log.info(f'PW pre navigation hook for: {context.request.url} ...') + @crawler.pre_navigation_hook + async def hook(context: PlaywrightPreNavCrawlingContext) -> None: + try: + context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. + context.log.info(f'PW pre navigation hook for: {context.request.url} ...') + except AdaptiveContextError: + context.log.info(f'BS pre navigation hook for: {context.request.url} ...') # Run the crawler with the initial list of URLs. await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index e901aaaa57..39185b7467 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -3,14 +3,13 @@ import logging from datetime import timedelta from itertools import cycle -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, cast from unittest.mock import Mock, patch import pytest from typing_extensions import override from crawlee import Request -from crawlee._types import BasicCrawlingContext from crawlee.crawlers import BasicCrawler, PlaywrightPreNavCrawlingContext from crawlee.crawlers._adaptive_playwright import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( @@ -93,15 +92,17 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: except AdaptiveContextError: bs_handler_count += 1 - @crawler.pre_navigation_hook_bs - async def bs_hook(context: BasicCrawlingContext) -> None: # noqa:ARG001 # Intentionally unused arg + @crawler.pre_navigation_hook + async def bs_hook(context: PlaywrightPreNavCrawlingContext) -> None: # Intentionally unused arg nonlocal bs_hook_count - bs_hook_count += 1 - - @crawler.pre_navigation_hook_pw - async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: # noqa:ARG001 # Intentionally unused arg nonlocal pw_hook_count - pw_hook_count += 1 + + try: + # page is available only if it was crawled by PlaywrightCrawler. + context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. + pw_hook_count += 1 + except AdaptiveContextError: + bs_hook_count += 1 await crawler.run(requests) @@ -112,27 +113,33 @@ async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: # noqa:ARG assert bs_hook_count == expected_bs_count -async def test_adaptive_crawling_context() -> None: - """Tests that correct context is used. Enforced rendering type detection to run both sub crawlers.""" +async def test_adaptive_crawling_pre_nav_change_to_context() -> None: + """Tests that context can be modified in pre-navigation hooks.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() - requests = ['https://warehouse-theme-metal.myshopify.com/'] + crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) + user_data_in_pre_nav_hook = [] + user_data_in_handler = [] @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - assert context.request.url == requests[0] - - @crawler.pre_navigation_hook_bs - async def bs_hook(context: BasicCrawlingContext) -> None: - assert type(context) is BasicCrawlingContext - assert context.request.url == requests[0] + user_data_in_handler.append(context.request.user_data.get('data', None)) - @crawler.pre_navigation_hook_pw - async def pw_hook(context: PlaywrightPreNavCrawlingContext) -> None: - assert type(context) is PlaywrightPreNavCrawlingContext - assert context.request.url == requests[0] + @crawler.pre_navigation_hook + async def pre_nav_hook(context: PlaywrightPreNavCrawlingContext) -> None: + user_data_in_pre_nav_hook.append(context.request.user_data.get('data', None)) + try: + # page is available only if it was crawled by PlaywrightCrawler. + context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. + context.request.user_data['data'] = 'pw' + except AdaptiveContextError: + context.request.user_data['data'] = 'bs' - await crawler.run(requests) + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + # Check that pre nav hooks does not influence each other + assert user_data_in_pre_nav_hook == [None, None] + # Check that pre nav hooks can modify context + assert user_data_in_handler == ['pw', 'bs'] async def test_adaptive_crawling_result() -> None: @@ -253,20 +260,6 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: assert crawler.predictor_state.rendering_type_mispredictions == 1 -def test_adaptive_default_hooks_raise_exception() -> None: - """Trying to attach usual pre-navigation hook raises exception. - - It is ambiguous and so sub crawler specific hooks should be used instead.""" - - crawler = AdaptivePlaywrightCrawler() - - with pytest.raises(RuntimeError): - - @crawler.pre_navigation_hook - async def some_hook(whatever: Any) -> None: - pass - - @pytest.mark.parametrize( 'error_in_pw_crawler', [ @@ -340,3 +333,6 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: assert crawler._statistics._log_message == log_message assert crawler._statistics._periodic_message_logger == periodic_message_logger assert crawler._statistics._log_interval == log_interval + + +# Add more tests for setattr, if needed at all and pre nav hooks From 0766b7ad5845930bc346bbcbff8b89b8ae56f0bf Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 9 Jan 2025 09:15:44 +0100 Subject: [PATCH 28/79] Simplify pre-nav hook common context. --- .../_adaptive_playwright_crawler.py | 19 +++++-- .../_adaptive_playwright_crawling_context.py | 57 +++++-------------- .../crawlers/_adaptive_playwright/_example.py | 4 +- .../test_adaptive_playwright_crawler.py | 14 ++--- 4 files changed, 38 insertions(+), 56 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index b9f1d6b26e..e388ddbc45 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -400,12 +400,23 @@ async def _commit_result(self, result: RequestHandlerRunResult, context: BasicCr await asyncio.gather(*result_tasks) - def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None: + def pre_navigation_hook( + self, + hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]], + ) -> None: """Pre navigation hooks for adaptive crawler are delegated to sub crawlers. Hooks are wrapped in context that handles possibly missing `page` object by throwing `AdaptiveContextError`. Hooks that try to access `context.page` will have to catch this exception if triggered by static pipeline. """ - wrapped_hook = AdaptivePlaywrightPreNavCrawlingContext.wrap_hook_in_temporal_adaptive_context(hook) - self._pw_context_pipeline.pre_navigation_hook(wrapped_hook) - self._bs_context_pipeline.pre_navigation_hook(wrapped_hook) + + def wrapped_bs_hook(context: BasicCrawlingContext) -> Awaitable[None]: + wrapped_context = AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_contexts(context) + return hook(wrapped_context) + + def wrapped_pw_hook(context: PlaywrightPreNavCrawlingContext) -> Awaitable[None]: + wrapped_context = AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_contexts(context) + return hook(wrapped_context) + + self._pw_context_pipeline.pre_navigation_hook(wrapped_pw_hook) + self._bs_context_pipeline.pre_navigation_hook(wrapped_bs_hook) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 48a5dee102..86f34eb5f6 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -1,18 +1,17 @@ from __future__ import annotations from dataclasses import dataclass, fields -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING from bs4 import BeautifulSoup -from crawlee import HttpHeaders, Request +from crawlee import HttpHeaders from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group from crawlee.crawlers import ( BeautifulSoupCrawlingContext, BeautifulSoupParserType, PlaywrightCrawlingContext, - PlaywrightPreNavCrawlingContext, ) if TYPE_CHECKING: @@ -78,57 +77,29 @@ async def from_playwright_crawling_context( ) -class AdaptivePlaywrightPreNavCrawlingContext: +@dataclass(frozen=True) +class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext): """This is just wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext. - Changes done to this context are done to the wrapped context. Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext. """ - def __init__(self, context: BasicCrawlingContext | AdaptivePlaywrightCrawlingContext) -> None: - self._context = context + _page: Page | None = None @property def page(self) -> Page: - if type(self._context) is PlaywrightPreNavCrawlingContext: - return self._context.page + if self._page is not None: + return self._page raise AdaptiveContextError('Page is not crawled with PlaywrightCrawler.') - @property - def request(self) -> Request: - return self._context.request - - def __getattr__(self, name: str) -> Any: - if name == '_context': - return super().__getattribute__(name) - - if name == 'page' and type(self._context) is BasicCrawlingContext: - raise AdaptiveContextError('Page is not crawled with PlaywrightCrawler.') - - return getattr(self._context, name) - - def __setattr__(self, name: str, value: Any) -> None: - """Maybe not needed if all allowed changes on context are mutation and not assignment???""" - if name == '_context': - super().__setattr__(name, value) - return - setattr(self._context, name, value) - @classmethod - def wrap_hook_in_temporal_adaptive_context( - cls, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]] - ) -> Callable[[PlaywrightPreNavCrawlingContext | BasicCrawlingContext], Awaitable[None]]: - """This will wrap context entering hook functions in unified AdaptivePlaywrightPreNavCrawlingContext. - - Any changes done to this wrapped context happens to the original context. - """ - - def wrapped_hook(context: PlaywrightPreNavCrawlingContext | BasicCrawlingContext) -> Awaitable[None]: - # This cast is done to hide PlaywrightPreNavCrawlingContext from user. - wrapped_context = cast(PlaywrightPreNavCrawlingContext, cls(context)) - return hook(wrapped_context) - - return wrapped_hook + def from_pre_navigation_contexts( + cls, context: AdaptivePlaywrightPreNavCrawlingContext | BasicCrawlingContext + ) -> Self: + """Convenience constructor that creates new context from existing pre navigation contexts.""" + context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} + context_kwargs['_page'] = context_kwargs.pop('page', None) + return cls(**context_kwargs) @dataclass(frozen=True) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index 4d90170e1a..c707d244c8 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -2,11 +2,11 @@ import logging from logging import getLogger -from crawlee.crawlers import PlaywrightPreNavCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( AdaptiveContextError, AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPreNavCrawlingContext, ) @@ -30,7 +30,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: await context.use_state({'bla': i}) @crawler.pre_navigation_hook - async def hook(context: PlaywrightPreNavCrawlingContext) -> None: + async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: try: context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. context.log.info(f'PW pre navigation hook for: {context.request.url} ...') diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 39185b7467..bf19615da2 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -10,12 +10,15 @@ from typing_extensions import override from crawlee import Request -from crawlee.crawlers import BasicCrawler, PlaywrightPreNavCrawlingContext +from crawlee.crawlers import BasicCrawler from crawlee.crawlers._adaptive_playwright import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( AdaptivePlaywrightCrawlerStatistics, ) -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import AdaptiveContextError +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( + AdaptiveContextError, + AdaptivePlaywrightPreNavCrawlingContext, +) from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( RenderingType, RenderingTypePrediction, @@ -93,7 +96,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: bs_handler_count += 1 @crawler.pre_navigation_hook - async def bs_hook(context: PlaywrightPreNavCrawlingContext) -> None: # Intentionally unused arg + async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: # Intentionally unused arg nonlocal bs_hook_count nonlocal pw_hook_count @@ -126,7 +129,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: user_data_in_handler.append(context.request.user_data.get('data', None)) @crawler.pre_navigation_hook - async def pre_nav_hook(context: PlaywrightPreNavCrawlingContext) -> None: + async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: user_data_in_pre_nav_hook.append(context.request.user_data.get('data', None)) try: # page is available only if it was crawled by PlaywrightCrawler. @@ -333,6 +336,3 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: assert crawler._statistics._log_message == log_message assert crawler._statistics._periodic_message_logger == periodic_message_logger assert crawler._statistics._log_interval == log_interval - - -# Add more tests for setattr, if needed at all and pre nav hooks From 4a63c2c9b23fc95e5f15d136636bb15fc1f4184f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 9 Jan 2025 15:08:57 +0100 Subject: [PATCH 29/79] Make static crawling part of AdaptiveCrawler generic. This adds some complexity, but adds more flexibility. --- .../_abstract_http/_abstract_http_crawler.py | 25 +++ .../_adaptive_playwright_crawler.py | 192 +++++++++++------- .../_adaptive_playwright_crawling_context.py | 25 ++- .../crawlers/_adaptive_playwright/_example.py | 10 +- .../test_adaptive_playwright_crawler.py | 26 ++- 5 files changed, 190 insertions(+), 88 deletions(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 7685e09570..6514a70cc1 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -90,6 +90,31 @@ def __init__( kwargs.setdefault('_logger', logging.getLogger(__name__)) super().__init__(**kwargs) + @staticmethod + def create_parsed_http_crawler_class( + static_parser: AbstractHttpParser[TParseResult], + ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]]: + """Convenience class factory that creates specific version of `AbstractHttpCrawler` class. + + In general typing sense two generic types of `AbstractHttpCrawler` do not have to be dependent on each other. + This is convenience constructor for specific cases when `TParseResult` is used to specify both generic + parameters in `AbstractHttpCrawler`. + """ + + class _ParsedHttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]): + def __init__( + self, + parser: AbstractHttpParser[TParseResult] = static_parser, + **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]], + ) -> None: + kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline() + super().__init__( + parser=parser, + **kwargs, + ) + + return _ParsedHttpCrawler + def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]: """Create static content crawler context pipeline with expected pipeline steps.""" return ( diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index e388ddbc45..16f537ad5b 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -7,24 +7,28 @@ from dataclasses import dataclass from logging import getLogger from random import random -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Generic -from typing_extensions import Self, TypedDict +from bs4 import BeautifulSoup +from parsel import Selector +from typing_extensions import Self, TypedDict, TypeVar from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult from crawlee._utils.docs import docs_group from crawlee._utils.wait import wait_for from crawlee.crawlers import ( + AbstractHttpCrawler, + AbstractHttpParser, BasicCrawler, - BeautifulSoupCrawler, BeautifulSoupCrawlingContext, BeautifulSoupParserType, ContextPipeline, + ParsedHttpCrawlingContext, + ParselCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext, ) -from crawlee.crawlers._abstract_http._abstract_http_crawler import _HttpCrawlerOptions from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( AdaptivePlaywrightCrawlerStatistics, ) @@ -41,6 +45,8 @@ SubCrawlerRun, create_comparator, ) +from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser +from crawlee.crawlers._parsel._parsel_parser import ParselParser from crawlee.statistics import Statistics if TYPE_CHECKING: @@ -53,17 +59,14 @@ from crawlee import Request from crawlee.browsers import BrowserPool from crawlee.browsers._types import BrowserType + from crawlee.crawlers._abstract_http._abstract_http_crawler import _HttpCrawlerOptions from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions from crawlee.router import Router from crawlee.statistics import FinalStatistics -@docs_group('Data structures') -class _BeautifulsoupCrawlerAdditionalOptions(_HttpCrawlerOptions): - """Additional options that can be specified for BeautifulsoupCrawler.""" - - parser: NotRequired[BeautifulSoupParserType] - """Parser type used by BeautifulSoup.""" +TStaticParseResult = TypeVar('TStaticParseResult') +TStaticCrawlingContext = TypeVar('TStaticCrawlingContext', bound=ParsedHttpCrawlingContext) @docs_group('Data structures') @@ -111,16 +114,17 @@ async def __aexit__( @dataclass -class _OrphanPlaywrightContextPipeline: +class _OrphanPlaywrightContextPipeline(Generic[TStaticParseResult]): pre_navigation_hook: Callable[[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]], None] pipeline: ContextPipeline[PlaywrightCrawlingContext] needed_contexts: list[AbstractAsyncContextManager] top_router: Router[AdaptivePlaywrightCrawlingContext] + static_parser: AbstractHttpParser[TStaticParseResult] def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: async def from_pw_to_router(context: PlaywrightCrawlingContext) -> None: adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( - context=context, beautiful_soup_parser_type='lxml' + context=context, parser=self.static_parser ) await self.top_router(adaptive_crawling_context) @@ -128,22 +132,24 @@ async def from_pw_to_router(context: PlaywrightCrawlingContext) -> None: @dataclass -class _OrphanBeautifulsoupContextPipeline: +class _OrphanStaticContextPipeline(Generic[TStaticCrawlingContext]): pre_navigation_hook: Callable[[Callable[[BasicCrawlingContext], Awaitable[None]]], None] - pipeline: ContextPipeline[BeautifulSoupCrawlingContext] + pipeline: ContextPipeline[TStaticCrawlingContext] needed_contexts: list[AbstractAsyncContextManager] top_router: Router[AdaptivePlaywrightCrawlingContext] def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: - async def from_pw_to_router(context: BeautifulSoupCrawlingContext) -> None: - adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_beautifulsoup_crawling_context(context) + async def from_pw_to_router(context: TStaticCrawlingContext) -> None: + adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context(context) await self.top_router(adaptive_crawling_context) return self.pipeline(top_context, from_pw_to_router) -class AdaptivePlaywrightCrawler(BasicCrawler[AdaptivePlaywrightCrawlingContext]): - """Adaptive crawler that uses both `BeautifulSoupCrawler` and `PlaywrightCrawler`. +class AdaptivePlaywrightCrawler( + Generic[TStaticCrawlingContext, TStaticParseResult], BasicCrawler[AdaptivePlaywrightCrawlingContext] +): + """Adaptive crawler that uses both specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. It tries to detect whether it is sufficient to crawl with `BeautifulSoupCrawler` (which is faster) or if `PlaywrightCrawler` should be used (in case `BeautifulSoupCrawler` did not work as expected for specific url.). @@ -153,11 +159,13 @@ class AdaptivePlaywrightCrawler(BasicCrawler[AdaptivePlaywrightCrawlingContext]) def __init__( self, + *, + static_parser: AbstractHttpParser[TStaticParseResult], rendering_type_predictor: RenderingTypePredictor | None = None, result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, - beautifulsoup_crawler_kwargs: _BeautifulsoupCrawlerAdditionalOptions | None = None, - playwright_crawler_args: _PlaywrightCrawlerAdditionalOptions | None = None, + static_crawler_specific_kwargs: _HttpCrawlerOptions | None = None, + playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> None: """A default constructor. @@ -167,22 +175,21 @@ def __init__( rendering method should be used. If None, then `DefaultRenderingTypePredictor` is used. result_checker: Function that evaluates whether crawling result is valid or not. result_comparator: Function that compares two crawling results and decides whether they are equivalent. - beautifulsoup_crawler_kwargs: BeautifulsoupCrawler only kwargs that are passed to the sub crawler. - playwright_crawler_args: PlaywrightCrawler only kwargs that are passed to the sub crawler. + static_parser: Implementation of `AbstractHttpParser`. Parser that will be used for static crawling. + static_crawler_specific_kwargs: `AbstractHttpCrawler` only kwargs that are passed to the sub crawler. + playwright_crawler_specific_kwargs: `PlaywrightCrawler` only kwargs that are passed to the sub crawler. kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ # Some sub crawler kwargs are internally modified. Prepare copies. - bs_kwargs = deepcopy(kwargs) - pw_kwargs = deepcopy(kwargs) + basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs) + basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs) # Adaptive crawling related. self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 # Intentionally unused argument. - self.result_comparator = result_comparator or create_comparator(result_checker) # Use AdaptivePlaywrightCrawlerStatistics. - # Very hard to work with current "fake generic" Statistics. TODO: Discuss best approach. if 'statistics' in kwargs: # If statistics already specified by user, create AdaptivePlaywrightCrawlerStatistics from it. statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=kwargs['statistics']) @@ -190,41 +197,43 @@ def __init__( statistics = AdaptivePlaywrightCrawlerStatistics() kwargs['statistics'] = statistics - # self.statistics is hard coded in BasicCrawler to Statistics, so even when we save children class in it, mypy - # will complain about using child-specific methods. Save same object to another attribute so that - # AdaptivePlaywrightCrawlerStatistics specific methods can be access in "type safe manner". self.predictor_state = statistics.predictor_state super().__init__(**kwargs) # Sub crawlers related. - beautifulsoup_crawler_kwargs = beautifulsoup_crawler_kwargs or {} - beautifulsoup_crawler_kwargs.setdefault('parser', 'lxml') - playwright_crawler_args = playwright_crawler_args or {} + playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {} + static_crawler_specific_kwargs = static_crawler_specific_kwargs or {} # Each sub crawler will use custom logger . - bs_logger = getLogger('Subcrawler_BS') - bs_logger.setLevel(logging.ERROR) - bs_kwargs['_logger'] = bs_logger + static_logger = getLogger('Subcrawler_static') + static_logger.setLevel(logging.ERROR) + basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger - pw_logger = getLogger('Subcrawler_PW') + pw_logger = getLogger('Subcrawler_playwright') pw_logger.setLevel(logging.ERROR) - pw_kwargs['_logger'] = pw_logger + basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger # Each sub crawler will use own dummy statistics. - bs_kwargs['statistics'] = _NoActiveStatistics() - pw_kwargs['statistics'] = _NoActiveStatistics() + basic_crawler_kwargs_for_static_crawler['statistics'] = _NoActiveStatistics() + basic_crawler_kwargs_for_pw_crawler['statistics'] = _NoActiveStatistics() # Initialize sub crawlers to create their pipelines. - beautifulsoup_crawler = BeautifulSoupCrawler(**beautifulsoup_crawler_kwargs, **bs_kwargs) - playwright_crawler = PlaywrightCrawler(**playwright_crawler_args, **pw_kwargs) + static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser) + + static_crawler = static_crawler_class( + parser=static_parser, **static_crawler_specific_kwargs, **basic_crawler_kwargs_for_static_crawler + ) + playwright_crawler = PlaywrightCrawler( + **playwright_crawler_specific_kwargs, **basic_crawler_kwargs_for_pw_crawler + ) required_contexts_pw_crawler: list[AbstractAsyncContextManager] = [ playwright_crawler._statistics, # noqa:SLF001 # Intentional access to private member. playwright_crawler._browser_pool, # noqa:SLF001 # Intentional access to private member. ] - required_contexts_bs_crawler: list[AbstractAsyncContextManager] = [ - beautifulsoup_crawler._statistics, # noqa:SLF001 # Intentional access to private member. + required_contexts_static_crawler: list[AbstractAsyncContextManager] = [ + static_crawler._statistics, # noqa:SLF001 # Intentional access to private member. ] self._pw_context_pipeline = _OrphanPlaywrightContextPipeline( @@ -232,17 +241,61 @@ def __init__( needed_contexts=required_contexts_pw_crawler, top_router=self.router, pre_navigation_hook=playwright_crawler.pre_navigation_hook, + static_parser=static_parser, ) - self._bs_context_pipeline = _OrphanBeautifulsoupContextPipeline( - pipeline=beautifulsoup_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. - needed_contexts=required_contexts_bs_crawler, + self._static_context_pipeline = _OrphanStaticContextPipeline[ParsedHttpCrawlingContext[TStaticParseResult]]( + pipeline=static_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. + needed_contexts=required_contexts_static_crawler, top_router=self.router, - pre_navigation_hook=beautifulsoup_crawler.pre_navigation_hook, + pre_navigation_hook=static_crawler.pre_navigation_hook, + ) + + @staticmethod + def with_beautifulsoup_static_parser( + rendering_type_predictor: RenderingTypePredictor | None = None, + result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, + result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, + parser_type: BeautifulSoupParserType = 'lxml', + static_crawler_specific_kwargs: _HttpCrawlerOptions[BeautifulSoupCrawlingContext] | None = None, + playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, + **kwargs: Unpack[_BasicCrawlerOptions], + ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]: + """Creates `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" + parser_kwargs = {'parser': parser_type} if parser_type else {} + + return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]( + rendering_type_predictor=rendering_type_predictor, + result_checker=result_checker, + result_comparator=result_comparator, + static_parser=BeautifulSoupParser(**parser_kwargs), + static_crawler_specific_kwargs=static_crawler_specific_kwargs, + playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, + **kwargs, + ) + + @staticmethod + def with_parsel_static_parser( + rendering_type_predictor: RenderingTypePredictor | None = None, + result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, + result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, + static_crawler_specific_kwargs: _HttpCrawlerOptions[ParselCrawlingContext] | None = None, + playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, + **kwargs: Unpack[_BasicCrawlerOptions], + ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]: + """Creates `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content.""" + return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]( + rendering_type_predictor=rendering_type_predictor, + result_checker=result_checker, + result_comparator=result_comparator, + static_parser=ParselParser(), + static_crawler_specific_kwargs=static_crawler_specific_kwargs, + playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, + **kwargs, ) async def crawl_one_with( self, - subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanBeautifulsoupContextPipeline, + subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanStaticContextPipeline, context: BasicCrawlingContext, timeout: timedelta, result: RequestHandlerRunResult, @@ -294,7 +347,7 @@ async def run( """ contexts_to_enter = [ cm - for cm in self._bs_context_pipeline.needed_contexts + self._pw_context_pipeline.needed_contexts + for cm in self._static_context_pipeline.needed_contexts + self._pw_context_pipeline.needed_contexts if cm and getattr(cm, 'active', False) is False ] @@ -320,7 +373,7 @@ async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext """ async def _run_subcrawler( - subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanBeautifulsoupContextPipeline, + subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanStaticContextPipeline, use_state: dict | None = None, ) -> SubCrawlerRun: """Helper closure that creates new `RequestHandlerRunResult` and delegates request handling to sub crawler. @@ -350,13 +403,13 @@ async def _run_subcrawler( context.log.debug(f'Running static request for {context.request.url}') self.predictor_state.track_http_only_request_handler_runs() - bs_run = await _run_subcrawler(self._bs_context_pipeline) - if bs_run.result and self.result_checker(bs_run.result): - await self._commit_result(result=bs_run.result, context=context) + static_run = await _run_subcrawler(self._static_context_pipeline) + if static_run.result and self.result_checker(static_run.result): + await self._commit_result(result=static_run.result, context=context) return - if bs_run.exception: + if static_run.exception: context.log.exception( - msg=f'Static crawler: failed for {context.request.url}', exc_info=bs_run.exception + msg=f'Static crawler: failed for {context.request.url}', exc_info=static_run.exception ) else: context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}') @@ -364,10 +417,15 @@ async def _run_subcrawler( context.log.debug(f'Running browser request handler for {context.request.url}') - kvs = await context.get_key_value_store() - default_value = dict[str, JsonSerializable]() - old_state: dict[str, JsonSerializable] = await kvs.get_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) - old_state_copy = deepcopy(old_state) + if should_detect_rendering_type: + # Save copy of global state from `use_state` before it can be mutated by browser crawl. + # This copy will be used in the static crawl to make sure they both run with same conditions and to + # avoid static crawl to modify the state. + # (This static crawl is performed only to evaluate rendering type detection.) + kvs = await context.get_key_value_store() + default_value = dict[str, JsonSerializable]() + old_state: dict[str, JsonSerializable] = await kvs.get_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) + old_state_copy = deepcopy(old_state) pw_run = await _run_subcrawler(self._pw_context_pipeline) self.predictor_state.track_browser_request_handler_runs() @@ -380,9 +438,9 @@ async def _run_subcrawler( if should_detect_rendering_type: detection_result: RenderingType - bs_run = await _run_subcrawler(self._bs_context_pipeline, use_state=old_state_copy) + static_run = await _run_subcrawler(self._static_context_pipeline, use_state=old_state_copy) - if bs_run.result and self.result_comparator(bs_run.result, pw_run.result): + if static_run.result and self.result_comparator(static_run.result, pw_run.result): detection_result = 'static' else: detection_result = 'client only' @@ -410,13 +468,11 @@ def pre_navigation_hook( Hooks that try to access `context.page` will have to catch this exception if triggered by static pipeline. """ - def wrapped_bs_hook(context: BasicCrawlingContext) -> Awaitable[None]: - wrapped_context = AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_contexts(context) - return hook(wrapped_context) - - def wrapped_pw_hook(context: PlaywrightPreNavCrawlingContext) -> Awaitable[None]: + def hook_with_wrapped_context( + context: BasicCrawlingContext | PlaywrightPreNavCrawlingContext, + ) -> Awaitable[None]: wrapped_context = AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_contexts(context) return hook(wrapped_context) - self._pw_context_pipeline.pre_navigation_hook(wrapped_pw_hook) - self._bs_context_pipeline.pre_navigation_hook(wrapped_bs_hook) + self._pw_context_pipeline.pre_navigation_hook(hook_with_wrapped_context) + self._static_context_pipeline.pre_navigation_hook(hook_with_wrapped_context) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 86f34eb5f6..0277ba5ca6 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -1,16 +1,16 @@ from __future__ import annotations from dataclasses import dataclass, fields -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Generic -from bs4 import BeautifulSoup +from typing_extensions import TypeVar from crawlee import HttpHeaders from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group from crawlee.crawlers import ( - BeautifulSoupCrawlingContext, - BeautifulSoupParserType, + AbstractHttpParser, + ParsedHttpCrawlingContext, PlaywrightCrawlingContext, ) @@ -25,9 +25,12 @@ class AdaptiveContextError(RuntimeError): pass +TStaticParseResult = TypeVar('TStaticParseResult') + + @dataclass(frozen=True) @docs_group('Data structures') -class AdaptivePlaywrightCrawlingContext(BeautifulSoupCrawlingContext): +class AdaptivePlaywrightCrawlingContext(Generic[TStaticParseResult], ParsedHttpCrawlingContext[TStaticParseResult]): _response: Response | None = None _infinite_scroll: Callable[[], Awaitable[None]] | None = None _page: Page | None = None @@ -51,13 +54,15 @@ def response(self) -> Response: return self._response @classmethod - def from_beautifulsoup_crawling_context(cls, context: BeautifulSoupCrawlingContext) -> Self: - """Convenience constructor that creates new context from existing `BeautifulSoupCrawlingContext`.""" + def from_parsed_http_crawling_context( + cls, context: ParsedHttpCrawlingContext[TStaticParseResult] + ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult]: + """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext`.""" return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) @classmethod async def from_playwright_crawling_context( - cls, context: PlaywrightCrawlingContext, beautiful_soup_parser_type: BeautifulSoupParserType | None + cls, context: PlaywrightCrawlingContext, parser: AbstractHttpParser[TStaticParseResult] ) -> Self: """Convenience constructor that creates new context from existing `PlaywrightCrawlingContext`.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} @@ -65,13 +70,13 @@ async def from_playwright_crawling_context( context_kwargs['_response'] = context_kwargs.pop('response') context_kwargs['_page'] = context_kwargs.pop('page') context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll') - # This might be always available. + # This might not be always available. protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol') http_response = await _HttpResponse.from_playwright_response( response=context.response, protocol=protocol_guess or '' ) return cls( - parsed_content=BeautifulSoup(http_response.read(), features=beautiful_soup_parser_type), + parsed_content=await parser.parse(http_response), http_response=http_response, **context_kwargs, ) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py index c707d244c8..c3706df868 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_example.py @@ -16,9 +16,15 @@ async def main() -> None: top_logger.setLevel(logging.DEBUG) i = 0 - crawler = AdaptivePlaywrightCrawler( - max_requests_per_crawl=10, _logger=top_logger, playwright_crawler_args={'headless': False} + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_requests_per_crawl=10, _logger=top_logger, playwright_crawler_specific_kwargs={'headless': False} ) + """ + + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + max_requests_per_crawl=10, _logger=top_logger, playwright_crawler_specific_kwargs={'headless': False} + ) + """ @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index bf19615da2..de9c8a29eb 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -75,7 +75,7 @@ async def test_adaptive_crawling( rendering_types=rendering_types, detection_probability_recommendation=detection_probability_recommendation ) - crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=predictor) + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(rendering_type_predictor=predictor) pw_handler_count = 0 bs_handler_count = 0 @@ -120,7 +120,9 @@ async def test_adaptive_crawling_pre_nav_change_to_context() -> None: """Tests that context can be modified in pre-navigation hooks.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() - crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + rendering_type_predictor=static_only_predictor_enforce_detection + ) user_data_in_pre_nav_hook = [] user_data_in_handler = [] @@ -151,7 +153,9 @@ async def test_adaptive_crawling_result() -> None: Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() requests = ['https://warehouse-theme-metal.myshopify.com/'] - crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + rendering_type_predictor=static_only_predictor_enforce_detection + ) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @@ -186,7 +190,9 @@ async def test_adaptive_crawling_predictor_calls( some_url = 'https://warehouse-theme-metal.myshopify.com/' static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() requests = [Request.from_url(url=some_url, label=some_label)] - crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + rendering_type_predictor=static_only_predictor_enforce_detection + ) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @@ -216,7 +222,9 @@ async def test_adaptive_crawling_result_use_state_isolation() -> None: Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() requests = ['https://warehouse-theme-metal.myshopify.com/'] - crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_predictor_enforce_detection) + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + rendering_type_predictor=static_only_predictor_enforce_detection + ) store = await crawler.get_key_value_store() await store.set_value(BasicCrawler.CRAWLEE_STATE_KEY, {'counter': 0}) request_handler_calls = 0 @@ -247,7 +255,7 @@ async def test_adaptive_crawling_statistics() -> None: static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) - crawler = AdaptivePlaywrightCrawler( + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_no_detection, result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. ) @@ -283,7 +291,9 @@ async def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_pw_crawle requests = ['https://warehouse-theme-metal.myshopify.com/'] static_only_no_detection_predictor = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) - crawler = AdaptivePlaywrightCrawler(rendering_type_predictor=static_only_no_detection_predictor) + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + rendering_type_predictor=static_only_no_detection_predictor + ) saved_data = {'some': 'data'} @crawler.router.default_handler @@ -327,7 +337,7 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: log_interval=log_interval, ) - crawler = AdaptivePlaywrightCrawler(statistics=statistics) + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(statistics=statistics) assert type(crawler._statistics) is AdaptivePlaywrightCrawlerStatistics assert crawler._statistics._persistence_enabled == persistence_enabled From bd72a8446cf86d8a93bf528f8baeed58ca8e8da6 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 9 Jan 2025 16:05:30 +0100 Subject: [PATCH 30/79] Update tests to remove bs references. Add one sanity check test for parsel variant. Update some doc strings. --- .../crawlers/_adaptive_playwright/__init__.py | 8 +-- .../_adaptive_playwright_crawler.py | 68 +++++++------------ .../_playwright/_playwright_crawler.py | 33 +++++++++ .../test_adaptive_playwright_crawler.py | 62 +++++++++++++---- 4 files changed, 106 insertions(+), 65 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py index 46addf4483..b781396aae 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/__init__.py +++ b/src/crawlee/crawlers/_adaptive_playwright/__init__.py @@ -1,11 +1,7 @@ from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPreNavCrawlingContext, ) -__all__ = [ - 'AdaptivePlaywrightCrawler', - 'AdaptivePlaywrightCrawlingContext', - 'HttpCrawlerOptions', - 'ParsedHttpCrawlingContext', -] +__all__ = ['AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext'] diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 16f537ad5b..2b5bccc7e7 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -11,10 +11,9 @@ from bs4 import BeautifulSoup from parsel import Selector -from typing_extensions import Self, TypedDict, TypeVar +from typing_extensions import Self, TypeVar from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult -from crawlee._utils.docs import docs_group from crawlee._utils.wait import wait_for from crawlee.crawlers import ( AbstractHttpCrawler, @@ -50,17 +49,16 @@ from crawlee.statistics import Statistics if TYPE_CHECKING: - from collections.abc import Awaitable, Callable, Coroutine, Mapping, Sequence + from collections.abc import Awaitable, Callable, Coroutine, Sequence from datetime import timedelta from types import TracebackType - from typing_extensions import NotRequired, Unpack + from typing_extensions import Unpack from crawlee import Request - from crawlee.browsers import BrowserPool - from crawlee.browsers._types import BrowserType from crawlee.crawlers._abstract_http._abstract_http_crawler import _HttpCrawlerOptions from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions + from crawlee.crawlers._playwright._playwright_crawler import PlaywrightCrawlerAdditionalOptions from crawlee.router import Router from crawlee.statistics import FinalStatistics @@ -69,34 +67,6 @@ TStaticCrawlingContext = TypeVar('TStaticCrawlingContext', bound=ParsedHttpCrawlingContext) -@docs_group('Data structures') -class _PlaywrightCrawlerAdditionalOptions(TypedDict): - """Additional options that can be specified for PlaywrightCrawler.""" - - browser_pool: NotRequired[BrowserPool] - """A `BrowserPool` instance to be used for launching the browsers and getting pages.""" - - browser_type: NotRequired[BrowserType] - """The type of browser to launch ('chromium', 'firefox', or 'webkit'). - This option should not be used if `browser_pool` is provided.""" - - browser_launch_options: NotRequired[Mapping[str, Any]] - """Keyword arguments to pass to the browser launch method. These options are provided - directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright - documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch. - This option should not be used if `browser_pool` is provided.""" - - browser_new_context_options: NotRequired[Mapping[str, Any]] - """Keyword arguments to pass to the browser new context method. These options - are provided directly to Playwright's `browser.new_context` method. For more details, refer to the - Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. - This option should not be used if `browser_pool` is provided.""" - - headless: NotRequired[bool] - """Whether to run the browser in headless mode. - This option should not be used if `browser_pool` is provided.""" - - class _NoActiveStatistics(Statistics): """Statistics compliant object that is not supposed to do anything when active. To be used in sub crawlers.""" @@ -115,6 +85,8 @@ async def __aexit__( @dataclass class _OrphanPlaywrightContextPipeline(Generic[TStaticParseResult]): + """Minimal setup required by playwright context pipeline to work without crawler.""" + pre_navigation_hook: Callable[[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]], None] pipeline: ContextPipeline[PlaywrightCrawlingContext] needed_contexts: list[AbstractAsyncContextManager] @@ -122,28 +94,34 @@ class _OrphanPlaywrightContextPipeline(Generic[TStaticParseResult]): static_parser: AbstractHttpParser[TStaticParseResult] def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: - async def from_pw_to_router(context: PlaywrightCrawlingContext) -> None: + """Call that will be used by the top crawler to run through the pipeline.""" + + async def from_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> None: adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( context=context, parser=self.static_parser ) await self.top_router(adaptive_crawling_context) - return self.pipeline(top_context, from_pw_to_router) + return self.pipeline(top_context, from_pipeline_to_top_router) @dataclass class _OrphanStaticContextPipeline(Generic[TStaticCrawlingContext]): + """Minimal setup required by static context pipeline to work without crawler.""" + pre_navigation_hook: Callable[[Callable[[BasicCrawlingContext], Awaitable[None]]], None] pipeline: ContextPipeline[TStaticCrawlingContext] needed_contexts: list[AbstractAsyncContextManager] top_router: Router[AdaptivePlaywrightCrawlingContext] def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: - async def from_pw_to_router(context: TStaticCrawlingContext) -> None: + """Call that will be used by the top crawler to run through the pipeline.""" + + async def from_pipeline_to_top_router(context: TStaticCrawlingContext) -> None: adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context(context) await self.top_router(adaptive_crawling_context) - return self.pipeline(top_context, from_pw_to_router) + return self.pipeline(top_context, from_pipeline_to_top_router) class AdaptivePlaywrightCrawler( @@ -165,7 +143,7 @@ def __init__( result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, static_crawler_specific_kwargs: _HttpCrawlerOptions | None = None, - playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, + playwright_crawler_specific_kwargs: PlaywrightCrawlerAdditionalOptions | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> None: """A default constructor. @@ -257,7 +235,7 @@ def with_beautifulsoup_static_parser( result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, parser_type: BeautifulSoupParserType = 'lxml', static_crawler_specific_kwargs: _HttpCrawlerOptions[BeautifulSoupCrawlingContext] | None = None, - playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, + playwright_crawler_specific_kwargs: PlaywrightCrawlerAdditionalOptions | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]: """Creates `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" @@ -279,7 +257,7 @@ def with_parsel_static_parser( result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, static_crawler_specific_kwargs: _HttpCrawlerOptions[ParselCrawlingContext] | None = None, - playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, + playwright_crawler_specific_kwargs: PlaywrightCrawlerAdditionalOptions | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]: """Creates `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content.""" @@ -372,7 +350,7 @@ async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext Reference implementation: https://github.com/apify/crawlee/blob/master/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts """ - async def _run_subcrawler( + async def _run_subcrawler_pipeline( subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanStaticContextPipeline, use_state: dict | None = None, ) -> SubCrawlerRun: @@ -403,7 +381,7 @@ async def _run_subcrawler( context.log.debug(f'Running static request for {context.request.url}') self.predictor_state.track_http_only_request_handler_runs() - static_run = await _run_subcrawler(self._static_context_pipeline) + static_run = await _run_subcrawler_pipeline(self._static_context_pipeline) if static_run.result and self.result_checker(static_run.result): await self._commit_result(result=static_run.result, context=context) return @@ -427,7 +405,7 @@ async def _run_subcrawler( old_state: dict[str, JsonSerializable] = await kvs.get_value(BasicCrawler.CRAWLEE_STATE_KEY, default_value) old_state_copy = deepcopy(old_state) - pw_run = await _run_subcrawler(self._pw_context_pipeline) + pw_run = await _run_subcrawler_pipeline(self._pw_context_pipeline) self.predictor_state.track_browser_request_handler_runs() if pw_run.exception is not None: @@ -438,7 +416,7 @@ async def _run_subcrawler( if should_detect_rendering_type: detection_result: RenderingType - static_run = await _run_subcrawler(self._static_context_pipeline, use_state=old_state_copy) + static_run = await _run_subcrawler_pipeline(self._static_context_pipeline, use_state=old_state_copy) if static_run.result and self.result_comparator(static_run.result, pw_run.result): detection_result = 'static' diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index df1494f3a9..c2119b3cb4 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any, Callable from pydantic import ValidationError +from typing_extensions import NotRequired, TypedDict from crawlee import EnqueueStrategy from crawlee._request import BaseRequestData @@ -285,3 +286,35 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], hook: A coroutine function to be called before each navigation. """ self._pre_navigation_hooks.append(hook) + + +@docs_group('Data structures') +class PlaywrightCrawlerAdditionalOptions(TypedDict): + """Additional arguments for the `PlaywrightCrawler` constructor. + + It is intended for typing forwarded `__init__` arguments in the subclasses. + All arguments are `BasicCrawlerOptions` + `PlaywrightCrawlerAdditionalOptions` + """ + + browser_pool: NotRequired[BrowserPool] + """A `BrowserPool` instance to be used for launching the browsers and getting pages.""" + + browser_type: NotRequired[BrowserType] + """The type of browser to launch ('chromium', 'firefox', or 'webkit'). + This option should not be used if `browser_pool` is provided.""" + + browser_launch_options: NotRequired[Mapping[str, Any]] + """Keyword arguments to pass to the browser launch method. These options are provided + directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright + documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch. + This option should not be used if `browser_pool` is provided.""" + + browser_new_context_options: NotRequired[Mapping[str, Any]] + """Keyword arguments to pass to the browser new context method. These options + are provided directly to Playwright's `browser.new_context` method. For more details, refer to the + Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. + This option should not be used if `browser_pool` is provided.""" + + headless: NotRequired[bool] + """Whether to run the browser in headless mode. + This option should not be used if `browser_pool` is provided.""" diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index de9c8a29eb..6ca5f5c663 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -51,7 +51,7 @@ def store_result(self, url: str, label: str | None, crawl_type: RenderingType) - @pytest.mark.parametrize( - ('expected_pw_count', 'expected_bs_count', 'rendering_types', 'detection_probability_recommendation'), + ('expected_pw_count', 'expected_static_count', 'rendering_types', 'detection_probability_recommendation'), [ pytest.param(0, 2, cycle(['static']), cycle([0]), id='Static only'), pytest.param(2, 0, cycle(['client only']), cycle([0]), id='Client only'), @@ -61,7 +61,7 @@ def store_result(self, url: str, label: str | None, crawl_type: RenderingType) - ) async def test_adaptive_crawling( expected_pw_count: int, - expected_bs_count: int, + expected_static_count: int, rendering_types: Iterator[RenderingType], detection_probability_recommendation: Iterator[int], ) -> None: @@ -78,26 +78,26 @@ async def test_adaptive_crawling( crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(rendering_type_predictor=predictor) pw_handler_count = 0 - bs_handler_count = 0 + static_handler_count = 0 pw_hook_count = 0 - bs_hook_count = 0 + static_hook_count = 0 @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: nonlocal pw_handler_count - nonlocal bs_handler_count + nonlocal static_handler_count try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. pw_handler_count += 1 except AdaptiveContextError: - bs_handler_count += 1 + static_handler_count += 1 @crawler.pre_navigation_hook async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: # Intentionally unused arg - nonlocal bs_hook_count + nonlocal static_hook_count nonlocal pw_hook_count try: @@ -105,15 +105,49 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. pw_hook_count += 1 except AdaptiveContextError: - bs_hook_count += 1 + static_hook_count += 1 await crawler.run(requests) assert pw_handler_count == expected_pw_count assert pw_hook_count == expected_pw_count - assert bs_handler_count == expected_bs_count - assert bs_hook_count == expected_bs_count + assert static_handler_count == expected_static_count + assert static_hook_count == expected_static_count + + +async def test_adaptive_crawling_parcel() -> None: + """Top level test for parcel. Only one argument combination. (The rest of code is tested with bs variant.)""" + requests = [ + 'https://warehouse-theme-metal.myshopify.com/', + 'https://warehouse-theme-metal.myshopify.com/collections', + ] + + predictor = _SimpleRenderingTypePredictor( + rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([0]) + ) + + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(rendering_type_predictor=predictor) + + pw_handler_count = 0 + static_handler_count = 0 + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + nonlocal pw_handler_count + nonlocal static_handler_count + + try: + # page is available only if it was crawled by PlaywrightCrawler. + context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. + pw_handler_count += 1 + except AdaptiveContextError: + static_handler_count += 1 + + await crawler.run(requests) + + assert pw_handler_count == 1 + assert static_handler_count == 1 async def test_adaptive_crawling_pre_nav_change_to_context() -> None: @@ -152,10 +186,10 @@ async def test_adaptive_crawling_result() -> None: Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() - requests = ['https://warehouse-theme-metal.myshopify.com/'] crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_enforce_detection ) + requests = ['https://warehouse-theme-metal.myshopify.com/'] @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @@ -176,14 +210,14 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @pytest.mark.parametrize( - ('pw_saved_data', 'bs_saved_data', 'expected_result_renderingl_type'), + ('pw_saved_data', 'static_saved_data', 'expected_result_renderingl_type'), [ pytest.param({'some': 'data'}, {'some': 'data'}, 'static', id='Same results from sub crawlers'), pytest.param({'some': 'data'}, {'different': 'data'}, 'client only', id='Different results from sub crawlers'), ], ) async def test_adaptive_crawling_predictor_calls( - pw_saved_data: dict[str, str], bs_saved_data: dict[str, str], expected_result_renderingl_type: RenderingType + pw_saved_data: dict[str, str], static_saved_data: dict[str, str], expected_result_renderingl_type: RenderingType ) -> None: """Tests expected predictor calls. Same results.""" some_label = 'bla' @@ -201,7 +235,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. await context.push_data(pw_saved_data) except AdaptiveContextError: - await context.push_data(bs_saved_data) + await context.push_data(static_saved_data) with ( patch.object(static_only_predictor_enforce_detection, 'store_result', Mock()) as mocked_store_result, From c964d44723810c177ac6ed8d69d1562ef66d5e28 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 10 Jan 2025 09:49:10 +0100 Subject: [PATCH 31/79] Revert accidental Lint edits to website/*.py --- website/pydoc-markdown/generate_ast.py | 7 +- .../google_docstring_processor.py | 75 +++++++++---------- 2 files changed, 43 insertions(+), 39 deletions(-) diff --git a/website/pydoc-markdown/generate_ast.py b/website/pydoc-markdown/generate_ast.py index 8dfe580268..9163d1c117 100644 --- a/website/pydoc-markdown/generate_ast.py +++ b/website/pydoc-markdown/generate_ast.py @@ -45,4 +45,9 @@ repo_root_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../')) -print(json.dumps(dump, indent=4).replace(repo_root_path, 'REPO_ROOT_PLACEHOLDER')) +print( + json.dumps(dump, indent=4).replace( + repo_root_path, + 'REPO_ROOT_PLACEHOLDER' + ) +) diff --git a/website/pydoc-markdown/google_docstring_processor.py b/website/pydoc-markdown/google_docstring_processor.py index 9c6dcf41c2..154462c4d2 100644 --- a/website/pydoc-markdown/google_docstring_processor.py +++ b/website/pydoc-markdown/google_docstring_processor.py @@ -79,37 +79,37 @@ class ApifyGoogleProcessor(Processor): """ _param_res = [ - re.compile(r'^(?P\S+):\s+(?P.+)$'), - re.compile(r'^(?P\S+)\s+\((?P[^)]+)\):\s+(?P.+)$'), - re.compile(r'^(?P\S+)\s+--\s+(?P.+)$'), - re.compile(r'^(?P\S+)\s+\{\[(?P\S+)\]\}\s+--\s+(?P.+)$'), - re.compile(r'^(?P\S+)\s+\{(?P\S+)\}\s+--\s+(?P.+)$'), + re.compile(r"^(?P\S+):\s+(?P.+)$"), + re.compile(r"^(?P\S+)\s+\((?P[^)]+)\):\s+(?P.+)$"), + re.compile(r"^(?P\S+)\s+--\s+(?P.+)$"), + re.compile(r"^(?P\S+)\s+\{\[(?P\S+)\]\}\s+--\s+(?P.+)$"), + re.compile(r"^(?P\S+)\s+\{(?P\S+)\}\s+--\s+(?P.+)$"), ] _keywords_map = { - 'Args:': 'Arguments', - 'Arguments:': 'Arguments', - 'Attributes:': 'Attributes', - 'Example:': 'Example', - 'Examples:': 'Examples', - 'Keyword Args:': 'Arguments', - 'Keyword Arguments:': 'Arguments', - 'Methods:': 'Methods', - 'Note:': 'Notes', - 'Notes:': 'Notes', - 'Other Parameters:': 'Arguments', - 'Parameters:': 'Arguments', - 'Return:': 'Returns', - 'Returns:': 'Returns', - 'Raises:': 'Raises', - 'References:': 'References', - 'See Also:': 'See Also', - 'Todo:': 'Todo', - 'Warning:': 'Warnings', - 'Warnings:': 'Warnings', - 'Warns:': 'Warns', - 'Yield:': 'Yields', - 'Yields:': 'Yields', + "Args:": "Arguments", + "Arguments:": "Arguments", + "Attributes:": "Attributes", + "Example:": "Example", + "Examples:": "Examples", + "Keyword Args:": "Arguments", + "Keyword Arguments:": "Arguments", + "Methods:": "Methods", + "Note:": "Notes", + "Notes:": "Notes", + "Other Parameters:": "Arguments", + "Parameters:": "Arguments", + "Return:": "Returns", + "Returns:": "Returns", + "Raises:": "Raises", + "References:": "References", + "See Also:": "See Also", + "Todo:": "Todo", + "Warning:": "Warnings", + "Warnings:": "Warnings", + "Warns:": "Warns", + "Yield:": "Yields", + "Yields:": "Yields", } def check_docstring_format(self, docstring: str) -> bool: @@ -139,9 +139,9 @@ def _commit(): lines.extend(current_lines) current_lines.clear() - for line in node.docstring.content.split('\n'): + for line in node.docstring.content.split("\n"): multiline_argument_offset += 1 - if line.lstrip().startswith('```'): + if line.lstrip().startswith("```"): in_codeblock = not in_codeblock current_lines.append(line) if not in_codeblock: @@ -171,16 +171,15 @@ def _commit(): if not param_match: if multiline_argument_offset == 1: - current_lines[-1]['desc'] += '\n' + line + current_lines[-1]["desc"] += "\n" + line multiline_argument_offset = 0 else: current_lines.append(line) _commit() - node.docstring.content = json.dumps( - { - 'text': '\n'.join(lines), - 'sections': sections, - }, - indent=None, - ) + node.docstring.content = json.dumps({ + "text": "\n".join(lines), + "sections": sections, + }, indent=None) + + From a47139564c834ebedbb5bea3f4bab162eaa902c6 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 14 Jan 2025 16:38:22 +0100 Subject: [PATCH 32/79] Review comments. --- .../_adaptive_playwright_crawler.py | 66 +++++++------------ .../_rendering_type_predictor.py | 15 +++-- .../_result_comparator.py | 2 +- .../test_adaptive_playwright_crawler.py | 8 +-- 4 files changed, 39 insertions(+), 52 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 2b5bccc7e7..1684a715cb 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -2,7 +2,6 @@ import asyncio import logging -from contextlib import AbstractAsyncContextManager, AsyncExitStack from copy import deepcopy from dataclasses import dataclass from logging import getLogger @@ -36,13 +35,15 @@ AdaptivePlaywrightPreNavCrawlingContext, ) from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( - DefaultRenderingTypePredictor, + RandomRenderingTypePredictor as DefaultRenderingTypePredictor, +) +from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( RenderingType, RenderingTypePredictor, ) from crawlee.crawlers._adaptive_playwright._result_comparator import ( SubCrawlerRun, - create_comparator, + create_default_comparator, ) from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser from crawlee.crawlers._parsel._parsel_parser import ParselParser @@ -50,6 +51,7 @@ if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Coroutine, Sequence + from contextlib import AbstractAsyncContextManager from datetime import timedelta from types import TracebackType @@ -70,6 +72,10 @@ class _NoActiveStatistics(Statistics): """Statistics compliant object that is not supposed to do anything when active. To be used in sub crawlers.""" + def __init__(self) -> None: + super().__init__() + self._active = True + async def __aenter__(self) -> Self: self._active = True return self @@ -89,7 +95,7 @@ class _OrphanPlaywrightContextPipeline(Generic[TStaticParseResult]): pre_navigation_hook: Callable[[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]], None] pipeline: ContextPipeline[PlaywrightCrawlingContext] - needed_contexts: list[AbstractAsyncContextManager] + needed_context: AbstractAsyncContextManager top_router: Router[AdaptivePlaywrightCrawlingContext] static_parser: AbstractHttpParser[TStaticParseResult] @@ -111,7 +117,6 @@ class _OrphanStaticContextPipeline(Generic[TStaticCrawlingContext]): pre_navigation_hook: Callable[[Callable[[BasicCrawlingContext], Awaitable[None]]], None] pipeline: ContextPipeline[TStaticCrawlingContext] - needed_contexts: list[AbstractAsyncContextManager] top_router: Router[AdaptivePlaywrightCrawlingContext] def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: @@ -129,8 +134,8 @@ class AdaptivePlaywrightCrawler( ): """Adaptive crawler that uses both specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. - It tries to detect whether it is sufficient to crawl with `BeautifulSoupCrawler` (which is faster) or if - `PlaywrightCrawler` should be used (in case `BeautifulSoupCrawler` did not work as expected for specific url.). + It tries to detect whether it is sufficient to crawl without browser (which is faster) or if + `PlaywrightCrawler` should be used (in case previous method did not work as expected for specific url.). # TODO: Add example """ @@ -146,7 +151,7 @@ def __init__( playwright_crawler_specific_kwargs: PlaywrightCrawlerAdditionalOptions | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> None: - """A default constructor. + """A default constructor. Recommended way to create instance is to call factory methods `with_*_static_parser`. Args: rendering_type_predictor: Object that implements RenderingTypePredictor and is capable of predicting which @@ -165,7 +170,7 @@ def __init__( # Adaptive crawling related. self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 # Intentionally unused argument. - self.result_comparator = result_comparator or create_comparator(result_checker) + self.result_comparator = result_comparator or create_default_comparator(result_checker) # Use AdaptivePlaywrightCrawlerStatistics. if 'statistics' in kwargs: @@ -206,24 +211,15 @@ def __init__( **playwright_crawler_specific_kwargs, **basic_crawler_kwargs_for_pw_crawler ) - required_contexts_pw_crawler: list[AbstractAsyncContextManager] = [ - playwright_crawler._statistics, # noqa:SLF001 # Intentional access to private member. - playwright_crawler._browser_pool, # noqa:SLF001 # Intentional access to private member. - ] - required_contexts_static_crawler: list[AbstractAsyncContextManager] = [ - static_crawler._statistics, # noqa:SLF001 # Intentional access to private member. - ] - self._pw_context_pipeline = _OrphanPlaywrightContextPipeline( pipeline=playwright_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. - needed_contexts=required_contexts_pw_crawler, + needed_context=playwright_crawler._browser_pool, # noqa:SLF001 # Intentional access to private member. top_router=self.router, pre_navigation_hook=playwright_crawler.pre_navigation_hook, static_parser=static_parser, ) self._static_context_pipeline = _OrphanStaticContextPipeline[ParsedHttpCrawlingContext[TStaticParseResult]]( pipeline=static_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. - needed_contexts=required_contexts_static_crawler, top_router=self.router, pre_navigation_hook=static_crawler.pre_navigation_hook, ) @@ -239,13 +235,11 @@ def with_beautifulsoup_static_parser( **kwargs: Unpack[_BasicCrawlerOptions], ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]: """Creates `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" - parser_kwargs = {'parser': parser_type} if parser_type else {} - return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, - static_parser=BeautifulSoupParser(**parser_kwargs), + static_parser=BeautifulSoupParser(parser=parser_type), static_crawler_specific_kwargs=static_crawler_specific_kwargs, playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, **kwargs, @@ -323,22 +317,12 @@ async def run( purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default request queue will be purged. """ - contexts_to_enter = [ - cm - for cm in self._static_context_pipeline.needed_contexts + self._pw_context_pipeline.needed_contexts - if cm and getattr(cm, 'active', False) is False - ] - - # Enter contexts required by sub crawler for them to be able to do `crawl_one` - async with AsyncExitStack() as exit_stack: - for context in contexts_to_enter: - await exit_stack.enter_async_context(context) + if not getattr(self._pw_context_pipeline.needed_context, 'active', False): + async with self._pw_context_pipeline.needed_context: + return await super().run(requests=requests, purge_request_queue=purge_request_queue) + else: return await super().run(requests=requests, purge_request_queue=purge_request_queue) - # AsyncExitStack can in theory swallow exceptions and so the return might not execute. - # https://github.com/python/mypy/issues/7726 - raise RuntimeError('FinalStatistics not created.') - # Can't use override as mypy does not like it for double underscore private method. async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 """Override BasicCrawler method that delegates request processing to sub crawlers. @@ -370,7 +354,7 @@ async def _run_subcrawler_pipeline( except Exception as e: return SubCrawlerRun(exception=e) - rendering_type_prediction = self.rendering_type_predictor.predict(context.request.url, context.request.label) + rendering_type_prediction = self.rendering_type_predictor.predict(context.request) should_detect_rendering_type = random() < rendering_type_prediction.detection_probability_recommendation if not should_detect_rendering_type: @@ -383,7 +367,7 @@ async def _run_subcrawler_pipeline( static_run = await _run_subcrawler_pipeline(self._static_context_pipeline) if static_run.result and self.result_checker(static_run.result): - await self._commit_result(result=static_run.result, context=context) + await self._push_result_to_context(result=static_run.result, context=context) return if static_run.exception: context.log.exception( @@ -412,7 +396,7 @@ async def _run_subcrawler_pipeline( raise pw_run.exception if pw_run.result: - await self._commit_result(result=pw_run.result, context=context) + await self._push_result_to_context(result=pw_run.result, context=context) if should_detect_rendering_type: detection_result: RenderingType @@ -424,9 +408,9 @@ async def _run_subcrawler_pipeline( detection_result = 'client only' context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}') - self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result) + self.rendering_type_predictor.store_result(context.request, detection_result) - async def _commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: + async def _push_result_to_context(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: """Execute calls from `result` on the context.""" result_tasks = ( [asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls] diff --git a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py index 4ece912ad2..bdf49d9509 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py @@ -3,10 +3,13 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from random import choice -from typing import Literal +from typing import TYPE_CHECKING, Literal from typing_extensions import override +if TYPE_CHECKING: + from crawlee import Request + RenderingType = Literal['static', 'client only'] @@ -18,19 +21,19 @@ class RenderingTypePrediction: class RenderingTypePredictor(ABC): @abstractmethod - def predict(self, url: str, label: str | None) -> RenderingTypePrediction: ... + def predict(self, request: Request) -> RenderingTypePrediction: ... @abstractmethod - def store_result(self, url: str, label: str | None, crawl_type: RenderingType) -> None: ... + def store_result(self, request: Request, crawl_type: RenderingType) -> None: ... -class DefaultRenderingTypePredictor(RenderingTypePredictor): +class RandomRenderingTypePredictor(RenderingTypePredictor): # Dummy version of predictor. Proper version will be implemented in another change. @override - def predict(self, url: str, label: str | None) -> RenderingTypePrediction: # Will be implemented later + def predict(self, request: Request) -> RenderingTypePrediction: # Will be implemented later return RenderingTypePrediction(choice(['static', 'client only']), 0.1) @override - def store_result(self, url: str, label: str | None, crawl_type: RenderingType) -> None: + def store_result(self, request: Request, crawl_type: RenderingType) -> None: pass diff --git a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py index beef4405ed..d6a6c376f6 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py @@ -15,7 +15,7 @@ class SubCrawlerRun: exception: Exception | None = None -def create_comparator( +def create_default_comparator( result_checker: Callable[[RequestHandlerRunResult], bool] | None, ) -> Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool]: """Factory for creating comparator function.""" diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 6ca5f5c663..710d89f9ef 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -42,11 +42,11 @@ def __init__( self._detection_probability_recommendation = detection_probability_recommendation or cycle([1]) @override - def predict(self, url: str, label: str | None) -> RenderingTypePrediction: + def predict(self, request: Request) -> RenderingTypePrediction: return RenderingTypePrediction(next(self._rendering_types), next(self._detection_probability_recommendation)) @override - def store_result(self, url: str, label: str | None, crawl_type: RenderingType) -> None: + def store_result(self, request: Request, crawl_type: RenderingType) -> None: pass @@ -245,9 +245,9 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: ): await crawler.run(requests) - mocked_predict.assert_called_once_with(some_url, some_label) + mocked_predict.assert_called_once_with(requests[0]) # If `static` and `client only` results are same, `store_result` should be called with `static`. - mocked_store_result.assert_called_once_with(some_url, some_label, expected_result_renderingl_type) + mocked_store_result.assert_called_once_with(requests[0], expected_result_renderingl_type) async def test_adaptive_crawling_result_use_state_isolation() -> None: From dbf6310e4245afdb4705a526da10507d00d243e0 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 15 Jan 2025 09:51:25 +0100 Subject: [PATCH 33/79] Sub crawler timeout handling + test --- .../_adaptive_playwright_crawler.py | 9 ++-- .../test_adaptive_playwright_crawler.py | 45 ++++++++++++++++++- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 1684a715cb..4052d16003 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -52,7 +52,6 @@ if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Coroutine, Sequence from contextlib import AbstractAsyncContextManager - from datetime import timedelta from types import TracebackType from typing_extensions import Unpack @@ -95,8 +94,8 @@ class _OrphanPlaywrightContextPipeline(Generic[TStaticParseResult]): pre_navigation_hook: Callable[[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]], None] pipeline: ContextPipeline[PlaywrightCrawlingContext] - needed_context: AbstractAsyncContextManager top_router: Router[AdaptivePlaywrightCrawlingContext] + needed_context: AbstractAsyncContextManager static_parser: AbstractHttpParser[TStaticParseResult] def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: @@ -269,10 +268,13 @@ async def crawl_one_with( self, subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanStaticContextPipeline, context: BasicCrawlingContext, - timeout: timedelta, result: RequestHandlerRunResult, state: dict[str, JsonSerializable] | None = None, ) -> RequestHandlerRunResult: + # Timeout to ensure that both sub crawlers can finish one request withing top crawler `request_handler_timeout`. + sub_crawler_timeout_coefficient = 0.45 + timeout = self._request_handler_timeout * sub_crawler_timeout_coefficient + if state is not None: async def get_input_state( @@ -346,7 +348,6 @@ async def _run_subcrawler_pipeline( crawl_result = await self.crawl_one_with( subcrawler_pipeline=subcrawler_pipeline, context=context, - timeout=self._request_handler_timeout, result=RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store), state=use_state, ) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 710d89f9ef..1b123675c4 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio import logging from datetime import timedelta from itertools import cycle @@ -281,10 +282,10 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: async def test_adaptive_crawling_statistics() -> None: - """Test adaptive crawling related statistics. + """Test adaptive crawler statistics. Crawler set to static crawling, but due to result_checker returning False on static crawling result it - will do browser crawling instead well. This increments all three adaptive crawling related stats.""" + will do browser crawling instead as well. This increments all three adaptive crawling related stats.""" requests = ['https://warehouse-theme-metal.myshopify.com/'] static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) @@ -304,6 +305,10 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: assert crawler.predictor_state.browser_request_handler_runs == 1 assert crawler.predictor_state.rendering_type_mispredictions == 1 + # Despite running both sub crawlers the top crawler statistics should count this as one request finished. + assert crawler.statistics.state.requests_finished == 1 + assert crawler.statistics.state.requests_failed == 0 + @pytest.mark.parametrize( 'error_in_pw_crawler', @@ -380,3 +385,39 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: assert crawler._statistics._log_message == log_message assert crawler._statistics._periodic_message_logger == periodic_message_logger assert crawler._statistics._log_interval == log_interval + + +async def test_adaptive_playwright_crawler_timeout_in_sub_crawler() -> None: + """Tests that timeout used by sub crawlers ensure that both have chance to run within top crawler timeout. + + Create situation where static sub crawler blocks(should timeout), such error should start browser sub + crawler, which must be capable of running without top crawler handler timing out.""" + requests = ['https://warehouse-theme-metal.myshopify.com/'] + + static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) + top_crawler_handler_timeout = timedelta(seconds=2) + + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + rendering_type_predictor=static_only_predictor_no_detection, + result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. + request_handler_timeout=top_crawler_handler_timeout, + ) + mocked_static_handler = Mock() + mocked_browser_handler = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + try: + # page is available only if it was crawled by PlaywrightCrawler. + context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. + mocked_browser_handler() + except AdaptiveContextError: + mocked_static_handler() + # Sleep for time obviously larger than top crawler timeout. + await asyncio.sleep(top_crawler_handler_timeout.total_seconds() * 2) + + await crawler.run(requests) + + mocked_static_handler.assert_called_once_with() + # Browser handler was capable of running despite static handler having sleep time larger than top handler timeout. + mocked_browser_handler.assert_called_once_with() From 8ee8f995bcca1502f3d10cb5ba508102a59c1274 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 15 Jan 2025 14:06:57 +0100 Subject: [PATCH 34/79] Simplify prenav hooks --- .../_adaptive_playwright_crawler.py | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 4052d16003..1ef69da0e6 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -2,6 +2,7 @@ import asyncio import logging +from collections.abc import Awaitable, Callable from copy import deepcopy from dataclasses import dataclass from logging import getLogger @@ -50,7 +51,7 @@ from crawlee.statistics import Statistics if TYPE_CHECKING: - from collections.abc import Awaitable, Callable, Coroutine, Sequence + from collections.abc import Coroutine, Sequence from contextlib import AbstractAsyncContextManager from types import TracebackType @@ -92,7 +93,6 @@ async def __aexit__( class _OrphanPlaywrightContextPipeline(Generic[TStaticParseResult]): """Minimal setup required by playwright context pipeline to work without crawler.""" - pre_navigation_hook: Callable[[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]], None] pipeline: ContextPipeline[PlaywrightCrawlingContext] top_router: Router[AdaptivePlaywrightCrawlingContext] needed_context: AbstractAsyncContextManager @@ -114,7 +114,6 @@ async def from_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> Non class _OrphanStaticContextPipeline(Generic[TStaticCrawlingContext]): """Minimal setup required by static context pipeline to work without crawler.""" - pre_navigation_hook: Callable[[Callable[[BasicCrawlingContext], Awaitable[None]]], None] pipeline: ContextPipeline[TStaticCrawlingContext] top_router: Router[AdaptivePlaywrightCrawlingContext] @@ -210,17 +209,24 @@ def __init__( **playwright_crawler_specific_kwargs, **basic_crawler_kwargs_for_pw_crawler ) + self._pre_navigation_hooks = list[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]]() + + async def adaptive_pre_navigation_hook(context: BasicCrawlingContext | PlaywrightPreNavCrawlingContext) -> None: + for hook in self._pre_navigation_hooks: + await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_contexts(context)) + + playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook) + static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook) + self._pw_context_pipeline = _OrphanPlaywrightContextPipeline( pipeline=playwright_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. needed_context=playwright_crawler._browser_pool, # noqa:SLF001 # Intentional access to private member. top_router=self.router, - pre_navigation_hook=playwright_crawler.pre_navigation_hook, static_parser=static_parser, ) self._static_context_pipeline = _OrphanStaticContextPipeline[ParsedHttpCrawlingContext[TStaticParseResult]]( pipeline=static_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. top_router=self.router, - pre_navigation_hook=static_crawler.pre_navigation_hook, ) @staticmethod @@ -430,12 +436,4 @@ def pre_navigation_hook( Hooks are wrapped in context that handles possibly missing `page` object by throwing `AdaptiveContextError`. Hooks that try to access `context.page` will have to catch this exception if triggered by static pipeline. """ - - def hook_with_wrapped_context( - context: BasicCrawlingContext | PlaywrightPreNavCrawlingContext, - ) -> Awaitable[None]: - wrapped_context = AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_contexts(context) - return hook(wrapped_context) - - self._pw_context_pipeline.pre_navigation_hook(hook_with_wrapped_context) - self._static_context_pipeline.pre_navigation_hook(hook_with_wrapped_context) + self._pre_navigation_hooks.append(hook) From c291d3ecab1bdcf6d60e4c4364315bc701940130 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 15 Jan 2025 14:19:37 +0100 Subject: [PATCH 35/79] Simplify context manager handling --- .../_adaptive_playwright_crawler.py | 28 ++----------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 1ef69da0e6..a26b345dcb 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -51,18 +51,15 @@ from crawlee.statistics import Statistics if TYPE_CHECKING: - from collections.abc import Coroutine, Sequence - from contextlib import AbstractAsyncContextManager + from collections.abc import Coroutine from types import TracebackType from typing_extensions import Unpack - from crawlee import Request from crawlee.crawlers._abstract_http._abstract_http_crawler import _HttpCrawlerOptions from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions from crawlee.crawlers._playwright._playwright_crawler import PlaywrightCrawlerAdditionalOptions from crawlee.router import Router - from crawlee.statistics import FinalStatistics TStaticParseResult = TypeVar('TStaticParseResult') @@ -95,7 +92,6 @@ class _OrphanPlaywrightContextPipeline(Generic[TStaticParseResult]): pipeline: ContextPipeline[PlaywrightCrawlingContext] top_router: Router[AdaptivePlaywrightCrawlingContext] - needed_context: AbstractAsyncContextManager static_parser: AbstractHttpParser[TStaticParseResult] def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: @@ -218,9 +214,10 @@ async def adaptive_pre_navigation_hook(context: BasicCrawlingContext | Playwrigh playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook) static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook) + self._additional_context_managers = [*self._additional_context_managers, playwright_crawler._browser_pool] # noqa: SLF001 # Intentional access to private member. + self._pw_context_pipeline = _OrphanPlaywrightContextPipeline( pipeline=playwright_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. - needed_context=playwright_crawler._browser_pool, # noqa:SLF001 # Intentional access to private member. top_router=self.router, static_parser=static_parser, ) @@ -312,25 +309,6 @@ async def get_input_state( ) return result - async def run( - self, - requests: Sequence[str | Request] | None = None, - *, - purge_request_queue: bool = True, - ) -> FinalStatistics: - """Run the crawler until all requests are processed. - - Args: - requests: The requests to be enqueued before the crawler starts. - purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default - request queue will be purged. - """ - if not getattr(self._pw_context_pipeline.needed_context, 'active', False): - async with self._pw_context_pipeline.needed_context: - return await super().run(requests=requests, purge_request_queue=purge_request_queue) - else: - return await super().run(requests=requests, purge_request_queue=purge_request_queue) - # Can't use override as mypy does not like it for double underscore private method. async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 """Override BasicCrawler method that delegates request processing to sub crawlers. From 2c2323842b0dacdfd05514eb9c6edd55d49d9507 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 15 Jan 2025 15:20:28 +0100 Subject: [PATCH 36/79] Review comments - _run_request_handler + timeouts --- .../_adaptive_playwright_crawler.py | 9 +++------ src/crawlee/crawlers/_basic/_basic_crawler.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index a26b345dcb..5c03db3fcc 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -275,9 +275,6 @@ async def crawl_one_with( state: dict[str, JsonSerializable] | None = None, ) -> RequestHandlerRunResult: # Timeout to ensure that both sub crawlers can finish one request withing top crawler `request_handler_timeout`. - sub_crawler_timeout_coefficient = 0.45 - timeout = self._request_handler_timeout * sub_crawler_timeout_coefficient - if state is not None: async def get_input_state( @@ -303,14 +300,14 @@ async def get_input_state( await wait_for( lambda: subcrawler_pipeline.create_pipeline_call(context_linked_to_result), - timeout=timeout, - timeout_message=f'Sub crawler timed out after {timeout.total_seconds()} seconds', + timeout=self._request_handler_timeout, + timeout_message=f'Sub crawler timed out after {self._request_handler_timeout.total_seconds()} seconds', logger=self._logger, ) return result # Can't use override as mypy does not like it for double underscore private method. - async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802 + async def _run_request_handler(self, context: BasicCrawlingContext) -> None: """Override BasicCrawler method that delegates request processing to sub crawlers. To decide which sub crawler should process the request it runs `rendering_type_predictor`. diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 2d41b86d36..65f0755fff 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1029,13 +1029,7 @@ async def __run_task_function(self) -> None: try: request.state = RequestState.REQUEST_HANDLER - await wait_for( - lambda: self.__run_request_handler(context), - timeout=self._request_handler_timeout, - timeout_message='Request handler timed out after ' - f'{self._request_handler_timeout.total_seconds()} seconds', - logger=self._logger, - ) + await self._run_request_handler(context=context) await self._commit_request_handler_result(context, result) @@ -1126,8 +1120,14 @@ async def __run_task_function(self) -> None: ) raise - async def __run_request_handler(self, context: BasicCrawlingContext) -> None: - await self._context_pipeline(context, self.router) + async def _run_request_handler(self, context: BasicCrawlingContext) -> None: + await wait_for( + lambda: self._context_pipeline(context, self.router), + timeout=self._request_handler_timeout, + timeout_message='Request handler timed out after ' + f'{self._request_handler_timeout.total_seconds()} seconds', + logger=self._logger, + ) def _is_session_blocked_status_code(self, session: Session | None, status_code: int) -> bool: """Check if the HTTP status code indicates that the session was blocked by the target website. From b2a29c1cfb1a41abc8c64b56a1e42dcfaa139d9d Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 15 Jan 2025 16:06:41 +0100 Subject: [PATCH 37/79] Statistics. TODO: Propagate generics through crawlers. --- .../_adaptive_playwright_crawler.py | 12 ++- ..._adaptive_playwright_crawler_statistics.py | 45 ++++------- src/crawlee/crawlers/_basic/_basic_crawler.py | 5 +- src/crawlee/statistics/_statistics.py | 76 ++++++++++++------- .../unit/_statistics/test_periodic_logging.py | 2 +- tests/unit/_statistics/test_persistence.py | 4 +- .../test_adaptive_playwright_crawler.py | 10 +-- ..._adaptive_playwright_crawler_statistics.py | 18 ++--- .../http_clients/test_curl_impersonate.py | 4 +- tests/unit/http_clients/test_httpx.py | 4 +- 10 files changed, 90 insertions(+), 90 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 5c03db3fcc..0cab809f68 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -48,7 +48,7 @@ ) from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser from crawlee.crawlers._parsel._parsel_parser import ParselParser -from crawlee.statistics import Statistics +from crawlee.statistics import Statistics, StatisticsState if TYPE_CHECKING: from collections.abc import Coroutine @@ -70,7 +70,7 @@ class _NoActiveStatistics(Statistics): """Statistics compliant object that is not supposed to do anything when active. To be used in sub crawlers.""" def __init__(self) -> None: - super().__init__() + super().__init__(state_model=StatisticsState) self._active = True async def __aenter__(self) -> Self: @@ -174,8 +174,6 @@ def __init__( statistics = AdaptivePlaywrightCrawlerStatistics() kwargs['statistics'] = statistics - self.predictor_state = statistics.predictor_state - super().__init__(**kwargs) # Sub crawlers related. @@ -345,7 +343,7 @@ async def _run_subcrawler_pipeline( ) if rendering_type_prediction.rendering_type == 'static': context.log.debug(f'Running static request for {context.request.url}') - self.predictor_state.track_http_only_request_handler_runs() + self.statistics.track_http_only_request_handler_runs() static_run = await _run_subcrawler_pipeline(self._static_context_pipeline) if static_run.result and self.result_checker(static_run.result): @@ -357,7 +355,7 @@ async def _run_subcrawler_pipeline( ) else: context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}') - self.predictor_state.track_rendering_type_mispredictions() + self.statistics.track_rendering_type_mispredictions() context.log.debug(f'Running browser request handler for {context.request.url}') @@ -372,7 +370,7 @@ async def _run_subcrawler_pipeline( old_state_copy = deepcopy(old_state) pw_run = await _run_subcrawler_pipeline(self._pw_context_pipeline) - self.predictor_state.track_browser_request_handler_runs() + self.statistics.track_browser_request_handler_runs() if pw_run.exception is not None: raise pw_run.exception diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index 9ff2ec887a..9d0073fce3 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -1,10 +1,9 @@ from __future__ import annotations from datetime import timedelta -from typing import TYPE_CHECKING, Annotated, Any, cast +from typing import TYPE_CHECKING, Annotated -from pydantic import BaseModel, ConfigDict, Field -from typing_extensions import override +from pydantic import ConfigDict, Field from crawlee._utils.docs import docs_group from crawlee.statistics import Statistics, StatisticsState @@ -18,25 +17,16 @@ @docs_group('Data structures') -class PredictorState(BaseModel): +class AdaptivePlaywrightCrawlerStatisticState(StatisticsState): model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants') http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0 browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0 rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0 - def track_http_only_request_handler_runs(self) -> None: - self.http_only_request_handler_runs += 1 - - def track_browser_request_handler_runs(self) -> None: - self.browser_request_handler_runs += 1 - - def track_rendering_type_mispredictions(self) -> None: - self.rendering_type_mispredictions += 1 - @docs_group('Classes') -class AdaptivePlaywrightCrawlerStatistics(Statistics): +class AdaptivePlaywrightCrawlerStatistics(Statistics[AdaptivePlaywrightCrawlerStatisticState]): def __init__( self, *, @@ -47,9 +37,7 @@ def __init__( log_message: str = 'Statistics', periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), - state_model: type[StatisticsState] = StatisticsState, ) -> None: - self.predictor_state = PredictorState() super().__init__( persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, @@ -58,9 +46,9 @@ def __init__( log_message=log_message, periodic_message_logger=periodic_message_logger, log_interval=log_interval, - state_model=state_model, + state_model=AdaptivePlaywrightCrawlerStatisticState, ) - self._persist_predictor_state_key = self._persist_state_key + '_PREDICTOR' + self.state @classmethod def from_statistics(cls, statistics: Statistics) -> Self: @@ -72,20 +60,13 @@ def from_statistics(cls, statistics: Statistics) -> Self: log_message=statistics._log_message, # noqa:SLF001 # Accessing private member to create copy like-object. periodic_message_logger=statistics._periodic_message_logger, # noqa:SLF001 # Accessing private member to create copy like-object. log_interval=statistics._log_interval, # noqa:SLF001 # Accessing private member to create copy like-object. - state_model=statistics._state_model, # noqa:SLF001 # Accessing private member to create copy like-object. ) - @override - async def _persist_other_statistics(self, key_value_store: KeyValueStore) -> None: - """Persist state of predictor.""" - await key_value_store.set_value( - self._persist_predictor_state_key, - self.predictor_state.model_dump(mode='json', by_alias=True), - 'application/json', - ) + def track_http_only_request_handler_runs(self) -> None: + self.state.http_only_request_handler_runs += 1 - @override - async def _load_other_statistics(self, key_value_store: KeyValueStore) -> None: - """Load state of predictor.""" - stored_state = await key_value_store.get_value(self._persist_predictor_state_key, cast(Any, {})) - self.predictor_state = self.predictor_state.__class__.model_validate(stored_state) + def track_browser_request_handler_runs(self) -> None: + self.state.browser_request_handler_runs += 1 + + def track_rendering_type_mispredictions(self) -> None: + self.state.rendering_type_mispredictions += 1 diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 65f0755fff..bc2f938879 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -44,7 +44,7 @@ from crawlee.http_clients import HttpxHttpClient from crawlee.router import Router from crawlee.sessions import SessionPool -from crawlee.statistics import Statistics +from crawlee.statistics import Statistics, StatisticsState from crawlee.storages import Dataset, KeyValueStore, RequestQueue from ._context_pipeline import ContextPipeline @@ -66,6 +66,7 @@ from crawlee.storages._dataset import ExportDataCsvKwargs, ExportDataJsonKwargs, GetDataKwargs, PushDataKwargs TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext) +TStatisticsState = TypeVar('TStatisticsState', bound=BasicCrawlingContext, default=StatisticsState) ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Union[Request, None]]] FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]] @@ -313,7 +314,7 @@ def __init__( self._logger = _logger or logging.getLogger(__name__) # Statistics - self._statistics = statistics or Statistics( + self._statistics = statistics or Statistics.with_default_state( periodic_message_logger=self._logger, log_message='Current request statistics:', ) diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index 09e9a5616a..9b7bf31864 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -4,7 +4,9 @@ import math from datetime import datetime, timedelta, timezone from logging import Logger, getLogger -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, Any, Generic, cast + +from typing_extensions import Self, TypeVar from crawlee import service_locator from crawlee._utils.context import ensure_context @@ -18,9 +20,7 @@ if TYPE_CHECKING: from types import TracebackType - from typing_extensions import Self - - +TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) logger = getLogger(__name__) @@ -53,7 +53,7 @@ def retry_count(self) -> int: @docs_group('Classes') -class Statistics: +class Statistics(Generic[TStatisticsState]): """An interface to collecting and logging runtime statistics for requests. All information is saved to the key value store so that it persists between migrations, abortions and resurrections. @@ -71,13 +71,13 @@ def __init__( log_message: str = 'Statistics', periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), - state_model: type[StatisticsState] = StatisticsState, + state_model: type[TStatisticsState], ) -> None: self._id = Statistics.__next_id Statistics.__next_id += 1 self._state_model = state_model - self.state: StatisticsState = self._state_model() + self.state = self._state_model() self._instance_start: datetime | None = None self._retry_histogram = dict[int, int]() @@ -87,7 +87,7 @@ def __init__( self._requests_in_progress = dict[str, RequestProcessingRecord]() if persist_state_key is None: - persist_state_key = self._get_default_persist_state_key() + persist_state_key = f'SDK_CRAWLER_STATISTICS_{self._id}' self._persistence_enabled = persistence_enabled self._persist_state_key = persist_state_key @@ -102,8 +102,28 @@ def __init__( # Flag to indicate the context state. self._active = False - def _get_default_persist_state_key(self) -> str: - return f'SDK_CRAWLER_STATISTICS_{self._id}' + @staticmethod + def with_default_state( + *, + persistence_enabled: bool = False, + persist_state_kvs_name: str = 'default', + persist_state_key: str | None = None, + key_value_store: KeyValueStore | None = None, + log_message: str = 'Statistics', + periodic_message_logger: Logger | None = None, + log_interval: timedelta = timedelta(minutes=1), + ) -> Statistics[StatisticsState]: + """Convenience constructor for creating a `Statistics` with default state model `StatisticsState`.""" + return Statistics[StatisticsState]( + persistence_enabled=persistence_enabled, + persist_state_kvs_name=persist_state_kvs_name, + persist_state_key=persist_state_key, + key_value_store=key_value_store, + log_message=log_message, + periodic_message_logger=periodic_message_logger, + log_interval=log_interval, + state_model=StatisticsState, + ) @property def active(self) -> bool: @@ -247,12 +267,13 @@ def _log(self) -> None: self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}') async def _maybe_load_statistics(self) -> None: - if self._persistence_enabled and self._key_value_store is not None: - await self._load_crawler_statistics(self._key_value_store) - await self._load_other_statistics(self._key_value_store) + if not self._persistence_enabled: + return - async def _load_crawler_statistics(self, key_value_store: KeyValueStore) -> None: - stored_state = await key_value_store.get_value(self._persist_state_key, cast(Any, {})) + if not self._key_value_store: + return + + stored_state = await self._key_value_store.get_value(self._persist_state_key, cast(Any, {})) saved_state = self.state.__class__.model_validate(stored_state) self.state = saved_state @@ -264,10 +285,18 @@ async def _load_crawler_statistics(self, key_value_store: KeyValueStore) -> None elif saved_state.crawler_last_started_at: self._instance_start = saved_state.crawler_last_started_at - async def _load_other_statistics(self, key_value_store: KeyValueStore) -> None: - """Load other statistics. Not implemented in Statistics.""" + async def _persist_state(self, event_data: EventPersistStateData) -> None: + logger.debug(f'Persisting state of the Statistics (event_data={event_data}).') + + if not self._persistence_enabled: + return + + if not self._key_value_store: + return + + if not self._instance_start: + return - async def _persist_crawler_statistics(self, key_value_store: KeyValueStore) -> None: final_statistics = self.calculate() persisted_state = StatisticsPersistedState( stats_id=self._id, @@ -282,21 +311,12 @@ async def _persist_crawler_statistics(self, key_value_store: KeyValueStore) -> N logger.debug('Persisting state') - await key_value_store.set_value( + await self._key_value_store.set_value( self._persist_state_key, self.state.model_dump(mode='json', by_alias=True) | persisted_state.model_dump(mode='json', by_alias=True), 'application/json', ) - async def _persist_other_statistics(self, key_value_store: KeyValueStore) -> None: - """Save other statistics. Not implemented in Statistics.""" - - async def _persist_state(self, event_data: EventPersistStateData) -> None: - logger.debug(f'Persisting state of the Statistics (event_data={event_data}).') - if self._persistence_enabled and self._key_value_store is not None and self._instance_start is not None: - await self._persist_crawler_statistics(self._key_value_store) - await self._persist_other_statistics(self._key_value_store) - def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None: retry_count = record.retry_count diff --git a/tests/unit/_statistics/test_periodic_logging.py b/tests/unit/_statistics/test_periodic_logging.py index 38938a9c9e..1a72e613ae 100644 --- a/tests/unit/_statistics/test_periodic_logging.py +++ b/tests/unit/_statistics/test_periodic_logging.py @@ -15,7 +15,7 @@ async def test_periodic_logging(caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.INFO) log_message = 'Periodic statistics XYZ' - statistics = Statistics(log_interval=timedelta(milliseconds=50), log_message=log_message) + statistics = Statistics.with_default_state(log_interval=timedelta(milliseconds=50), log_message=log_message) async with statistics: await asyncio.sleep(0.1) diff --git a/tests/unit/_statistics/test_persistence.py b/tests/unit/_statistics/test_persistence.py index 1cda18d2cf..3c6e06d02d 100644 --- a/tests/unit/_statistics/test_persistence.py +++ b/tests/unit/_statistics/test_persistence.py @@ -6,10 +6,10 @@ async def test_basic_persistence() -> None: key = 'statistics_foo' - async with Statistics(persistence_enabled=True, persist_state_key=key) as statistics: + async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics: statistics.state.requests_failed = 42 - async with Statistics(persistence_enabled=True, persist_state_key=key) as statistics: + async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics: pass assert statistics.state.requests_failed == 42 diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 1b123675c4..1889a26d44 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -301,9 +301,9 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: await crawler.run(requests) - assert crawler.predictor_state.http_only_request_handler_runs == 1 - assert crawler.predictor_state.browser_request_handler_runs == 1 - assert crawler.predictor_state.rendering_type_mispredictions == 1 + assert crawler.statistics.state.http_only_request_handler_runs == 1 + assert crawler.statistics.state.browser_request_handler_runs == 1 + assert crawler.statistics.state.rendering_type_mispredictions == 1 # Despite running both sub crawlers the top crawler statistics should count this as one request finished. assert crawler.statistics.state.requests_finished == 1 @@ -313,7 +313,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @pytest.mark.parametrize( 'error_in_pw_crawler', [ - pytest.param(False, id='Error only in bs sub crawler'), + pytest.param(False, id='Error only in static sub crawler'), pytest.param(True, id='Error in both sub crawlers'), ], ) @@ -367,7 +367,7 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: log_message = 'some message' periodic_message_logger = logging.getLogger('some logger') # Accessing private member to create copy like-object. log_interval = timedelta(minutes=2) - statistics = Statistics( + statistics = Statistics.with_default_state( persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, persist_state_key=persist_state_key, diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py index a885597d20..fb685b71a3 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py @@ -7,14 +7,14 @@ async def test_predictor_state_persistence() -> None: """Test that adaptive statistics can be correctly persisted and initialized from persisted values.""" async with AdaptivePlaywrightCrawlerStatistics(persistence_enabled=True) as adaptive_statistics: - adaptive_statistics.predictor_state.track_rendering_type_mispredictions() - adaptive_statistics.predictor_state.track_rendering_type_mispredictions() + adaptive_statistics.track_rendering_type_mispredictions() + adaptive_statistics.track_rendering_type_mispredictions() - adaptive_statistics.predictor_state.track_http_only_request_handler_runs() - adaptive_statistics.predictor_state.track_http_only_request_handler_runs() - adaptive_statistics.predictor_state.track_http_only_request_handler_runs() + adaptive_statistics.track_http_only_request_handler_runs() + adaptive_statistics.track_http_only_request_handler_runs() + adaptive_statistics.track_http_only_request_handler_runs() - adaptive_statistics.predictor_state.track_browser_request_handler_runs() + adaptive_statistics.track_browser_request_handler_runs() persistence_state_key = adaptive_statistics._persist_state_key # adaptive_statistics are persisted after leaving the context @@ -25,6 +25,6 @@ async def test_predictor_state_persistence() -> None: ) as new_adaptive_statistics: pass - assert new_adaptive_statistics.predictor_state.rendering_type_mispredictions == 2 - assert new_adaptive_statistics.predictor_state.http_only_request_handler_runs == 3 - assert new_adaptive_statistics.predictor_state.browser_request_handler_runs == 1 + assert new_adaptive_statistics.state.rendering_type_mispredictions == 2 + assert new_adaptive_statistics.state.http_only_request_handler_runs == 3 + assert new_adaptive_statistics.state.browser_request_handler_runs == 1 diff --git a/tests/unit/http_clients/test_curl_impersonate.py b/tests/unit/http_clients/test_curl_impersonate.py index aa90a51823..7f5c8312de 100644 --- a/tests/unit/http_clients/test_curl_impersonate.py +++ b/tests/unit/http_clients/test_curl_impersonate.py @@ -32,7 +32,7 @@ async def test_crawl_with_proxy( url = str(httpbin / 'status/222') request = Request.from_url(url) - async with Statistics() as statistics: + async with Statistics.with_default_state() as statistics: result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics) assert result.http_response.status_code == 222 # 222 - authentication successful @@ -48,7 +48,7 @@ async def test_crawl_with_proxy_disabled( request = Request.from_url(url) with pytest.raises(ProxyError): - async with Statistics() as statistics: + async with Statistics.with_default_state() as statistics: await http_client.crawl(request, proxy_info=disabled_proxy, statistics=statistics) diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py index 2224a0c146..08ad340b2c 100644 --- a/tests/unit/http_clients/test_httpx.py +++ b/tests/unit/http_clients/test_httpx.py @@ -44,7 +44,7 @@ async def test_proxy( url = str(httpbin / 'status/222') request = Request.from_url(url) - async with Statistics() as statistics: + async with Statistics.with_default_state() as statistics: result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics) assert result.http_response.status_code == 222 @@ -60,7 +60,7 @@ async def test_proxy_disabled( request = Request.from_url(url) with pytest.raises(ProxyError): - async with Statistics() as statistics: + async with Statistics.with_default_state() as statistics: await http_client.crawl(request, proxy_info=disabled_proxy, statistics=statistics) From 0786f870c80772c332b79868ef69c4ba652c2188 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 15 Jan 2025 17:01:13 +0100 Subject: [PATCH 38/79] Make statistics generic again! With consequences of making crawler generic with another parameter. --- .../_abstract_http/_abstract_http_crawler.py | 14 ++++- .../_adaptive_playwright_crawler.py | 57 ++++++++++++------- ..._adaptive_playwright_crawler_statistics.py | 10 ---- src/crawlee/crawlers/_basic/_basic_crawler.py | 31 +++++----- .../_playwright/_playwright_crawler.py | 5 +- ..._adaptive_playwright_crawler_statistics.py | 13 ++--- 6 files changed, 74 insertions(+), 56 deletions(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 6514a70cc1..76696c6852 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -14,6 +14,7 @@ from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline from crawlee.errors import SessionError from crawlee.http_clients import HttpxHttpClient +from crawlee.statistics import StatisticsState from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult @@ -27,6 +28,7 @@ from ._abstract_http_parser import AbstractHttpParser TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext) +TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) @docs_group('Data structures') @@ -39,7 +41,11 @@ class _HttpCrawlerOptions(Generic[TCrawlingContext], TypedDict): @docs_group('Data structures') -class HttpCrawlerOptions(Generic[TCrawlingContext], _HttpCrawlerOptions, BasicCrawlerOptions[TCrawlingContext]): +class HttpCrawlerOptions( + Generic[TCrawlingContext, TStatisticsState], + _HttpCrawlerOptions, + BasicCrawlerOptions[TCrawlingContext, StatisticsState], +): """Arguments for the `AbstractHttpCrawler` constructor. It is intended for typing forwarded `__init__` arguments in the subclasses. @@ -47,7 +53,9 @@ class HttpCrawlerOptions(Generic[TCrawlingContext], _HttpCrawlerOptions, BasicCr @docs_group('Abstract classes') -class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC): +class AbstractHttpCrawler( + Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC +): """A web crawler for performing HTTP requests. The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. On top @@ -68,7 +76,7 @@ def __init__( parser: AbstractHttpParser[TParseResult], additional_http_error_status_codes: Iterable[int] = (), ignore_http_error_status_codes: Iterable[int] = (), - **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext]], + **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]], ) -> None: self._parser = parser self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = [] diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 0cab809f68..823b738eb0 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -30,6 +30,7 @@ ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( AdaptivePlaywrightCrawlerStatistics, + AdaptivePlaywrightCrawlerStatisticState, ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, @@ -124,7 +125,8 @@ async def from_pipeline_to_top_router(context: TStaticCrawlingContext) -> None: class AdaptivePlaywrightCrawler( - Generic[TStaticCrawlingContext, TStaticParseResult], BasicCrawler[AdaptivePlaywrightCrawlingContext] + Generic[TStaticCrawlingContext, TStaticParseResult], + BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState], ): """Adaptive crawler that uses both specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. @@ -143,6 +145,7 @@ def __init__( result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, static_crawler_specific_kwargs: _HttpCrawlerOptions | None = None, playwright_crawler_specific_kwargs: PlaywrightCrawlerAdditionalOptions | None = None, + statistics: Statistics[AdaptivePlaywrightCrawlerStatisticState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> None: """A default constructor. Recommended way to create instance is to call factory methods `with_*_static_parser`. @@ -155,6 +158,8 @@ def __init__( static_parser: Implementation of `AbstractHttpParser`. Parser that will be used for static crawling. static_crawler_specific_kwargs: `AbstractHttpCrawler` only kwargs that are passed to the sub crawler. playwright_crawler_specific_kwargs: `PlaywrightCrawler` only kwargs that are passed to the sub crawler. + statistics: A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of + non-default configuration. kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ # Some sub crawler kwargs are internally modified. Prepare copies. @@ -166,15 +171,7 @@ def __init__( self.result_checker = result_checker or (lambda result: True) # noqa: ARG005 # Intentionally unused argument. self.result_comparator = result_comparator or create_default_comparator(result_checker) - # Use AdaptivePlaywrightCrawlerStatistics. - if 'statistics' in kwargs: - # If statistics already specified by user, create AdaptivePlaywrightCrawlerStatistics from it. - statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=kwargs['statistics']) - else: - statistics = AdaptivePlaywrightCrawlerStatistics() - kwargs['statistics'] = statistics - - super().__init__(**kwargs) + super().__init__(statistics=statistics, **kwargs) # Sub crawlers related. playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {} @@ -189,18 +186,19 @@ def __init__( pw_logger.setLevel(logging.ERROR) basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger - # Each sub crawler will use own dummy statistics. - basic_crawler_kwargs_for_static_crawler['statistics'] = _NoActiveStatistics() - basic_crawler_kwargs_for_pw_crawler['statistics'] = _NoActiveStatistics() - # Initialize sub crawlers to create their pipelines. static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser) static_crawler = static_crawler_class( - parser=static_parser, **static_crawler_specific_kwargs, **basic_crawler_kwargs_for_static_crawler + parser=static_parser, + statistics=_NoActiveStatistics(), + **static_crawler_specific_kwargs, + **basic_crawler_kwargs_for_static_crawler, ) playwright_crawler = PlaywrightCrawler( - **playwright_crawler_specific_kwargs, **basic_crawler_kwargs_for_pw_crawler + statistics=_NoActiveStatistics(), + **playwright_crawler_specific_kwargs, + **basic_crawler_kwargs_for_pw_crawler, ) self._pre_navigation_hooks = list[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]]() @@ -232,9 +230,14 @@ def with_beautifulsoup_static_parser( parser_type: BeautifulSoupParserType = 'lxml', static_crawler_specific_kwargs: _HttpCrawlerOptions[BeautifulSoupCrawlingContext] | None = None, playwright_crawler_specific_kwargs: PlaywrightCrawlerAdditionalOptions | None = None, + statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]: """Creates `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" + if statistics is not None: + adaptive_statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=statistics) + else: + adaptive_statistics = AdaptivePlaywrightCrawlerStatistics() return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, @@ -242,6 +245,7 @@ def with_beautifulsoup_static_parser( static_parser=BeautifulSoupParser(parser=parser_type), static_crawler_specific_kwargs=static_crawler_specific_kwargs, playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, + statistics=adaptive_statistics, **kwargs, ) @@ -252,9 +256,14 @@ def with_parsel_static_parser( result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, static_crawler_specific_kwargs: _HttpCrawlerOptions[ParselCrawlingContext] | None = None, playwright_crawler_specific_kwargs: PlaywrightCrawlerAdditionalOptions | None = None, + statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]: """Creates `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content.""" + if statistics is not None: + adaptive_statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=statistics) + else: + adaptive_statistics = AdaptivePlaywrightCrawlerStatistics() return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, @@ -262,6 +271,7 @@ def with_parsel_static_parser( static_parser=ParselParser(), static_crawler_specific_kwargs=static_crawler_specific_kwargs, playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, + statistics=adaptive_statistics, **kwargs, ) @@ -343,7 +353,7 @@ async def _run_subcrawler_pipeline( ) if rendering_type_prediction.rendering_type == 'static': context.log.debug(f'Running static request for {context.request.url}') - self.statistics.track_http_only_request_handler_runs() + self.track_http_only_request_handler_runs() static_run = await _run_subcrawler_pipeline(self._static_context_pipeline) if static_run.result and self.result_checker(static_run.result): @@ -355,7 +365,7 @@ async def _run_subcrawler_pipeline( ) else: context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}') - self.statistics.track_rendering_type_mispredictions() + self.track_rendering_type_mispredictions() context.log.debug(f'Running browser request handler for {context.request.url}') @@ -370,7 +380,7 @@ async def _run_subcrawler_pipeline( old_state_copy = deepcopy(old_state) pw_run = await _run_subcrawler_pipeline(self._pw_context_pipeline) - self.statistics.track_browser_request_handler_runs() + self.track_browser_request_handler_runs() if pw_run.exception is not None: raise pw_run.exception @@ -410,3 +420,12 @@ def pre_navigation_hook( Hooks that try to access `context.page` will have to catch this exception if triggered by static pipeline. """ self._pre_navigation_hooks.append(hook) + + def track_http_only_request_handler_runs(self) -> None: + self.statistics.state.http_only_request_handler_runs += 1 + + def track_browser_request_handler_runs(self) -> None: + self.statistics.state.browser_request_handler_runs += 1 + + def track_rendering_type_mispredictions(self) -> None: + self.statistics.state.rendering_type_mispredictions += 1 diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index 9d0073fce3..5045102022 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -48,7 +48,6 @@ def __init__( log_interval=log_interval, state_model=AdaptivePlaywrightCrawlerStatisticState, ) - self.state @classmethod def from_statistics(cls, statistics: Statistics) -> Self: @@ -61,12 +60,3 @@ def from_statistics(cls, statistics: Statistics) -> Self: periodic_message_logger=statistics._periodic_message_logger, # noqa:SLF001 # Accessing private member to create copy like-object. log_interval=statistics._log_interval, # noqa:SLF001 # Accessing private member to create copy like-object. ) - - def track_http_only_request_handler_runs(self) -> None: - self.state.http_only_request_handler_runs += 1 - - def track_browser_request_handler_runs(self) -> None: - self.state.browser_request_handler_runs += 1 - - def track_rendering_type_mispredictions(self) -> None: - self.state.rendering_type_mispredictions += 1 diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index bc2f938879..2b1cccfb5a 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -66,13 +66,13 @@ from crawlee.storages._dataset import ExportDataCsvKwargs, ExportDataJsonKwargs, GetDataKwargs, PushDataKwargs TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext) -TStatisticsState = TypeVar('TStatisticsState', bound=BasicCrawlingContext, default=StatisticsState) +TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Union[Request, None]]] FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]] @docs_group('Data structures') -class _BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): +class _BasicCrawlerOptions(TypedDict): """Non-generic options for basic crawler.""" configuration: NotRequired[Configuration] @@ -127,9 +127,6 @@ class _BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): request_handler_timeout: NotRequired[timedelta] """Maximum duration allowed for a single request handler to run.""" - statistics: NotRequired[Statistics] - """A custom `Statistics` instance, allowing the use of non-default configuration.""" - abort_on_error: NotRequired[bool] """If True, the crawler stops immediately when any request handler error occurs.""" @@ -146,7 +143,7 @@ class _BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): @docs_group('Data structures') -class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext], TypedDict): +class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict): """Generic options for basic crawler.""" request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]] @@ -156,10 +153,15 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext], TypedDict): """Enables extending the request lifecycle and modifying the crawling context. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.""" + statistics: NotRequired[Statistics[TStatisticsState]] + """A custom `Statistics` instance, allowing the use of non-default configuration.""" + @docs_group('Data structures') class BasicCrawlerOptions( - Generic[TCrawlingContext], _BasicCrawlerOptions, _BasicCrawlerOptionsGeneric[TCrawlingContext] + Generic[TCrawlingContext, TStatisticsState], + _BasicCrawlerOptions, + _BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState], ): """Arguments for the `BasicCrawler` constructor. @@ -168,7 +170,7 @@ class BasicCrawlerOptions( @docs_group('Classes') -class BasicCrawler(Generic[TCrawlingContext]): +class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]): """A basic web crawler providing a framework for crawling websites. The `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their @@ -213,7 +215,7 @@ def __init__( retry_on_blocked: bool = True, concurrency_settings: ConcurrencySettings | None = None, request_handler_timeout: timedelta = timedelta(minutes=1), - statistics: Statistics | None = None, + statistics: Statistics[TStatisticsState] | None = None, abort_on_error: bool = False, configure_logging: bool = True, _context_pipeline: ContextPipeline[TCrawlingContext] | None = None, @@ -314,9 +316,12 @@ def __init__( self._logger = _logger or logging.getLogger(__name__) # Statistics - self._statistics = statistics or Statistics.with_default_state( - periodic_message_logger=self._logger, - log_message='Current request statistics:', + self._statistics = statistics or cast( + Statistics[TStatisticsState], + Statistics.with_default_state( + periodic_message_logger=self._logger, + log_message='Current request statistics:', + ), ) # Additional context managers to enter and exit @@ -365,7 +370,7 @@ def router(self, router: Router[TCrawlingContext]) -> None: self._router = router @property - def statistics(self) -> Statistics: + def statistics(self) -> Statistics[TStatisticsState]: """Statistics about the current (or last) crawler run.""" return self._statistics diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index c2119b3cb4..3313b7af6e 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -14,6 +14,7 @@ from crawlee.browsers import BrowserPool from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline from crawlee.errors import SessionError +from crawlee.statistics import StatisticsState from ._playwright_crawling_context import PlaywrightCrawlingContext from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext @@ -29,7 +30,7 @@ @docs_group('Classes') -class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]): +class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]): """A web crawler that leverages the `Playwright` browser automation library. The `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. @@ -76,7 +77,7 @@ def __init__( browser_launch_options: Mapping[str, Any] | None = None, browser_new_context_options: Mapping[str, Any] | None = None, headless: bool | None = None, - **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]], + **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]], ) -> None: """A default constructor. diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py index fb685b71a3..35081cddf3 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py @@ -7,14 +7,9 @@ async def test_predictor_state_persistence() -> None: """Test that adaptive statistics can be correctly persisted and initialized from persisted values.""" async with AdaptivePlaywrightCrawlerStatistics(persistence_enabled=True) as adaptive_statistics: - adaptive_statistics.track_rendering_type_mispredictions() - adaptive_statistics.track_rendering_type_mispredictions() - - adaptive_statistics.track_http_only_request_handler_runs() - adaptive_statistics.track_http_only_request_handler_runs() - adaptive_statistics.track_http_only_request_handler_runs() - - adaptive_statistics.track_browser_request_handler_runs() + adaptive_statistics.state.browser_request_handler_runs = 1 + adaptive_statistics.state.rendering_type_mispredictions = 2 + adaptive_statistics.state.http_only_request_handler_runs = 3 persistence_state_key = adaptive_statistics._persist_state_key # adaptive_statistics are persisted after leaving the context @@ -25,6 +20,6 @@ async def test_predictor_state_persistence() -> None: ) as new_adaptive_statistics: pass + assert new_adaptive_statistics.state.browser_request_handler_runs == 1 assert new_adaptive_statistics.state.rendering_type_mispredictions == 2 assert new_adaptive_statistics.state.http_only_request_handler_runs == 3 - assert new_adaptive_statistics.state.browser_request_handler_runs == 1 From 1234ea71b158e316ba6dad660e65eb62bbe8f12c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 16 Jan 2025 14:45:43 +0100 Subject: [PATCH 39/79] Mock requests in tests. --- .../test_adaptive_playwright_crawler.py | 75 +++++++++++-------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 87ad7481eb..3f8daa1ced 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, cast from unittest.mock import Mock, patch +import httpx import pytest from typing_extensions import override @@ -30,6 +31,23 @@ if TYPE_CHECKING: from collections.abc import Iterator + import respx + + +@pytest.fixture +def mocked_urls(respx_mock: respx.MockRouter) -> list[str]: + """Mock example pages used in the test to reduce test time and avoid flakiness.""" + urls = [ + 'https://warehouse-theme-metal.myshopify.com/', + 'https://warehouse-theme-metal.myshopify.com/collections', + ] + for url in urls: + respx_mock.get(url).return_value = httpx.Response( + status_code=200, content=b'bla' + ) + + return urls + class _SimpleRenderingTypePredictor(RenderingTypePredictor): """Simplified predictor for tests.""" @@ -65,12 +83,9 @@ async def test_adaptive_crawling( expected_static_count: int, rendering_types: Iterator[RenderingType], detection_probability_recommendation: Iterator[int], + mocked_urls: list[str], ) -> None: """Tests correct routing to pre-nav hooks and correct handling through proper handler.""" - requests = [ - 'https://warehouse-theme-metal.myshopify.com/', - 'https://warehouse-theme-metal.myshopify.com/collections', - ] predictor = _SimpleRenderingTypePredictor( rendering_types=rendering_types, detection_probability_recommendation=detection_probability_recommendation @@ -108,7 +123,7 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None except AdaptiveContextError: static_hook_count += 1 - await crawler.run(requests) + await crawler.run(mocked_urls) assert pw_handler_count == expected_pw_count assert pw_hook_count == expected_pw_count @@ -117,13 +132,8 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None assert static_hook_count == expected_static_count -async def test_adaptive_crawling_parcel() -> None: +async def test_adaptive_crawling_parcel(mocked_urls: list[str]) -> None: """Top level test for parcel. Only one argument combination. (The rest of code is tested with bs variant.)""" - requests = [ - 'https://warehouse-theme-metal.myshopify.com/', - 'https://warehouse-theme-metal.myshopify.com/collections', - ] - predictor = _SimpleRenderingTypePredictor( rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([0]) ) @@ -145,13 +155,13 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: except AdaptiveContextError: static_handler_count += 1 - await crawler.run(requests) + await crawler.run(mocked_urls) assert pw_handler_count == 1 assert static_handler_count == 1 -async def test_adaptive_crawling_pre_nav_change_to_context() -> None: +async def test_adaptive_crawling_pre_nav_change_to_context(mocked_urls: list[str]) -> None: """Tests that context can be modified in pre-navigation hooks.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() @@ -175,14 +185,14 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None except AdaptiveContextError: context.request.user_data['data'] = 'bs' - await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + await crawler.run(mocked_urls[:1]) # Check that pre nav hooks does not influence each other assert user_data_in_pre_nav_hook == [None, None] # Check that pre nav hooks can modify context assert user_data_in_handler == ['pw', 'bs'] -async def test_adaptive_crawling_result() -> None: +async def test_adaptive_crawling_result(mocked_urls: list[str]) -> None: """Tests that result only from one sub crawler is saved. Enforced rendering type detection to run both sub crawlers.""" @@ -190,7 +200,6 @@ async def test_adaptive_crawling_result() -> None: crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_enforce_detection ) - requests = ['https://warehouse-theme-metal.myshopify.com/'] @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @@ -201,7 +210,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: except AdaptiveContextError: await context.push_data({'handler': 'bs'}) - await crawler.run(requests) + await crawler.run(mocked_urls[:1]) dataset = await crawler.get_dataset() items = [item async for item in dataset.iterate_items()] @@ -218,11 +227,14 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: ], ) async def test_adaptive_crawling_predictor_calls( - pw_saved_data: dict[str, str], static_saved_data: dict[str, str], expected_result_renderingl_type: RenderingType + pw_saved_data: dict[str, str], + static_saved_data: dict[str, str], + expected_result_renderingl_type: RenderingType, + mocked_urls: list[str], ) -> None: """Tests expected predictor calls. Same results.""" some_label = 'bla' - some_url = 'https://warehouse-theme-metal.myshopify.com/' + some_url = mocked_urls[0] static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() requests = [Request.from_url(url=some_url, label=some_label)] crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( @@ -251,12 +263,11 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: mocked_store_result.assert_called_once_with(requests[0], expected_result_renderingl_type) -async def test_adaptive_crawling_result_use_state_isolation() -> None: +async def test_adaptive_crawling_result_use_state_isolation(mocked_urls: list[str]) -> None: """Tests that global state accessed through `use_state` is changed only by one sub crawler. Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() - requests = ['https://warehouse-theme-metal.myshopify.com/'] crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_enforce_detection ) @@ -271,7 +282,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: request_handler_calls += 1 state['counter'] += 1 - await crawler.run(requests) + await crawler.run(mocked_urls[:1]) await store.persist_autosaved_values() @@ -281,13 +292,11 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: assert (await store.get_value(BasicCrawler._CRAWLEE_STATE_KEY))['counter'] == 1 -async def test_adaptive_crawling_statistics() -> None: +async def test_adaptive_crawling_statistics(mocked_urls: list[str]) -> None: """Test adaptive crawler statistics. Crawler set to static crawling, but due to result_checker returning False on static crawling result it will do browser crawling instead as well. This increments all three adaptive crawling related stats.""" - requests = ['https://warehouse-theme-metal.myshopify.com/'] - static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( @@ -299,7 +308,7 @@ async def test_adaptive_crawling_statistics() -> None: async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: pass - await crawler.run(requests) + await crawler.run(mocked_urls[:1]) assert crawler.statistics.state.http_only_request_handler_runs == 1 assert crawler.statistics.state.browser_request_handler_runs == 1 @@ -317,7 +326,9 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: pytest.param(True, id='Error in both sub crawlers'), ], ) -async def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_pw_crawler: bool) -> None: +async def test_adaptive_crawler_exceptions_in_sub_crawlers( + *, error_in_pw_crawler: bool, mocked_urls: list[str] +) -> None: """Test that correct results are commited when exceptions are raised in sub crawlers. Exception in bs sub crawler will be logged and pw sub crawler used instead. @@ -327,7 +338,6 @@ async def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_pw_crawle Exception in pw sub crawler will prevent any result from being commited. Even if `push_data` was called before the exception """ - requests = ['https://warehouse-theme-metal.myshopify.com/'] static_only_no_detection_predictor = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( @@ -348,7 +358,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: await context.push_data({'this': 'data should not be saved'}) raise RuntimeError('Some bs sub crawler related error') from None - await crawler.run(requests) + await crawler.run(mocked_urls[:1]) dataset = await crawler.get_dataset() stored_results = [item async for item in dataset.iterate_items()] @@ -387,15 +397,14 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: assert crawler._statistics._log_interval == log_interval -async def test_adaptive_playwright_crawler_timeout_in_sub_crawler() -> None: +async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(mocked_urls: list[str]) -> None: """Tests that timeout used by sub crawlers ensure that both have chance to run within top crawler timeout. Create situation where static sub crawler blocks(should timeout), such error should start browser sub crawler, which must be capable of running without top crawler handler timing out.""" - requests = ['https://warehouse-theme-metal.myshopify.com/'] static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) - top_crawler_handler_timeout = timedelta(seconds=2) + top_crawler_handler_timeout = timedelta(seconds=3) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_no_detection, @@ -416,7 +425,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: # Sleep for time obviously larger than top crawler timeout. await asyncio.sleep(top_crawler_handler_timeout.total_seconds() * 2) - await crawler.run(requests) + await crawler.run(mocked_urls[:1]) mocked_static_handler.assert_called_once_with() # Browser handler was capable of running despite static handler having sleep time larger than top handler timeout. From e85ec9f696088508d169d097e9a348125a9dffbf Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 17 Jan 2025 10:26:10 +0100 Subject: [PATCH 40/79] Improve error readability. Increase test timeout. --- .../_adaptive_playwright_crawler.py | 10 +++- .../test_adaptive_playwright_crawler.py | 59 ++++++++----------- 2 files changed, 33 insertions(+), 36 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 7c5f5ec5ae..4673abccf7 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -106,6 +106,9 @@ async def from_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> Non return self.pipeline(top_context, from_pipeline_to_top_router) + def __str__(self) -> str: + return 'Playwright context pipeline' + @dataclass class _OrphanStaticContextPipeline(Generic[TStaticCrawlingContext]): @@ -123,6 +126,9 @@ async def from_pipeline_to_top_router(context: TStaticCrawlingContext) -> None: return self.pipeline(top_context, from_pipeline_to_top_router) + def __str__(self) -> str: + return 'Static context pipeline' + class AdaptivePlaywrightCrawler( Generic[TStaticCrawlingContext, TStaticParseResult], @@ -309,7 +315,9 @@ async def get_input_state( await wait_for( lambda: subcrawler_pipeline.create_pipeline_call(context_linked_to_result), timeout=self._request_handler_timeout, - timeout_message=f'Sub crawler timed out after {self._request_handler_timeout.total_seconds()} seconds', + timeout_message=( + f'{subcrawler_pipeline=!s} timed out after {self._request_handler_timeout.total_seconds()}seconds' + ), logger=self._logger, ) return result diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 3f8daa1ced..a40ba998ee 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, cast from unittest.mock import Mock, patch -import httpx import pytest from typing_extensions import override @@ -31,22 +30,14 @@ if TYPE_CHECKING: from collections.abc import Iterator - import respx - @pytest.fixture -def mocked_urls(respx_mock: respx.MockRouter) -> list[str]: - """Mock example pages used in the test to reduce test time and avoid flakiness.""" - urls = [ +def test_urls() -> list[str]: + """Example pages used in the test to reduce test.""" + return [ 'https://warehouse-theme-metal.myshopify.com/', 'https://warehouse-theme-metal.myshopify.com/collections', ] - for url in urls: - respx_mock.get(url).return_value = httpx.Response( - status_code=200, content=b'bla' - ) - - return urls class _SimpleRenderingTypePredictor(RenderingTypePredictor): @@ -83,7 +74,7 @@ async def test_adaptive_crawling( expected_static_count: int, rendering_types: Iterator[RenderingType], detection_probability_recommendation: Iterator[int], - mocked_urls: list[str], + test_urls: list[str], ) -> None: """Tests correct routing to pre-nav hooks and correct handling through proper handler.""" @@ -123,7 +114,7 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None except AdaptiveContextError: static_hook_count += 1 - await crawler.run(mocked_urls) + await crawler.run(test_urls) assert pw_handler_count == expected_pw_count assert pw_hook_count == expected_pw_count @@ -132,7 +123,7 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None assert static_hook_count == expected_static_count -async def test_adaptive_crawling_parcel(mocked_urls: list[str]) -> None: +async def test_adaptive_crawling_parcel(test_urls: list[str]) -> None: """Top level test for parcel. Only one argument combination. (The rest of code is tested with bs variant.)""" predictor = _SimpleRenderingTypePredictor( rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([0]) @@ -155,13 +146,13 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: except AdaptiveContextError: static_handler_count += 1 - await crawler.run(mocked_urls) + await crawler.run(test_urls) assert pw_handler_count == 1 assert static_handler_count == 1 -async def test_adaptive_crawling_pre_nav_change_to_context(mocked_urls: list[str]) -> None: +async def test_adaptive_crawling_pre_nav_change_to_context(test_urls: list[str]) -> None: """Tests that context can be modified in pre-navigation hooks.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() @@ -185,14 +176,14 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None except AdaptiveContextError: context.request.user_data['data'] = 'bs' - await crawler.run(mocked_urls[:1]) + await crawler.run(test_urls[:1]) # Check that pre nav hooks does not influence each other assert user_data_in_pre_nav_hook == [None, None] # Check that pre nav hooks can modify context assert user_data_in_handler == ['pw', 'bs'] -async def test_adaptive_crawling_result(mocked_urls: list[str]) -> None: +async def test_adaptive_crawling_result(test_urls: list[str]) -> None: """Tests that result only from one sub crawler is saved. Enforced rendering type detection to run both sub crawlers.""" @@ -210,7 +201,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: except AdaptiveContextError: await context.push_data({'handler': 'bs'}) - await crawler.run(mocked_urls[:1]) + await crawler.run(test_urls[:1]) dataset = await crawler.get_dataset() items = [item async for item in dataset.iterate_items()] @@ -230,11 +221,11 @@ async def test_adaptive_crawling_predictor_calls( pw_saved_data: dict[str, str], static_saved_data: dict[str, str], expected_result_renderingl_type: RenderingType, - mocked_urls: list[str], + test_urls: list[str], ) -> None: """Tests expected predictor calls. Same results.""" some_label = 'bla' - some_url = mocked_urls[0] + some_url = test_urls[0] static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() requests = [Request.from_url(url=some_url, label=some_label)] crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( @@ -263,7 +254,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: mocked_store_result.assert_called_once_with(requests[0], expected_result_renderingl_type) -async def test_adaptive_crawling_result_use_state_isolation(mocked_urls: list[str]) -> None: +async def test_adaptive_crawling_result_use_state_isolation(test_urls: list[str]) -> None: """Tests that global state accessed through `use_state` is changed only by one sub crawler. Enforced rendering type detection to run both sub crawlers.""" @@ -282,7 +273,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: request_handler_calls += 1 state['counter'] += 1 - await crawler.run(mocked_urls[:1]) + await crawler.run(test_urls[:1]) await store.persist_autosaved_values() @@ -292,7 +283,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: assert (await store.get_value(BasicCrawler._CRAWLEE_STATE_KEY))['counter'] == 1 -async def test_adaptive_crawling_statistics(mocked_urls: list[str]) -> None: +async def test_adaptive_crawling_statistics(test_urls: list[str]) -> None: """Test adaptive crawler statistics. Crawler set to static crawling, but due to result_checker returning False on static crawling result it @@ -308,7 +299,7 @@ async def test_adaptive_crawling_statistics(mocked_urls: list[str]) -> None: async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: pass - await crawler.run(mocked_urls[:1]) + await crawler.run(test_urls[:1]) assert crawler.statistics.state.http_only_request_handler_runs == 1 assert crawler.statistics.state.browser_request_handler_runs == 1 @@ -326,9 +317,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: pytest.param(True, id='Error in both sub crawlers'), ], ) -async def test_adaptive_crawler_exceptions_in_sub_crawlers( - *, error_in_pw_crawler: bool, mocked_urls: list[str] -) -> None: +async def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_pw_crawler: bool, test_urls: list[str]) -> None: """Test that correct results are commited when exceptions are raised in sub crawlers. Exception in bs sub crawler will be logged and pw sub crawler used instead. @@ -358,7 +347,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: await context.push_data({'this': 'data should not be saved'}) raise RuntimeError('Some bs sub crawler related error') from None - await crawler.run(mocked_urls[:1]) + await crawler.run(test_urls[:1]) dataset = await crawler.get_dataset() stored_results = [item async for item in dataset.iterate_items()] @@ -397,19 +386,19 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: assert crawler._statistics._log_interval == log_interval -async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(mocked_urls: list[str]) -> None: +async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_urls: list[str]) -> None: """Tests that timeout used by sub crawlers ensure that both have chance to run within top crawler timeout. Create situation where static sub crawler blocks(should timeout), such error should start browser sub crawler, which must be capable of running without top crawler handler timing out.""" static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) - top_crawler_handler_timeout = timedelta(seconds=3) + request_handler_timeout = timedelta(seconds=5) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_no_detection, result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. - request_handler_timeout=top_crawler_handler_timeout, + request_handler_timeout=request_handler_timeout, ) mocked_static_handler = Mock() mocked_browser_handler = Mock() @@ -423,9 +412,9 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: except AdaptiveContextError: mocked_static_handler() # Sleep for time obviously larger than top crawler timeout. - await asyncio.sleep(top_crawler_handler_timeout.total_seconds() * 2) + await asyncio.sleep(request_handler_timeout.total_seconds() * 2) - await crawler.run(mocked_urls[:1]) + await crawler.run(test_urls[:1]) mocked_static_handler.assert_called_once_with() # Browser handler was capable of running despite static handler having sleep time larger than top handler timeout. From 6e8635ae79e477f247756d32df690ce04ded99b9 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 17 Jan 2025 11:26:05 +0100 Subject: [PATCH 41/79] Mock both static and browser requests in tests. --- .../test_adaptive_playwright_crawler.py | 71 +++++++++++++++---- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index a40ba998ee..b6a45c4051 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -7,10 +7,12 @@ from typing import TYPE_CHECKING, cast from unittest.mock import Mock, patch +import httpx import pytest from typing_extensions import override from crawlee import Request +from crawlee.browsers import BrowserPool from crawlee.crawlers import BasicCrawler from crawlee.crawlers._adaptive_playwright import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( @@ -30,15 +32,47 @@ if TYPE_CHECKING: from collections.abc import Iterator + import respx + + from crawlee.browsers._base_browser_plugin import BaseBrowserPlugin + from crawlee.browsers._types import CrawleePage + from crawlee.proxy_configuration import ProxyInfo + @pytest.fixture -def test_urls() -> list[str]: - """Example pages used in the test to reduce test.""" - return [ +def test_urls(respx_mock: respx.MockRouter) -> list[str]: + """Example pages used in the test to reduce test. Mock for static requests.""" + urls = [ 'https://warehouse-theme-metal.myshopify.com/', 'https://warehouse-theme-metal.myshopify.com/collections', ] + for url in urls: + respx_mock.get(url).return_value = httpx.Response( + status_code=200, content=b'What a body!' + ) + return urls + + +class _StaticRedirectBrowserPool(BrowserPool): + """BrowserPool for redirecting browser requests to static content.""" + + async def new_page( + self, + *, + page_id: str | None = None, + browser_plugin: BaseBrowserPlugin | None = None, + proxy_info: ProxyInfo | None = None, + ) -> CrawleePage: + crawlee_page = await super().new_page(page_id=page_id, browser_plugin=browser_plugin, proxy_info=proxy_info) + await crawlee_page.page.route( + '**/*', + lambda route: route.fulfill( + status=200, content_type='text/plain', body='What a body!' + ), + ) + return crawlee_page + class _SimpleRenderingTypePredictor(RenderingTypePredictor): """Simplified predictor for tests.""" @@ -82,7 +116,10 @@ async def test_adaptive_crawling( rendering_types=rendering_types, detection_probability_recommendation=detection_probability_recommendation ) - crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(rendering_type_predictor=predictor) + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + rendering_type_predictor=predictor, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) pw_handler_count = 0 static_handler_count = 0 @@ -129,7 +166,10 @@ async def test_adaptive_crawling_parcel(test_urls: list[str]) -> None: rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([0]) ) - crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(rendering_type_predictor=predictor) + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + rendering_type_predictor=predictor, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) pw_handler_count = 0 static_handler_count = 0 @@ -157,7 +197,8 @@ async def test_adaptive_crawling_pre_nav_change_to_context(test_urls: list[str]) static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( - rendering_type_predictor=static_only_predictor_enforce_detection + rendering_type_predictor=static_only_predictor_enforce_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, ) user_data_in_pre_nav_hook = [] user_data_in_handler = [] @@ -189,7 +230,8 @@ async def test_adaptive_crawling_result(test_urls: list[str]) -> None: Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( - rendering_type_predictor=static_only_predictor_enforce_detection + rendering_type_predictor=static_only_predictor_enforce_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, ) @crawler.router.default_handler @@ -229,7 +271,8 @@ async def test_adaptive_crawling_predictor_calls( static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() requests = [Request.from_url(url=some_url, label=some_label)] crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( - rendering_type_predictor=static_only_predictor_enforce_detection + rendering_type_predictor=static_only_predictor_enforce_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, ) @crawler.router.default_handler @@ -260,7 +303,8 @@ async def test_adaptive_crawling_result_use_state_isolation(test_urls: list[str] Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( - rendering_type_predictor=static_only_predictor_enforce_detection + rendering_type_predictor=static_only_predictor_enforce_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, ) store = await crawler.get_key_value_store() await store.set_value(BasicCrawler._CRAWLEE_STATE_KEY, {'counter': 0}) @@ -293,6 +337,7 @@ async def test_adaptive_crawling_statistics(test_urls: list[str]) -> None: crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_no_detection, result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, ) @crawler.router.default_handler @@ -330,7 +375,8 @@ async def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_pw_crawle static_only_no_detection_predictor = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( - rendering_type_predictor=static_only_no_detection_predictor + rendering_type_predictor=static_only_no_detection_predictor, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, ) saved_data = {'some': 'data'} @@ -389,16 +435,17 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_urls: list[str]) -> None: """Tests that timeout used by sub crawlers ensure that both have chance to run within top crawler timeout. - Create situation where static sub crawler blocks(should timeout), such error should start browser sub + Create situation where static sub crawler blocks(should time out), such error should start browser sub crawler, which must be capable of running without top crawler handler timing out.""" static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) - request_handler_timeout = timedelta(seconds=5) + request_handler_timeout = timedelta(seconds=2) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_no_detection, result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. request_handler_timeout=request_handler_timeout, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, ) mocked_static_handler = Mock() mocked_browser_handler = Mock() From 0e9146a434740eb383cac0e3a4e1979c91e34ca5 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 17 Jan 2025 12:26:45 +0100 Subject: [PATCH 42/79] Create proper example code (Not yet used in docs. Will be part of future PRs) --- .../code/adaptive_playwright_crawler.py | 55 +++++++++++++++++++ .../crawlers/_adaptive_playwright/_example.py | 51 ----------------- .../test_adaptive_playwright_crawler.py | 2 +- 3 files changed, 56 insertions(+), 52 deletions(-) create mode 100644 docs/examples/code/adaptive_playwright_crawler.py delete mode 100644 src/crawlee/crawlers/_adaptive_playwright/_example.py diff --git a/docs/examples/code/adaptive_playwright_crawler.py b/docs/examples/code/adaptive_playwright_crawler.py new file mode 100644 index 0000000000..43314ab59e --- /dev/null +++ b/docs/examples/code/adaptive_playwright_crawler.py @@ -0,0 +1,55 @@ +import asyncio + +from playwright.async_api import Route + +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( + AdaptiveContextError, + AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False} + ) + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + # Code that will be executed in both crawl types + context.log.info(f'User handler processing: {context.request.url} ...') + + try: + some_locator = context.page.locator('div').first + # Code that will be executed only in Playwright crawl. + # Trying to access `context.page` in static crawl will throw `AdaptiveContextError`. + + await some_locator.wait_for() + # Do stuff with locator... + context.log.info(f'Playwright processing of: {context.request.url} ...') + except AdaptiveContextError: + # Code that will be executed in only in static crawl + context.log.info(f'Static processing of: {context.request.url} ...') + + # FInd more links and enqueue them. + await context.enqueue_links() + await context.push_data({'Top crawler Url': context.request.url}) + + @crawler.pre_navigation_hook + async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + async def some_routing_function(route: Route) -> None: + await route.continue_() + + try: + await context.page.route('*/**', some_routing_function) + context.log.info(f'Playwright pre navigation hook for: {context.request.url} ...') + except AdaptiveContextError: + context.log.info(f'Static pre navigation hook for: {context.request.url} ...') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_example.py b/src/crawlee/crawlers/_adaptive_playwright/_example.py deleted file mode 100644 index c3706df868..0000000000 --- a/src/crawlee/crawlers/_adaptive_playwright/_example.py +++ /dev/null @@ -1,51 +0,0 @@ -import asyncio -import logging -from logging import getLogger - -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( - AdaptiveContextError, - AdaptivePlaywrightCrawlingContext, - AdaptivePlaywrightPreNavCrawlingContext, -) - - -async def main() -> None: - # remove in review. Move this to documentation examples instead. - top_logger = getLogger(__name__) - top_logger.setLevel(logging.DEBUG) - i = 0 - - crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( - max_requests_per_crawl=10, _logger=top_logger, playwright_crawler_specific_kwargs={'headless': False} - ) - """ - - crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( - max_requests_per_crawl=10, _logger=top_logger, playwright_crawler_specific_kwargs={'headless': False} - ) - """ - - @crawler.router.default_handler - async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - nonlocal i - i = i + 1 - context.log.info(f'Processing with Top adaptive_crawler: {context.request.url} ...') - await context.enqueue_links() - await context.push_data({'Top crwaler Url': context.request.url}) - await context.use_state({'bla': i}) - - @crawler.pre_navigation_hook - async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: - try: - context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. - context.log.info(f'PW pre navigation hook for: {context.request.url} ...') - except AdaptiveContextError: - context.log.info(f'BS pre navigation hook for: {context.request.url} ...') - - # Run the crawler with the initial list of URLs. - await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) - - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index b6a45c4051..860d1e59ba 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -41,7 +41,7 @@ @pytest.fixture def test_urls(respx_mock: respx.MockRouter) -> list[str]: - """Example pages used in the test to reduce test. Mock for static requests.""" + """Example pages used in the test are mocked for static requests.""" urls = [ 'https://warehouse-theme-metal.myshopify.com/', 'https://warehouse-theme-metal.myshopify.com/collections', From 44ad898b27a76cb8b66c1c752a40d5f64cb79c67 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 20 Jan 2025 08:03:18 +0100 Subject: [PATCH 43/79] Relax timeout in test to avoid flakiness in CI --- .../test_adaptive_playwright_crawler.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 860d1e59ba..db04a7aa82 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -433,15 +433,16 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_urls: list[str]) -> None: - """Tests that timeout used by sub crawlers ensure that both have chance to run within top crawler timeout. + """Tests that timeout in static sub crawler forces fall back to browser sub crawler. Create situation where static sub crawler blocks(should time out), such error should start browser sub - crawler, which must be capable of running without top crawler handler timing out.""" + crawler.""" static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) - request_handler_timeout = timedelta(seconds=2) + request_handler_timeout = timedelta(seconds=0.1) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_request_retries=1, rendering_type_predictor=static_only_predictor_no_detection, result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. request_handler_timeout=request_handler_timeout, @@ -458,6 +459,8 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: mocked_browser_handler() except AdaptiveContextError: mocked_static_handler() + # Relax timeout for the fallback browser request to avoid flakiness in test + crawler._request_handler_timeout = timedelta(seconds=5) # Sleep for time obviously larger than top crawler timeout. await asyncio.sleep(request_handler_timeout.total_seconds() * 2) From f219453f258c027620d30e296f8cd5aefe873b98 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 21 Jan 2025 09:51:13 +0100 Subject: [PATCH 44/79] Remove AdaptivePlaywrightCrawlerStatistics Replace it by new helper factory-like method on statistics class. --- .../_adaptive_playwright_crawler.py | 10 ++-- ..._adaptive_playwright_crawler_statistics.py | 49 +------------------ src/crawlee/statistics/_statistics.py | 16 +++++- .../test_adaptive_playwright_crawler.py | 5 +- ..._adaptive_playwright_crawler_statistics.py | 13 +++-- 5 files changed, 33 insertions(+), 60 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 4673abccf7..a9ac59e897 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -29,7 +29,6 @@ PlaywrightPreNavCrawlingContext, ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( - AdaptivePlaywrightCrawlerStatistics, AdaptivePlaywrightCrawlerStatisticState, ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( @@ -241,9 +240,9 @@ def with_beautifulsoup_static_parser( ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]: """Creates `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" if statistics is not None: - adaptive_statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=statistics) + adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState) else: - adaptive_statistics = AdaptivePlaywrightCrawlerStatistics() + adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState) return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, @@ -266,10 +265,11 @@ def with_parsel_static_parser( **kwargs: Unpack[_BasicCrawlerOptions], ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]: """Creates `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content.""" + """Creates `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" if statistics is not None: - adaptive_statistics = AdaptivePlaywrightCrawlerStatistics.from_statistics(statistics=statistics) + adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState) else: - adaptive_statistics = AdaptivePlaywrightCrawlerStatistics() + adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState) return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index 5045102022..8564e273dc 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -1,19 +1,11 @@ from __future__ import annotations -from datetime import timedelta -from typing import TYPE_CHECKING, Annotated +from typing import Annotated from pydantic import ConfigDict, Field from crawlee._utils.docs import docs_group -from crawlee.statistics import Statistics, StatisticsState - -if TYPE_CHECKING: - from logging import Logger - - from typing_extensions import Self - - from crawlee.storages import KeyValueStore +from crawlee.statistics import StatisticsState @docs_group('Data structures') @@ -23,40 +15,3 @@ class AdaptivePlaywrightCrawlerStatisticState(StatisticsState): http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0 browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0 rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0 - - -@docs_group('Classes') -class AdaptivePlaywrightCrawlerStatistics(Statistics[AdaptivePlaywrightCrawlerStatisticState]): - def __init__( - self, - *, - persistence_enabled: bool = False, - persist_state_kvs_name: str = 'default', - persist_state_key: str | None = None, - key_value_store: KeyValueStore | None = None, - log_message: str = 'Statistics', - periodic_message_logger: Logger | None = None, - log_interval: timedelta = timedelta(minutes=1), - ) -> None: - super().__init__( - persistence_enabled=persistence_enabled, - persist_state_kvs_name=persist_state_kvs_name, - persist_state_key=persist_state_key, - key_value_store=key_value_store, - log_message=log_message, - periodic_message_logger=periodic_message_logger, - log_interval=log_interval, - state_model=AdaptivePlaywrightCrawlerStatisticState, - ) - - @classmethod - def from_statistics(cls, statistics: Statistics) -> Self: - return cls( - persistence_enabled=statistics._persistence_enabled, # noqa:SLF001 # Accessing private member to create copy like-object. - persist_state_kvs_name=statistics._persist_state_kvs_name, # noqa:SLF001 # Accessing private member to create copy like-object. - persist_state_key=statistics._persist_state_key, # noqa:SLF001 # Accessing private member to create copy like-object. - key_value_store=statistics._key_value_store, # noqa:SLF001 # Accessing private member to create copy like-object. - log_message=statistics._log_message, # noqa:SLF001 # Accessing private member to create copy like-object. - periodic_message_logger=statistics._periodic_message_logger, # noqa:SLF001 # Accessing private member to create copy like-object. - log_interval=statistics._log_interval, # noqa:SLF001 # Accessing private member to create copy like-object. - ) diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index fa6585fac0..14b0eff771 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -21,6 +21,7 @@ from types import TracebackType TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) +TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState) logger = getLogger(__name__) @@ -99,12 +100,25 @@ def __init__( self._log_message = log_message self._periodic_message_logger = periodic_message_logger or logger - self._log_interval = log_interval self._periodic_logger = RecurringTask(self._log, log_interval) # Flag to indicate the context state. self._active = False + def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]: + """Method that produces near copy of the Statistics with replaced state model.""" + new_statistics: Statistics[TNewStatisticsState] = Statistics( + persistence_enabled=self._persistence_enabled, + persist_state_kvs_name=self._persist_state_kvs_name, + persist_state_key=self._persist_state_key, + key_value_store=self._key_value_store, + log_message=self._log_message, + periodic_message_logger=self._periodic_message_logger, + state_model=state_model, + ) + new_statistics._periodic_logger = self._periodic_logger # noqa:SLF001 # Accessing private member to create copy like-object. + return new_statistics + @staticmethod def with_default_state( *, diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index db04a7aa82..b91d898218 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -16,7 +16,7 @@ from crawlee.crawlers import BasicCrawler from crawlee.crawlers._adaptive_playwright import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( - AdaptivePlaywrightCrawlerStatistics, + AdaptivePlaywrightCrawlerStatisticState, ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( AdaptiveContextError, @@ -423,13 +423,12 @@ def test_adaptive_playwright_crawler_statistics_in_init() -> None: crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(statistics=statistics) - assert type(crawler._statistics) is AdaptivePlaywrightCrawlerStatistics + assert type(crawler._statistics.state) is AdaptivePlaywrightCrawlerStatisticState assert crawler._statistics._persistence_enabled == persistence_enabled assert crawler._statistics._persist_state_kvs_name == persist_state_kvs_name assert crawler._statistics._persist_state_key == persist_state_key assert crawler._statistics._log_message == log_message assert crawler._statistics._periodic_message_logger == periodic_message_logger - assert crawler._statistics._log_interval == log_interval async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_urls: list[str]) -> None: diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py index 35081cddf3..435e374ae3 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py @@ -1,12 +1,15 @@ from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( - AdaptivePlaywrightCrawlerStatistics, + AdaptivePlaywrightCrawlerStatisticState, ) +from crawlee.statistics import Statistics async def test_predictor_state_persistence() -> None: """Test that adaptive statistics can be correctly persisted and initialized from persisted values.""" - async with AdaptivePlaywrightCrawlerStatistics(persistence_enabled=True) as adaptive_statistics: + async with Statistics( + state_model=AdaptivePlaywrightCrawlerStatisticState, persistence_enabled=True + ) as adaptive_statistics: adaptive_statistics.state.browser_request_handler_runs = 1 adaptive_statistics.state.rendering_type_mispredictions = 2 adaptive_statistics.state.http_only_request_handler_runs = 3 @@ -15,8 +18,10 @@ async def test_predictor_state_persistence() -> None: # adaptive_statistics are persisted after leaving the context # new_adaptive_statistics are initialized from the persisted values. - async with AdaptivePlaywrightCrawlerStatistics( - persistence_enabled=True, persist_state_key=persistence_state_key + async with Statistics( + state_model=AdaptivePlaywrightCrawlerStatisticState, + persistence_enabled=True, + persist_state_key=persistence_state_key, ) as new_adaptive_statistics: pass From 64d9e5464912c4279261c9f0d7418cffde34b05f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 21 Jan 2025 10:35:46 +0100 Subject: [PATCH 45/79] WIP --- src/crawlee/statistics/_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index 14b0eff771..38042073ef 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -106,7 +106,7 @@ def __init__( self._active = False def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]: - """Method that produces near copy of the Statistics with replaced state model.""" + """Create near copy of the `Statistics` with replaced `state_model`.""" new_statistics: Statistics[TNewStatisticsState] = Statistics( persistence_enabled=self._persistence_enabled, persist_state_kvs_name=self._persist_state_kvs_name, From 08fc81fc5a103b81c57865f00dfbb56e4e15bd69 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 21 Jan 2025 13:56:05 +0100 Subject: [PATCH 46/79] Update options typed dicts Do not expose the _ starting options --- .../_abstract_http/_abstract_http_crawler.py | 5 +-- .../_adaptive_playwright_crawler.py | 21 +++++---- src/crawlee/crawlers/_basic/_basic_crawler.py | 6 +-- .../_playwright/_playwright_crawler.py | 43 ++++++++++++------- 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 24926e9a1a..5f821f29c2 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -31,8 +31,7 @@ TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) -@docs_group('Data structures') -class _HttpCrawlerOptions(Generic[TCrawlingContext], TypedDict): +class _HttpCrawlerAdditionalOptions(TypedDict): additional_http_error_status_codes: NotRequired[Iterable[int]] """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered.""" @@ -43,7 +42,7 @@ class _HttpCrawlerOptions(Generic[TCrawlingContext], TypedDict): @docs_group('Data structures') class HttpCrawlerOptions( Generic[TCrawlingContext, TStatisticsState], - _HttpCrawlerOptions, + _HttpCrawlerAdditionalOptions, BasicCrawlerOptions[TCrawlingContext, StatisticsState], ): """Arguments for the `AbstractHttpCrawler` constructor. diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index a9ac59e897..24a58bc89b 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -19,11 +19,9 @@ AbstractHttpCrawler, AbstractHttpParser, BasicCrawler, - BeautifulSoupCrawlingContext, BeautifulSoupParserType, ContextPipeline, ParsedHttpCrawlingContext, - ParselCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext, @@ -56,9 +54,11 @@ from typing_extensions import Unpack - from crawlee.crawlers._abstract_http._abstract_http_crawler import _HttpCrawlerOptions + from crawlee.crawlers._abstract_http._abstract_http_crawler import ( + _HttpCrawlerAdditionalOptions, + ) from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions - from crawlee.crawlers._playwright._playwright_crawler import PlaywrightCrawlerAdditionalOptions + from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions from crawlee.router import Router @@ -148,8 +148,8 @@ def __init__( rendering_type_predictor: RenderingTypePredictor | None = None, result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, - static_crawler_specific_kwargs: _HttpCrawlerOptions | None = None, - playwright_crawler_specific_kwargs: PlaywrightCrawlerAdditionalOptions | None = None, + static_crawler_specific_kwargs: _HttpCrawlerAdditionalOptions | None = None, + playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[AdaptivePlaywrightCrawlerStatisticState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> None: @@ -233,8 +233,8 @@ def with_beautifulsoup_static_parser( result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, parser_type: BeautifulSoupParserType = 'lxml', - static_crawler_specific_kwargs: _HttpCrawlerOptions[BeautifulSoupCrawlingContext] | None = None, - playwright_crawler_specific_kwargs: PlaywrightCrawlerAdditionalOptions | None = None, + static_crawler_specific_kwargs: _HttpCrawlerAdditionalOptions | None = None, + playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]: @@ -259,13 +259,12 @@ def with_parsel_static_parser( rendering_type_predictor: RenderingTypePredictor | None = None, result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, - static_crawler_specific_kwargs: _HttpCrawlerOptions[ParselCrawlingContext] | None = None, - playwright_crawler_specific_kwargs: PlaywrightCrawlerAdditionalOptions | None = None, + static_crawler_specific_kwargs: _HttpCrawlerAdditionalOptions | None = None, + playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]: """Creates `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content.""" - """Creates `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" if statistics is not None: adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState) else: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 62e66ba43e..e4b2922dc5 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -70,9 +70,8 @@ FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]] -@docs_group('Data structures') class _BasicCrawlerOptions(TypedDict): - """Non-generic options for basic crawler.""" + """Non-generic options the `BasicCrawler` constructor.""" configuration: NotRequired[Configuration] """The `Configuration` instance. Some of its properties are used as defaults for the crawler.""" @@ -141,9 +140,8 @@ class _BasicCrawlerOptions(TypedDict): subclasses rather than direct instantiation of `BasicCrawler`.""" -@docs_group('Data structures') class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict): - """Generic options for basic crawler.""" + """Generic options the `BasicCrawler` constructor.""" request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]] """A callable responsible for handling requests.""" diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 5d19d41f6e..e384d929c8 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -1,10 +1,10 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, Callable +from typing import TYPE_CHECKING, Any, Callable, Generic from pydantic import ValidationError -from typing_extensions import NotRequired, TypedDict +from typing_extensions import NotRequired, TypedDict, TypeVar from crawlee import EnqueueStrategy from crawlee._request import BaseRequestData @@ -20,6 +20,9 @@ from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext from ._utils import infinite_scroll +TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext) +TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) + if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Mapping @@ -289,12 +292,11 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], self._pre_navigation_hooks.append(hook) -@docs_group('Data structures') -class PlaywrightCrawlerAdditionalOptions(TypedDict): +class _PlaywrightCrawlerAdditionalOptions(TypedDict): """Additional arguments for the `PlaywrightCrawler` constructor. It is intended for typing forwarded `__init__` arguments in the subclasses. - All arguments are `BasicCrawlerOptions` + `PlaywrightCrawlerAdditionalOptions` + All arguments are `BasicCrawlerOptions` + `_PlaywrightCrawlerAdditionalOptions` """ browser_pool: NotRequired[BrowserPool] @@ -302,20 +304,31 @@ class PlaywrightCrawlerAdditionalOptions(TypedDict): browser_type: NotRequired[BrowserType] """The type of browser to launch ('chromium', 'firefox', or 'webkit'). - This option should not be used if `browser_pool` is provided.""" + This option should not be used if `browser_pool` is provided.""" browser_launch_options: NotRequired[Mapping[str, Any]] """Keyword arguments to pass to the browser launch method. These options are provided - directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright - documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch. - This option should not be used if `browser_pool` is provided.""" + directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright + documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch. + This option should not be used if `browser_pool` is provided.""" browser_new_context_options: NotRequired[Mapping[str, Any]] - """Keyword arguments to pass to the browser new context method. These options - are provided directly to Playwright's `browser.new_context` method. For more details, refer to the - Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. - This option should not be used if `browser_pool` is provided.""" + """Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's + `browser.new_context` method. For more details, refer to the Playwright documentation: + https://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if + `browser_pool` is provided.""" headless: NotRequired[bool] - """Whether to run the browser in headless mode. - This option should not be used if `browser_pool` is provided.""" + """Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided.""" + + +@docs_group('Data structures') +class PlaywrightCrawlerOptions( + Generic[TCrawlingContext, TStatisticsState], + _PlaywrightCrawlerAdditionalOptions, + BasicCrawlerOptions[TCrawlingContext, StatisticsState], +): + """Arguments for the `AbstractHttpCrawler` constructor. + + It is intended for typing forwarded `__init__` arguments in the subclasses. + """ From 9bde9dcc50db86e19db28c60089d14c7df2b517e Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 21 Jan 2025 14:15:45 +0100 Subject: [PATCH 47/79] Add docstrings to adaptive context public stuff Fix typo --- .../_adaptive_playwright_crawling_context.py | 17 +++++++++++++++++ .../_adaptive_playwright/_result_comparator.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 0277ba5ca6..2a13181811 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -37,18 +37,31 @@ class AdaptivePlaywrightCrawlingContext(Generic[TStaticParseResult], ParsedHttpC @property def page(self) -> Page: + """The Playwright `Page` object for the current page. + + Raises `AdaptiveContextError` if accessed during static crawling. + """ if not self._page: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._page @property def infinite_scroll(self) -> Callable[[], Awaitable[None]]: + """A function to perform infinite scrolling on the page. + + This scrolls to the bottom, triggering the loading of additional content if present. + Raises `AdaptiveContextError` if accessed during static crawling. + """ if not self._infinite_scroll: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._infinite_scroll @property def response(self) -> Response: + """The Playwright `Response` object containing the response details for the current URL. + + Raises `AdaptiveContextError` if accessed during static crawling. + """ if not self._response: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._response @@ -93,6 +106,10 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext): @property def page(self) -> Page: + """The Playwright `Page` object for the current page. + + Raises `AdaptiveContextError` if accessed during static crawling. + """ if self._page is not None: return self._page raise AdaptiveContextError('Page is not crawled with PlaywrightCrawler.') diff --git a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py index d6a6c376f6..c7c21eae73 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py @@ -41,5 +41,5 @@ def full_result_comparator(result_1: RequestHandlerRunResult, result_2: RequestH def push_data_only_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: - """Compare results by comparing their push dara calls. Ignore other parts of results in comparison.""" + """Compare results by comparing their push data calls. Ignore other parts of results in comparison.""" return result_1.push_data_calls == result_2.push_data_calls From 9a145698d4a259719213dc71b1322172424856db Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 21 Jan 2025 14:23:43 +0100 Subject: [PATCH 48/79] Make crawl_one_with private. Update comments. --- .../_adaptive_playwright_crawler.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 24a58bc89b..2be9f7ff3a 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup from parsel import Selector -from typing_extensions import Self, TypeVar +from typing_extensions import Self, TypeVar, override from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult from crawlee._utils.wait import wait_for @@ -280,14 +280,17 @@ def with_parsel_static_parser( **kwargs, ) - async def crawl_one_with( + async def _crawl_one_with( self, subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanStaticContextPipeline, context: BasicCrawlingContext, result: RequestHandlerRunResult, state: dict[str, JsonSerializable] | None = None, ) -> RequestHandlerRunResult: - # Timeout to ensure that both sub crawlers can finish one request withing top crawler `request_handler_timeout`. + """Perform a one request crawl with specific context pipeline and return result of this crawl. + + Use `context`, `result` and `state` to create new copy-like context that is passed to the `subcrawler_pipeline`. + """ if state is not None: async def get_input_state( @@ -321,7 +324,7 @@ async def get_input_state( ) return result - # Can't use override as mypy does not like it for double underscore private method. + @override async def _run_request_handler(self, context: BasicCrawlingContext) -> None: """Override BasicCrawler method that delegates request processing to sub crawlers. @@ -341,7 +344,7 @@ async def _run_subcrawler_pipeline( Produces `SubCrawlerRun` that either contains filled `RequestHandlerRunResult` or exception. """ try: - crawl_result = await self.crawl_one_with( + crawl_result = await self._crawl_one_with( subcrawler_pipeline=subcrawler_pipeline, context=context, result=RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store), From fd8dd8221389a7bb6d2036c9a00a85338443679e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Wed, 22 Jan 2025 14:17:09 +0100 Subject: [PATCH 49/79] Update tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py Co-authored-by: Jan Buchar --- .../_adaptive_playwright/test_adaptive_playwright_crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index b91d898218..fc7facb976 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -218,9 +218,9 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None context.request.user_data['data'] = 'bs' await crawler.run(test_urls[:1]) - # Check that pre nav hooks does not influence each other + # Check that repeated pre nav hook invocations do not influence each other while probing assert user_data_in_pre_nav_hook == [None, None] - # Check that pre nav hooks can modify context + # Check that the request handler sees changes to user data done by pre nav hooks assert user_data_in_handler == ['pw', 'bs'] From 422121902e8c007960b799b1bfd0a1de8caa4f95 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 22 Jan 2025 14:36:49 +0100 Subject: [PATCH 50/79] Review comments --- .../_adaptive_playwright_crawler.py | 3 +-- .../_adaptive_playwright_crawling_context.py | 10 +++++----- .../test_adaptive_playwright_crawler.py | 11 ++++------- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 2be9f7ff3a..c180aa1548 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -24,7 +24,6 @@ ParsedHttpCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, - PlaywrightPreNavCrawlingContext, ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( AdaptivePlaywrightCrawlerStatisticState, @@ -208,7 +207,7 @@ def __init__( self._pre_navigation_hooks = list[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]]() - async def adaptive_pre_navigation_hook(context: BasicCrawlingContext | PlaywrightPreNavCrawlingContext) -> None: + async def adaptive_pre_navigation_hook(context: BasicCrawlingContext) -> None: for hook in self._pre_navigation_hooks: await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_contexts(context)) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 2a13181811..ed6a514984 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -85,7 +85,7 @@ async def from_playwright_crawling_context( context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll') # This might not be always available. protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol') - http_response = await _HttpResponse.from_playwright_response( + http_response = await _PlaywrightHttpResponse.from_playwright_response( response=context.response, protocol=protocol_guess or '' ) return cls( @@ -115,9 +115,7 @@ def page(self) -> Page: raise AdaptiveContextError('Page is not crawled with PlaywrightCrawler.') @classmethod - def from_pre_navigation_contexts( - cls, context: AdaptivePlaywrightPreNavCrawlingContext | BasicCrawlingContext - ) -> Self: + def from_pre_navigation_contexts(cls, context: BasicCrawlingContext) -> Self: """Convenience constructor that creates new context from existing pre navigation contexts.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} context_kwargs['_page'] = context_kwargs.pop('page', None) @@ -125,7 +123,9 @@ def from_pre_navigation_contexts( @dataclass(frozen=True) -class _HttpResponse: +class _PlaywrightHttpResponse: + """Wrapper class for playwright `Response` object to implement `HttpResponse` protocol.""" + http_version: str status_code: int headers: HttpHeaders diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index fc7facb976..6243251df0 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -245,15 +245,12 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: await crawler.run(test_urls[:1]) - dataset = await crawler.get_dataset() - items = [item async for item in dataset.iterate_items()] - # Enforced rendering type detection will trigger both sub crawlers, but only pw crawler result is saved. - assert items == [{'handler': 'pw'}] + assert (await crawler.get_data()).items == [{'handler': 'pw'}] @pytest.mark.parametrize( - ('pw_saved_data', 'static_saved_data', 'expected_result_renderingl_type'), + ('pw_saved_data', 'static_saved_data', 'expected_result_rendering_type'), [ pytest.param({'some': 'data'}, {'some': 'data'}, 'static', id='Same results from sub crawlers'), pytest.param({'some': 'data'}, {'different': 'data'}, 'client only', id='Different results from sub crawlers'), @@ -262,7 +259,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: async def test_adaptive_crawling_predictor_calls( pw_saved_data: dict[str, str], static_saved_data: dict[str, str], - expected_result_renderingl_type: RenderingType, + expected_result_rendering_type: RenderingType, test_urls: list[str], ) -> None: """Tests expected predictor calls. Same results.""" @@ -294,7 +291,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: mocked_predict.assert_called_once_with(requests[0]) # If `static` and `client only` results are same, `store_result` should be called with `static`. - mocked_store_result.assert_called_once_with(requests[0], expected_result_renderingl_type) + mocked_store_result.assert_called_once_with(requests[0], expected_result_rendering_type) async def test_adaptive_crawling_result_use_state_isolation(test_urls: list[str]) -> None: From 565d36b0c2669aba27eb5e6945d0550bd11c592f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 22 Jan 2025 15:03:25 +0100 Subject: [PATCH 51/79] Remove _run_subcrawler_pipeline --- .../_adaptive_playwright_crawler.py | 56 +++++++------------ 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index c180aa1548..5bce2e1eb3 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -283,13 +283,14 @@ async def _crawl_one_with( self, subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanStaticContextPipeline, context: BasicCrawlingContext, - result: RequestHandlerRunResult, state: dict[str, JsonSerializable] | None = None, - ) -> RequestHandlerRunResult: - """Perform a one request crawl with specific context pipeline and return result of this crawl. + ) -> SubCrawlerRun: + """Perform a one request crawl with specific context pipeline and return `SubCrawlerRun`. + `SubCrawlerRun` container either result of the crawl or the exception that was thrown during the crawl. Use `context`, `result` and `state` to create new copy-like context that is passed to the `subcrawler_pipeline`. """ + result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store) if state is not None: async def get_input_state( @@ -313,15 +314,18 @@ async def get_input_state( log=context.log, ) - await wait_for( - lambda: subcrawler_pipeline.create_pipeline_call(context_linked_to_result), - timeout=self._request_handler_timeout, - timeout_message=( - f'{subcrawler_pipeline=!s} timed out after {self._request_handler_timeout.total_seconds()}seconds' - ), - logger=self._logger, - ) - return result + try: + await wait_for( + lambda: subcrawler_pipeline.create_pipeline_call(context_linked_to_result), + timeout=self._request_handler_timeout, + timeout_message=( + f'{subcrawler_pipeline=!s} timed out after {self._request_handler_timeout.total_seconds()}seconds' + ), + logger=self._logger, + ) + return SubCrawlerRun(result=result) + except Exception as e: + return SubCrawlerRun(exception=e) @override async def _run_request_handler(self, context: BasicCrawlingContext) -> None: @@ -333,26 +337,6 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None: Reference implementation: https://github.com/apify/crawlee/blob/master/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts """ - - async def _run_subcrawler_pipeline( - subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanStaticContextPipeline, - use_state: dict | None = None, - ) -> SubCrawlerRun: - """Helper closure that creates new `RequestHandlerRunResult` and delegates request handling to sub crawler. - - Produces `SubCrawlerRun` that either contains filled `RequestHandlerRunResult` or exception. - """ - try: - crawl_result = await self._crawl_one_with( - subcrawler_pipeline=subcrawler_pipeline, - context=context, - result=RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store), - state=use_state, - ) - return SubCrawlerRun(result=crawl_result) - except Exception as e: - return SubCrawlerRun(exception=e) - rendering_type_prediction = self.rendering_type_predictor.predict(context.request) should_detect_rendering_type = random() < rendering_type_prediction.detection_probability_recommendation @@ -364,7 +348,7 @@ async def _run_subcrawler_pipeline( context.log.debug(f'Running static request for {context.request.url}') self.track_http_only_request_handler_runs() - static_run = await _run_subcrawler_pipeline(self._static_context_pipeline) + static_run = await self._crawl_one_with(self._static_context_pipeline, context=context) if static_run.result and self.result_checker(static_run.result): await self._push_result_to_context(result=static_run.result, context=context) return @@ -388,7 +372,7 @@ async def _run_subcrawler_pipeline( old_state: dict[str, JsonSerializable] = await kvs.get_value(self._CRAWLEE_STATE_KEY, default_value) old_state_copy = deepcopy(old_state) - pw_run = await _run_subcrawler_pipeline(self._pw_context_pipeline) + pw_run = await self._crawl_one_with(self._pw_context_pipeline, context=context) self.track_browser_request_handler_runs() if pw_run.exception is not None: @@ -399,7 +383,9 @@ async def _run_subcrawler_pipeline( if should_detect_rendering_type: detection_result: RenderingType - static_run = await _run_subcrawler_pipeline(self._static_context_pipeline, use_state=old_state_copy) + static_run = await self._crawl_one_with( + self._static_context_pipeline, context=context, state=old_state_copy + ) if static_run.result and self.result_comparator(static_run.result, pw_run.result): detection_result = 'static' From 4bd8251acd9123555732c0dd62c8c5f3d0b9460c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 22 Jan 2025 16:24:38 +0100 Subject: [PATCH 52/79] Remove Orphans --- .../_adaptive_playwright_crawler.py | 118 +++++++----------- 1 file changed, 48 insertions(+), 70 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 5bce2e1eb3..f1d3cc2c6d 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -2,12 +2,11 @@ import asyncio import logging -from collections.abc import Awaitable, Callable +from collections.abc import Awaitable, Callable, Coroutine from copy import deepcopy -from dataclasses import dataclass from logging import getLogger from random import random -from typing import TYPE_CHECKING, Any, Generic +from typing import TYPE_CHECKING, Any, Generic, get_args from bs4 import BeautifulSoup from parsel import Selector @@ -20,7 +19,6 @@ AbstractHttpParser, BasicCrawler, BeautifulSoupParserType, - ContextPipeline, ParsedHttpCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, @@ -48,7 +46,6 @@ from crawlee.statistics import Statistics, StatisticsState if TYPE_CHECKING: - from collections.abc import Coroutine from types import TracebackType from typing_extensions import Unpack @@ -58,7 +55,6 @@ ) from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions - from crawlee.router import Router TStaticParseResult = TypeVar('TStaticParseResult') @@ -85,49 +81,6 @@ async def __aexit__( self._active = False -@dataclass -class _OrphanPlaywrightContextPipeline(Generic[TStaticParseResult]): - """Minimal setup required by playwright context pipeline to work without crawler.""" - - pipeline: ContextPipeline[PlaywrightCrawlingContext] - top_router: Router[AdaptivePlaywrightCrawlingContext] - static_parser: AbstractHttpParser[TStaticParseResult] - - def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: - """Call that will be used by the top crawler to run through the pipeline.""" - - async def from_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> None: - adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( - context=context, parser=self.static_parser - ) - await self.top_router(adaptive_crawling_context) - - return self.pipeline(top_context, from_pipeline_to_top_router) - - def __str__(self) -> str: - return 'Playwright context pipeline' - - -@dataclass -class _OrphanStaticContextPipeline(Generic[TStaticCrawlingContext]): - """Minimal setup required by static context pipeline to work without crawler.""" - - pipeline: ContextPipeline[TStaticCrawlingContext] - top_router: Router[AdaptivePlaywrightCrawlingContext] - - def create_pipeline_call(self, top_context: BasicCrawlingContext) -> Coroutine[Any, Any, None]: - """Call that will be used by the top crawler to run through the pipeline.""" - - async def from_pipeline_to_top_router(context: TStaticCrawlingContext) -> None: - adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context(context) - await self.top_router(adaptive_crawling_context) - - return self.pipeline(top_context, from_pipeline_to_top_router) - - def __str__(self) -> str: - return 'Static context pipeline' - - class AdaptivePlaywrightCrawler( Generic[TStaticCrawlingContext, TStaticParseResult], BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState], @@ -216,15 +169,10 @@ async def adaptive_pre_navigation_hook(context: BasicCrawlingContext) -> None: self._additional_context_managers = [*self._additional_context_managers, playwright_crawler._browser_pool] # noqa: SLF001 # Intentional access to private member. - self._pw_context_pipeline = _OrphanPlaywrightContextPipeline( - pipeline=playwright_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. - top_router=self.router, - static_parser=static_parser, - ) - self._static_context_pipeline = _OrphanStaticContextPipeline[ParsedHttpCrawlingContext[TStaticParseResult]]( - pipeline=static_crawler._context_pipeline, # noqa:SLF001 # Intentional access to private member. - top_router=self.router, - ) + # Sub crawler pipeline related + self._pw_context_pipeline = playwright_crawler._context_pipeline # noqa:SLF001 # Intentional access to private member. + self._static_context_pipeline = static_crawler._context_pipeline # noqa:SLF001 # Intentional access to private member. + self._static_parser = static_parser @staticmethod def with_beautifulsoup_static_parser( @@ -279,18 +227,18 @@ def with_parsel_static_parser( **kwargs, ) - async def _crawl_one_with( + async def _crawl_one( self, - subcrawler_pipeline: _OrphanPlaywrightContextPipeline | _OrphanStaticContextPipeline, + rendering_type: RenderingType, context: BasicCrawlingContext, state: dict[str, JsonSerializable] | None = None, ) -> SubCrawlerRun: """Perform a one request crawl with specific context pipeline and return `SubCrawlerRun`. - `SubCrawlerRun` container either result of the crawl or the exception that was thrown during the crawl. - Use `context`, `result` and `state` to create new copy-like context that is passed to the `subcrawler_pipeline`. + `SubCrawlerRun` contains either result of the crawl or the exception that was thrown during the crawl. + Sub crawler pipeline call is dynamically created based on the `rendering_type`. + New copy-like context is created from passed `context` and `state` and is passed to sub crawler pipeline. """ - result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store) if state is not None: async def get_input_state( @@ -302,6 +250,8 @@ async def get_input_state( else: use_state_function = context.use_state + # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers. + result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store) context_linked_to_result = BasicCrawlingContext( request=deepcopy(context.request), session=deepcopy(context.session), @@ -316,10 +266,12 @@ async def get_input_state( try: await wait_for( - lambda: subcrawler_pipeline.create_pipeline_call(context_linked_to_result), + lambda: self._pipeline_call_factory( + rendering_type=rendering_type, context_linked_to_result=context_linked_to_result + ), timeout=self._request_handler_timeout, timeout_message=( - f'{subcrawler_pipeline=!s} timed out after {self._request_handler_timeout.total_seconds()}seconds' + f'{rendering_type=!s} timed out after {self._request_handler_timeout.total_seconds()}seconds' ), logger=self._logger, ) @@ -327,6 +279,34 @@ async def get_input_state( except Exception as e: return SubCrawlerRun(exception=e) + def _pipeline_call_factory( + self, rendering_type: RenderingType, context_linked_to_result: BasicCrawlingContext + ) -> Coroutine[Any, Any, None]: + """Create sub crawler pipeline call.""" + if rendering_type == 'static': + + async def from_static_pipeline_to_top_router( + context: ParsedHttpCrawlingContext[TStaticParseResult], + ) -> None: + adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context(context) + await self.router(adaptive_crawling_context) + + return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) + + if rendering_type == 'client only': + + async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> None: + adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( + context=context, parser=self._static_parser + ) + await self.router(adaptive_crawling_context) + + return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) + + raise RuntimeError( + f"Not a valid rendering type. Must be one of the following: {', '.join(get_args(RenderingType))}" + ) + @override async def _run_request_handler(self, context: BasicCrawlingContext) -> None: """Override BasicCrawler method that delegates request processing to sub crawlers. @@ -348,7 +328,7 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None: context.log.debug(f'Running static request for {context.request.url}') self.track_http_only_request_handler_runs() - static_run = await self._crawl_one_with(self._static_context_pipeline, context=context) + static_run = await self._crawl_one(rendering_type='static', context=context) if static_run.result and self.result_checker(static_run.result): await self._push_result_to_context(result=static_run.result, context=context) return @@ -372,7 +352,7 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None: old_state: dict[str, JsonSerializable] = await kvs.get_value(self._CRAWLEE_STATE_KEY, default_value) old_state_copy = deepcopy(old_state) - pw_run = await self._crawl_one_with(self._pw_context_pipeline, context=context) + pw_run = await self._crawl_one('client only', context=context) self.track_browser_request_handler_runs() if pw_run.exception is not None: @@ -383,9 +363,7 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None: if should_detect_rendering_type: detection_result: RenderingType - static_run = await self._crawl_one_with( - self._static_context_pipeline, context=context, state=old_state_copy - ) + static_run = await self._crawl_one('static', context=context, state=old_state_copy) if static_run.result and self.result_comparator(static_run.result, pw_run.result): detection_result = 'static' From fc95132340e0fe88700368e68cb8622aab914c46 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 22 Jan 2025 16:32:52 +0100 Subject: [PATCH 53/79] Move SubCrawlerRun to where it is used --- .../_adaptive_playwright/_adaptive_playwright_crawler.py | 8 +++++++- .../crawlers/_adaptive_playwright/_result_comparator.py | 7 ------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index f1d3cc2c6d..78cfda6487 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -4,6 +4,7 @@ import logging from collections.abc import Awaitable, Callable, Coroutine from copy import deepcopy +from dataclasses import dataclass from logging import getLogger from random import random from typing import TYPE_CHECKING, Any, Generic, get_args @@ -38,7 +39,6 @@ RenderingTypePredictor, ) from crawlee.crawlers._adaptive_playwright._result_comparator import ( - SubCrawlerRun, create_default_comparator, ) from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser @@ -402,3 +402,9 @@ def track_browser_request_handler_runs(self) -> None: def track_rendering_type_mispredictions(self) -> None: self.statistics.state.rendering_type_mispredictions += 1 + + +@dataclass(frozen=True) +class SubCrawlerRun: + result: RequestHandlerRunResult | None = None + exception: Exception | None = None diff --git a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py index c7c21eae73..a71e25a40d 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py @@ -1,6 +1,5 @@ from __future__ import annotations -from dataclasses import dataclass from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -9,12 +8,6 @@ from crawlee._types import RequestHandlerRunResult -@dataclass(frozen=True) -class SubCrawlerRun: - result: RequestHandlerRunResult | None = None - exception: Exception | None = None - - def create_default_comparator( result_checker: Callable[[RequestHandlerRunResult], bool] | None, ) -> Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool]: From 4f316b50d95599248f84f395f0eecccd6cd37b62 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 22 Jan 2025 17:02:37 +0100 Subject: [PATCH 54/79] Use custom _TestInput dataclass --- .../test_adaptive_playwright_crawler.py | 69 +++++++++++++++---- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 6243251df0..0ab1fedc22 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -2,6 +2,7 @@ import asyncio import logging +from dataclasses import dataclass from datetime import timedelta from itertools import cycle from typing import TYPE_CHECKING, cast @@ -80,7 +81,7 @@ class _SimpleRenderingTypePredictor(RenderingTypePredictor): def __init__( self, rendering_types: Iterator[RenderingType] | None = None, - detection_probability_recommendation: None | Iterator[int] = None, + detection_probability_recommendation: None | Iterator[float] = None, ) -> None: self._rendering_types = rendering_types or cycle(['static']) self._detection_probability_recommendation = detection_probability_recommendation or cycle([1]) @@ -94,26 +95,64 @@ def store_result(self, request: Request, crawl_type: RenderingType) -> None: pass +@dataclass(frozen=True) +class _TestInput: + expected_pw_count: int + expected_static_count: int + rendering_types: Iterator[RenderingType] + detection_probability_recommendation: Iterator[float] + + @pytest.mark.parametrize( - ('expected_pw_count', 'expected_static_count', 'rendering_types', 'detection_probability_recommendation'), + 'test_input', [ - pytest.param(0, 2, cycle(['static']), cycle([0]), id='Static only'), - pytest.param(2, 0, cycle(['client only']), cycle([0]), id='Client only'), - pytest.param(1, 1, cycle(['static', 'client only']), cycle([0]), id='Mixed'), - pytest.param(2, 2, cycle(['static', 'client only']), cycle([1]), id='Enforced rendering type detection'), + pytest.param( + _TestInput( + expected_pw_count=0, + expected_static_count=2, + rendering_types=cycle(['static']), + detection_probability_recommendation=cycle([0]), + ), + id='Static only', + ), + pytest.param( + _TestInput( + expected_pw_count=2, + expected_static_count=0, + rendering_types=cycle(['client only']), + detection_probability_recommendation=cycle([0]), + ), + id='Client only', + ), + pytest.param( + _TestInput( + expected_pw_count=1, + expected_static_count=1, + rendering_types=cycle(['static', 'client only']), + detection_probability_recommendation=cycle([0]), + ), + id='Mixed', + ), + pytest.param( + _TestInput( + expected_pw_count=2, + expected_static_count=2, + rendering_types=cycle(['static', 'client only']), + detection_probability_recommendation=cycle([1]), + ), + id='Enforced rendering type detection', + ), ], ) async def test_adaptive_crawling( - expected_pw_count: int, - expected_static_count: int, - rendering_types: Iterator[RenderingType], - detection_probability_recommendation: Iterator[int], + test_input: _TestInput, test_urls: list[str], ) -> None: """Tests correct routing to pre-nav hooks and correct handling through proper handler.""" predictor = _SimpleRenderingTypePredictor( - rendering_types=rendering_types, detection_probability_recommendation=detection_probability_recommendation + rendering_types=test_input.rendering_types, + detection_probability_recommendation=test_input.detection_probability_recommendation, ) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( @@ -153,11 +192,11 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None await crawler.run(test_urls) - assert pw_handler_count == expected_pw_count - assert pw_hook_count == expected_pw_count + assert pw_handler_count == test_input.expected_pw_count + assert pw_hook_count == test_input.expected_pw_count - assert static_handler_count == expected_static_count - assert static_hook_count == expected_static_count + assert static_handler_count == test_input.expected_static_count + assert static_hook_count == test_input.expected_static_count async def test_adaptive_crawling_parcel(test_urls: list[str]) -> None: From 949c4ffc87785abcbde3f8b98d12e208fcbbea85 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 23 Jan 2025 08:44:33 +0100 Subject: [PATCH 55/79] Review comments --- .../code/adaptive_playwright_crawler.py | 25 ++++++++----------- .../_adaptive_playwright_crawler.py | 13 ++++++---- .../_adaptive_playwright_crawling_context.py | 2 +- .../test_adaptive_playwright_crawler.py | 4 +-- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/examples/code/adaptive_playwright_crawler.py b/docs/examples/code/adaptive_playwright_crawler.py index 43314ab59e..02692c3ce9 100644 --- a/docs/examples/code/adaptive_playwright_crawler.py +++ b/docs/examples/code/adaptive_playwright_crawler.py @@ -15,24 +15,21 @@ async def main() -> None: max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False} ) + @crawler.router.handler(label='label') + async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None: + # Do some processing using `page` + some_locator = context.page.locator('div').first + await some_locator.wait_for() + # Do stuff with locator... + context.log.info(f'Playwright processing of: {context.request.url} ...') + @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - # Code that will be executed in both crawl types context.log.info(f'User handler processing: {context.request.url} ...') + # Do some processing using `parsed_content` + context.log.info(context.parsed_content.title) - try: - some_locator = context.page.locator('div').first - # Code that will be executed only in Playwright crawl. - # Trying to access `context.page` in static crawl will throw `AdaptiveContextError`. - - await some_locator.wait_for() - # Do stuff with locator... - context.log.info(f'Playwright processing of: {context.request.url} ...') - except AdaptiveContextError: - # Code that will be executed in only in static crawl - context.log.info(f'Static processing of: {context.request.url} ...') - - # FInd more links and enqueue them. + # Find more links and enqueue them. await context.enqueue_links() await context.push_data({'Top crawler Url': context.request.url}) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 78cfda6487..66542a978b 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -61,8 +61,11 @@ TStaticCrawlingContext = TypeVar('TStaticCrawlingContext', bound=ParsedHttpCrawlingContext) -class _NoActiveStatistics(Statistics): - """Statistics compliant object that is not supposed to do anything when active. To be used in sub crawlers.""" +class _NonPersistentStatistics(Statistics): + """Statistics compliant object that is not supposed to do anything when entering/exiting context. + + To be used in sub crawlers. + """ def __init__(self) -> None: super().__init__(state_model=StatisticsState) @@ -148,12 +151,12 @@ def __init__( static_crawler = static_crawler_class( parser=static_parser, - statistics=_NoActiveStatistics(), + statistics=_NonPersistentStatistics(), **static_crawler_specific_kwargs, **basic_crawler_kwargs_for_static_crawler, ) playwright_crawler = PlaywrightCrawler( - statistics=_NoActiveStatistics(), + statistics=_NonPersistentStatistics(), **playwright_crawler_specific_kwargs, **basic_crawler_kwargs_for_pw_crawler, ) @@ -162,7 +165,7 @@ def __init__( async def adaptive_pre_navigation_hook(context: BasicCrawlingContext) -> None: for hook in self._pre_navigation_hooks: - await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_contexts(context)) + await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context)) playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook) static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index ed6a514984..cb36569dab 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -115,7 +115,7 @@ def page(self) -> Page: raise AdaptiveContextError('Page is not crawled with PlaywrightCrawler.') @classmethod - def from_pre_navigation_contexts(cls, context: BasicCrawlingContext) -> Self: + def from_pre_navigation_context(cls, context: BasicCrawlingContext) -> Self: """Convenience constructor that creates new context from existing pre navigation contexts.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} context_kwargs['_page'] = context_kwargs.pop('page', None) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 0ab1fedc22..02131e0768 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -199,8 +199,8 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None assert static_hook_count == test_input.expected_static_count -async def test_adaptive_crawling_parcel(test_urls: list[str]) -> None: - """Top level test for parcel. Only one argument combination. (The rest of code is tested with bs variant.)""" +async def test_adaptive_crawling_parsel(test_urls: list[str]) -> None: + """Top level test for parsel. Only one argument combination. (The rest of code is tested with bs variant.)""" predictor = _SimpleRenderingTypePredictor( rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([0]) ) From 14d6f4f94d88e30b4f27dfc77838f64b02642a90 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 24 Jan 2025 15:27:59 +0100 Subject: [PATCH 56/79] WIP TODO: finish the test draft --- .../_abstract_http/_abstract_http_parser.py | 9 +++ .../_adaptive_playwright_crawler.py | 9 +-- .../_adaptive_playwright_crawling_context.py | 32 ++++++++++- .../_beautifulsoup/_beautifulsoup_crawler.py | 6 +- .../_beautifulsoup_crawling_context.py | 11 ++-- .../_beautifulsoup/_beautifulsoup_parser.py | 16 ++++-- src/crawlee/crawlers/_beautifulsoup/_utils.py | 2 +- src/crawlee/crawlers/_http/_http_parser.py | 8 +++ .../crawlers/_parsel/_parsel_parser.py | 10 ++++ .../test_adaptive_playwright_crawler.py | 56 ++++++++++++++++++- ...st_adaptive_playwright_crawling_context.py | 0 11 files changed, 137 insertions(+), 22 deletions(-) create mode 100644 tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index f6fba50ad6..a195070b3e 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -30,6 +30,15 @@ async def parse(self, response: HttpResponse) -> TParseResult: Parsed HTTP response. """ + @abstractmethod + async def parse_text(self, text: str) -> TParseResult: + ... + + @abstractmethod + async def select(self,parsed_content: TParseResult, selector: str) -> TParseResult|None: + ... + + def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo: """Detect if blocked and return BlockedInfo with additional information. diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 66542a978b..e991c3807f 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -9,7 +9,7 @@ from random import random from typing import TYPE_CHECKING, Any, Generic, get_args -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from parsel import Selector from typing_extensions import Self, TypeVar, override @@ -187,13 +187,13 @@ def with_beautifulsoup_static_parser( playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], - ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]: + ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Tag], Tag]: """Creates `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" if statistics is not None: adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState) else: adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState) - return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]( + return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Tag], Tag]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, @@ -291,7 +291,8 @@ def _pipeline_call_factory( async def from_static_pipeline_to_top_router( context: ParsedHttpCrawlingContext[TStaticParseResult], ) -> None: - adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context(context) + adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context( + context=context, parser=self._static_parser) await self.router(adaptive_crawling_context) return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index cb36569dab..d50329b2ce 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -31,10 +31,12 @@ class AdaptiveContextError(RuntimeError): @dataclass(frozen=True) @docs_group('Data structures') class AdaptivePlaywrightCrawlingContext(Generic[TStaticParseResult], ParsedHttpCrawlingContext[TStaticParseResult]): + _static_parser: AbstractHttpParser _response: Response | None = None _infinite_scroll: Callable[[], Awaitable[None]] | None = None _page: Page | None = None + @property def page(self) -> Page: """The Playwright `Page` object for the current page. @@ -66,12 +68,37 @@ def response(self) -> Response: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._response + async def wait_for_selector(self, selector: str, timeout: int = 5) -> None: + if self._static_parser.select(self.parsed_content,selector): + return + else: + await self.page.locator(selector).wait_for(timeout=timeout*1000) + # Should we parse the whole page again? + + + async def query_selector(self, selector: str, timeout: int= 5) -> TStaticParseResult: + static_content:TStaticParseResult|None = await self._static_parser.select(self.parsed_content, selector) + if static_content is not None: + return static_content + else: + locator = self.page.locator(selector) + await locator.wait_for(timeout=timeout*1000) + # Should we parse the whole page again? + static_content = await self._static_parser.select(locator.inner_html(), selector) + if static_content is not None: + return static_content + else: + # Selector worked in Playwright, but not in static parser + raise AdaptiveContextError("Used selector is not a valid static selector") + + + @classmethod def from_parsed_http_crawling_context( - cls, context: ParsedHttpCrawlingContext[TStaticParseResult] + cls, context: ParsedHttpCrawlingContext[TStaticParseResult], parser: AbstractHttpParser[TStaticParseResult] ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult]: """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext`.""" - return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) + return cls(_static_parser=parser, **{field.name: getattr(context, field.name) for field in fields(context)}) @classmethod async def from_playwright_crawling_context( @@ -91,6 +118,7 @@ async def from_playwright_crawling_context( return cls( parsed_content=await parser.parse(http_response), http_response=http_response, + _static_parser=parser, **context_kwargs, ) diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py index 68c3b4bbad..c1f778b237 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions @@ -19,7 +19,7 @@ @docs_group('Classes') -class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup]): +class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Tag]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. @@ -68,7 +68,7 @@ def __init__( """ async def final_step( - context: ParsedHttpCrawlingContext[BeautifulSoup], + context: ParsedHttpCrawlingContext[Tag], ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]: """Enhance `ParsedHttpCrawlingContext[BeautifulSoup]` with `soup` property.""" yield BeautifulSoupCrawlingContext.from_parsed_http_crawling_context(context) diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py index 46fc6678a8..d3dd551275 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py @@ -1,6 +1,7 @@ from dataclasses import dataclass, fields +from typing import cast -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from typing_extensions import Self from crawlee._utils.docs import docs_group @@ -11,19 +12,19 @@ @dataclass(frozen=True) @docs_group('Data structures') -class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup]): +class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[Tag]): """The crawling context used by the `BeautifulSoupCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. """ @property - def soup(self) -> BeautifulSoup: + def soup(self) -> Tag: """Convenience alias.""" - return self.parsed_content + return cast(BeautifulSoup, self.parsed_content) @classmethod - def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self: + def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Tag]) -> Self: """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`.""" return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index 155498b5d1..fab7bc2970 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -13,22 +13,30 @@ from crawlee.http_clients import HttpResponse -class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup]): +class BeautifulSoupParser(AbstractHttpParser[Tag]): """Parser for parsing HTTP response using `BeautifulSoup`.""" def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: self._parser = parser @override - async def parse(self, response: HttpResponse) -> BeautifulSoup: + async def parse(self, response: HttpResponse) -> Tag: return BeautifulSoup(response.read(), features=self._parser) @override - def is_matching_selector(self, parsed_content: BeautifulSoup, selector: str) -> bool: + async def parse_text(self, text: str) -> Tag: + return BeautifulSoup(text, features=self._parser) + + @override + def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool: return parsed_content.select_one(selector) is not None @override - def find_links(self, parsed_content: BeautifulSoup, selector: str) -> Iterable[str]: + async def select(self,parsed_content: Tag, selector: str) -> Tag|None: + return parsed_content.select_one(selector) + + @override + def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]: link: Tag urls: list[str] = [] for link in parsed_content.select(selector): diff --git a/src/crawlee/crawlers/_beautifulsoup/_utils.py b/src/crawlee/crawlers/_beautifulsoup/_utils.py index f92990ba9b..4c8719335e 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_utils.py +++ b/src/crawlee/crawlers/_beautifulsoup/_utils.py @@ -17,7 +17,7 @@ from collections.abc import Iterable -def html_to_text(source: str | BeautifulSoup) -> str: +def html_to_text(source: str | Tag) -> str: """Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup. Args: diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 90b8463fce..526479b88a 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -23,6 +23,14 @@ class NoParser(AbstractHttpParser[bytes]): async def parse(self, response: HttpResponse) -> bytes: return response.read() + @override + async def parse_text(self, text: str) -> bytes: + raise NotImplementedError() + + @override + async def select(self,parsed_content: bytes, selector: str) -> bytes|None: + raise NotImplementedError() + @override def is_blocked(self, parsed_content: bytes) -> BlockedInfo: # Intentional unused argument. return BlockedInfo(reason='') diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 4a61395111..abf1607e4d 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -1,3 +1,4 @@ +from __future__ import annotations import asyncio from collections.abc import Iterable @@ -15,6 +16,15 @@ class ParselParser(AbstractHttpParser[Selector]): async def parse(self, response: HttpResponse) -> Selector: return await asyncio.to_thread(lambda: Selector(body=response.read())) + @override + async def parse_text(self, text: str) -> Selector: + raise NotImplementedError() + + @override + async def select(self,parsed_content: Selector, selector: str) -> Selector|None: + raise NotImplementedError() + + @override def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 02131e0768..296a83b7df 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -6,10 +6,11 @@ from datetime import timedelta from itertools import cycle from typing import TYPE_CHECKING, cast -from unittest.mock import Mock, patch +from unittest.mock import Mock, patch, call import httpx import pytest +from bs4 import Tag, BeautifulSoup from typing_extensions import override from crawlee import Request @@ -50,7 +51,7 @@ def test_urls(respx_mock: respx.MockRouter) -> list[str]: for url in urls: respx_mock.get(url).return_value = httpx.Response( - status_code=200, content=b'What a body!' + status_code=200, content=b'

Bla

' ) return urls @@ -69,9 +70,14 @@ async def new_page( await crawlee_page.page.route( '**/*', lambda route: route.fulfill( - status=200, content_type='text/plain', body='What a body!' + status=200, content_type='text/plain', body='

Bla

' ), ) + # + add_h2_js = '''() => {let h2 = document.createElement('h2'); h2.innerText = "Blo"; document.getElementsByTagName("body")[0].append(h2)}''' + + asyncio.create_task(crawlee_page.page.evaluate(add_h2_js)) + return crawlee_page @@ -504,3 +510,47 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: mocked_static_handler.assert_called_once_with() # Browser handler was capable of running despite static handler having sleep time larger than top handler timeout. mocked_browser_handler.assert_called_once_with() + + + +async def test_adaptive_context_helpers(test_urls): + """Test that context helpers work regardless of the crawl type. + + Handler tries to locate two elements h1 and h2. + h1 exists immediately, h2 is created dynamically after small timeout + Create situation where page is crawled with static sub crawler first. + Static sub crawler should be able to locate only h1. It wil try to wait for h2, trying to wait for h2 will trigger + `AdaptiveContextError` which will force the adaptive crawler to try playwright sub crawler instead. Playwright sub + crawler is able to wait for the h2 element.""" + # Get page with injected JS code that will add some element after timeout + static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) + request_handler_timeout = timedelta(seconds=5) + + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_request_retries=1, + rendering_type_predictor=static_only_predictor_no_detection, + result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. + request_handler_timeout=request_handler_timeout, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) + + mocked_h1_handler = Mock() + mocked_h2_handler = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + h1 = await context.query_selector("h1",2) + mocked_h1_handler(h1) + h2 = await context.query_selector("h2",2) + mocked_h2_handler(h2) + + await crawler.run(test_urls[:1]) + + expected_h1_tag = Tag(name = "h1") + expected_h1_tag.append("Bla") + + expected_h2_tag = Tag(name = "h2") + expected_h2_tag.append("Blo") + + mocked_h1_handler.assert_has_calls([call(expected_h1_tag), call(expected_h1_tag)]) + mocked_h2_handler.assert_has_calls([call(expected_h2_tag)]) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py new file mode 100644 index 0000000000..e69de29bb2 From 56ad33a5a5dd4e395bc7411ce5072ced1a28fa77 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 27 Jan 2025 13:54:04 +0100 Subject: [PATCH 57/79] Add optional argument to pre navigation hook decorator --- .../code/adaptive_playwright_crawler.py | 17 +++++--- .../_adaptive_playwright_crawler.py | 39 +++++++++++++++---- .../_adaptive_playwright_crawling_context.py | 6 ++- .../test_adaptive_playwright_crawler.py | 36 ++++++++++++++++- 4 files changed, 82 insertions(+), 16 deletions(-) diff --git a/docs/examples/code/adaptive_playwright_crawler.py b/docs/examples/code/adaptive_playwright_crawler.py index 02692c3ce9..c7398a76cf 100644 --- a/docs/examples/code/adaptive_playwright_crawler.py +++ b/docs/examples/code/adaptive_playwright_crawler.py @@ -4,7 +4,6 @@ from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( - AdaptiveContextError, AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) @@ -35,14 +34,20 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: @crawler.pre_navigation_hook async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed both in static sub crawler and playwright sub crawler.""" + # Trying to access context.page in this hook would raise `AdaptiveContextError` for pages crawled + # without playwright. + context.log.info(f'pre navigation hook for: {context.request.url} ...') + + @crawler.pre_navigation_hook(playwright_only=True) + async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed only in playwright sub crawler.""" + async def some_routing_function(route: Route) -> None: await route.continue_() - try: - await context.page.route('*/**', some_routing_function) - context.log.info(f'Playwright pre navigation hook for: {context.request.url} ...') - except AdaptiveContextError: - context.log.info(f'Static pre navigation hook for: {context.request.url} ...') + await context.page.route('*/**', some_routing_function) + context.log.info(f'Playwright only pre navigation hook for: {context.request.url} ...') # Run the crawler with the initial list of URLs. await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 66542a978b..168a3afc0e 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -23,6 +23,7 @@ ParsedHttpCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( AdaptivePlaywrightCrawlerStatisticState, @@ -161,14 +162,22 @@ def __init__( **basic_crawler_kwargs_for_pw_crawler, ) + # Register pre navigation hooks on sub crawlers self._pre_navigation_hooks = list[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]]() + self._pre_navigation_hooks_pw_only = list[ + Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] + ]() - async def adaptive_pre_navigation_hook(context: BasicCrawlingContext) -> None: + async def adaptive_pre_navigation_hook_static(context: BasicCrawlingContext) -> None: for hook in self._pre_navigation_hooks: await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context)) - playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook) - static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook) + async def adaptive_pre_navigation_hook_pw(context: PlaywrightPreNavCrawlingContext) -> None: + for hook in self._pre_navigation_hooks + self._pre_navigation_hooks_pw_only: + await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context)) + + static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_static) + playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_pw) self._additional_context_managers = [*self._additional_context_managers, playwright_crawler._browser_pool] # noqa: SLF001 # Intentional access to private member. @@ -388,14 +397,28 @@ async def _push_result_to_context(self, result: RequestHandlerRunResult, context def pre_navigation_hook( self, - hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]], - ) -> None: + hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None, + *, + playwright_only: bool = False, + ) -> Callable[[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]], None]: """Pre navigation hooks for adaptive crawler are delegated to sub crawlers. - Hooks are wrapped in context that handles possibly missing `page` object by throwing `AdaptiveContextError`. - Hooks that try to access `context.page` will have to catch this exception if triggered by static pipeline. + Optionally parametrized decorator. + Hooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`. """ - self._pre_navigation_hooks.append(hook) + + def register_hooks(hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: + if playwright_only: + self._pre_navigation_hooks_pw_only.append(hook) + else: + self._pre_navigation_hooks.append(hook) + + # No parameter in decorator. Execute directly. + if hook: + register_hooks(hook) + + # Return parametrized decorator that will be executed through decorator syntax if called with parameter. + return register_hooks def track_http_only_request_handler_runs(self) -> None: self.statistics.state.http_only_request_handler_runs += 1 diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index cb36569dab..b34a2afecc 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -112,7 +112,11 @@ def page(self) -> Page: """ if self._page is not None: return self._page - raise AdaptiveContextError('Page is not crawled with PlaywrightCrawler.') + raise AdaptiveContextError( + 'Page was crawled with static sub crawler and not with crawled with PlaywrightCrawler. For Playwright only ' + 'hooks please use `playwright_only`=True when registering the hook. ' + 'For example: @crawler.pre_navigation_hook(playwright_only=True)' + ) @classmethod def from_pre_navigation_context(cls, context: BasicCrawlingContext) -> Self: diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 02131e0768..875affc3d2 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -6,7 +6,7 @@ from datetime import timedelta from itertools import cycle from typing import TYPE_CHECKING, cast -from unittest.mock import Mock, patch +from unittest.mock import Mock, call, patch import httpx import pytest @@ -263,6 +263,40 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None assert user_data_in_handler == ['pw', 'bs'] +async def test_playwright_only_hook(test_urls: list[str]) -> None: + """Test that hook can be registered for playwright only sub crawler. + + Create a situation where one page is crawled by both sub crawlers. One common pre navigation hook is registered and + one playwright only pre navigation hook is registered.""" + static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() + + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + rendering_type_predictor=static_only_predictor_enforce_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) + pre_nav_hook_common = Mock() + pre_nav_hook_playwright = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + pass + + @crawler.pre_navigation_hook + async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + pre_nav_hook_common(context.request.url) + + @crawler.pre_navigation_hook(playwright_only=True) + async def pre_nav_hook_pw_only(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + pre_nav_hook_playwright(context.page.url) + + await crawler.run(test_urls[:1]) + + # Default behavior. Hook is called everytime, both static sub crawler and playwright sub crawler. + pre_nav_hook_common.assert_has_calls([call(test_urls[0]), call(test_urls[0])]) + # Hook is called only by playwright sub crawler. + pre_nav_hook_playwright.assert_called_once_with('about:blank') + + async def test_adaptive_crawling_result(test_urls: list[str]) -> None: """Tests that result only from one sub crawler is saved. From 781d5ff3631200230c944785ba0035d4edb6d343 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 27 Jan 2025 14:24:18 +0100 Subject: [PATCH 58/79] Remove _push_result_to_context and add result argument/return to _run_request_handler. --- .../_adaptive_playwright_crawler.py | 28 ++++++------------- src/crawlee/crawlers/_basic/_basic_crawler.py | 15 ++++++---- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 168a3afc0e..e26816e3c7 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -1,6 +1,5 @@ from __future__ import annotations -import asyncio import logging from collections.abc import Awaitable, Callable, Coroutine from copy import deepcopy @@ -320,7 +319,9 @@ async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> ) @override - async def _run_request_handler(self, context: BasicCrawlingContext) -> None: + async def _run_request_handler( + self, context: BasicCrawlingContext, result: RequestHandlerRunResult + ) -> RequestHandlerRunResult: """Override BasicCrawler method that delegates request processing to sub crawlers. To decide which sub crawler should process the request it runs `rendering_type_predictor`. @@ -342,8 +343,7 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None: static_run = await self._crawl_one(rendering_type='static', context=context) if static_run.result and self.result_checker(static_run.result): - await self._push_result_to_context(result=static_run.result, context=context) - return + return static_run.result if static_run.exception: context.log.exception( msg=f'Static crawler: failed for {context.request.url}', exc_info=static_run.exception @@ -367,12 +367,7 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None: pw_run = await self._crawl_one('client only', context=context) self.track_browser_request_handler_runs() - if pw_run.exception is not None: - raise pw_run.exception - if pw_run.result: - await self._push_result_to_context(result=pw_run.result, context=context) - if should_detect_rendering_type: detection_result: RenderingType static_run = await self._crawl_one('static', context=context, state=old_state_copy) @@ -384,16 +379,11 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None: context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}') self.rendering_type_predictor.store_result(context.request, detection_result) - - async def _push_result_to_context(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None: - """Execute calls from `result` on the context.""" - result_tasks = ( - [asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls] - + [asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls] - + [asyncio.create_task(self._commit_key_value_store_changes(result, context.get_key_value_store))] - ) - - await asyncio.gather(*result_tasks) + return pw_run.result + if pw_run.exception is not None: + raise pw_run.exception + # Unreachable code, but mypy can't know it. + raise RuntimeError('Missing both result and exception.') def pre_navigation_hook( self, diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 4f6889de7e..4528dd9121 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1018,16 +1018,16 @@ async def __run_task_function(self) -> None: session = await self._get_session() proxy_info = await self._get_proxy_info(request, session) - result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store) + empty_result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store) context = BasicCrawlingContext( request=request, session=session, proxy_info=proxy_info, send_request=self._prepare_send_request_function(session, proxy_info), - add_requests=result.add_requests, - push_data=result.push_data, - get_key_value_store=result.get_key_value_store, + add_requests=empty_result.add_requests, + push_data=empty_result.push_data, + get_key_value_store=empty_result.get_key_value_store, use_state=self._use_state, log=self._logger, ) @@ -1039,7 +1039,7 @@ async def __run_task_function(self) -> None: request.state = RequestState.REQUEST_HANDLER try: - await self._run_request_handler(context=context) + result = await self._run_request_handler(context=context, result=empty_result) except asyncio.TimeoutError as e: raise RequestHandlerError(e, context) from e @@ -1132,7 +1132,9 @@ async def __run_task_function(self) -> None: ) raise - async def _run_request_handler(self, context: BasicCrawlingContext) -> None: + async def _run_request_handler( + self, context: BasicCrawlingContext, result: RequestHandlerRunResult + ) -> RequestHandlerRunResult: await wait_for( lambda: self._context_pipeline(context, self.router), timeout=self._request_handler_timeout, @@ -1140,6 +1142,7 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None: f'{self._request_handler_timeout.total_seconds()} seconds', logger=self._logger, ) + return result def _is_session_blocked_status_code(self, session: Session | None, status_code: int) -> bool: """Check if the HTTP status code indicates that the session was blocked by the target website. From 23f149470224ce0f62fe5f8904439322ebf4412f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 27 Jan 2025 15:36:45 +0100 Subject: [PATCH 59/79] Context helper for bs + test. TODO: implement for parsel. --- .../_abstract_http/_abstract_http_parser.py | 7 +-- .../_adaptive_playwright_crawler.py | 5 +- .../_adaptive_playwright_crawling_context.py | 36 ++++++------- .../_beautifulsoup/_beautifulsoup_crawler.py | 2 +- .../_beautifulsoup/_beautifulsoup_parser.py | 2 +- src/crawlee/crawlers/_http/_http_parser.py | 6 +-- .../crawlers/_parsel/_parsel_parser.py | 16 +++--- .../test_adaptive_playwright_crawler.py | 54 +++++++++---------- 8 files changed, 64 insertions(+), 64 deletions(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index a195070b3e..a4f7c68e47 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -31,13 +31,10 @@ async def parse(self, response: HttpResponse) -> TParseResult: """ @abstractmethod - async def parse_text(self, text: str) -> TParseResult: - ... + async def parse_text(self, text: str) -> TParseResult: ... @abstractmethod - async def select(self,parsed_content: TParseResult, selector: str) -> TParseResult|None: - ... - + async def select(self, parsed_content: TParseResult, selector: str) -> TParseResult | None: ... def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo: """Detect if blocked and return BlockedInfo with additional information. diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index e991c3807f..fee9e133b4 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -9,7 +9,7 @@ from random import random from typing import TYPE_CHECKING, Any, Generic, get_args -from bs4 import BeautifulSoup, Tag +from bs4 import Tag from parsel import Selector from typing_extensions import Self, TypeVar, override @@ -292,7 +292,8 @@ async def from_static_pipeline_to_top_router( context: ParsedHttpCrawlingContext[TStaticParseResult], ) -> None: adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context( - context=context, parser=self._static_parser) + context=context, parser=self._static_parser + ) await self.router(adaptive_crawling_context) return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index d50329b2ce..df9ed2520d 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -1,8 +1,10 @@ from __future__ import annotations from dataclasses import dataclass, fields +from datetime import timedelta from typing import TYPE_CHECKING, Generic +from bs4 import BeautifulSoup from typing_extensions import TypeVar from crawlee import HttpHeaders @@ -36,7 +38,6 @@ class AdaptivePlaywrightCrawlingContext(Generic[TStaticParseResult], ParsedHttpC _infinite_scroll: Callable[[], Awaitable[None]] | None = None _page: Page | None = None - @property def page(self) -> Page: """The Playwright `Page` object for the current page. @@ -69,29 +70,26 @@ def response(self) -> Response: return self._response async def wait_for_selector(self, selector: str, timeout: int = 5) -> None: - if self._static_parser.select(self.parsed_content,selector): + if self._static_parser.select(self.parsed_content, selector): return - else: - await self.page.locator(selector).wait_for(timeout=timeout*1000) - # Should we parse the whole page again? - + await self.page.locator(selector).wait_for(timeout=timeout * 1000) + # Should we parse the whole page again? - async def query_selector(self, selector: str, timeout: int= 5) -> TStaticParseResult: - static_content:TStaticParseResult|None = await self._static_parser.select(self.parsed_content, selector) + async def query_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> TStaticParseResult: + static_content: TStaticParseResult | None = await self._static_parser.select(self.parsed_content, selector) if static_content is not None: return static_content - else: - locator = self.page.locator(selector) - await locator.wait_for(timeout=timeout*1000) - # Should we parse the whole page again? - static_content = await self._static_parser.select(locator.inner_html(), selector) - if static_content is not None: - return static_content - else: - # Selector worked in Playwright, but not in static parser - raise AdaptiveContextError("Used selector is not a valid static selector") - + locator = self.page.locator(selector) + await locator.wait_for(timeout=timeout.total_seconds() * 1000) + # Should we parse the whole page again? But context is not mutable... + parsed_selector: TStaticParseResult | None = await self._static_parser.select( + BeautifulSoup(await locator.evaluate('el => el.outerHTML')), selector + ) + if parsed_selector is not None: + return parsed_selector + # Selector worked in Playwright, but not in static parser + raise AdaptiveContextError('Used selector is not a valid static selector') @classmethod def from_parsed_http_crawling_context( diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py index c1f778b237..d86083410d 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from bs4 import BeautifulSoup, Tag +from bs4 import Tag from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index fab7bc2970..86533fbac5 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -32,7 +32,7 @@ def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool: return parsed_content.select_one(selector) is not None @override - async def select(self,parsed_content: Tag, selector: str) -> Tag|None: + async def select(self, parsed_content: Tag, selector: str) -> Tag | None: return parsed_content.select_one(selector) @override diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 526479b88a..868f59d839 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -25,11 +25,11 @@ async def parse(self, response: HttpResponse) -> bytes: @override async def parse_text(self, text: str) -> bytes: - raise NotImplementedError() + raise NotImplementedError @override - async def select(self,parsed_content: bytes, selector: str) -> bytes|None: - raise NotImplementedError() + async def select(self, parsed_content: bytes, selector: str) -> bytes | None: + raise NotImplementedError @override def is_blocked(self, parsed_content: bytes) -> BlockedInfo: # Intentional unused argument. diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index abf1607e4d..abd6ccd96f 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -1,12 +1,17 @@ from __future__ import annotations + import asyncio -from collections.abc import Iterable +from typing import TYPE_CHECKING from parsel import Selector from typing_extensions import override from crawlee.crawlers._abstract_http import AbstractHttpParser -from crawlee.http_clients import HttpResponse + +if TYPE_CHECKING: + from collections.abc import Iterable + + from crawlee.http_clients import HttpResponse class ParselParser(AbstractHttpParser[Selector]): @@ -18,12 +23,11 @@ async def parse(self, response: HttpResponse) -> Selector: @override async def parse_text(self, text: str) -> Selector: - raise NotImplementedError() + raise NotImplementedError @override - async def select(self,parsed_content: Selector, selector: str) -> Selector|None: - raise NotImplementedError() - + async def select(self, parsed_content: Selector, selector: str) -> Selector | None: + raise NotImplementedError @override def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 296a83b7df..230043c5cf 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -6,11 +6,11 @@ from datetime import timedelta from itertools import cycle from typing import TYPE_CHECKING, cast -from unittest.mock import Mock, patch, call +from unittest.mock import Mock, call, patch import httpx import pytest -from bs4 import Tag, BeautifulSoup +from bs4 import Tag from typing_extensions import override from crawlee import Request @@ -40,6 +40,17 @@ from crawlee.browsers._types import CrawleePage from crawlee.proxy_configuration import ProxyInfo +_H1_TEXT = 'Static' +_H2_TEXT = 'Only in browser' +_PAGE_CONTENT = f""" +

{_H1_TEXT}

+ +""" + @pytest.fixture def test_urls(respx_mock: respx.MockRouter) -> list[str]: @@ -50,9 +61,7 @@ def test_urls(respx_mock: respx.MockRouter) -> list[str]: ] for url in urls: - respx_mock.get(url).return_value = httpx.Response( - status_code=200, content=b'

Bla

' - ) + respx_mock.get(url).return_value = httpx.Response(status_code=200, content=_PAGE_CONTENT.encode()) return urls @@ -69,15 +78,8 @@ async def new_page( crawlee_page = await super().new_page(page_id=page_id, browser_plugin=browser_plugin, proxy_info=proxy_info) await crawlee_page.page.route( '**/*', - lambda route: route.fulfill( - status=200, content_type='text/plain', body='

Bla

' - ), + lambda route: route.fulfill(status=200, content_type='text/html', body=_PAGE_CONTENT), ) - # - add_h2_js = '''() => {let h2 = document.createElement('h2'); h2.innerText = "Blo"; document.getElementsByTagName("body")[0].append(h2)}''' - - asyncio.create_task(crawlee_page.page.evaluate(add_h2_js)) - return crawlee_page @@ -485,7 +487,6 @@ async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_urls: lis crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_request_retries=1, rendering_type_predictor=static_only_predictor_no_detection, - result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. request_handler_timeout=request_handler_timeout, playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, ) @@ -512,25 +513,22 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: mocked_browser_handler.assert_called_once_with() - -async def test_adaptive_context_helpers(test_urls): +async def test_adaptive_context_helpers(test_urls: list[str]) -> None: """Test that context helpers work regardless of the crawl type. Handler tries to locate two elements h1 and h2. - h1 exists immediately, h2 is created dynamically after small timeout + h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html. Create situation where page is crawled with static sub crawler first. Static sub crawler should be able to locate only h1. It wil try to wait for h2, trying to wait for h2 will trigger `AdaptiveContextError` which will force the adaptive crawler to try playwright sub crawler instead. Playwright sub - crawler is able to wait for the h2 element.""" + crawler is able to wait for the h2 element.""" + # Get page with injected JS code that will add some element after timeout static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) - request_handler_timeout = timedelta(seconds=5) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_request_retries=1, rendering_type_predictor=static_only_predictor_no_detection, - result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. - request_handler_timeout=request_handler_timeout, playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, ) @@ -539,18 +537,20 @@ async def test_adaptive_context_helpers(test_urls): @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - h1 = await context.query_selector("h1",2) + h1 = await context.query_selector('h1', timedelta(milliseconds=1000)) mocked_h1_handler(h1) - h2 = await context.query_selector("h2",2) + h2 = await context.query_selector('h2', timedelta(milliseconds=1000)) mocked_h2_handler(h2) await crawler.run(test_urls[:1]) - expected_h1_tag = Tag(name = "h1") - expected_h1_tag.append("Bla") + expected_h1_tag = Tag(name='h1') + expected_h1_tag.append(_H1_TEXT) - expected_h2_tag = Tag(name = "h2") - expected_h2_tag.append("Blo") + expected_h2_tag = Tag(name='h2') + expected_h2_tag.append(_H2_TEXT) + # Called by both sub crawlers mocked_h1_handler.assert_has_calls([call(expected_h1_tag), call(expected_h1_tag)]) + # Called only by pw sub crawler mocked_h2_handler.assert_has_calls([call(expected_h2_tag)]) From 2ae55acf976d11c7a50592313441d93e3b3d2c61 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 27 Jan 2025 16:19:50 +0100 Subject: [PATCH 60/79] parsel wip --- src/crawlee/crawlers/_parsel/_parsel_parser.py | 6 ++++-- .../test_adaptive_playwright_crawler.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index abd6ccd96f..1ddc205b22 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -23,11 +23,13 @@ async def parse(self, response: HttpResponse) -> Selector: @override async def parse_text(self, text: str) -> Selector: - raise NotImplementedError + return Selector(text=text) @override async def select(self, parsed_content: Selector, selector: str) -> Selector | None: - raise NotImplementedError + # This is super weird + if found_content:=parsed_content.css(selector).get(): + return Selector(found_content) @override def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 230043c5cf..027e30d12d 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -526,7 +526,7 @@ async def test_adaptive_context_helpers(test_urls: list[str]) -> None: # Get page with injected JS code that will add some element after timeout static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) - crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( max_request_retries=1, rendering_type_predictor=static_only_predictor_no_detection, playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, From b4ba31bc9486feeee2249a24e212ede45daacd3a Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 28 Jan 2025 10:11:16 +0100 Subject: [PATCH 61/79] Add `block_request` to adaptive pre nav context --- .../_adaptive_playwright_crawling_context.py | 14 ++++++++++++++ .../test_adaptive_playwright_crawler.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index b34a2afecc..f20bf3815c 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -20,6 +20,8 @@ from playwright.async_api import Page, Response from typing_extensions import Self + from crawlee.crawlers._playwright._types import BlockRequestsFunction + class AdaptiveContextError(RuntimeError): pass @@ -88,6 +90,8 @@ async def from_playwright_crawling_context( http_response = await _PlaywrightHttpResponse.from_playwright_response( response=context.response, protocol=protocol_guess or '' ) + # block_requests is useful only on pre-navigation contexts. It is useless here. + context_kwargs.pop('block_requests') return cls( parsed_content=await parser.parse(http_response), http_response=http_response, @@ -103,6 +107,7 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext): """ _page: Page | None = None + block_requests: BlockRequestsFunction | None = None @property def page(self) -> Page: @@ -123,6 +128,15 @@ def from_pre_navigation_context(cls, context: BasicCrawlingContext) -> Self: """Convenience constructor that creates new context from existing pre navigation contexts.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} context_kwargs['_page'] = context_kwargs.pop('page', None) + + # For static sub crawler replace block requests by function doing nothing. + async def dummy_block_requests( + url_patterns: list[str] | None = None, # noqa:ARG001 + extra_url_patterns: list[str] | None = None, # noqa:ARG001 + ) -> None: + return + + context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests) return cls(**context_kwargs) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 875affc3d2..57d7eee38a 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -69,7 +69,7 @@ async def new_page( await crawlee_page.page.route( '**/*', lambda route: route.fulfill( - status=200, content_type='text/plain', body='What a body!' + status=200, content_type='text/html', body='What a body!' ), ) return crawlee_page From 0b7bfca2f1c105cc780c2fa36fd346eb12b0bf53 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 28 Jan 2025 12:02:36 +0100 Subject: [PATCH 62/79] Draft of context helpers both for Parsel and BeautifulSoup --- .../_adaptive_playwright_crawling_context.py | 19 ++++--- .../crawlers/_parsel/_parsel_parser.py | 6 +-- .../test_adaptive_playwright_crawler.py | 50 +++++++++++++++++-- 3 files changed, 60 insertions(+), 15 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index df9ed2520d..9dd5703a6f 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -4,7 +4,6 @@ from datetime import timedelta from typing import TYPE_CHECKING, Generic -from bs4 import BeautifulSoup from typing_extensions import TypeVar from crawlee import HttpHeaders @@ -33,7 +32,7 @@ class AdaptiveContextError(RuntimeError): @dataclass(frozen=True) @docs_group('Data structures') class AdaptivePlaywrightCrawlingContext(Generic[TStaticParseResult], ParsedHttpCrawlingContext[TStaticParseResult]): - _static_parser: AbstractHttpParser + _static_parser: AbstractHttpParser[TStaticParseResult] _response: Response | None = None _infinite_scroll: Callable[[], Awaitable[None]] | None = None _page: Page | None = None @@ -73,24 +72,30 @@ async def wait_for_selector(self, selector: str, timeout: int = 5) -> None: if self._static_parser.select(self.parsed_content, selector): return await self.page.locator(selector).wait_for(timeout=timeout * 1000) - # Should we parse the whole page again? + # Should we parse the whole page again? But context is not mutable... + # without reparsing, what is the use case for wait_for_selector? Seems useless and + # all use cases are satisfied with query_selector async def query_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> TStaticParseResult: - static_content: TStaticParseResult | None = await self._static_parser.select(self.parsed_content, selector) + static_content = await self._static_parser.select(self.parsed_content, selector) if static_content is not None: return static_content + locator = self.page.locator(selector) await locator.wait_for(timeout=timeout.total_seconds() * 1000) # Should we parse the whole page again? But context is not mutable... - parsed_selector: TStaticParseResult | None = await self._static_parser.select( - BeautifulSoup(await locator.evaluate('el => el.outerHTML')), selector + parsed_selector = await self._static_parser.select( + await self.parse_with_static_parser(await locator.evaluate('el => el.outerHTML')), selector ) if parsed_selector is not None: return parsed_selector - # Selector worked in Playwright, but not in static parser + # Selector worked in Playwright, but not in static parser and raise AdaptiveContextError('Used selector is not a valid static selector') + async def parse_with_static_parser(self, text: str) -> TStaticParseResult: + return await self._static_parser.parse_text(text) + @classmethod def from_parsed_http_crawling_context( cls, context: ParsedHttpCrawlingContext[TStaticParseResult], parser: AbstractHttpParser[TStaticParseResult] diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 1ddc205b22..0bc98d9be7 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -27,9 +27,9 @@ async def parse_text(self, text: str) -> Selector: @override async def select(self, parsed_content: Selector, selector: str) -> Selector | None: - # This is super weird - if found_content:=parsed_content.css(selector).get(): - return Selector(found_content) + if selector_list := parsed_content.css(selector): + return selector_list[0] + return None @override def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 027e30d12d..7e21c3849b 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -11,6 +11,7 @@ import httpx import pytest from bs4 import Tag +from parsel import Selector from typing_extensions import override from crawlee import Request @@ -42,7 +43,7 @@ _H1_TEXT = 'Static' _H2_TEXT = 'Only in browser' -_PAGE_CONTENT = f""" +_PAGE_CONTENT_STATIC = f"""

{_H1_TEXT}

""" @@ -514,8 +519,8 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: mocked_browser_handler.assert_called_once_with() -async def test_adaptive_context_helpers_beautiful_soup(test_urls: list[str]) -> None: - """Test that context helpers work regardless of the crawl type. +async def test_adaptive_context_query_selector_beautiful_soup(test_urls: list[str]) -> None: + """Test that `context.query_selector` works regardless of the crawl type for BeautifulSoup variant. Handler tries to locate two elements h1 and h2. h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html. @@ -538,9 +543,9 @@ async def test_adaptive_context_helpers_beautiful_soup(test_urls: list[str]) -> @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - h1 = await context.query_selector('h1', timedelta(milliseconds=1000)) + h1 = await context.query_selector('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) mocked_h1_handler(h1) - h2 = await context.query_selector('h2', timedelta(milliseconds=1000)) + h2 = await context.query_selector('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) mocked_h2_handler(h2) await crawler.run(test_urls[:1]) @@ -557,8 +562,8 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: mocked_h2_handler.assert_has_calls([call(expected_h2_tag)]) -async def test_adaptive_context_helpers_parsel(test_urls: list[str]) -> None: - """Test that context helpers work regardless of the crawl type. +async def test_adaptive_context_query_selector_parsel(test_urls: list[str]) -> None: + """Test that `context.query_selector` works regardless of the crawl type for Parsel variant. Handler tries to locate two elements h1 and h2. h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html. @@ -583,9 +588,9 @@ async def test_adaptive_context_helpers_parsel(test_urls: list[str]) -> None: @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - h1 = await context.query_selector('h1', timedelta(milliseconds=1000)) + h1 = await context.query_selector('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) mocked_h1_handler(type(h1), h1.get()) - h2 = await context.query_selector('h2', timedelta(milliseconds=1000)) + h2 = await context.query_selector('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) mocked_h2_handler(type(h2), h2.get()) await crawler.run(test_urls[:1]) @@ -594,3 +599,42 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: mocked_h1_handler.assert_has_calls([call(Selector, expected_h1_tag), call(Selector, expected_h1_tag)]) # Called only by pw sub crawler mocked_h2_handler.assert_has_calls([call(Selector, expected_h2_tag)]) + + +async def test_adaptive_context_parse_with_static_parser_parsel(test_urls: list[str]) -> None: + """Test `context.parse_with_static_parser` works regardless of the crawl type for Parsel variant. + + (Test covers also `context.wait_for_selector`, which is called by `context.parse_with_static_parser`) + Create situation where + """ + static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) + expected_h2_tag = f'

{_H2_TEXT}

' + + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + max_request_retries=1, + rendering_type_predictor=static_only_predictor_no_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) + + mocked_h2_handler = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + h2_static = context.parsed_content.css('h2') # Should not find anything + mocked_h2_handler(h2_static) + + # Reparse whole page after h2 appears + parsed_content_after_h2_appeared = await context.parse_with_static_parser( + selector='h2', timeout=timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2) + ) + mocked_h2_handler(parsed_content_after_h2_appeared.css('h2')[0].get()) + + await crawler.run(test_urls[:1]) + + mocked_h2_handler.assert_has_calls( + [ + call([]), # Static sub crawler tried and did not find h2. + call([]), # Playwright sub crawler tried and did not find h2 without waiting. + call(expected_h2_tag), + ] + ) # Playwright waited for h2 to appear. From dc286be395057075dd866b4e12bc95bcef64b6f8 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 30 Jan 2025 10:35:49 +0100 Subject: [PATCH 67/79] Parser with two generics variant --- .../_abstract_http/_abstract_http_crawler.py | 16 +++++----- .../_abstract_http/_abstract_http_parser.py | 6 ++-- .../_abstract_http/_http_crawling_context.py | 1 + .../_adaptive_playwright_crawler.py | 15 +++++----- .../_adaptive_playwright_crawling_context.py | 29 ++++++++++--------- .../_beautifulsoup/_beautifulsoup_crawler.py | 6 ++-- .../_beautifulsoup_crawling_context.py | 9 +++--- .../_beautifulsoup/_beautifulsoup_parser.py | 6 ++-- src/crawlee/crawlers/_http/_http_crawler.py | 2 +- src/crawlee/crawlers/_http/_http_parser.py | 2 +- .../crawlers/_parsel/_parsel_crawler.py | 2 +- .../crawlers/_parsel/_parsel_parser.py | 2 +- 12 files changed, 51 insertions(+), 45 deletions(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 5f821f29c2..8021cb29c9 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -16,7 +16,7 @@ from crawlee.http_clients import HttpxHttpClient from crawlee.statistics import StatisticsState -from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult +from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Iterable @@ -53,7 +53,7 @@ class HttpCrawlerOptions( @docs_group('Abstract classes') class AbstractHttpCrawler( - Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC + Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC ): """A web crawler for performing HTTP requests. @@ -72,7 +72,7 @@ class AbstractHttpCrawler( def __init__( self, *, - parser: AbstractHttpParser[TParseResult], + parser: AbstractHttpParser[TParseResult, TSelectResult], additional_http_error_status_codes: Iterable[int] = (), ignore_http_error_status_codes: Iterable[int] = (), **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]], @@ -99,8 +99,8 @@ def __init__( @staticmethod def create_parsed_http_crawler_class( - static_parser: AbstractHttpParser[TParseResult], - ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]]: + static_parser: AbstractHttpParser[TParseResult, TSelectResult], + ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]: """Convenience class factory that creates specific version of `AbstractHttpCrawler` class. In general typing sense two generic types of `AbstractHttpCrawler` do not have to be dependent on each other. @@ -108,10 +108,12 @@ def create_parsed_http_crawler_class( parameters in `AbstractHttpCrawler`. """ - class _ParsedHttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]): + class _ParsedHttpCrawler( + AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult] + ): def __init__( self, - parser: AbstractHttpParser[TParseResult] = static_parser, + parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser, **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]], ) -> None: kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline() diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index a4f7c68e47..60a90a78ed 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -7,7 +7,7 @@ from crawlee._utils.docs import docs_group from crawlee.crawlers._types import BlockedInfo -from ._http_crawling_context import TParseResult +from ._http_crawling_context import TParseResult, TSelectResult if TYPE_CHECKING: from collections.abc import Iterable @@ -16,7 +16,7 @@ @docs_group('Abstract classes') -class AbstractHttpParser(Generic[TParseResult], ABC): +class AbstractHttpParser(Generic[TParseResult, TSelectResult], ABC): """Parser used for parsing http response and inspecting parsed result to find links or detect blocking.""" @abstractmethod @@ -34,7 +34,7 @@ async def parse(self, response: HttpResponse) -> TParseResult: async def parse_text(self, text: str) -> TParseResult: ... @abstractmethod - async def select(self, parsed_content: TParseResult, selector: str) -> TParseResult | None: ... + async def select(self, parsed_content: TParseResult, selector: str) -> TSelectResult | None: ... def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo: """Detect if blocked and return BlockedInfo with additional information. diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py index 7475b85eb3..2278cb783d 100644 --- a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py +++ b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py @@ -10,6 +10,7 @@ from crawlee.http_clients import HttpCrawlingResult, HttpResponse TParseResult = TypeVar('TParseResult') +TSelectResult = TypeVar('TSelectResult') @dataclass(frozen=True) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index fee9e133b4..47f93e7a8e 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -9,7 +9,7 @@ from random import random from typing import TYPE_CHECKING, Any, Generic, get_args -from bs4 import Tag +from bs4 import BeautifulSoup, Tag from parsel import Selector from typing_extensions import Self, TypeVar, override @@ -58,6 +58,7 @@ TStaticParseResult = TypeVar('TStaticParseResult') +TStaticSelectResult = TypeVar('TStaticSelectResult') TStaticCrawlingContext = TypeVar('TStaticCrawlingContext', bound=ParsedHttpCrawlingContext) @@ -85,7 +86,7 @@ async def __aexit__( class AdaptivePlaywrightCrawler( - Generic[TStaticCrawlingContext, TStaticParseResult], + Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult], BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState], ): """Adaptive crawler that uses both specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. @@ -99,7 +100,7 @@ class AdaptivePlaywrightCrawler( def __init__( self, *, - static_parser: AbstractHttpParser[TStaticParseResult], + static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], rendering_type_predictor: RenderingTypePredictor | None = None, result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, @@ -187,13 +188,13 @@ def with_beautifulsoup_static_parser( playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], - ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Tag], Tag]: + ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag]: """Creates `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" if statistics is not None: adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState) else: adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState) - return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Tag], Tag]( + return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, @@ -213,13 +214,13 @@ def with_parsel_static_parser( playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], - ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]: + ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector]: """Creates `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content.""" if statistics is not None: adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState) else: adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState) - return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]( + return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 5cd4b27fa8..541870ab70 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -2,9 +2,7 @@ from dataclasses import dataclass, fields from datetime import timedelta -from typing import TYPE_CHECKING, Generic - -from typing_extensions import TypeVar +from typing import TYPE_CHECKING, Generic, TypeVar from crawlee import HttpHeaders from crawlee._types import BasicCrawlingContext @@ -22,17 +20,20 @@ from typing_extensions import Self -class AdaptiveContextError(RuntimeError): - pass +TStaticParseResult = TypeVar('TStaticParseResult') +TStaticSelectResult = TypeVar('TStaticSelectResult') -TStaticParseResult = TypeVar('TStaticParseResult') +class AdaptiveContextError(RuntimeError): + pass @dataclass(frozen=True) @docs_group('Data structures') -class AdaptivePlaywrightCrawlingContext(Generic[TStaticParseResult], ParsedHttpCrawlingContext[TStaticParseResult]): - _static_parser: AbstractHttpParser[TStaticParseResult] +class AdaptivePlaywrightCrawlingContext( + Generic[TStaticParseResult, TStaticSelectResult], ParsedHttpCrawlingContext[TStaticParseResult] +): + _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult] _response: Response | None = None _infinite_scroll: Callable[[], Awaitable[None]] | None = None _page: Page | None = None @@ -73,7 +74,7 @@ async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta( return await self.page.locator(selector).wait_for(timeout=timeout.total_seconds() * 1000) - async def query_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> TStaticParseResult: + async def query_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> TStaticSelectResult: static_content = await self._static_parser.select(self.parsed_content, selector) if static_content is not None: return static_content @@ -98,15 +99,17 @@ async def parse_with_static_parser( @classmethod def from_parsed_http_crawling_context( - cls, context: ParsedHttpCrawlingContext[TStaticParseResult], parser: AbstractHttpParser[TStaticParseResult] - ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult]: + cls, + context: ParsedHttpCrawlingContext[TStaticParseResult], + parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], + ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]: """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext`.""" return cls(_static_parser=parser, **{field.name: getattr(context, field.name) for field in fields(context)}) @classmethod async def from_playwright_crawling_context( - cls, context: PlaywrightCrawlingContext, parser: AbstractHttpParser[TStaticParseResult] - ) -> Self: + cls, context: PlaywrightCrawlingContext, parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult] + ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]: """Convenience constructor that creates new context from existing `PlaywrightCrawlingContext`.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} # Remove playwright specific attributes and pass them as private instead to be available as property. diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py index d86083410d..b9bb33c21b 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from bs4 import Tag +from bs4 import BeautifulSoup, Tag from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions @@ -19,7 +19,7 @@ @docs_group('Classes') -class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Tag]): +class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup, Tag]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. @@ -68,7 +68,7 @@ def __init__( """ async def final_step( - context: ParsedHttpCrawlingContext[Tag], + context: ParsedHttpCrawlingContext[BeautifulSoup], ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]: """Enhance `ParsedHttpCrawlingContext[BeautifulSoup]` with `soup` property.""" yield BeautifulSoupCrawlingContext.from_parsed_http_crawling_context(context) diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py index dfbbbf1b91..46fc6678a8 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py @@ -1,7 +1,6 @@ from dataclasses import dataclass, fields -from typing import cast -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup from typing_extensions import Self from crawlee._utils.docs import docs_group @@ -12,7 +11,7 @@ @dataclass(frozen=True) @docs_group('Data structures') -class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[Tag]): +class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup]): """The crawling context used by the `BeautifulSoupCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. @@ -21,10 +20,10 @@ class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[Tag]): @property def soup(self) -> BeautifulSoup: """Convenience alias.""" - return cast(BeautifulSoup, self.parsed_content) + return self.parsed_content @classmethod - def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Tag]) -> Self: + def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self: """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`.""" return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index 86533fbac5..7fd24651a3 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -13,18 +13,18 @@ from crawlee.http_clients import HttpResponse -class BeautifulSoupParser(AbstractHttpParser[Tag]): +class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup, Tag]): """Parser for parsing HTTP response using `BeautifulSoup`.""" def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: self._parser = parser @override - async def parse(self, response: HttpResponse) -> Tag: + async def parse(self, response: HttpResponse) -> BeautifulSoup: return BeautifulSoup(response.read(), features=self._parser) @override - async def parse_text(self, text: str) -> Tag: + async def parse_text(self, text: str) -> BeautifulSoup: return BeautifulSoup(text, features=self._parser) @override diff --git a/src/crawlee/crawlers/_http/_http_crawler.py b/src/crawlee/crawlers/_http/_http_crawler.py index 1ee89861b9..282d9370b9 100644 --- a/src/crawlee/crawlers/_http/_http_crawler.py +++ b/src/crawlee/crawlers/_http/_http_crawler.py @@ -12,7 +12,7 @@ @docs_group('Classes') -class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes]): +class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes, bytes]): """Specific version of generic `AbstractHttpCrawler`. It uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 868f59d839..68578f6638 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -13,7 +13,7 @@ from crawlee.http_clients import HttpResponse -class NoParser(AbstractHttpParser[bytes]): +class NoParser(AbstractHttpParser[bytes, bytes]): """Dummy parser for backwards compatibility. To enable using `HttpCrawler` without need for additional specific parser. diff --git a/src/crawlee/crawlers/_parsel/_parsel_crawler.py b/src/crawlee/crawlers/_parsel/_parsel_crawler.py index 4ed7cffac9..2769c6bd8e 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_crawler.py +++ b/src/crawlee/crawlers/_parsel/_parsel_crawler.py @@ -19,7 +19,7 @@ @docs_group('Classes') -class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector]): +class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selector]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 0bc98d9be7..27fef4f00e 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -14,7 +14,7 @@ from crawlee.http_clients import HttpResponse -class ParselParser(AbstractHttpParser[Selector]): +class ParselParser(AbstractHttpParser[Selector, Selector]): """Parser for parsing HTTP response using Parsel.""" @override From e164fa16a897f1541f0beb06aeece1f3a6b42526 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 30 Jan 2025 11:30:53 +0100 Subject: [PATCH 68/79] Wrapping parser only for adaptive crawler variant --- .../_abstract_http/_abstract_http_crawler.py | 16 +++--- .../_abstract_http/_abstract_http_parser.py | 10 +--- .../_abstract_http/_http_crawling_context.py | 1 - .../_adaptive_playwright/_adaptive_parser.py | 55 +++++++++++++++++++ .../_adaptive_playwright_crawler.py | 27 +++++---- .../_adaptive_playwright_crawling_context.py | 11 ++-- .../_beautifulsoup/_beautifulsoup_crawler.py | 4 +- .../_beautifulsoup/_beautifulsoup_parser.py | 14 ++--- src/crawlee/crawlers/_http/_http_crawler.py | 2 +- src/crawlee/crawlers/_http/_http_parser.py | 10 +--- .../crawlers/_parsel/_parsel_crawler.py | 2 +- .../crawlers/_parsel/_parsel_parser.py | 12 +--- 12 files changed, 98 insertions(+), 66 deletions(-) create mode 100644 src/crawlee/crawlers/_adaptive_playwright/_adaptive_parser.py diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 8021cb29c9..5f821f29c2 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -16,7 +16,7 @@ from crawlee.http_clients import HttpxHttpClient from crawlee.statistics import StatisticsState -from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult +from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Iterable @@ -53,7 +53,7 @@ class HttpCrawlerOptions( @docs_group('Abstract classes') class AbstractHttpCrawler( - Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC + Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC ): """A web crawler for performing HTTP requests. @@ -72,7 +72,7 @@ class AbstractHttpCrawler( def __init__( self, *, - parser: AbstractHttpParser[TParseResult, TSelectResult], + parser: AbstractHttpParser[TParseResult], additional_http_error_status_codes: Iterable[int] = (), ignore_http_error_status_codes: Iterable[int] = (), **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]], @@ -99,8 +99,8 @@ def __init__( @staticmethod def create_parsed_http_crawler_class( - static_parser: AbstractHttpParser[TParseResult, TSelectResult], - ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]: + static_parser: AbstractHttpParser[TParseResult], + ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]]: """Convenience class factory that creates specific version of `AbstractHttpCrawler` class. In general typing sense two generic types of `AbstractHttpCrawler` do not have to be dependent on each other. @@ -108,12 +108,10 @@ def create_parsed_http_crawler_class( parameters in `AbstractHttpCrawler`. """ - class _ParsedHttpCrawler( - AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult] - ): + class _ParsedHttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]): def __init__( self, - parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser, + parser: AbstractHttpParser[TParseResult] = static_parser, **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]], ) -> None: kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline() diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index 60a90a78ed..f6fba50ad6 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -7,7 +7,7 @@ from crawlee._utils.docs import docs_group from crawlee.crawlers._types import BlockedInfo -from ._http_crawling_context import TParseResult, TSelectResult +from ._http_crawling_context import TParseResult if TYPE_CHECKING: from collections.abc import Iterable @@ -16,7 +16,7 @@ @docs_group('Abstract classes') -class AbstractHttpParser(Generic[TParseResult, TSelectResult], ABC): +class AbstractHttpParser(Generic[TParseResult], ABC): """Parser used for parsing http response and inspecting parsed result to find links or detect blocking.""" @abstractmethod @@ -30,12 +30,6 @@ async def parse(self, response: HttpResponse) -> TParseResult: Parsed HTTP response. """ - @abstractmethod - async def parse_text(self, text: str) -> TParseResult: ... - - @abstractmethod - async def select(self, parsed_content: TParseResult, selector: str) -> TSelectResult | None: ... - def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo: """Detect if blocked and return BlockedInfo with additional information. diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py index 2278cb783d..7475b85eb3 100644 --- a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py +++ b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py @@ -10,7 +10,6 @@ from crawlee.http_clients import HttpCrawlingResult, HttpResponse TParseResult = TypeVar('TParseResult') -TSelectResult = TypeVar('TSelectResult') @dataclass(frozen=True) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_parser.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_parser.py new file mode 100644 index 0000000000..464ad3cd11 --- /dev/null +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_parser.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Generic, TypeVar + +from bs4 import BeautifulSoup, Tag +from parsel import Selector +from typing_extensions import override + +if TYPE_CHECKING: + from crawlee.crawlers import AbstractHttpParser + from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser + from crawlee.crawlers._parsel._parsel_parser import ParselParser + +TParseResult = TypeVar('TParseResult') +TSelectResult = TypeVar('TSelectResult') + + +class ParserWithSelect(Generic[TParseResult, TSelectResult], ABC): + def __init__(self, static_parser: AbstractHttpParser[TParseResult]) -> None: + self.static_parser = static_parser + + @abstractmethod + async def select(self, parsed_content: TParseResult, selector: str) -> TSelectResult | None: ... + + @abstractmethod + async def parse_text(self, text: str) -> TParseResult: ... + + +class ParselParserWithSelect(ParserWithSelect[Selector, Selector]): + def __init__(self, static_parser: ParselParser) -> None: + self.static_parser = static_parser + + @override + async def select(self, parsed_content: Selector, selector: str) -> Selector | None: + if selector_list := parsed_content.css(selector): + return selector_list[0] + return None + + @override + async def parse_text(self, text: str) -> Selector: + return Selector(text=text) + + +class BeautifulSoupParserWithSelect(ParserWithSelect[BeautifulSoup, Tag]): + def __init__(self, static_parser: BeautifulSoupParser) -> None: + self.static_parser = static_parser + + @override + async def select(self, parsed_content: Tag, selector: str) -> Tag | None: + return parsed_content.select_one(selector) + + @override + async def parse_text(self, text: str) -> BeautifulSoup: + return BeautifulSoup(text, features=self.static_parser.parser) # type:ignore[attr-defined] # Mypy bug? diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 47f93e7a8e..daf410290c 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -17,13 +17,17 @@ from crawlee._utils.wait import wait_for from crawlee.crawlers import ( AbstractHttpCrawler, - AbstractHttpParser, BasicCrawler, BeautifulSoupParserType, ParsedHttpCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, ) +from crawlee.crawlers._adaptive_playwright._adaptive_parser import ( + BeautifulSoupParserWithSelect, + ParselParserWithSelect, + ParserWithSelect, +) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( AdaptivePlaywrightCrawlerStatisticState, ) @@ -100,7 +104,7 @@ class AdaptivePlaywrightCrawler( def __init__( self, *, - static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], + static_parser_with_select: ParserWithSelect[TStaticParseResult, TStaticSelectResult], rendering_type_predictor: RenderingTypePredictor | None = None, result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, @@ -116,7 +120,8 @@ def __init__( rendering method should be used. If None, then `DefaultRenderingTypePredictor` is used. result_checker: Function that evaluates whether crawling result is valid or not. result_comparator: Function that compares two crawling results and decides whether they are equivalent. - static_parser: Implementation of `AbstractHttpParser`. Parser that will be used for static crawling. + static_parser_with_select: Implementation of `AbstractHttpParser`. Parser that contains parser for + static crawling with extra methods `select` and `parse_text`. static_crawler_specific_kwargs: `AbstractHttpCrawler` only kwargs that are passed to the sub crawler. playwright_crawler_specific_kwargs: `PlaywrightCrawler` only kwargs that are passed to the sub crawler. statistics: A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of @@ -148,10 +153,12 @@ def __init__( basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger # Initialize sub crawlers to create their pipelines. - static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser) + static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class( + static_parser=static_parser_with_select.static_parser + ) static_crawler = static_crawler_class( - parser=static_parser, + parser=static_parser_with_select.static_parser, statistics=_NonPersistentStatistics(), **static_crawler_specific_kwargs, **basic_crawler_kwargs_for_static_crawler, @@ -176,7 +183,7 @@ async def adaptive_pre_navigation_hook(context: BasicCrawlingContext) -> None: # Sub crawler pipeline related self._pw_context_pipeline = playwright_crawler._context_pipeline # noqa:SLF001 # Intentional access to private member. self._static_context_pipeline = static_crawler._context_pipeline # noqa:SLF001 # Intentional access to private member. - self._static_parser = static_parser + self._static_parser_with_select = static_parser_with_select @staticmethod def with_beautifulsoup_static_parser( @@ -198,7 +205,7 @@ def with_beautifulsoup_static_parser( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, - static_parser=BeautifulSoupParser(parser=parser_type), + static_parser_with_select=BeautifulSoupParserWithSelect(BeautifulSoupParser(parser=parser_type)), static_crawler_specific_kwargs=static_crawler_specific_kwargs, playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, statistics=adaptive_statistics, @@ -224,7 +231,7 @@ def with_parsel_static_parser( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, - static_parser=ParselParser(), + static_parser_with_select=ParselParserWithSelect(ParselParser()), static_crawler_specific_kwargs=static_crawler_specific_kwargs, playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, statistics=adaptive_statistics, @@ -293,7 +300,7 @@ async def from_static_pipeline_to_top_router( context: ParsedHttpCrawlingContext[TStaticParseResult], ) -> None: adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context( - context=context, parser=self._static_parser + context=context, parser=self._static_parser_with_select ) await self.router(adaptive_crawling_context) @@ -303,7 +310,7 @@ async def from_static_pipeline_to_top_router( async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> None: adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( - context=context, parser=self._static_parser + context=context, parser=self._static_parser_with_select ) await self.router(adaptive_crawling_context) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 541870ab70..3fd82dbf94 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -8,7 +8,6 @@ from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group from crawlee.crawlers import ( - AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext, ) @@ -19,6 +18,8 @@ from playwright.async_api import Page, Response from typing_extensions import Self + from crawlee.crawlers._adaptive_playwright._adaptive_parser import ParserWithSelect + TStaticParseResult = TypeVar('TStaticParseResult') TStaticSelectResult = TypeVar('TStaticSelectResult') @@ -33,7 +34,7 @@ class AdaptiveContextError(RuntimeError): class AdaptivePlaywrightCrawlingContext( Generic[TStaticParseResult, TStaticSelectResult], ParsedHttpCrawlingContext[TStaticParseResult] ): - _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult] + _static_parser: ParserWithSelect[TStaticParseResult, TStaticSelectResult] _response: Response | None = None _infinite_scroll: Callable[[], Awaitable[None]] | None = None _page: Page | None = None @@ -101,14 +102,14 @@ async def parse_with_static_parser( def from_parsed_http_crawling_context( cls, context: ParsedHttpCrawlingContext[TStaticParseResult], - parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], + parser: ParserWithSelect[TStaticParseResult, TStaticSelectResult], ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]: """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext`.""" return cls(_static_parser=parser, **{field.name: getattr(context, field.name) for field in fields(context)}) @classmethod async def from_playwright_crawling_context( - cls, context: PlaywrightCrawlingContext, parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult] + cls, context: PlaywrightCrawlingContext, parser: ParserWithSelect[TStaticParseResult, TStaticSelectResult] ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]: """Convenience constructor that creates new context from existing `PlaywrightCrawlingContext`.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} @@ -122,7 +123,7 @@ async def from_playwright_crawling_context( response=context.response, protocol=protocol_guess or '' ) return cls( - parsed_content=await parser.parse(http_response), + parsed_content=await parser.static_parser.parse(http_response), http_response=http_response, _static_parser=parser, **context_kwargs, diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py index b9bb33c21b..68c3b4bbad 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions @@ -19,7 +19,7 @@ @docs_group('Classes') -class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup, Tag]): +class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index 7fd24651a3..16be22b6c8 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -13,7 +13,7 @@ from crawlee.http_clients import HttpResponse -class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup, Tag]): +class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup]): """Parser for parsing HTTP response using `BeautifulSoup`.""" def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: @@ -23,18 +23,10 @@ def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: async def parse(self, response: HttpResponse) -> BeautifulSoup: return BeautifulSoup(response.read(), features=self._parser) - @override - async def parse_text(self, text: str) -> BeautifulSoup: - return BeautifulSoup(text, features=self._parser) - @override def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool: return parsed_content.select_one(selector) is not None - @override - async def select(self, parsed_content: Tag, selector: str) -> Tag | None: - return parsed_content.select_one(selector) - @override def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]: link: Tag @@ -45,5 +37,9 @@ def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]: urls.append(url.strip()) return urls + @property + def parser(self) -> BeautifulSoupParserType: + return self._parser + BeautifulSoupParserType = Literal['html.parser', 'lxml', 'xml', 'html5lib'] diff --git a/src/crawlee/crawlers/_http/_http_crawler.py b/src/crawlee/crawlers/_http/_http_crawler.py index 282d9370b9..1ee89861b9 100644 --- a/src/crawlee/crawlers/_http/_http_crawler.py +++ b/src/crawlee/crawlers/_http/_http_crawler.py @@ -12,7 +12,7 @@ @docs_group('Classes') -class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes, bytes]): +class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes]): """Specific version of generic `AbstractHttpCrawler`. It uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 68578f6638..90b8463fce 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -13,7 +13,7 @@ from crawlee.http_clients import HttpResponse -class NoParser(AbstractHttpParser[bytes, bytes]): +class NoParser(AbstractHttpParser[bytes]): """Dummy parser for backwards compatibility. To enable using `HttpCrawler` without need for additional specific parser. @@ -23,14 +23,6 @@ class NoParser(AbstractHttpParser[bytes, bytes]): async def parse(self, response: HttpResponse) -> bytes: return response.read() - @override - async def parse_text(self, text: str) -> bytes: - raise NotImplementedError - - @override - async def select(self, parsed_content: bytes, selector: str) -> bytes | None: - raise NotImplementedError - @override def is_blocked(self, parsed_content: bytes) -> BlockedInfo: # Intentional unused argument. return BlockedInfo(reason='') diff --git a/src/crawlee/crawlers/_parsel/_parsel_crawler.py b/src/crawlee/crawlers/_parsel/_parsel_crawler.py index 2769c6bd8e..4ed7cffac9 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_crawler.py +++ b/src/crawlee/crawlers/_parsel/_parsel_crawler.py @@ -19,7 +19,7 @@ @docs_group('Classes') -class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selector]): +class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 27fef4f00e..59846859b9 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -14,23 +14,13 @@ from crawlee.http_clients import HttpResponse -class ParselParser(AbstractHttpParser[Selector, Selector]): +class ParselParser(AbstractHttpParser[Selector]): """Parser for parsing HTTP response using Parsel.""" @override async def parse(self, response: HttpResponse) -> Selector: return await asyncio.to_thread(lambda: Selector(body=response.read())) - @override - async def parse_text(self, text: str) -> Selector: - return Selector(text=text) - - @override - async def select(self, parsed_content: Selector, selector: str) -> Selector | None: - if selector_list := parsed_content.css(selector): - return selector_list[0] - return None - @override def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None From e65d883efa32f70694fe1134d3e41ecbf8204a53 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 30 Jan 2025 12:54:51 +0100 Subject: [PATCH 69/79] Revert "Wrapping parser only for adaptive crawler variant" This reverts commit e164fa16a897f1541f0beb06aeece1f3a6b42526. --- .../_abstract_http/_abstract_http_crawler.py | 16 +++--- .../_abstract_http/_abstract_http_parser.py | 10 +++- .../_abstract_http/_http_crawling_context.py | 1 + .../_adaptive_playwright/_adaptive_parser.py | 55 ------------------- .../_adaptive_playwright_crawler.py | 27 ++++----- .../_adaptive_playwright_crawling_context.py | 11 ++-- .../_beautifulsoup/_beautifulsoup_crawler.py | 4 +- .../_beautifulsoup/_beautifulsoup_parser.py | 14 +++-- src/crawlee/crawlers/_http/_http_crawler.py | 2 +- src/crawlee/crawlers/_http/_http_parser.py | 10 +++- .../crawlers/_parsel/_parsel_crawler.py | 2 +- .../crawlers/_parsel/_parsel_parser.py | 12 +++- 12 files changed, 66 insertions(+), 98 deletions(-) delete mode 100644 src/crawlee/crawlers/_adaptive_playwright/_adaptive_parser.py diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 5f821f29c2..8021cb29c9 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -16,7 +16,7 @@ from crawlee.http_clients import HttpxHttpClient from crawlee.statistics import StatisticsState -from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult +from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Iterable @@ -53,7 +53,7 @@ class HttpCrawlerOptions( @docs_group('Abstract classes') class AbstractHttpCrawler( - Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC + Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC ): """A web crawler for performing HTTP requests. @@ -72,7 +72,7 @@ class AbstractHttpCrawler( def __init__( self, *, - parser: AbstractHttpParser[TParseResult], + parser: AbstractHttpParser[TParseResult, TSelectResult], additional_http_error_status_codes: Iterable[int] = (), ignore_http_error_status_codes: Iterable[int] = (), **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]], @@ -99,8 +99,8 @@ def __init__( @staticmethod def create_parsed_http_crawler_class( - static_parser: AbstractHttpParser[TParseResult], - ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]]: + static_parser: AbstractHttpParser[TParseResult, TSelectResult], + ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]: """Convenience class factory that creates specific version of `AbstractHttpCrawler` class. In general typing sense two generic types of `AbstractHttpCrawler` do not have to be dependent on each other. @@ -108,10 +108,12 @@ def create_parsed_http_crawler_class( parameters in `AbstractHttpCrawler`. """ - class _ParsedHttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]): + class _ParsedHttpCrawler( + AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult] + ): def __init__( self, - parser: AbstractHttpParser[TParseResult] = static_parser, + parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser, **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]], ) -> None: kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline() diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index f6fba50ad6..60a90a78ed 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -7,7 +7,7 @@ from crawlee._utils.docs import docs_group from crawlee.crawlers._types import BlockedInfo -from ._http_crawling_context import TParseResult +from ._http_crawling_context import TParseResult, TSelectResult if TYPE_CHECKING: from collections.abc import Iterable @@ -16,7 +16,7 @@ @docs_group('Abstract classes') -class AbstractHttpParser(Generic[TParseResult], ABC): +class AbstractHttpParser(Generic[TParseResult, TSelectResult], ABC): """Parser used for parsing http response and inspecting parsed result to find links or detect blocking.""" @abstractmethod @@ -30,6 +30,12 @@ async def parse(self, response: HttpResponse) -> TParseResult: Parsed HTTP response. """ + @abstractmethod + async def parse_text(self, text: str) -> TParseResult: ... + + @abstractmethod + async def select(self, parsed_content: TParseResult, selector: str) -> TSelectResult | None: ... + def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo: """Detect if blocked and return BlockedInfo with additional information. diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py index 7475b85eb3..2278cb783d 100644 --- a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py +++ b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py @@ -10,6 +10,7 @@ from crawlee.http_clients import HttpCrawlingResult, HttpResponse TParseResult = TypeVar('TParseResult') +TSelectResult = TypeVar('TSelectResult') @dataclass(frozen=True) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_parser.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_parser.py deleted file mode 100644 index 464ad3cd11..0000000000 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_parser.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Generic, TypeVar - -from bs4 import BeautifulSoup, Tag -from parsel import Selector -from typing_extensions import override - -if TYPE_CHECKING: - from crawlee.crawlers import AbstractHttpParser - from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser - from crawlee.crawlers._parsel._parsel_parser import ParselParser - -TParseResult = TypeVar('TParseResult') -TSelectResult = TypeVar('TSelectResult') - - -class ParserWithSelect(Generic[TParseResult, TSelectResult], ABC): - def __init__(self, static_parser: AbstractHttpParser[TParseResult]) -> None: - self.static_parser = static_parser - - @abstractmethod - async def select(self, parsed_content: TParseResult, selector: str) -> TSelectResult | None: ... - - @abstractmethod - async def parse_text(self, text: str) -> TParseResult: ... - - -class ParselParserWithSelect(ParserWithSelect[Selector, Selector]): - def __init__(self, static_parser: ParselParser) -> None: - self.static_parser = static_parser - - @override - async def select(self, parsed_content: Selector, selector: str) -> Selector | None: - if selector_list := parsed_content.css(selector): - return selector_list[0] - return None - - @override - async def parse_text(self, text: str) -> Selector: - return Selector(text=text) - - -class BeautifulSoupParserWithSelect(ParserWithSelect[BeautifulSoup, Tag]): - def __init__(self, static_parser: BeautifulSoupParser) -> None: - self.static_parser = static_parser - - @override - async def select(self, parsed_content: Tag, selector: str) -> Tag | None: - return parsed_content.select_one(selector) - - @override - async def parse_text(self, text: str) -> BeautifulSoup: - return BeautifulSoup(text, features=self.static_parser.parser) # type:ignore[attr-defined] # Mypy bug? diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index daf410290c..47f93e7a8e 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -17,17 +17,13 @@ from crawlee._utils.wait import wait_for from crawlee.crawlers import ( AbstractHttpCrawler, + AbstractHttpParser, BasicCrawler, BeautifulSoupParserType, ParsedHttpCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, ) -from crawlee.crawlers._adaptive_playwright._adaptive_parser import ( - BeautifulSoupParserWithSelect, - ParselParserWithSelect, - ParserWithSelect, -) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( AdaptivePlaywrightCrawlerStatisticState, ) @@ -104,7 +100,7 @@ class AdaptivePlaywrightCrawler( def __init__( self, *, - static_parser_with_select: ParserWithSelect[TStaticParseResult, TStaticSelectResult], + static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], rendering_type_predictor: RenderingTypePredictor | None = None, result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, @@ -120,8 +116,7 @@ def __init__( rendering method should be used. If None, then `DefaultRenderingTypePredictor` is used. result_checker: Function that evaluates whether crawling result is valid or not. result_comparator: Function that compares two crawling results and decides whether they are equivalent. - static_parser_with_select: Implementation of `AbstractHttpParser`. Parser that contains parser for - static crawling with extra methods `select` and `parse_text`. + static_parser: Implementation of `AbstractHttpParser`. Parser that will be used for static crawling. static_crawler_specific_kwargs: `AbstractHttpCrawler` only kwargs that are passed to the sub crawler. playwright_crawler_specific_kwargs: `PlaywrightCrawler` only kwargs that are passed to the sub crawler. statistics: A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of @@ -153,12 +148,10 @@ def __init__( basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger # Initialize sub crawlers to create their pipelines. - static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class( - static_parser=static_parser_with_select.static_parser - ) + static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser) static_crawler = static_crawler_class( - parser=static_parser_with_select.static_parser, + parser=static_parser, statistics=_NonPersistentStatistics(), **static_crawler_specific_kwargs, **basic_crawler_kwargs_for_static_crawler, @@ -183,7 +176,7 @@ async def adaptive_pre_navigation_hook(context: BasicCrawlingContext) -> None: # Sub crawler pipeline related self._pw_context_pipeline = playwright_crawler._context_pipeline # noqa:SLF001 # Intentional access to private member. self._static_context_pipeline = static_crawler._context_pipeline # noqa:SLF001 # Intentional access to private member. - self._static_parser_with_select = static_parser_with_select + self._static_parser = static_parser @staticmethod def with_beautifulsoup_static_parser( @@ -205,7 +198,7 @@ def with_beautifulsoup_static_parser( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, - static_parser_with_select=BeautifulSoupParserWithSelect(BeautifulSoupParser(parser=parser_type)), + static_parser=BeautifulSoupParser(parser=parser_type), static_crawler_specific_kwargs=static_crawler_specific_kwargs, playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, statistics=adaptive_statistics, @@ -231,7 +224,7 @@ def with_parsel_static_parser( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, - static_parser_with_select=ParselParserWithSelect(ParselParser()), + static_parser=ParselParser(), static_crawler_specific_kwargs=static_crawler_specific_kwargs, playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, statistics=adaptive_statistics, @@ -300,7 +293,7 @@ async def from_static_pipeline_to_top_router( context: ParsedHttpCrawlingContext[TStaticParseResult], ) -> None: adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context( - context=context, parser=self._static_parser_with_select + context=context, parser=self._static_parser ) await self.router(adaptive_crawling_context) @@ -310,7 +303,7 @@ async def from_static_pipeline_to_top_router( async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> None: adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( - context=context, parser=self._static_parser_with_select + context=context, parser=self._static_parser ) await self.router(adaptive_crawling_context) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 3fd82dbf94..541870ab70 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -8,6 +8,7 @@ from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group from crawlee.crawlers import ( + AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext, ) @@ -18,8 +19,6 @@ from playwright.async_api import Page, Response from typing_extensions import Self - from crawlee.crawlers._adaptive_playwright._adaptive_parser import ParserWithSelect - TStaticParseResult = TypeVar('TStaticParseResult') TStaticSelectResult = TypeVar('TStaticSelectResult') @@ -34,7 +33,7 @@ class AdaptiveContextError(RuntimeError): class AdaptivePlaywrightCrawlingContext( Generic[TStaticParseResult, TStaticSelectResult], ParsedHttpCrawlingContext[TStaticParseResult] ): - _static_parser: ParserWithSelect[TStaticParseResult, TStaticSelectResult] + _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult] _response: Response | None = None _infinite_scroll: Callable[[], Awaitable[None]] | None = None _page: Page | None = None @@ -102,14 +101,14 @@ async def parse_with_static_parser( def from_parsed_http_crawling_context( cls, context: ParsedHttpCrawlingContext[TStaticParseResult], - parser: ParserWithSelect[TStaticParseResult, TStaticSelectResult], + parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]: """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext`.""" return cls(_static_parser=parser, **{field.name: getattr(context, field.name) for field in fields(context)}) @classmethod async def from_playwright_crawling_context( - cls, context: PlaywrightCrawlingContext, parser: ParserWithSelect[TStaticParseResult, TStaticSelectResult] + cls, context: PlaywrightCrawlingContext, parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult] ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]: """Convenience constructor that creates new context from existing `PlaywrightCrawlingContext`.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} @@ -123,7 +122,7 @@ async def from_playwright_crawling_context( response=context.response, protocol=protocol_guess or '' ) return cls( - parsed_content=await parser.static_parser.parse(http_response), + parsed_content=await parser.parse(http_response), http_response=http_response, _static_parser=parser, **context_kwargs, diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py index 68c3b4bbad..b9bb33c21b 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions @@ -19,7 +19,7 @@ @docs_group('Classes') -class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup]): +class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup, Tag]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index 16be22b6c8..7fd24651a3 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -13,7 +13,7 @@ from crawlee.http_clients import HttpResponse -class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup]): +class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup, Tag]): """Parser for parsing HTTP response using `BeautifulSoup`.""" def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: @@ -23,10 +23,18 @@ def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: async def parse(self, response: HttpResponse) -> BeautifulSoup: return BeautifulSoup(response.read(), features=self._parser) + @override + async def parse_text(self, text: str) -> BeautifulSoup: + return BeautifulSoup(text, features=self._parser) + @override def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool: return parsed_content.select_one(selector) is not None + @override + async def select(self, parsed_content: Tag, selector: str) -> Tag | None: + return parsed_content.select_one(selector) + @override def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]: link: Tag @@ -37,9 +45,5 @@ def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]: urls.append(url.strip()) return urls - @property - def parser(self) -> BeautifulSoupParserType: - return self._parser - BeautifulSoupParserType = Literal['html.parser', 'lxml', 'xml', 'html5lib'] diff --git a/src/crawlee/crawlers/_http/_http_crawler.py b/src/crawlee/crawlers/_http/_http_crawler.py index 1ee89861b9..282d9370b9 100644 --- a/src/crawlee/crawlers/_http/_http_crawler.py +++ b/src/crawlee/crawlers/_http/_http_crawler.py @@ -12,7 +12,7 @@ @docs_group('Classes') -class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes]): +class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes, bytes]): """Specific version of generic `AbstractHttpCrawler`. It uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 90b8463fce..68578f6638 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -13,7 +13,7 @@ from crawlee.http_clients import HttpResponse -class NoParser(AbstractHttpParser[bytes]): +class NoParser(AbstractHttpParser[bytes, bytes]): """Dummy parser for backwards compatibility. To enable using `HttpCrawler` without need for additional specific parser. @@ -23,6 +23,14 @@ class NoParser(AbstractHttpParser[bytes]): async def parse(self, response: HttpResponse) -> bytes: return response.read() + @override + async def parse_text(self, text: str) -> bytes: + raise NotImplementedError + + @override + async def select(self, parsed_content: bytes, selector: str) -> bytes | None: + raise NotImplementedError + @override def is_blocked(self, parsed_content: bytes) -> BlockedInfo: # Intentional unused argument. return BlockedInfo(reason='') diff --git a/src/crawlee/crawlers/_parsel/_parsel_crawler.py b/src/crawlee/crawlers/_parsel/_parsel_crawler.py index 4ed7cffac9..2769c6bd8e 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_crawler.py +++ b/src/crawlee/crawlers/_parsel/_parsel_crawler.py @@ -19,7 +19,7 @@ @docs_group('Classes') -class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector]): +class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selector]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 59846859b9..27fef4f00e 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -14,13 +14,23 @@ from crawlee.http_clients import HttpResponse -class ParselParser(AbstractHttpParser[Selector]): +class ParselParser(AbstractHttpParser[Selector, Selector]): """Parser for parsing HTTP response using Parsel.""" @override async def parse(self, response: HttpResponse) -> Selector: return await asyncio.to_thread(lambda: Selector(body=response.read())) + @override + async def parse_text(self, text: str) -> Selector: + return Selector(text=text) + + @override + async def select(self, parsed_content: Selector, selector: str) -> Selector | None: + if selector_list := parsed_content.css(selector): + return selector_list[0] + return None + @override def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None From ab1c40e469d06d406b6ece51d614c96850457e42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Thu, 30 Jan 2025 13:09:54 +0100 Subject: [PATCH 70/79] Update src/crawlee/crawlers/_basic/_basic_crawler.py Co-authored-by: Jan Buchar --- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 5c53cb2ba0..67bb1c6b49 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -914,7 +914,7 @@ async def send_request( return send_request async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None: - """Commit request handler result for the input `context`. Result is stored in `_context_result_map`.""" + """Commit request handler result for the input `context`. Result is taken from `_context_result_map`.""" result = self._context_result_map[context] request_manager = await self.get_request_manager() From 2bf43f63d5bb701a9c79f0997dafd417ada6d3f1 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 30 Jan 2025 13:27:07 +0100 Subject: [PATCH 71/79] Finalize exports and re exports --- src/crawlee/crawlers/__init__.py | 24 +++++++++++++++++++ .../crawlers/_adaptive_playwright/__init__.py | 4 +++- .../_adaptive_playwright_crawler.py | 2 ++ .../_adaptive_playwright_crawling_context.py | 1 + .../_rendering_type_predictor.py | 4 ++++ .../test_adaptive_playwright_crawler.py | 17 ++++++------- 6 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py index ca0cf5c801..ec280f94c4 100644 --- a/src/crawlee/crawlers/__init__.py +++ b/src/crawlee/crawlers/__init__.py @@ -18,10 +18,31 @@ with _try_import(__name__, 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavCrawlingContext'): from ._playwright import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext +with _try_import( + __name__, + 'AdaptivePlaywrightCrawler', + 'AdaptivePlaywrightCrawlingContext', + 'AdaptivePlaywrightPreNavCrawlingContext', + 'RenderingType', + 'RenderingTypePrediction', + 'RenderingTypePredictor', +): + from ._adaptive_playwright import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPreNavCrawlingContext, + RenderingType, + RenderingTypePrediction, + RenderingTypePredictor, + ) + __all__ = [ 'AbstractHttpCrawler', 'AbstractHttpParser', + 'AdaptivePlaywrightCrawler', + 'AdaptivePlaywrightCrawlingContext', + 'AdaptivePlaywrightPreNavCrawlingContext', 'BasicCrawler', 'BasicCrawlerOptions', 'BasicCrawlingContext', @@ -39,4 +60,7 @@ 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavCrawlingContext', + 'RenderingType', + 'RenderingTypePrediction', + 'RenderingTypePredictor', ] diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py index 7f05047e78..15626f12e9 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/__init__.py +++ b/src/crawlee/crawlers/_adaptive_playwright/__init__.py @@ -1,5 +1,5 @@ try: - from ._rendering_type_predictor import RenderingTypePredictor + from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor except ImportError as exc: raise ImportError( "To import this, you need to install the 'adaptive-playwright' extra. " @@ -16,5 +16,7 @@ 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', + 'RenderingType', + 'RenderingTypePrediction', 'RenderingTypePredictor', ] diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index e94b151469..06f6e63db1 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -13,6 +13,7 @@ from typing_extensions import Self, TypeVar, override from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult +from crawlee._utils.docs import docs_group from crawlee._utils.wait import wait_for from crawlee.crawlers import ( AbstractHttpCrawler, @@ -82,6 +83,7 @@ async def __aexit__( self._active = False +@docs_group('Classes') class AdaptivePlaywrightCrawler( Generic[TStaticCrawlingContext, TStaticParseResult], BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState], diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index f20bf3815c..65c765ed5a 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -100,6 +100,7 @@ async def from_playwright_crawling_context( @dataclass(frozen=True) +@docs_group('Data structures') class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext): """This is just wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext. diff --git a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py index 653e7adc60..094a424200 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py @@ -11,18 +11,21 @@ from typing_extensions import override from crawlee import Request +from crawlee._utils.docs import docs_group UrlComponents = list[str] RenderingType = Literal['static', 'client only'] FeatureVector = tuple[float, float] +@docs_group('Data structures') @dataclass(frozen=True) class RenderingTypePrediction: rendering_type: RenderingType detection_probability_recommendation: float +@docs_group('Classes') class RenderingTypePredictor(ABC): @abstractmethod def predict(self, request: Request) -> RenderingTypePrediction: @@ -42,6 +45,7 @@ def store_result(self, request: Request, rendering_type: RenderingType) -> None: """ +@docs_group('Classes') class DefaultRenderingTypePredictor(RenderingTypePredictor): """Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls. diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index d9b3feb697..d0d40531b6 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -14,19 +14,20 @@ from crawlee import Request from crawlee.browsers import BrowserPool -from crawlee.crawlers import BasicCrawler -from crawlee.crawlers._adaptive_playwright import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPreNavCrawlingContext, + BasicCrawler, + RenderingType, + RenderingTypePrediction, + RenderingTypePredictor, +) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( AdaptivePlaywrightCrawlerStatisticState, ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( AdaptiveContextError, - AdaptivePlaywrightPreNavCrawlingContext, -) -from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( - RenderingType, - RenderingTypePrediction, - RenderingTypePredictor, ) from crawlee.statistics import Statistics From 9de0b5c545f1c6b0dcc2d14e9a553e7ea89dee79 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 30 Jan 2025 15:55:40 +0100 Subject: [PATCH 72/79] Docs (guide, examples, docstrings) TODO: Merge from master and update --- .../adaptive_playwright_crawler_handler.py | 13 +++ ...e_playwright_crawler_init_beautifulsoup.py | 10 ++ ...adaptive_playwright_crawler_init_parsel.py | 10 ++ ...tive_playwright_crawler_init_prediction.py | 47 ++++++++++ ...aptive_playwright_crawler_pre_nav_hooks.py | 24 +++++ docs/guides/playwright_crawler_adaptive.mdx | 91 +++++++++++++++++++ .../_abstract_http/_abstract_http_parser.py | 24 ++++- .../_adaptive_playwright_crawling_context.py | 31 ++++++- .../_rendering_type_predictor.py | 1 + 9 files changed, 248 insertions(+), 3 deletions(-) create mode 100644 docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py create mode 100644 docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py create mode 100644 docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py create mode 100644 docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py create mode 100644 docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py create mode 100644 docs/guides/playwright_crawler_adaptive.mdx diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py new file mode 100644 index 0000000000..565ce0d669 --- /dev/null +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py @@ -0,0 +1,13 @@ +from datetime import timedelta + +from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext + +crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() + + +@crawler.router.default_handler +async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + # Locate element h2 within 5 seconds + h2 = await context.query_selector('h2', timedelta(milliseconds=5000)) + # Do stuff with element found by the selector + context.log.info(h2) diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py new file mode 100644 index 0000000000..4c8da9a966 --- /dev/null +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py @@ -0,0 +1,10 @@ +from crawlee.crawlers import AdaptivePlaywrightCrawler + +crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + # Arguments relevant only for PlaywrightCrawler + playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chrome'}, + # Arguments relevant only for BeautifulSoupCrawler + static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]}, + # Common arguments relevant to all crawlers + max_crawl_depth=5, +) diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py new file mode 100644 index 0000000000..24e07a3a80 --- /dev/null +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py @@ -0,0 +1,10 @@ +from crawlee.crawlers import AdaptivePlaywrightCrawler + +crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + # Arguments relevant only for PlaywrightCrawler + playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chrome'}, + # Arguments relevant only for ParselCrawler + static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]}, + # Common arguments relevant to all crawlers + max_crawl_depth=5, +) diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py new file mode 100644 index 0000000000..560a02666f --- /dev/null +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py @@ -0,0 +1,47 @@ +from crawlee import Request +from crawlee._types import RequestHandlerRunResult +from crawlee.crawlers import AdaptivePlaywrightCrawler, RenderingType, RenderingTypePrediction, RenderingTypePredictor + + +class CustomRenderingTypePredictor(RenderingTypePredictor): + def __init__(self) -> None: + self._learning_data = [] + + def predict(self, request: Request) -> RenderingTypePrediction: + # Some custom logic that produces some `RenderingTypePrediction` based on the `request` input. + rendering_type = 'static' if 'abc' in request.url else 'client only' + + return RenderingTypePrediction( + # Recommends `static` rendering type -> HTTP-based sub crawler will be used. + rendering_type=rendering_type, + # Recommends that both sub crawlers should run with 20% chance. + # When both sub crawlers are running, the predictor can compare results and learn. + # When this number is high, then predictor is not very confident about the `rendering_type`. + # When this number is low, then predictor is very confident about the `rendering_type`. + detection_probability_recommendation=0.2, + ) + + def store_result(self, request: Request, rendering_type: RenderingType) -> None: + # This function allows predictor to store new learning data and retrain itself if needed. + # `request` is input for prediction and `rendering_type` is the correct prediction output. + self._learning_data.append((request.url, rendering_type)) + # retrain + + +def result_checker(result: RequestHandlerRunResult) -> bool: + # Some function that inspects produced `result` and returns `True` if the result is correct. + return bool(result) # Check something on result + + +def result_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: + # Some function that inspects two results and returns `True` if they are considered equivalent. + # It is used when comparing results produced by HTTP-based sub crawler and playwright based sub crawler. + + return result_1.push_data_calls == result_2.push_data_calls # For example compare `push_data` calls. + + +crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + rendering_type_predictor=CustomRenderingTypePredictor(), + result_checker=result_checker, + result_comparator=result_comparator, +) diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py new file mode 100644 index 0000000000..4568907e49 --- /dev/null +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py @@ -0,0 +1,24 @@ +from playwright.async_api import Route + +from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightPreNavCrawlingContext + +crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() + + +@crawler.pre_navigation_hook +async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed both in static sub crawler and playwright sub crawler.""" + # Trying to access context.page in this hook would raise `AdaptiveContextError` for pages crawled + # without playwright. + context.log.info(f'pre navigation hook for: {context.request.url} ...') + + +@crawler.pre_navigation_hook(playwright_only=True) +async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed only in playwright sub crawler.""" + + async def some_routing_function(route: Route) -> None: + await route.continue_() + + await context.page.route('*/**', some_routing_function) + context.log.info(f'Playwright only pre navigation hook for: {context.request.url} ...') diff --git a/docs/guides/playwright_crawler_adaptive.mdx b/docs/guides/playwright_crawler_adaptive.mdx new file mode 100644 index 0000000000..f835691bc9 --- /dev/null +++ b/docs/guides/playwright_crawler_adaptive.mdx @@ -0,0 +1,91 @@ +--- +id: adaptive playwright-crawler +title: Adaptive Playwright crawler +description: How to use the AdaptivePlaywrightCrawler. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import AdaptivePlaywrightCrawlerInitBeautifulSoup from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py'; +import AdaptivePlaywrightCrawlerInitParsel from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py'; +import AdaptivePlaywrightCrawlerInitPrediction from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py'; +import AdaptivePlaywrightCrawlerHandler from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py'; +import AdaptivePlaywrightCrawlerPreNavHooks from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py'; + + + +An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler like `ParselCrawler` or `BeautifulSoupCrawler`. +It uses more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible. + +Detection is done based on the `RenderingTypePredictor` with default implementation ...(add after merged to master...). It predicts which crawling method should be used and learns from already crawled pages. + +## When to use AdaptivePlaywrightCrawler + +Use `AdaptivePlaywrightCrawler` in scenarios where some target pages have to be crawled with `PlaywrightCrawler`, but for others faster HTTP-based crawler is sufficient. This way, you can achieve lower costs when crawling multiple different websites. + +## Request handler and adaptive context helpers + +Request handler for `AdaptivePlaywrightCrawler` works on special context type - `AdaptivePlaywrightCrawlingContext`. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access `page` object for example. To overcome this there are three helper methods on this context that can be called regardless of how the context was created. + +`wait_for_selector` accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright. + +`query_selector` accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns this selector if found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for `ParselCrawler` and `Tag` for `BeautifulSoupCrawler`. + +`parse_with_static_parser` will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls `wait_for_selector` and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete. + +See following example about how to create request handler and use context helpers: + + + {AdaptivePlaywrightCrawlerHandler} + + + +## Advanced configuration + +`AdaptivePlaywrightCrawler` is internally composed of two sub crawlers you can do a detailed configuration of both of them. For detailed configuration options of the sub crawlers, please refer to their pages: `PlaywrightCrawler`, `ParselCrawler`, `BeautifulSoupCrawler`. + +In the following example you can see how to create and configure `AdaptivePlaywrightCrawler` with two different HTTP-based sub crawlers: + + + + + + {AdaptivePlaywrightCrawlerInitBeautifulSoup} + + + + + {AdaptivePlaywrightCrawlerInitParsel} + + + + +### Prediction related arguments + +To control which pages are crawled by which method you can use following arguments: + +`RenderingTypePredictor` - Class that can give recommendations about which sub crawler should be used for specific url. Predictor will also recommend to use both sub crawlers for some page from time to time, to check that the given recommendation was correct. Predictor should be able to learn from previous results and gradually give more reliable recommendations. + +`result_checker` - Is a function that checks result created from crawling a page. By default, it always returns `True`. + +`result_comparator` - Is a function that compares two results (HTTP-based sub crawler result and playwright based sub crawler result) and returns `True` if they are considered the same. By default, this function compares calls of context helper `push_data` by each sub crawler. This function is used by `rendering_type_predictor` to evaluate whether HTTP-based crawler has the same results as playwright based crawler. + +See following example about how to pass prediction related arguments: + + + {AdaptivePlaywrightCrawlerInitPrediction} + + + + +## Page configuration with pre-navigation hooks +In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the `pre_navigation_hook` method of the `AdaptivePlaywrightCrawler`. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of `AdaptivePlaywrightCrawler` it is possible that the hook will be for HTTP-based sub crawler or playwright-based sub crawler. Using `page` object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook. + +See following example about how to register the pre navigation hooks: + + + {AdaptivePlaywrightCrawlerPreNavHooks} + diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index 60a90a78ed..9fbe532836 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -31,10 +31,30 @@ async def parse(self, response: HttpResponse) -> TParseResult: """ @abstractmethod - async def parse_text(self, text: str) -> TParseResult: ... + async def parse_text(self, text: str) -> TParseResult: + """Parse test. + + Args: + text: String containing html. + + Returns: + Parsed text. + """ @abstractmethod - async def select(self, parsed_content: TParseResult, selector: str) -> TSelectResult | None: ... + async def select(self, parsed_content: TParseResult, selector: str) -> TSelectResult | None: + """Use css selector to select page element and return it. + + Default implementation that expects `is_matching_selector` abstract method to be implemented. + Override this method if your parser has different way of blockage detection. + + Args: + parsed_content: content where the page element will be located + selector: css selector + + Returns: + Selected element. + """ def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo: """Detect if blocked and return BlockedInfo with additional information. diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 541870ab70..3f4a77b770 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -70,11 +70,30 @@ def response(self) -> Response: return self._response async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> None: + """Locate element by css selector a return once it is found. + + If element is not found within timeout, TimeoutError is raised. + + Args: + selector: css selector to be used to locate specific element on page. + timeout: timeout that defines how long the function wait for the selector to appear. + """ if await self._static_parser.select(self.parsed_content, selector): return await self.page.locator(selector).wait_for(timeout=timeout.total_seconds() * 1000) async def query_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> TStaticSelectResult: + """Locate element by css selector a return it once it is found. + + If element is not found within timeout, TimeoutError is raised. + + Args: + selector: css selector to be used to locate specific element on page. + timeout: timeout that defines how long the function wait for the selector to appear. + + Returns: + `TStaticSelectResult` which is result of used static parser `select` method. + """ static_content = await self._static_parser.select(self.parsed_content, selector) if static_content is not None: return static_content @@ -87,12 +106,22 @@ async def query_selector(self, selector: str, timeout: timedelta = timedelta(sec ) if parsed_selector is not None: return parsed_selector - # Selector worked in Playwright, but not in static parser and raise AdaptiveContextError('Used selector is not a valid static selector') async def parse_with_static_parser( self, selector: str | None, timeout: timedelta = timedelta(seconds=5) ) -> TStaticParseResult: + """Parse whole page with static parser. If `selector` argument is used wait for selector first. + + If element is not found within timeout, TimeoutError is raised. + + Args: + selector: css selector to be used to locate specific element on page. + timeout: timeout that defines how long the function wait for the selector to appear. + + Returns: + `TStaticParseResult` which is result of used static parser `parse_text` method. + """ if selector: await self.wait_for_selector(selector, timeout) return await self._static_parser.parse_text(await self.page.content()) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py index bdf49d9509..e2190c3d65 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py @@ -19,6 +19,7 @@ class RenderingTypePrediction: detection_probability_recommendation: float +@dataclass(frozen=True) class RenderingTypePredictor(ABC): @abstractmethod def predict(self, request: Request) -> RenderingTypePrediction: ... From e133712c445ab77b957582d67514b6698dfb7c53 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 31 Jan 2025 09:30:50 +0100 Subject: [PATCH 73/79] Docs after merge from main adaptive-crawler branch --- .../code/adaptive_playwright_crawler.py | 27 ++++++++-------- docs/examples/playwright_crawler_adaptive.mdx | 20 ++++++++++++ ...e_playwright_crawler_init_beautifulsoup.py | 2 +- ...adaptive_playwright_crawler_init_parsel.py | 2 +- ...tive_playwright_crawler_init_prediction.py | 6 ++-- docs/guides/playwright_crawler_adaptive.mdx | 23 ++++++------- .../_adaptive_playwright_crawler.py | 32 ++++++++++++++++--- .../_rendering_type_predictor.py | 1 - 8 files changed, 78 insertions(+), 35 deletions(-) create mode 100644 docs/examples/playwright_crawler_adaptive.mdx diff --git a/docs/examples/code/adaptive_playwright_crawler.py b/docs/examples/code/adaptive_playwright_crawler.py index c7398a76cf..84674e0ea6 100644 --- a/docs/examples/code/adaptive_playwright_crawler.py +++ b/docs/examples/code/adaptive_playwright_crawler.py @@ -1,36 +1,35 @@ import asyncio +from datetime import timedelta from playwright.async_api import Route -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) async def main() -> None: + # Crawler created by following factory method will use `beautifulsoup` for parsing static content. crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False} ) - @crawler.router.handler(label='label') - async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None: - # Do some processing using `page` - some_locator = context.page.locator('div').first - await some_locator.wait_for() - # Do stuff with locator... - context.log.info(f'Playwright processing of: {context.request.url} ...') - @crawler.router.default_handler - async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - context.log.info(f'User handler processing: {context.request.url} ...') + async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None: # Do some processing using `parsed_content` context.log.info(context.parsed_content.title) + # Locate element h2 within 5 seconds + h2 = await context.query_selector('h2', timedelta(milliseconds=5000)) + # Do stuff with element found by the selector + context.log.info(h2) + # Find more links and enqueue them. await context.enqueue_links() - await context.push_data({'Top crawler Url': context.request.url}) + # Save some data. + await context.push_data({'Visited url': context.request.url}) @crawler.pre_navigation_hook async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: @@ -41,7 +40,7 @@ async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: @crawler.pre_navigation_hook(playwright_only=True) async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: - """Hook executed only in playwright sub crawler.""" + """Hook executed only in playwright sub crawler. It is safe to access `page` object.""" async def some_routing_function(route: Route) -> None: await route.continue_() diff --git a/docs/examples/playwright_crawler_adaptive.mdx b/docs/examples/playwright_crawler_adaptive.mdx new file mode 100644 index 0000000000..870c39b502 --- /dev/null +++ b/docs/examples/playwright_crawler_adaptive.mdx @@ -0,0 +1,20 @@ +--- +id: adaptive-playwright-crawler +title: AdaptivePlaywrightCrawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import AdaptivePlaywrightCrawlerExample from '!!raw-loader!./code/adaptive_playwright_crawler.py'; + +This example demonstrates how to use `AdaptivePlaywrightCrawler`. An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler like `ParselCrawler` or `BeautifulSoupCrawler`. +It uses more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible. + +A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only`=`True` to mark hooks that should be executed only for playwright sub crawler. + +For more detailed description please see [AdaptivePlaywrightCrawler guide](/python/docs/guides/adaptive-playwright-crawler 'Network payload investigation.') + + + {AdaptivePlaywrightCrawlerExample} + diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py index 4c8da9a966..b3ed447f2a 100644 --- a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py @@ -2,7 +2,7 @@ crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( # Arguments relevant only for PlaywrightCrawler - playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chrome'}, + playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chromium'}, # Arguments relevant only for BeautifulSoupCrawler static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]}, # Common arguments relevant to all crawlers diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py index 24e07a3a80..76010fe224 100644 --- a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py @@ -2,7 +2,7 @@ crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( # Arguments relevant only for PlaywrightCrawler - playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chrome'}, + playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chromium'}, # Arguments relevant only for ParselCrawler static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]}, # Common arguments relevant to all crawlers diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py index 560a02666f..d8875016b7 100644 --- a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py @@ -5,11 +5,11 @@ class CustomRenderingTypePredictor(RenderingTypePredictor): def __init__(self) -> None: - self._learning_data = [] + self._learning_data = list[tuple[Request, RenderingType]]() def predict(self, request: Request) -> RenderingTypePrediction: # Some custom logic that produces some `RenderingTypePrediction` based on the `request` input. - rendering_type = 'static' if 'abc' in request.url else 'client only' + rendering_type: RenderingType = 'static' if 'abc' in request.url else 'client only' return RenderingTypePrediction( # Recommends `static` rendering type -> HTTP-based sub crawler will be used. @@ -24,7 +24,7 @@ def predict(self, request: Request) -> RenderingTypePrediction: def store_result(self, request: Request, rendering_type: RenderingType) -> None: # This function allows predictor to store new learning data and retrain itself if needed. # `request` is input for prediction and `rendering_type` is the correct prediction output. - self._learning_data.append((request.url, rendering_type)) + self._learning_data.append((request, rendering_type)) # retrain diff --git a/docs/guides/playwright_crawler_adaptive.mdx b/docs/guides/playwright_crawler_adaptive.mdx index f835691bc9..04f1f180ae 100644 --- a/docs/guides/playwright_crawler_adaptive.mdx +++ b/docs/guides/playwright_crawler_adaptive.mdx @@ -1,6 +1,6 @@ --- -id: adaptive playwright-crawler -title: Adaptive Playwright crawler +id: adaptive-playwright-crawler +title: AdaptivePlaywrightCrawler description: How to use the AdaptivePlaywrightCrawler. --- @@ -20,7 +20,7 @@ import AdaptivePlaywrightCrawlerPreNavHooks from '!!raw-loader!./code/adaptive_p An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler like `ParselCrawler` or `BeautifulSoupCrawler`. It uses more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible. -Detection is done based on the `RenderingTypePredictor` with default implementation ...(add after merged to master...). It predicts which crawling method should be used and learns from already crawled pages. +Detection is done based on the `RenderingTypePredictor` with default implementation `DefaultRenderingTypePredictor`. It predicts which crawling method should be used and learns from already crawled pages. ## When to use AdaptivePlaywrightCrawler @@ -28,7 +28,7 @@ Use `AdaptivePlaywrightCrawler``AdaptivePlaywrightCrawler` works on special context type - `AdaptivePlaywrightCrawlingContext`. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access `page` object for example. To overcome this there are three helper methods on this context that can be called regardless of how the context was created. +Request handler for `AdaptivePlaywrightCrawler` works on special context type - `AdaptivePlaywrightCrawlingContext`. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object for example. To overcome this there are three helper methods on this context that can be called regardless of how the context was created. `wait_for_selector` accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright. @@ -36,16 +36,17 @@ Request handler for `AdaptivePlayw `parse_with_static_parser` will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls `wait_for_selector` and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete. -See following example about how to create request handler and use context helpers: +See the following example about how to create request handler and use context helpers: {AdaptivePlaywrightCrawlerHandler} -## Advanced configuration +## Crawler configuration +To use `AdaptivePlaywrightCrawler` it is recommended to use one of the prepared factory methods that will create the crawler with specific HTTP-based sub crawler variant: `AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser` or `AdaptivePlaywrightCrawler.with_parsel_static_parser`. -`AdaptivePlaywrightCrawler` is internally composed of two sub crawlers you can do a detailed configuration of both of them. For detailed configuration options of the sub crawlers, please refer to their pages: `PlaywrightCrawler`, `ParselCrawler`, `BeautifulSoupCrawler`. +`AdaptivePlaywrightCrawler` is internally composed of two sub crawlers and you can do a detailed configuration of both of them. For detailed configuration options of the sub crawlers, please refer to their pages: `PlaywrightCrawler`, `ParselCrawler`, `BeautifulSoupCrawler`. In the following example you can see how to create and configure `AdaptivePlaywrightCrawler` with two different HTTP-based sub crawlers: @@ -71,9 +72,9 @@ To control which pages are crawled by which method you can use following argumen `result_checker` - Is a function that checks result created from crawling a page. By default, it always returns `True`. -`result_comparator` - Is a function that compares two results (HTTP-based sub crawler result and playwright based sub crawler result) and returns `True` if they are considered the same. By default, this function compares calls of context helper `push_data` by each sub crawler. This function is used by `rendering_type_predictor` to evaluate whether HTTP-based crawler has the same results as playwright based crawler. +`result_comparator` - Is a function that compares two results (HTTP-based sub crawler result and playwright based sub crawler result) and returns `True` if they are considered the same. By default, this function compares calls of context helper `push_data` by each sub crawler. This function is used by `rendering_type_predictor` to evaluate whether HTTP-based crawler has the same results as playwright based sub crawler. -See following example about how to pass prediction related arguments: +See the following example about how to pass prediction related arguments: {AdaptivePlaywrightCrawlerInitPrediction} @@ -82,9 +83,9 @@ See following example about how to pass prediction related arguments: ## Page configuration with pre-navigation hooks -In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the `pre_navigation_hook` method of the `AdaptivePlaywrightCrawler`. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of `AdaptivePlaywrightCrawler` it is possible that the hook will be for HTTP-based sub crawler or playwright-based sub crawler. Using `page` object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook. +In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the `pre_navigation_hook` method of the `AdaptivePlaywrightCrawler`. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of `AdaptivePlaywrightCrawler` it is possible that the hook will be for HTTP-based sub crawler or playwright-based sub crawler. Using [page](https://playwright.dev/python/docs/api/class-page) object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook. -See following example about how to register the pre navigation hooks: +See the following example about how to register the pre navigation hooks: {AdaptivePlaywrightCrawlerPreNavHooks} diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 12929229fa..0b6ffc2274 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -89,12 +89,36 @@ class AdaptivePlaywrightCrawler( Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult], BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState], ): - """Adaptive crawler that uses both specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. + """Adaptive crawler that uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. - It tries to detect whether it is sufficient to crawl without browser (which is faster) or if - `PlaywrightCrawler` should be used (in case previous method did not work as expected for specific url.). + It uses more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it + may be possible. - # TODO: Add example + ### Usage + ```python + from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext + + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False} + ) + + @crawler.router.default_handler + async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None: + # Do some processing using `parsed_content` + context.log.info(context.parsed_content.title) + + # Locate element h2 within 5 seconds + h2 = await context.query_selector('h2', timedelta(milliseconds=5000)) + # Do stuff with element found by the selector + context.log.info(h2) + + # Find more links and enqueue them. + await context.enqueue_links() + # Save some data. + await context.push_data({'Visited url': context.request.url}) + + await crawler.run(['https://crawlee.dev/']) + ``` """ def __init__( diff --git a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py index 48c0164b1b..094a424200 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py @@ -26,7 +26,6 @@ class RenderingTypePrediction: @docs_group('Classes') -@dataclass(frozen=True) class RenderingTypePredictor(ABC): @abstractmethod def predict(self, request: Request) -> RenderingTypePrediction: From 4effa19bc6c79929e2af2213e8f796b7890b9ce3 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 7 Feb 2025 08:49:10 +0100 Subject: [PATCH 74/79] Docs, docstrings and comments edtis. --- docs/examples/code/adaptive_playwright_crawler.py | 7 ++++--- .../adaptive_playwright_crawler_pre_nav_hooks.py | 8 +++++--- docs/guides/playwright_crawler_adaptive.mdx | 4 ++-- .../crawlers/_abstract_http/_abstract_http_parser.py | 5 +---- .../_adaptive_playwright_crawling_context.py | 6 +++--- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/examples/code/adaptive_playwright_crawler.py b/docs/examples/code/adaptive_playwright_crawler.py index 84674e0ea6..86afb5f087 100644 --- a/docs/examples/code/adaptive_playwright_crawler.py +++ b/docs/examples/code/adaptive_playwright_crawler.py @@ -33,9 +33,10 @@ async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) @crawler.pre_navigation_hook async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: - """Hook executed both in static sub crawler and playwright sub crawler.""" - # Trying to access context.page in this hook would raise `AdaptiveContextError` for pages crawled - # without playwright. + """Hook executed both in static sub crawler and playwright sub crawler. + + Trying to access `context.page` in this hook would raise `AdaptiveContextError` for pages crawled without + playwright.""" context.log.info(f'pre navigation hook for: {context.request.url} ...') @crawler.pre_navigation_hook(playwright_only=True) diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py index 4568907e49..d6b449482a 100644 --- a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py @@ -7,9 +7,11 @@ @crawler.pre_navigation_hook async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: - """Hook executed both in static sub crawler and playwright sub crawler.""" - # Trying to access context.page in this hook would raise `AdaptiveContextError` for pages crawled - # without playwright. + """Hook executed both in static sub crawler and playwright sub crawler. + + Trying to access `context.page` in this hook would raise `AdaptiveContextError` for pages crawled without + playwright.""" + context.log.info(f'pre navigation hook for: {context.request.url} ...') diff --git a/docs/guides/playwright_crawler_adaptive.mdx b/docs/guides/playwright_crawler_adaptive.mdx index 04f1f180ae..21147c4f4f 100644 --- a/docs/guides/playwright_crawler_adaptive.mdx +++ b/docs/guides/playwright_crawler_adaptive.mdx @@ -28,7 +28,7 @@ Use `AdaptivePlaywrightCrawler``AdaptivePlaywrightCrawler` works on special context type - `AdaptivePlaywrightCrawlingContext`. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object for example. To overcome this there are three helper methods on this context that can be called regardless of how the context was created. +Request handler for `AdaptivePlaywrightCrawler` works on special context type - `AdaptivePlaywrightCrawlingContext`. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object. To overcome this limitation, there are three helper methods on this context that can be called regardless of how the context was created. `wait_for_selector` accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright. @@ -83,7 +83,7 @@ See the following example about how to pass prediction related arguments: ## Page configuration with pre-navigation hooks -In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the `pre_navigation_hook` method of the `AdaptivePlaywrightCrawler`. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of `AdaptivePlaywrightCrawler` it is possible that the hook will be for HTTP-based sub crawler or playwright-based sub crawler. Using [page](https://playwright.dev/python/docs/api/class-page) object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook. +In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the `pre_navigation_hook` method of the `AdaptivePlaywrightCrawler`. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of `AdaptivePlaywrightCrawler` it is possible that the hook will be executed for HTTP-based sub crawler or playwright-based sub crawler. Using [page](https://playwright.dev/python/docs/api/class-page) object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook. See the following example about how to register the pre navigation hooks: diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index 9fbe532836..4913897ee7 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -32,7 +32,7 @@ async def parse(self, response: HttpResponse) -> TParseResult: @abstractmethod async def parse_text(self, text: str) -> TParseResult: - """Parse test. + """Parse text. Args: text: String containing html. @@ -45,9 +45,6 @@ async def parse_text(self, text: str) -> TParseResult: async def select(self, parsed_content: TParseResult, selector: str) -> TSelectResult | None: """Use css selector to select page element and return it. - Default implementation that expects `is_matching_selector` abstract method to be implemented. - Override this method if your parser has different way of blockage detection. - Args: parsed_content: content where the page element will be located selector: css selector diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index db4b33f55c..3f22d9b401 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -79,7 +79,7 @@ def response(self) -> Response: async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> None: """Locate element by css selector a return once it is found. - If element is not found within timeout, TimeoutError is raised. + If element is not found within timeout, `TimeoutError` is raised. Args: selector: css selector to be used to locate specific element on page. @@ -92,7 +92,7 @@ async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta( async def query_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> TStaticSelectResult: """Locate element by css selector a return it once it is found. - If element is not found within timeout, TimeoutError is raised. + If element is not found within timeout, `TimeoutError` is raised. Args: selector: css selector to be used to locate specific element on page. @@ -118,7 +118,7 @@ async def query_selector(self, selector: str, timeout: timedelta = timedelta(sec async def parse_with_static_parser( self, selector: str | None, timeout: timedelta = timedelta(seconds=5) ) -> TStaticParseResult: - """Parse whole page with static parser. If `selector` argument is used wait for selector first. + """Parse whole page with static parser. If `selector` argument is used, wait for selector first. If element is not found within timeout, TimeoutError is raised. From 410dcffc15074d962ce23e8d29946417aaaeb487 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 10 Feb 2025 12:59:00 +0100 Subject: [PATCH 75/79] Review comments --- docs/examples/playwright_crawler_adaptive.mdx | 8 ++++---- ...daptive_playwright_crawler_init_prediction.py | 1 - docs/guides/playwright_crawler_adaptive.mdx | 6 ++++-- .../_abstract_http/_abstract_http_parser.py | 6 +++--- .../_adaptive_playwright_crawler.py | 7 ++++--- .../_adaptive_playwright_crawling_context.py | 16 ++++++++-------- 6 files changed, 23 insertions(+), 21 deletions(-) diff --git a/docs/examples/playwright_crawler_adaptive.mdx b/docs/examples/playwright_crawler_adaptive.mdx index 870c39b502..5acbde2cc8 100644 --- a/docs/examples/playwright_crawler_adaptive.mdx +++ b/docs/examples/playwright_crawler_adaptive.mdx @@ -8,12 +8,12 @@ import CodeBlock from '@theme/CodeBlock'; import AdaptivePlaywrightCrawlerExample from '!!raw-loader!./code/adaptive_playwright_crawler.py'; -This example demonstrates how to use `AdaptivePlaywrightCrawler`. An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler like `ParselCrawler` or `BeautifulSoupCrawler`. -It uses more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible. +This example demonstrates how to use `AdaptivePlaywrightCrawler`. An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler such as `ParselCrawler` or `BeautifulSoupCrawler`. +It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. -A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only`=`True` to mark hooks that should be executed only for playwright sub crawler. +A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler. -For more detailed description please see [AdaptivePlaywrightCrawler guide](/python/docs/guides/adaptive-playwright-crawler 'Network payload investigation.') +For more detailed description please see [AdaptivePlaywrightCrawler guide](/python/docs/guides/adaptive-playwright-crawler 'AdaptivePlaywrightCrawler guide') {AdaptivePlaywrightCrawlerExample} diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py index d8875016b7..5e0623c789 100644 --- a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py @@ -36,7 +36,6 @@ def result_checker(result: RequestHandlerRunResult) -> bool: def result_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: # Some function that inspects two results and returns `True` if they are considered equivalent. # It is used when comparing results produced by HTTP-based sub crawler and playwright based sub crawler. - return result_1.push_data_calls == result_2.push_data_calls # For example compare `push_data` calls. diff --git a/docs/guides/playwright_crawler_adaptive.mdx b/docs/guides/playwright_crawler_adaptive.mdx index 21147c4f4f..58f1fbdc38 100644 --- a/docs/guides/playwright_crawler_adaptive.mdx +++ b/docs/guides/playwright_crawler_adaptive.mdx @@ -17,8 +17,8 @@ import AdaptivePlaywrightCrawlerPreNavHooks from '!!raw-loader!./code/adaptive_p -An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler like `ParselCrawler` or `BeautifulSoupCrawler`. -It uses more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible. +An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler such as `ParselCrawler` or `BeautifulSoupCrawler`. +It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. Detection is done based on the `RenderingTypePredictor` with default implementation `DefaultRenderingTypePredictor`. It predicts which crawling method should be used and learns from already crawled pages. @@ -26,6 +26,8 @@ Detection is done based on the `Rende Use `AdaptivePlaywrightCrawler` in scenarios where some target pages have to be crawled with `PlaywrightCrawler`, but for others faster HTTP-based crawler is sufficient. This way, you can achieve lower costs when crawling multiple different websites. +Another use case is performing selector-based data extraction without prior knowledge of whether the selector exists in the static page or is dynamically added by a code executed in a browsing client. + ## Request handler and adaptive context helpers Request handler for `AdaptivePlaywrightCrawler` works on special context type - `AdaptivePlaywrightCrawlingContext`. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object. To overcome this limitation, there are three helper methods on this context that can be called regardless of how the context was created. diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index 4913897ee7..40f012dcf8 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -32,7 +32,7 @@ async def parse(self, response: HttpResponse) -> TParseResult: @abstractmethod async def parse_text(self, text: str) -> TParseResult: - """Parse text. + """Parse text containing html. Args: text: String containing html. @@ -46,8 +46,8 @@ async def select(self, parsed_content: TParseResult, selector: str) -> TSelectRe """Use css selector to select page element and return it. Args: - parsed_content: content where the page element will be located - selector: css selector + parsed_content: Content where the page element will be located. + selector: Css selector used to locate desired html element. Returns: Selected element. diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index c891cd7f7a..18f752c06c 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -90,10 +90,11 @@ class AdaptivePlaywrightCrawler( Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult], BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState], ): - """Adaptive crawler that uses both specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. + """An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling. - It uses more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it - may be possible. + It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects + that it may bring a performance benefit. + It uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. ### Usage ```python diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 3f22d9b401..dd7de3e137 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -77,29 +77,29 @@ def response(self) -> Response: return self._response async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> None: - """Locate element by css selector a return once it is found. + """Locate element by css selector and return `None` once it is found. If element is not found within timeout, `TimeoutError` is raised. Args: - selector: css selector to be used to locate specific element on page. - timeout: timeout that defines how long the function wait for the selector to appear. + selector: Css selector to be used to locate specific element on page. + timeout: Timeout that defines how long the function wait for the selector to appear. """ if await self._static_parser.select(self.parsed_content, selector): return await self.page.locator(selector).wait_for(timeout=timeout.total_seconds() * 1000) async def query_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> TStaticSelectResult: - """Locate element by css selector a return it once it is found. + """Locate element by css selector and return the element once it is found. If element is not found within timeout, `TimeoutError` is raised. Args: - selector: css selector to be used to locate specific element on page. - timeout: timeout that defines how long the function wait for the selector to appear. + selector: Css selector to be used to locate specific element on page. + timeout: Timeout that defines how long the function wait for the selector to appear. Returns: - `TStaticSelectResult` which is result of used static parser `select` method. + Result of used static parser `select` method. """ static_content = await self._static_parser.select(self.parsed_content, selector) if static_content is not None: @@ -127,7 +127,7 @@ async def parse_with_static_parser( timeout: timeout that defines how long the function wait for the selector to appear. Returns: - `TStaticParseResult` which is result of used static parser `parse_text` method. + Result of used static parser `parse_text` method. """ if selector: await self.wait_for_selector(selector, timeout) From 109847dfbcb03e216221ef81deafc3baa8dde6ed Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 10 Feb 2025 15:00:06 +0100 Subject: [PATCH 76/79] TODO: Add test for autoparsing --- .../_adaptive_playwright_crawling_context.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index dd7de3e137..4e4c8831a8 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -76,6 +76,14 @@ def response(self) -> Response: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._response + @property + def _latest_parsed_content(self) -> TStaticParseResult: + """Return parsed page. No need to parse again for in static context. Reparse in Playwright context.""" + if self._page: + return self.parse_with_static_parser() + return self.parsed_content + + async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> None: """Locate element by css selector and return `None` once it is found. @@ -85,7 +93,7 @@ async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta( selector: Css selector to be used to locate specific element on page. timeout: Timeout that defines how long the function wait for the selector to appear. """ - if await self._static_parser.select(self.parsed_content, selector): + if await self._static_parser.select(self._latest_parsed_content, selector): return await self.page.locator(selector).wait_for(timeout=timeout.total_seconds() * 1000) @@ -101,7 +109,7 @@ async def query_selector(self, selector: str, timeout: timedelta = timedelta(sec Returns: Result of used static parser `select` method. """ - static_content = await self._static_parser.select(self.parsed_content, selector) + static_content = await self._static_parser.select(self._latest_parsed_content, selector) if static_content is not None: return static_content From 39bb7f8884f80c39efe9d3862481e125063833ff Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 11 Feb 2025 09:16:37 +0100 Subject: [PATCH 77/79] Review comments --- docs/examples/playwright_crawler_adaptive.mdx | 2 +- .../_adaptive_playwright_crawling_context.py | 14 +++---- .../test_adaptive_playwright_crawler.py | 37 +++++++++++++++++-- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/docs/examples/playwright_crawler_adaptive.mdx b/docs/examples/playwright_crawler_adaptive.mdx index 5acbde2cc8..a611251216 100644 --- a/docs/examples/playwright_crawler_adaptive.mdx +++ b/docs/examples/playwright_crawler_adaptive.mdx @@ -11,7 +11,7 @@ import AdaptivePlaywrightCrawlerExample from '!!raw-loader!./code/adaptive_playw This example demonstrates how to use `AdaptivePlaywrightCrawler`. An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler such as `ParselCrawler` or `BeautifulSoupCrawler`. It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. -A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler. +A [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler. For more detailed description please see [AdaptivePlaywrightCrawler guide](/python/docs/guides/adaptive-playwright-crawler 'AdaptivePlaywrightCrawler guide') diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 4e4c8831a8..f829308ea0 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -76,14 +76,12 @@ def response(self) -> Response: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._response - @property - def _latest_parsed_content(self) -> TStaticParseResult: - """Return parsed page. No need to parse again for in static context. Reparse in Playwright context.""" + async def _get_latest_parsed_content(self) -> TStaticParseResult: + """Return parsed page. No need to parse again in static context. Reparse in Playwright context.""" if self._page: - return self.parse_with_static_parser() + return await self.parse_with_static_parser() return self.parsed_content - async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> None: """Locate element by css selector and return `None` once it is found. @@ -93,7 +91,7 @@ async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta( selector: Css selector to be used to locate specific element on page. timeout: Timeout that defines how long the function wait for the selector to appear. """ - if await self._static_parser.select(self._latest_parsed_content, selector): + if await self._static_parser.select(await self._get_latest_parsed_content(), selector): return await self.page.locator(selector).wait_for(timeout=timeout.total_seconds() * 1000) @@ -109,7 +107,7 @@ async def query_selector(self, selector: str, timeout: timedelta = timedelta(sec Returns: Result of used static parser `select` method. """ - static_content = await self._static_parser.select(self._latest_parsed_content, selector) + static_content = await self._static_parser.select(await self._get_latest_parsed_content(), selector) if static_content is not None: return static_content @@ -124,7 +122,7 @@ async def query_selector(self, selector: str, timeout: timedelta = timedelta(sec raise AdaptiveContextError('Used selector is not a valid static selector') async def parse_with_static_parser( - self, selector: str | None, timeout: timedelta = timedelta(seconds=5) + self, selector: str | None = None, timeout: timedelta = timedelta(seconds=5) ) -> TStaticParseResult: """Parse whole page with static parser. If `selector` argument is used, wait for selector first. diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 9c6478c9d8..79dc837890 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -44,15 +44,17 @@ _H1_TEXT = 'Static' _H2_TEXT = 'Only in browser' +_H3_CHANGED_TEXT = 'Changed by JS' _INJECTED_JS_DELAY_MS = 100 _PAGE_CONTENT_STATIC = f"""

{_H1_TEXT}

+

Initial text

@@ -665,7 +667,6 @@ async def test_adaptive_context_parse_with_static_parser_parsel(test_urls: list[ """Test `context.parse_with_static_parser` works regardless of the crawl type for Parsel variant. (Test covers also `context.wait_for_selector`, which is called by `context.parse_with_static_parser`) - Create situation where """ static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) expected_h2_tag = f'

{_H2_TEXT}

' @@ -698,3 +699,33 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: call(expected_h2_tag), # Playwright waited for h2 to appear. ] ) + + +async def test_adaptive_context_helpers_on_changed_selector(test_urls: list[str]) -> None: + """Test that context helpers work on latest version of the page. + + Scenario where page is changed after a while. H2 element is added and text of H3 element is modified. + Test that context helpers automatically work on latest version of the page by reading H3 element and expecting it's + dynamically changed text instead of the original static text. + """ + browser_only_predictor_no_detection = _SimpleRenderingTypePredictor( + rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]) + ) + expected_h3_tag = f'

{_H3_CHANGED_TEXT}

' + + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + max_request_retries=1, + rendering_type_predictor=browser_only_predictor_no_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) + + mocked_h3_handler = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + await context.query_selector('h2') # Wait for change that is indicated by appearance of h2 element. + mocked_h3_handler((await context.query_selector('h3')).get()) # Get updated h3 element. + + await crawler.run(test_urls[:1]) + + mocked_h3_handler.assert_called_once_with(expected_h3_tag) From 3aff36fa7154c7249895aa23eed07add2167ba80 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 13 Feb 2025 17:05:06 +0100 Subject: [PATCH 78/79] Review comments --- .../code/adaptive_playwright_crawler.py | 4 +- .../adaptive_playwright_crawler_handler.py | 2 +- .../_abstract_http/_abstract_http_parser.py | 2 +- .../_adaptive_playwright_crawler.py | 4 +- .../_adaptive_playwright_crawling_context.py | 56 ++++++++++++++----- .../_beautifulsoup/_beautifulsoup_parser.py | 4 +- src/crawlee/crawlers/_http/_http_parser.py | 2 +- .../crawlers/_parsel/_parsel_parser.py | 6 +- .../test_adaptive_playwright_crawler.py | 44 +++++++++++---- 9 files changed, 87 insertions(+), 37 deletions(-) diff --git a/docs/examples/code/adaptive_playwright_crawler.py b/docs/examples/code/adaptive_playwright_crawler.py index c220dd6147..add09041b2 100644 --- a/docs/examples/code/adaptive_playwright_crawler.py +++ b/docs/examples/code/adaptive_playwright_crawler.py @@ -25,7 +25,7 @@ async def request_handler_for_label( context.log.info(context.parsed_content.title) # Locate element h2 within 5 seconds - h2 = await context.query_selector('h2', timedelta(milliseconds=5000)) + h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) # Do stuff with element found by the selector context.log.info(h2) @@ -39,7 +39,7 @@ async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: """Hook executed both in static sub crawler and playwright sub crawler. Trying to access `context.page` in this hook would raise `AdaptiveContextError` - for pages crawled without playwright.""" + for pages crawled without playwright.""" context.log.info(f'pre navigation hook for: {context.request.url} ...') @crawler.pre_navigation_hook(playwright_only=True) diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py index 565ce0d669..5dc12e54ef 100644 --- a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py @@ -8,6 +8,6 @@ @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: # Locate element h2 within 5 seconds - h2 = await context.query_selector('h2', timedelta(milliseconds=5000)) + h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) # Do stuff with element found by the selector context.log.info(h2) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index 40f012dcf8..5281838181 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -42,7 +42,7 @@ async def parse_text(self, text: str) -> TParseResult: """ @abstractmethod - async def select(self, parsed_content: TParseResult, selector: str) -> TSelectResult | None: + async def select(self, parsed_content: TParseResult, selector: str) -> tuple[TSelectResult, ...]: """Use css selector to select page element and return it. Args: diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 18f752c06c..d0bbeff2b3 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -101,7 +101,7 @@ class AdaptivePlaywrightCrawler( from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( - max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False} + max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'browser_type': 'chromium'} ) @crawler.router.default_handler @@ -110,7 +110,7 @@ async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) context.log.info(context.parsed_content.title) # Locate element h2 within 5 seconds - h2 = await context.query_selector('h2', timedelta(milliseconds=5000)) + h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) # Do stuff with element found by the selector context.log.info(h2) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index f829308ea0..c0450410ed 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -4,6 +4,8 @@ from datetime import timedelta from typing import TYPE_CHECKING, Generic, TypeVar +from playwright.async_api import TimeoutError as PlaywrightTimeoutError + from crawlee import HttpHeaders from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group @@ -76,12 +78,6 @@ def response(self) -> Response: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._response - async def _get_latest_parsed_content(self) -> TStaticParseResult: - """Return parsed page. No need to parse again in static context. Reparse in Playwright context.""" - if self._page: - return await self.parse_with_static_parser() - return self.parsed_content - async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> None: """Locate element by css selector and return `None` once it is found. @@ -91,12 +87,14 @@ async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta( selector: Css selector to be used to locate specific element on page. timeout: Timeout that defines how long the function wait for the selector to appear. """ - if await self._static_parser.select(await self._get_latest_parsed_content(), selector): + if await self._static_parser.select(await self.parse_with_static_parser(), selector): return await self.page.locator(selector).wait_for(timeout=timeout.total_seconds() * 1000) - async def query_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> TStaticSelectResult: - """Locate element by css selector and return the element once it is found. + async def query_selector_one( + self, selector: str, timeout: timedelta = timedelta(seconds=5) + ) -> TStaticSelectResult | None: + """Locate element by css selector and return first element found. If element is not found within timeout, `TimeoutError` is raised. @@ -107,19 +105,47 @@ async def query_selector(self, selector: str, timeout: timedelta = timedelta(sec Returns: Result of used static parser `select` method. """ - static_content = await self._static_parser.select(await self._get_latest_parsed_content(), selector) - if static_content is not None: + if matches := await self.query_selector_all(selector=selector, timeout=timeout): + return matches[0] + return None + + async def query_selector_all( + self, selector: str, timeout: timedelta = timedelta(seconds=5) + ) -> tuple[TStaticSelectResult, ...]: + """Locate element by css selector and return all elements found. + + If element is not found within timeout, `TimeoutError` is raised. + + Args: + selector: Css selector to be used to locate specific element on page. + timeout: Timeout that defines how long the function wait for the selector to appear. + + Returns: + List of results of used static parser `select` method. + """ + if static_content := await self._static_parser.select(await self.parse_with_static_parser(), selector): + # Selector found in static content. return static_content locator = self.page.locator(selector) - await locator.wait_for(timeout=timeout.total_seconds() * 1000) + try: + await locator.wait_for(timeout=timeout.total_seconds() * 1000) + except PlaywrightTimeoutError: + # Selector not found at all. + return () parsed_selector = await self._static_parser.select( await self._static_parser.parse_text(await locator.evaluate('el => el.outerHTML')), selector ) if parsed_selector is not None: + # Selector found by browser after some wait time and selected by static parser. return parsed_selector - raise AdaptiveContextError('Used selector is not a valid static selector') + + # Selector found by browser after some wait time, but could not be selected by static parser. + raise AdaptiveContextError( + 'Element exists on the page and Playwright was able to locate it, but the static content parser of selected' + 'static crawler does support such selector.' + ) async def parse_with_static_parser( self, selector: str | None = None, timeout: timedelta = timedelta(seconds=5) @@ -137,7 +163,9 @@ async def parse_with_static_parser( """ if selector: await self.wait_for_selector(selector, timeout) - return await self._static_parser.parse_text(await self.page.content()) + if self._page: + return await self._static_parser.parse_text(await self.page.content()) + return self.parsed_content @classmethod def from_parsed_http_crawling_context( diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index 7fd24651a3..87b056367d 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -32,8 +32,8 @@ def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool: return parsed_content.select_one(selector) is not None @override - async def select(self, parsed_content: Tag, selector: str) -> Tag | None: - return parsed_content.select_one(selector) + async def select(self, parsed_content: Tag, selector: str) -> tuple[Tag, ...]: + return tuple(match for match in parsed_content.select(selector)) @override def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]: diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 68578f6638..75fa534af1 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -28,7 +28,7 @@ async def parse_text(self, text: str) -> bytes: raise NotImplementedError @override - async def select(self, parsed_content: bytes, selector: str) -> bytes | None: + async def select(self, parsed_content: bytes, selector: str) -> tuple[bytes, ...]: raise NotImplementedError @override diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 27fef4f00e..2bb9b1056c 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -26,10 +26,8 @@ async def parse_text(self, text: str) -> Selector: return Selector(text=text) @override - async def select(self, parsed_content: Selector, selector: str) -> Selector | None: - if selector_list := parsed_content.css(selector): - return selector_list[0] - return None + async def select(self, parsed_content: Selector, selector: str) -> tuple[Selector, ...]: + return tuple(match for match in parsed_content.css(selector)) @override def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 79dc837890..1615a97dc7 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -582,7 +582,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: async def test_adaptive_context_query_selector_beautiful_soup(test_urls: list[str]) -> None: - """Test that `context.query_selector` works regardless of the crawl type for BeautifulSoup variant. + """Test that `context.query_selector_one` works regardless of the crawl type for BeautifulSoup variant. Handler tries to locate two elements h1 and h2. h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html. @@ -605,9 +605,9 @@ async def test_adaptive_context_query_selector_beautiful_soup(test_urls: list[st @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - h1 = await context.query_selector('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) + h1 = await context.query_selector_one('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) mocked_h1_handler(h1) - h2 = await context.query_selector('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) + h2 = await context.query_selector_one('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) mocked_h2_handler(h2) await crawler.run(test_urls[:1]) @@ -625,7 +625,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: async def test_adaptive_context_query_selector_parsel(test_urls: list[str]) -> None: - """Test that `context.query_selector` works regardless of the crawl type for Parsel variant. + """Test that `context.query_selector_one` works regardless of the crawl type for Parsel variant. Handler tries to locate two elements h1 and h2. h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html. @@ -650,10 +650,10 @@ async def test_adaptive_context_query_selector_parsel(test_urls: list[str]) -> N @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - h1 = await context.query_selector('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) - mocked_h1_handler(type(h1), h1.get()) - h2 = await context.query_selector('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) - mocked_h2_handler(type(h2), h2.get()) + if h1 := await context.query_selector_one('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)): + mocked_h1_handler(type(h1), h1.get()) + if h2 := await context.query_selector_one('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)): + mocked_h2_handler(type(h2), h2.get()) await crawler.run(test_urls[:1]) @@ -723,9 +723,33 @@ async def test_adaptive_context_helpers_on_changed_selector(test_urls: list[str] @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - await context.query_selector('h2') # Wait for change that is indicated by appearance of h2 element. - mocked_h3_handler((await context.query_selector('h3')).get()) # Get updated h3 element. + await context.query_selector_one('h2') # Wait for change that is indicated by appearance of h2 element. + if h3 := await context.query_selector_one('h3'): + mocked_h3_handler(h3.get()) # Get updated h3 element. await crawler.run(test_urls[:1]) mocked_h3_handler.assert_called_once_with(expected_h3_tag) + + +async def test_adaptive_context_query_non_existing_element(test_urls: list[str]) -> None: + """Test that querying non-existing selector returns `None`""" + browser_only_predictor_no_detection = _SimpleRenderingTypePredictor( + rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]) + ) + + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + max_request_retries=1, + rendering_type_predictor=browser_only_predictor_no_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) + + mocked_h3_handler = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + mocked_h3_handler(await context.query_selector_one('non sense selector', timeout=timedelta(milliseconds=1))) + + await crawler.run(test_urls[:1]) + + mocked_h3_handler.assert_called_once_with(None) From 760f2728f6808611c11960c6826faff5ef575530 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 13 Feb 2025 17:20:48 +0100 Subject: [PATCH 79/79] Update type hint to not brak docusaurus --- docs/guides/playwright_crawler_adaptive.mdx | 4 +++- src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py | 4 ++-- .../_adaptive_playwright_crawling_context.py | 4 ++-- src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py | 4 ++-- src/crawlee/crawlers/_http/_http_parser.py | 4 ++-- src/crawlee/crawlers/_parsel/_parsel_parser.py | 4 ++-- 6 files changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/guides/playwright_crawler_adaptive.mdx b/docs/guides/playwright_crawler_adaptive.mdx index 58f1fbdc38..0beb81954f 100644 --- a/docs/guides/playwright_crawler_adaptive.mdx +++ b/docs/guides/playwright_crawler_adaptive.mdx @@ -34,7 +34,9 @@ Request handler for `AdaptivePlayw `wait_for_selector` accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright. -`query_selector` accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns this selector if found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for `ParselCrawler` and `Tag` for `BeautifulSoupCrawler`. +`query_selector_one` accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns one selector if any selector is found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for `ParselCrawler` and `Tag` for `BeautifulSoupCrawler`. + +`query_selector_all` same as `query_selector_one`, but returns all found selectors. `parse_with_static_parser` will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls `wait_for_selector` and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete. diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index 5281838181..c8c77965ef 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -10,7 +10,7 @@ from ._http_crawling_context import TParseResult, TSelectResult if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @@ -42,7 +42,7 @@ async def parse_text(self, text: str) -> TParseResult: """ @abstractmethod - async def select(self, parsed_content: TParseResult, selector: str) -> tuple[TSelectResult, ...]: + async def select(self, parsed_content: TParseResult, selector: str) -> Sequence[TSelectResult]: """Use css selector to select page element and return it. Args: diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index c0450410ed..ea39b13402 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -16,7 +16,7 @@ ) if TYPE_CHECKING: - from collections.abc import Awaitable, Callable + from collections.abc import Awaitable, Callable, Sequence from playwright.async_api import Page, Response from typing_extensions import Self @@ -111,7 +111,7 @@ async def query_selector_one( async def query_selector_all( self, selector: str, timeout: timedelta = timedelta(seconds=5) - ) -> tuple[TStaticSelectResult, ...]: + ) -> Sequence[TStaticSelectResult]: """Locate element by css selector and return all elements found. If element is not found within timeout, `TimeoutError` is raised. diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index 87b056367d..f0543b9a8e 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -8,7 +8,7 @@ from crawlee.crawlers._abstract_http import AbstractHttpParser if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @@ -32,7 +32,7 @@ def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool: return parsed_content.select_one(selector) is not None @override - async def select(self, parsed_content: Tag, selector: str) -> tuple[Tag, ...]: + async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]: return tuple(match for match in parsed_content.select(selector)) @override diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 75fa534af1..0a9af538dc 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -8,7 +8,7 @@ from crawlee.crawlers._types import BlockedInfo if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @@ -28,7 +28,7 @@ async def parse_text(self, text: str) -> bytes: raise NotImplementedError @override - async def select(self, parsed_content: bytes, selector: str) -> tuple[bytes, ...]: + async def select(self, parsed_content: bytes, selector: str) -> Sequence[bytes]: raise NotImplementedError @override diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 2bb9b1056c..9baa1eba7c 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -9,7 +9,7 @@ from crawlee.crawlers._abstract_http import AbstractHttpParser if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @@ -26,7 +26,7 @@ async def parse_text(self, text: str) -> Selector: return Selector(text=text) @override - async def select(self, parsed_content: Selector, selector: str) -> tuple[Selector, ...]: + async def select(self, parsed_content: Selector, selector: str) -> Sequence[Selector]: return tuple(match for match in parsed_content.css(selector)) @override