diff --git a/docs/examples/code/adaptive_playwright_crawler.py b/docs/examples/code/adaptive_playwright_crawler.py index bf6145b585..add09041b2 100644 --- a/docs/examples/code/adaptive_playwright_crawler.py +++ b/docs/examples/code/adaptive_playwright_crawler.py @@ -1,4 +1,5 @@ import asyncio +from datetime import timedelta from playwright.async_api import Route @@ -10,40 +11,43 @@ async def main() -> None: + # Crawler created by following factory method will use `beautifulsoup` + # for parsing static content. crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False} ) - @crawler.router.handler(label='label') + @crawler.router.default_handler async def request_handler_for_label( context: AdaptivePlaywrightCrawlingContext, ) -> None: - # Do some processing using `page` - some_locator = context.page.locator('div').first - await some_locator.wait_for() - # Do stuff with locator... - context.log.info(f'Playwright processing of: {context.request.url} ...') - - @crawler.router.default_handler - async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - context.log.info(f'User handler processing: {context.request.url} ...') # Do some processing using `parsed_content` context.log.info(context.parsed_content.title) + # Locate element h2 within 5 seconds + h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) + # Do stuff with element found by the selector + context.log.info(h2) + # Find more links and enqueue them. await context.enqueue_links() - await context.push_data({'Top crawler Url': context.request.url}) + # Save some data. + await context.push_data({'Visited url': context.request.url}) @crawler.pre_navigation_hook async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: - """Hook executed both in static sub crawler and playwright sub crawler.""" - # Trying to access context.page in this hook would raise `AdaptiveContextError` - # for pages crawled without playwright. + """Hook executed both in static sub crawler and playwright sub crawler. + + Trying to access `context.page` in this hook would raise `AdaptiveContextError` + for pages crawled without playwright.""" context.log.info(f'pre navigation hook for: {context.request.url} ...') @crawler.pre_navigation_hook(playwright_only=True) async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: - """Hook executed only in playwright sub crawler.""" + """Hook executed only in playwright sub crawler. + + It is safe to access `page` object. + """ async def some_routing_function(route: Route) -> None: await route.continue_() diff --git a/docs/examples/playwright_crawler_adaptive.mdx b/docs/examples/playwright_crawler_adaptive.mdx new file mode 100644 index 0000000000..a611251216 --- /dev/null +++ b/docs/examples/playwright_crawler_adaptive.mdx @@ -0,0 +1,20 @@ +--- +id: adaptive-playwright-crawler +title: AdaptivePlaywrightCrawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import AdaptivePlaywrightCrawlerExample from '!!raw-loader!./code/adaptive_playwright_crawler.py'; + +This example demonstrates how to use `AdaptivePlaywrightCrawler`. An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler such as `ParselCrawler` or `BeautifulSoupCrawler`. +It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. + +A [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler. + +For more detailed description please see [AdaptivePlaywrightCrawler guide](/python/docs/guides/adaptive-playwright-crawler 'AdaptivePlaywrightCrawler guide') + + + {AdaptivePlaywrightCrawlerExample} + diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py new file mode 100644 index 0000000000..5dc12e54ef --- /dev/null +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py @@ -0,0 +1,13 @@ +from datetime import timedelta + +from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext + +crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() + + +@crawler.router.default_handler +async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + # Locate element h2 within 5 seconds + h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) + # Do stuff with element found by the selector + context.log.info(h2) diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py new file mode 100644 index 0000000000..b3ed447f2a --- /dev/null +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py @@ -0,0 +1,10 @@ +from crawlee.crawlers import AdaptivePlaywrightCrawler + +crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + # Arguments relevant only for PlaywrightCrawler + playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chromium'}, + # Arguments relevant only for BeautifulSoupCrawler + static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]}, + # Common arguments relevant to all crawlers + max_crawl_depth=5, +) diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py new file mode 100644 index 0000000000..76010fe224 --- /dev/null +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py @@ -0,0 +1,10 @@ +from crawlee.crawlers import AdaptivePlaywrightCrawler + +crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + # Arguments relevant only for PlaywrightCrawler + playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chromium'}, + # Arguments relevant only for ParselCrawler + static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]}, + # Common arguments relevant to all crawlers + max_crawl_depth=5, +) diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py new file mode 100644 index 0000000000..f1d8ec1001 --- /dev/null +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py @@ -0,0 +1,61 @@ +from crawlee import Request +from crawlee._types import RequestHandlerRunResult +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + RenderingType, + RenderingTypePrediction, + RenderingTypePredictor, +) + + +class CustomRenderingTypePredictor(RenderingTypePredictor): + def __init__(self) -> None: + self._learning_data = list[tuple[Request, RenderingType]]() + + def predict(self, request: Request) -> RenderingTypePrediction: + # Some custom logic that produces some `RenderingTypePrediction` + # based on the `request` input. + rendering_type: RenderingType = ( + 'static' if 'abc' in request.url else 'client only' + ) + + return RenderingTypePrediction( + # Recommends `static` rendering type -> HTTP-based sub crawler will be used. + rendering_type=rendering_type, + # Recommends that both sub crawlers should run with 20% chance. When both sub + # crawlers are running, the predictor can compare results and learn. + # High number means that predictor is not very confident about the + # `rendering_type`, low number means that predictor is very confident. + detection_probability_recommendation=0.2, + ) + + def store_result(self, request: Request, rendering_type: RenderingType) -> None: + # This function allows predictor to store new learning data and retrain itself + # if needed. `request` is input for prediction and `rendering_type` is the correct + # prediction. + self._learning_data.append((request, rendering_type)) + # retrain + + +def result_checker(result: RequestHandlerRunResult) -> bool: + # Some function that inspects produced `result` and returns `True` if the result + # is correct. + return bool(result) # Check something on result + + +def result_comparator( + result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult +) -> bool: + # Some function that inspects two results and returns `True` if they are + # considered equivalent. It is used when comparing results produced by HTTP-based + # sub crawler and playwright based sub crawler. + return ( + result_1.push_data_calls == result_2.push_data_calls + ) # For example compare `push_data` calls. + + +crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + rendering_type_predictor=CustomRenderingTypePredictor(), + result_checker=result_checker, + result_comparator=result_comparator, +) diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py new file mode 100644 index 0000000000..656997576d --- /dev/null +++ b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py @@ -0,0 +1,29 @@ +from playwright.async_api import Route + +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightPreNavCrawlingContext, +) + +crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() + + +@crawler.pre_navigation_hook +async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed both in static sub crawler and playwright sub crawler. + + Trying to access `context.page` in this hook would raise `AdaptiveContextError` + for pages crawled without playwright.""" + + context.log.info(f'pre navigation hook for: {context.request.url}') + + +@crawler.pre_navigation_hook(playwright_only=True) +async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed only in playwright sub crawler.""" + + async def some_routing_function(route: Route) -> None: + await route.continue_() + + await context.page.route('*/**', some_routing_function) + context.log.info(f'Playwright only pre navigation hook for: {context.request.url}') diff --git a/docs/guides/playwright_crawler_adaptive.mdx b/docs/guides/playwright_crawler_adaptive.mdx new file mode 100644 index 0000000000..0beb81954f --- /dev/null +++ b/docs/guides/playwright_crawler_adaptive.mdx @@ -0,0 +1,96 @@ +--- +id: adaptive-playwright-crawler +title: AdaptivePlaywrightCrawler +description: How to use the AdaptivePlaywrightCrawler. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import AdaptivePlaywrightCrawlerInitBeautifulSoup from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py'; +import AdaptivePlaywrightCrawlerInitParsel from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py'; +import AdaptivePlaywrightCrawlerInitPrediction from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py'; +import AdaptivePlaywrightCrawlerHandler from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py'; +import AdaptivePlaywrightCrawlerPreNavHooks from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py'; + + + +An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler such as `ParselCrawler` or `BeautifulSoupCrawler`. +It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. + +Detection is done based on the `RenderingTypePredictor` with default implementation `DefaultRenderingTypePredictor`. It predicts which crawling method should be used and learns from already crawled pages. + +## When to use AdaptivePlaywrightCrawler + +Use `AdaptivePlaywrightCrawler` in scenarios where some target pages have to be crawled with `PlaywrightCrawler`, but for others faster HTTP-based crawler is sufficient. This way, you can achieve lower costs when crawling multiple different websites. + +Another use case is performing selector-based data extraction without prior knowledge of whether the selector exists in the static page or is dynamically added by a code executed in a browsing client. + +## Request handler and adaptive context helpers + +Request handler for `AdaptivePlaywrightCrawler` works on special context type - `AdaptivePlaywrightCrawlingContext`. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object. To overcome this limitation, there are three helper methods on this context that can be called regardless of how the context was created. + +`wait_for_selector` accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright. + +`query_selector_one` accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns one selector if any selector is found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for `ParselCrawler` and `Tag` for `BeautifulSoupCrawler`. + +`query_selector_all` same as `query_selector_one`, but returns all found selectors. + +`parse_with_static_parser` will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls `wait_for_selector` and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete. + +See the following example about how to create request handler and use context helpers: + + + {AdaptivePlaywrightCrawlerHandler} + + + +## Crawler configuration +To use `AdaptivePlaywrightCrawler` it is recommended to use one of the prepared factory methods that will create the crawler with specific HTTP-based sub crawler variant: `AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser` or `AdaptivePlaywrightCrawler.with_parsel_static_parser`. + +`AdaptivePlaywrightCrawler` is internally composed of two sub crawlers and you can do a detailed configuration of both of them. For detailed configuration options of the sub crawlers, please refer to their pages: `PlaywrightCrawler`, `ParselCrawler`, `BeautifulSoupCrawler`. + +In the following example you can see how to create and configure `AdaptivePlaywrightCrawler` with two different HTTP-based sub crawlers: + + + + + + {AdaptivePlaywrightCrawlerInitBeautifulSoup} + + + + + {AdaptivePlaywrightCrawlerInitParsel} + + + + +### Prediction related arguments + +To control which pages are crawled by which method you can use following arguments: + +`RenderingTypePredictor` - Class that can give recommendations about which sub crawler should be used for specific url. Predictor will also recommend to use both sub crawlers for some page from time to time, to check that the given recommendation was correct. Predictor should be able to learn from previous results and gradually give more reliable recommendations. + +`result_checker` - Is a function that checks result created from crawling a page. By default, it always returns `True`. + +`result_comparator` - Is a function that compares two results (HTTP-based sub crawler result and playwright based sub crawler result) and returns `True` if they are considered the same. By default, this function compares calls of context helper `push_data` by each sub crawler. This function is used by `rendering_type_predictor` to evaluate whether HTTP-based crawler has the same results as playwright based sub crawler. + +See the following example about how to pass prediction related arguments: + + + {AdaptivePlaywrightCrawlerInitPrediction} + + + + +## Page configuration with pre-navigation hooks +In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the `pre_navigation_hook` method of the `AdaptivePlaywrightCrawler`. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of `AdaptivePlaywrightCrawler` it is possible that the hook will be executed for HTTP-based sub crawler or playwright-based sub crawler. Using [page](https://playwright.dev/python/docs/api/class-page) object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook. + +See the following example about how to register the pre navigation hooks: + + + {AdaptivePlaywrightCrawlerPreNavHooks} + diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 6df0983863..7c270c3744 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -16,7 +16,7 @@ from crawlee.http_clients import HttpxHttpClient from crawlee.statistics import StatisticsState -from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult +from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Iterable @@ -53,7 +53,7 @@ class HttpCrawlerOptions( @docs_group('Abstract classes') class AbstractHttpCrawler( - Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC + Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC ): """A web crawler for performing HTTP requests. @@ -72,7 +72,7 @@ class AbstractHttpCrawler( def __init__( self, *, - parser: AbstractHttpParser[TParseResult], + parser: AbstractHttpParser[TParseResult, TSelectResult], additional_http_error_status_codes: Iterable[int] = (), ignore_http_error_status_codes: Iterable[int] = (), **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]], @@ -100,8 +100,8 @@ def __init__( @classmethod def create_parsed_http_crawler_class( cls, - static_parser: AbstractHttpParser[TParseResult], - ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]]: + static_parser: AbstractHttpParser[TParseResult, TSelectResult], + ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]: """Convenience class factory that creates specific version of `AbstractHttpCrawler` class. In general typing sense two generic types of `AbstractHttpCrawler` do not have to be dependent on each other. @@ -109,10 +109,12 @@ def create_parsed_http_crawler_class( parameters in `AbstractHttpCrawler`. """ - class _ParsedHttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]): + class _ParsedHttpCrawler( + AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult] + ): def __init__( self, - parser: AbstractHttpParser[TParseResult] = static_parser, + parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser, **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]], ) -> None: kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline() diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index f6fba50ad6..c8c77965ef 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -7,16 +7,16 @@ from crawlee._utils.docs import docs_group from crawlee.crawlers._types import BlockedInfo -from ._http_crawling_context import TParseResult +from ._http_crawling_context import TParseResult, TSelectResult if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @docs_group('Abstract classes') -class AbstractHttpParser(Generic[TParseResult], ABC): +class AbstractHttpParser(Generic[TParseResult, TSelectResult], ABC): """Parser used for parsing http response and inspecting parsed result to find links or detect blocking.""" @abstractmethod @@ -30,6 +30,29 @@ async def parse(self, response: HttpResponse) -> TParseResult: Parsed HTTP response. """ + @abstractmethod + async def parse_text(self, text: str) -> TParseResult: + """Parse text containing html. + + Args: + text: String containing html. + + Returns: + Parsed text. + """ + + @abstractmethod + async def select(self, parsed_content: TParseResult, selector: str) -> Sequence[TSelectResult]: + """Use css selector to select page element and return it. + + Args: + parsed_content: Content where the page element will be located. + selector: Css selector used to locate desired html element. + + Returns: + Selected element. + """ + def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo: """Detect if blocked and return BlockedInfo with additional information. diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py index 7475b85eb3..2278cb783d 100644 --- a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py +++ b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py @@ -10,6 +10,7 @@ from crawlee.http_clients import HttpCrawlingResult, HttpResponse TParseResult = TypeVar('TParseResult') +TSelectResult = TypeVar('TSelectResult') @dataclass(frozen=True) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index b550e7dce2..d0bbeff2b3 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -8,7 +8,7 @@ from random import random from typing import TYPE_CHECKING, Any, Generic, get_args -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from parsel import Selector from typing_extensions import Self, TypeVar, override @@ -58,6 +58,7 @@ TStaticParseResult = TypeVar('TStaticParseResult') +TStaticSelectResult = TypeVar('TStaticSelectResult') TStaticCrawlingContext = TypeVar('TStaticCrawlingContext', bound=ParsedHttpCrawlingContext) @@ -86,19 +87,46 @@ async def __aexit__( @docs_group('Classes') class AdaptivePlaywrightCrawler( - Generic[TStaticCrawlingContext, TStaticParseResult], + Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult], BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState], ): - """Adaptive crawler that uses both specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. + """An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling. - It tries to detect whether it is sufficient to crawl without browser (which is faster) or if - `PlaywrightCrawler` should be used (in case previous method did not work as expected for specific url.). + It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects + that it may bring a performance benefit. + It uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. + + ### Usage + ```python + from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext + + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'browser_type': 'chromium'} + ) + + @crawler.router.default_handler + async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None: + # Do some processing using `parsed_content` + context.log.info(context.parsed_content.title) + + # Locate element h2 within 5 seconds + h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) + # Do stuff with element found by the selector + context.log.info(h2) + + # Find more links and enqueue them. + await context.enqueue_links() + # Save some data. + await context.push_data({'Visited url': context.request.url}) + + await crawler.run(['https://crawlee.dev/']) + ``` """ def __init__( self, *, - static_parser: AbstractHttpParser[TStaticParseResult], + static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], rendering_type_predictor: RenderingTypePredictor | None = None, result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, @@ -197,13 +225,13 @@ def with_beautifulsoup_static_parser( playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], - ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]: + ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag]: """Creates `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" if statistics is not None: adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState) else: adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState) - return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup]( + return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, @@ -224,13 +252,13 @@ def with_parsel_static_parser( playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], - ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]: + ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector]: """Creates `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content.""" if statistics is not None: adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState) else: adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState) - return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector]( + return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, @@ -302,7 +330,9 @@ def _pipeline_call_factory( async def from_static_pipeline_to_top_router( context: ParsedHttpCrawlingContext[TStaticParseResult], ) -> None: - adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context(context) + adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context( + context=context, parser=self._static_parser + ) await self.router(adaptive_crawling_context) return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 64912e0434..ea39b13402 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -1,9 +1,10 @@ from __future__ import annotations from dataclasses import dataclass, fields -from typing import TYPE_CHECKING, Generic +from datetime import timedelta +from typing import TYPE_CHECKING, Generic, TypeVar -from typing_extensions import TypeVar +from playwright.async_api import TimeoutError as PlaywrightTimeoutError from crawlee import HttpHeaders from crawlee._types import BasicCrawlingContext @@ -15,7 +16,7 @@ ) if TYPE_CHECKING: - from collections.abc import Awaitable, Callable + from collections.abc import Awaitable, Callable, Sequence from playwright.async_api import Page, Response from typing_extensions import Self @@ -23,16 +24,20 @@ from crawlee.crawlers._playwright._types import BlockRequestsFunction -class AdaptiveContextError(RuntimeError): - pass +TStaticParseResult = TypeVar('TStaticParseResult') +TStaticSelectResult = TypeVar('TStaticSelectResult') -TStaticParseResult = TypeVar('TStaticParseResult') +class AdaptiveContextError(RuntimeError): + pass @dataclass(frozen=True) @docs_group('Data structures') -class AdaptivePlaywrightCrawlingContext(Generic[TStaticParseResult], ParsedHttpCrawlingContext[TStaticParseResult]): +class AdaptivePlaywrightCrawlingContext( + Generic[TStaticParseResult, TStaticSelectResult], ParsedHttpCrawlingContext[TStaticParseResult] +): + _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult] """The crawling context used by `AdaptivePlaywrightCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. @@ -73,17 +78,108 @@ def response(self) -> Response: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._response + async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> None: + """Locate element by css selector and return `None` once it is found. + + If element is not found within timeout, `TimeoutError` is raised. + + Args: + selector: Css selector to be used to locate specific element on page. + timeout: Timeout that defines how long the function wait for the selector to appear. + """ + if await self._static_parser.select(await self.parse_with_static_parser(), selector): + return + await self.page.locator(selector).wait_for(timeout=timeout.total_seconds() * 1000) + + async def query_selector_one( + self, selector: str, timeout: timedelta = timedelta(seconds=5) + ) -> TStaticSelectResult | None: + """Locate element by css selector and return first element found. + + If element is not found within timeout, `TimeoutError` is raised. + + Args: + selector: Css selector to be used to locate specific element on page. + timeout: Timeout that defines how long the function wait for the selector to appear. + + Returns: + Result of used static parser `select` method. + """ + if matches := await self.query_selector_all(selector=selector, timeout=timeout): + return matches[0] + return None + + async def query_selector_all( + self, selector: str, timeout: timedelta = timedelta(seconds=5) + ) -> Sequence[TStaticSelectResult]: + """Locate element by css selector and return all elements found. + + If element is not found within timeout, `TimeoutError` is raised. + + Args: + selector: Css selector to be used to locate specific element on page. + timeout: Timeout that defines how long the function wait for the selector to appear. + + Returns: + List of results of used static parser `select` method. + """ + if static_content := await self._static_parser.select(await self.parse_with_static_parser(), selector): + # Selector found in static content. + return static_content + + locator = self.page.locator(selector) + try: + await locator.wait_for(timeout=timeout.total_seconds() * 1000) + except PlaywrightTimeoutError: + # Selector not found at all. + return () + + parsed_selector = await self._static_parser.select( + await self._static_parser.parse_text(await locator.evaluate('el => el.outerHTML')), selector + ) + if parsed_selector is not None: + # Selector found by browser after some wait time and selected by static parser. + return parsed_selector + + # Selector found by browser after some wait time, but could not be selected by static parser. + raise AdaptiveContextError( + 'Element exists on the page and Playwright was able to locate it, but the static content parser of selected' + 'static crawler does support such selector.' + ) + + async def parse_with_static_parser( + self, selector: str | None = None, timeout: timedelta = timedelta(seconds=5) + ) -> TStaticParseResult: + """Parse whole page with static parser. If `selector` argument is used, wait for selector first. + + If element is not found within timeout, TimeoutError is raised. + + Args: + selector: css selector to be used to locate specific element on page. + timeout: timeout that defines how long the function wait for the selector to appear. + + Returns: + Result of used static parser `parse_text` method. + """ + if selector: + await self.wait_for_selector(selector, timeout) + if self._page: + return await self._static_parser.parse_text(await self.page.content()) + return self.parsed_content + @classmethod def from_parsed_http_crawling_context( - cls, context: ParsedHttpCrawlingContext[TStaticParseResult] - ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult]: + cls, + context: ParsedHttpCrawlingContext[TStaticParseResult], + parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], + ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]: """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext`.""" - return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) + return cls(_static_parser=parser, **{field.name: getattr(context, field.name) for field in fields(context)}) @classmethod async def from_playwright_crawling_context( - cls, context: PlaywrightCrawlingContext, parser: AbstractHttpParser[TStaticParseResult] - ) -> Self: + cls, context: PlaywrightCrawlingContext, parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult] + ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]: """Convenience constructor that creates new context from existing `PlaywrightCrawlingContext`.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} # Remove playwright specific attributes and pass them as private instead to be available as property. @@ -100,6 +196,7 @@ async def from_playwright_crawling_context( return cls( parsed_content=await parser.parse(http_response), http_response=http_response, + _static_parser=parser, **context_kwargs, ) diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py index 68c3b4bbad..b9bb33c21b 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions @@ -19,7 +19,7 @@ @docs_group('Classes') -class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup]): +class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup, Tag]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index 155498b5d1..f0543b9a8e 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -8,12 +8,12 @@ from crawlee.crawlers._abstract_http import AbstractHttpParser if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse -class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup]): +class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup, Tag]): """Parser for parsing HTTP response using `BeautifulSoup`.""" def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: @@ -24,11 +24,19 @@ async def parse(self, response: HttpResponse) -> BeautifulSoup: return BeautifulSoup(response.read(), features=self._parser) @override - def is_matching_selector(self, parsed_content: BeautifulSoup, selector: str) -> bool: + async def parse_text(self, text: str) -> BeautifulSoup: + return BeautifulSoup(text, features=self._parser) + + @override + def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool: return parsed_content.select_one(selector) is not None @override - def find_links(self, parsed_content: BeautifulSoup, selector: str) -> Iterable[str]: + async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]: + return tuple(match for match in parsed_content.select(selector)) + + @override + def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]: link: Tag urls: list[str] = [] for link in parsed_content.select(selector): diff --git a/src/crawlee/crawlers/_beautifulsoup/_utils.py b/src/crawlee/crawlers/_beautifulsoup/_utils.py index f92990ba9b..4c8719335e 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_utils.py +++ b/src/crawlee/crawlers/_beautifulsoup/_utils.py @@ -17,7 +17,7 @@ from collections.abc import Iterable -def html_to_text(source: str | BeautifulSoup) -> str: +def html_to_text(source: str | Tag) -> str: """Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup. Args: diff --git a/src/crawlee/crawlers/_http/_http_crawler.py b/src/crawlee/crawlers/_http/_http_crawler.py index 1ee89861b9..282d9370b9 100644 --- a/src/crawlee/crawlers/_http/_http_crawler.py +++ b/src/crawlee/crawlers/_http/_http_crawler.py @@ -12,7 +12,7 @@ @docs_group('Classes') -class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes]): +class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes, bytes]): """Specific version of generic `AbstractHttpCrawler`. It uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 90b8463fce..0a9af538dc 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -8,12 +8,12 @@ from crawlee.crawlers._types import BlockedInfo if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse -class NoParser(AbstractHttpParser[bytes]): +class NoParser(AbstractHttpParser[bytes, bytes]): """Dummy parser for backwards compatibility. To enable using `HttpCrawler` without need for additional specific parser. @@ -23,6 +23,14 @@ class NoParser(AbstractHttpParser[bytes]): async def parse(self, response: HttpResponse) -> bytes: return response.read() + @override + async def parse_text(self, text: str) -> bytes: + raise NotImplementedError + + @override + async def select(self, parsed_content: bytes, selector: str) -> Sequence[bytes]: + raise NotImplementedError + @override def is_blocked(self, parsed_content: bytes) -> BlockedInfo: # Intentional unused argument. return BlockedInfo(reason='') diff --git a/src/crawlee/crawlers/_parsel/_parsel_crawler.py b/src/crawlee/crawlers/_parsel/_parsel_crawler.py index 4ed7cffac9..2769c6bd8e 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_crawler.py +++ b/src/crawlee/crawlers/_parsel/_parsel_crawler.py @@ -19,7 +19,7 @@ @docs_group('Classes') -class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector]): +class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selector]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 4a61395111..9baa1eba7c 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -1,20 +1,34 @@ +from __future__ import annotations + import asyncio -from collections.abc import Iterable +from typing import TYPE_CHECKING from parsel import Selector from typing_extensions import override from crawlee.crawlers._abstract_http import AbstractHttpParser -from crawlee.http_clients import HttpResponse +if TYPE_CHECKING: + from collections.abc import Iterable, Sequence + + from crawlee.http_clients import HttpResponse -class ParselParser(AbstractHttpParser[Selector]): + +class ParselParser(AbstractHttpParser[Selector, Selector]): """Parser for parsing HTTP response using Parsel.""" @override async def parse(self, response: HttpResponse) -> Selector: return await asyncio.to_thread(lambda: Selector(body=response.read())) + @override + async def parse_text(self, text: str) -> Selector: + return Selector(text=text) + + @override + async def select(self, parsed_content: Selector, selector: str) -> Sequence[Selector]: + return tuple(match for match in parsed_content.css(selector)) + @override def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index d0d40531b6..1615a97dc7 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -10,6 +10,8 @@ import httpx import pytest +from bs4 import Tag +from parsel import Selector from typing_extensions import override from crawlee import Request @@ -40,6 +42,24 @@ from crawlee.browsers._types import CrawleePage from crawlee.proxy_configuration import ProxyInfo +_H1_TEXT = 'Static' +_H2_TEXT = 'Only in browser' +_H3_CHANGED_TEXT = 'Changed by JS' +_INJECTED_JS_DELAY_MS = 100 +_PAGE_CONTENT_STATIC = f""" +

{_H1_TEXT}

+

Initial text

+ +""" + @pytest.fixture def test_urls(respx_mock: respx.MockRouter) -> list[str]: @@ -50,9 +70,7 @@ def test_urls(respx_mock: respx.MockRouter) -> list[str]: ] for url in urls: - respx_mock.get(url).return_value = httpx.Response( - status_code=200, content=b'What a body!' - ) + respx_mock.get(url).return_value = httpx.Response(status_code=200, content=_PAGE_CONTENT_STATIC.encode()) return urls @@ -69,9 +87,7 @@ async def new_page( crawlee_page = await super().new_page(page_id=page_id, browser_plugin=browser_plugin, proxy_info=proxy_info) await crawlee_page.page.route( '**/*', - lambda route: route.fulfill( - status=200, content_type='text/html', body='What a body!' - ), + lambda route: route.fulfill(status=200, content_type='text/html', body=_PAGE_CONTENT_STATIC), ) return crawlee_page @@ -514,7 +530,6 @@ async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_urls: lis crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_request_retries=1, rendering_type_predictor=static_only_predictor_no_detection, - result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. request_handler_timeout=request_handler_timeout, playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, ) @@ -564,3 +579,177 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: # First prediction should trigger rendering type detection as the predictor does not have any data for prediction. mocked_static_handler.assert_called_once_with() mocked_browser_handler.assert_called_once_with() + + +async def test_adaptive_context_query_selector_beautiful_soup(test_urls: list[str]) -> None: + """Test that `context.query_selector_one` works regardless of the crawl type for BeautifulSoup variant. + + Handler tries to locate two elements h1 and h2. + h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html. + Create situation where page is crawled with static sub crawler first. + Static sub crawler should be able to locate only h1. It wil try to wait for h2, trying to wait for h2 will trigger + `AdaptiveContextError` which will force the adaptive crawler to try playwright sub crawler instead. Playwright sub + crawler is able to wait for the h2 element.""" + + # Get page with injected JS code that will add some element after timeout + static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) + + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_request_retries=1, + rendering_type_predictor=static_only_predictor_no_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) + + mocked_h1_handler = Mock() + mocked_h2_handler = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + h1 = await context.query_selector_one('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) + mocked_h1_handler(h1) + h2 = await context.query_selector_one('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) + mocked_h2_handler(h2) + + await crawler.run(test_urls[:1]) + + expected_h1_tag = Tag(name='h1') + expected_h1_tag.append(_H1_TEXT) + + expected_h2_tag = Tag(name='h2') + expected_h2_tag.append(_H2_TEXT) + + # Called by both sub crawlers + mocked_h1_handler.assert_has_calls([call(expected_h1_tag), call(expected_h1_tag)]) + # Called only by pw sub crawler + mocked_h2_handler.assert_has_calls([call(expected_h2_tag)]) + + +async def test_adaptive_context_query_selector_parsel(test_urls: list[str]) -> None: + """Test that `context.query_selector_one` works regardless of the crawl type for Parsel variant. + + Handler tries to locate two elements h1 and h2. + h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html. + Create situation where page is crawled with static sub crawler first. + Static sub crawler should be able to locate only h1. It wil try to wait for h2, trying to wait for h2 will trigger + `AdaptiveContextError` which will force the adaptive crawler to try playwright sub crawler instead. Playwright sub + crawler is able to wait for the h2 element.""" + + # Get page with injected JS code that will add some element after timeout + static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) + expected_h1_tag = f'

{_H1_TEXT}

' + expected_h2_tag = f'

{_H2_TEXT}

' + + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + max_request_retries=1, + rendering_type_predictor=static_only_predictor_no_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) + + mocked_h1_handler = Mock() + mocked_h2_handler = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + if h1 := await context.query_selector_one('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)): + mocked_h1_handler(type(h1), h1.get()) + if h2 := await context.query_selector_one('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)): + mocked_h2_handler(type(h2), h2.get()) + + await crawler.run(test_urls[:1]) + + # Called by both sub crawlers + mocked_h1_handler.assert_has_calls([call(Selector, expected_h1_tag), call(Selector, expected_h1_tag)]) + # Called only by pw sub crawler + mocked_h2_handler.assert_has_calls([call(Selector, expected_h2_tag)]) + + +async def test_adaptive_context_parse_with_static_parser_parsel(test_urls: list[str]) -> None: + """Test `context.parse_with_static_parser` works regardless of the crawl type for Parsel variant. + + (Test covers also `context.wait_for_selector`, which is called by `context.parse_with_static_parser`) + """ + static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) + expected_h2_tag = f'

{_H2_TEXT}

' + + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + max_request_retries=1, + rendering_type_predictor=static_only_predictor_no_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) + + mocked_h2_handler = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + h2_static = context.parsed_content.css('h2') # Should not find anything + mocked_h2_handler(h2_static) + + # Reparse whole page after h2 appears + parsed_content_after_h2_appeared = await context.parse_with_static_parser( + selector='h2', timeout=timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2) + ) + mocked_h2_handler(parsed_content_after_h2_appeared.css('h2')[0].get()) + + await crawler.run(test_urls[:1]) + + mocked_h2_handler.assert_has_calls( + [ + call([]), # Static sub crawler tried and did not find h2. + call([]), # Playwright sub crawler tried and did not find h2 without waiting. + call(expected_h2_tag), # Playwright waited for h2 to appear. + ] + ) + + +async def test_adaptive_context_helpers_on_changed_selector(test_urls: list[str]) -> None: + """Test that context helpers work on latest version of the page. + + Scenario where page is changed after a while. H2 element is added and text of H3 element is modified. + Test that context helpers automatically work on latest version of the page by reading H3 element and expecting it's + dynamically changed text instead of the original static text. + """ + browser_only_predictor_no_detection = _SimpleRenderingTypePredictor( + rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]) + ) + expected_h3_tag = f'

{_H3_CHANGED_TEXT}

' + + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + max_request_retries=1, + rendering_type_predictor=browser_only_predictor_no_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) + + mocked_h3_handler = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + await context.query_selector_one('h2') # Wait for change that is indicated by appearance of h2 element. + if h3 := await context.query_selector_one('h3'): + mocked_h3_handler(h3.get()) # Get updated h3 element. + + await crawler.run(test_urls[:1]) + + mocked_h3_handler.assert_called_once_with(expected_h3_tag) + + +async def test_adaptive_context_query_non_existing_element(test_urls: list[str]) -> None: + """Test that querying non-existing selector returns `None`""" + browser_only_predictor_no_detection = _SimpleRenderingTypePredictor( + rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]) + ) + + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + max_request_retries=1, + rendering_type_predictor=browser_only_predictor_no_detection, + playwright_crawler_specific_kwargs={'browser_pool': _StaticRedirectBrowserPool.with_default_plugin()}, + ) + + mocked_h3_handler = Mock() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + mocked_h3_handler(await context.query_selector_one('non sense selector', timeout=timedelta(milliseconds=1))) + + await crawler.run(test_urls[:1]) + + mocked_h3_handler.assert_called_once_with(None) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py new file mode 100644 index 0000000000..e69de29bb2