From 39740cb3623ee2ec79c124c28bcb928d1e26cd28 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 3 Apr 2015 13:46:55 +0200 Subject: [PATCH 1/7] [middleware] allow enabling splash per spider --- scrapyjs/middleware.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/scrapyjs/middleware.py b/scrapyjs/middleware.py index c51c1d3..3830cec 100644 --- a/scrapyjs/middleware.py +++ b/scrapyjs/middleware.py @@ -32,6 +32,14 @@ def __init__(self, crawler, splash_base_url, slot_policy): self.splash_base_url = splash_base_url self.slot_policy = slot_policy + def get_splash_options(self, request, spider): + if request.meta.get("dont_proxy"): + return + + spider_options = getattr(spider, "splash", {}) + request_options = request.meta.get("splash") + return request_options or spider_options + @classmethod def from_crawler(cls, crawler): splash_base_url = crawler.settings.get('SPLASH_URL', cls.default_splash_url) @@ -43,24 +51,26 @@ def from_crawler(cls, crawler): return cls(crawler, splash_base_url, slot_policy) def process_request(self, request, spider): - splash_options = request.meta.get('splash') + splash_options = self.get_splash_options(request, spider) if not splash_options: return + elif request.meta.get("_splash_processed"): + return + if request.method != 'GET': log.msg("Currently only GET requests are supported by SplashMiddleware; %s " "will be handled without Splash" % request, logging.WARNING) return request meta = request.meta - del meta['splash'] - meta['_splash_processed'] = splash_options slot_policy = splash_options.get('slot_policy', self.slot_policy) self._set_download_slot(request, meta, slot_policy) args = splash_options.setdefault('args', {}) - args.setdefault('url', request.url) + args['url'] = request.url + body = json.dumps(args, ensure_ascii=False) if 'timeout' in args: @@ -86,6 +96,7 @@ def process_request(self, request, spider): endpoint = splash_options.setdefault('endpoint', self.default_endpoint) splash_base_url = splash_options.get('splash_url', self.splash_base_url) splash_url = urljoin(splash_base_url, endpoint) + meta['_splash_processed'] = True req_rep = request.replace( url=splash_url, From 9efd145d0b7c6b7d5e743948ecbb124aa2b9def5 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 3 Apr 2015 13:47:46 +0200 Subject: [PATCH 2/7] [middleware] return HtmlResponse to spider --- scrapyjs/middleware.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/scrapyjs/middleware.py b/scrapyjs/middleware.py index 3830cec..ee254ae 100644 --- a/scrapyjs/middleware.py +++ b/scrapyjs/middleware.py @@ -6,6 +6,7 @@ from scrapy.exceptions import NotConfigured from scrapy import log +from scrapy.http.response.html import HtmlResponse from scrapy.http.headers import Headers @@ -107,20 +108,31 @@ def process_request(self, request, spider): # are not respected. headers=Headers({'Content-Type': 'application/json'}), ) - self.crawler.stats.inc_value('splash/%s/request_count' % endpoint) return req_rep def process_response(self, request, response, spider): - splash_options = request.meta.get("_splash_processed") + splash_options = self.get_splash_options(request, spider) if splash_options: endpoint = splash_options['endpoint'] self.crawler.stats.inc_value( 'splash/%s/response_count/%s' % (endpoint, response.status) ) - + response = self.html_response(response, request) return response + def html_response(self, response, request): + """Give user nice HTML response he probably + expects. + """ + data = json.loads(response.body) + html = data.get("html") + if not html: + return response + + return HtmlResponse(data["url"], body=html, encoding='utf8', + status=response.status, request=request) + def _set_download_slot(self, request, meta, slot_policy): if slot_policy == SlotPolicy.PER_DOMAIN: # Use the same download slot to (sort of) respect download From 652fd6ee050287d0f69432b1bec98d38407ffca0 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 3 Apr 2015 13:48:10 +0200 Subject: [PATCH 3/7] [scrashtest] add another test spider --- example/scrashtest/spiders/dmoz_two.py | 31 ++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 example/scrashtest/spiders/dmoz_two.py diff --git a/example/scrashtest/spiders/dmoz_two.py b/example/scrashtest/spiders/dmoz_two.py new file mode 100644 index 0000000..e441fa6 --- /dev/null +++ b/example/scrashtest/spiders/dmoz_two.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +from urlparse import urljoin +import json + +import scrapy +from scrapy.contrib.linkextractors import LinkExtractor + + +class DmozSpider(scrapy.Spider): + name = "js_spider" + start_urls = ['http://www.isjavascriptenabled.com/'] + splash = {'args': {'har': 1, 'html': 1}} + + def parse(self, response): + is_js = response.xpath("//h1/text()").extract() + if "".join(is_js).lower() == "yes": + self.log("JS enabled!") + else: + self.log("Error! JS disabled!", scrapy.log.ERROR) + le = LinkExtractor() + + for link in le.extract_links(response): + url = urljoin(response.url, link.url) + yield scrapy.Request(url, self.parse_link) + break + + def parse_link(self, response): + title = response.xpath("//title").extract() + yes = response.xpath("//h1").extract() + self.log("response is: {}".format(repr(response))) + self.log(u"Html in response contains {} {}".format("".join(title), "".join(yes))) From 2e7407de724c7dce25b1b149bc8c1552cfa86505 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Tue, 26 May 2015 09:10:01 +0200 Subject: [PATCH 4/7] [spider attr] removed html_response * removed process_html (render response in html feature) from this PR * removed remote request tests with extra spider --- example/scrashtest/spiders/dmoz_two.py | 31 -------------------------- scrapyjs/middleware.py | 13 ----------- 2 files changed, 44 deletions(-) delete mode 100644 example/scrashtest/spiders/dmoz_two.py diff --git a/example/scrashtest/spiders/dmoz_two.py b/example/scrashtest/spiders/dmoz_two.py deleted file mode 100644 index e441fa6..0000000 --- a/example/scrashtest/spiders/dmoz_two.py +++ /dev/null @@ -1,31 +0,0 @@ -# -*- coding: utf-8 -*- -from urlparse import urljoin -import json - -import scrapy -from scrapy.contrib.linkextractors import LinkExtractor - - -class DmozSpider(scrapy.Spider): - name = "js_spider" - start_urls = ['http://www.isjavascriptenabled.com/'] - splash = {'args': {'har': 1, 'html': 1}} - - def parse(self, response): - is_js = response.xpath("//h1/text()").extract() - if "".join(is_js).lower() == "yes": - self.log("JS enabled!") - else: - self.log("Error! JS disabled!", scrapy.log.ERROR) - le = LinkExtractor() - - for link in le.extract_links(response): - url = urljoin(response.url, link.url) - yield scrapy.Request(url, self.parse_link) - break - - def parse_link(self, response): - title = response.xpath("//title").extract() - yes = response.xpath("//h1").extract() - self.log("response is: {}".format(repr(response))) - self.log(u"Html in response contains {} {}".format("".join(title), "".join(yes))) diff --git a/scrapyjs/middleware.py b/scrapyjs/middleware.py index ee254ae..2a5acb8 100644 --- a/scrapyjs/middleware.py +++ b/scrapyjs/middleware.py @@ -118,21 +118,8 @@ def process_response(self, request, response, spider): self.crawler.stats.inc_value( 'splash/%s/response_count/%s' % (endpoint, response.status) ) - response = self.html_response(response, request) return response - def html_response(self, response, request): - """Give user nice HTML response he probably - expects. - """ - data = json.loads(response.body) - html = data.get("html") - if not html: - return response - - return HtmlResponse(data["url"], body=html, encoding='utf8', - status=response.status, request=request) - def _set_download_slot(self, request, meta, slot_policy): if slot_policy == SlotPolicy.PER_DOMAIN: # Use the same download slot to (sort of) respect download From bed8998a8a3a2db84d9ac115246d2de18b24b0ca Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Tue, 26 May 2015 09:13:37 +0200 Subject: [PATCH 5/7] [tests] refactors tests, adds tests for spider attr * refactors tests from functions to objects inheriting from unittest.TestCase * adds tests for enabling middleware with spider attribute --- tests/test_middleware.py | 275 +++++++++++++++++++++------------------ 1 file changed, 150 insertions(+), 125 deletions(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index aa84efd..d3f232a 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -2,154 +2,179 @@ from __future__ import absolute_import import copy import json +from twisted.trial import unittest import scrapy -from scrapy.core.engine import ExecutionEngine from scrapy.utils.test import get_crawler +from scrapy.utils.httpobj import urlparse_cached import scrapyjs from scrapyjs.middleware import SplashMiddleware from scrapyjs.request import SplashRequest -def _get_mw(): - crawler = get_crawler(settings_dict={ - 'DOWNLOAD_HANDLERS': {'s3': None}, # for faster test running - }) - if not hasattr(crawler, 'logformatter'): - crawler.logformatter = None - crawler.engine = ExecutionEngine(crawler, lambda _: None) - # spider = crawler._create_spider("foo") - return SplashMiddleware.from_crawler(crawler) - - -def test_nosplash(): - mw = _get_mw() - req = scrapy.Request("http://example.com") - old_meta = copy.deepcopy(req.meta) - assert mw.process_request(req, None) is None - assert old_meta == req.meta - - -def test_splash_request(): - mw = _get_mw() - req = SplashRequest("http://example.com?foo=bar&url=1&wait=100") - - req2 = mw.process_request(req, None) - assert req2 is not None - assert req2 is not req - assert req2.url == "http://127.0.0.1:8050/render.html" - assert req2.headers == {'Content-Type': ['application/json']} - assert req2.method == 'POST' - - expected_body = {'url': req.url} - expected_body.update(SplashRequest.default_splash_meta['args']) - assert json.loads(req2.body) == expected_body - - -def test_splash_request_no_url(): - mw = _get_mw() - lua_source = "function main(splash) return {result='ok'} end" - req1 = SplashRequest(meta={'splash': { - 'args': {'lua_source': lua_source}, - 'endpoint': 'execute', - }}) - req = mw.process_request(req1, None) - assert req.url == 'http://127.0.0.1:8050/execute' - assert json.loads(req.body) == { - 'url': 'about:blank', - 'lua_source': lua_source - } - - -def test_override_splash_url(): - mw = _get_mw() - req1 = scrapy.Request("http://example.com", meta={ - 'splash': { - 'endpoint': 'render.png', - 'splash_url': 'http://splash.example.com' - } - }) - req = mw.process_request(req1, None) - assert req.url == 'http://splash.example.com/render.png' - assert json.loads(req.body) == {'url': req1.url} - - -def test_float_wait_arg(): - mw = _get_mw() - req1 = scrapy.Request("http://example.com", meta={ - 'splash': { - 'endpoint': 'render.html', - 'args': {'wait': 0.5} - } - }) - req = mw.process_request(req1, None) - assert json.loads(req.body) == {'url': req1.url, 'wait': 0.5} - +class MockedSlot(object): + def __init__(self, delay=0.0): + self.delay = delay -def test_slot_policy_single_slot(): - mw = _get_mw() - meta = {'splash': { - 'slot_policy': scrapyjs.SlotPolicy.SINGLE_SLOT - }} - req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) - req1 = mw.process_request(req1, None) +class MockedDownloader(object): - req2 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) - req2 = mw.process_request(req2, None) + def __init__(self): + self.slots = {} - assert req1.meta.get('download_slot') - assert req1.meta['download_slot'] == req2.meta['download_slot'] + def _get_slot_key(self, request, spider): + if 'download_slot' in request.meta: + return request.meta['download_slot'] + key = urlparse_cached(request).hostname or '' + return key -def test_slot_policy_per_domain(): - mw = _get_mw() - meta = {'splash': { - 'slot_policy': scrapyjs.SlotPolicy.PER_DOMAIN - }} - req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) - req1 = mw.process_request(req1, None) +class MockedEngine(object): + downloader = MockedDownloader() - req2 = scrapy.Request("http://example.com/path2", meta=meta) - req2 = mw.process_request(req2, None) - req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) - req3 = mw.process_request(req3, None) +class MiddlewareTest(unittest.TestCase): - assert req1.meta.get('download_slot') - assert req3.meta.get('download_slot') + def setUp(self): + self.crawler = get_crawler(settings_dict={ + 'DOWNLOAD_HANDLERS': {'s3': None}, # for faster test running + }) + if not hasattr(self.crawler, 'logformatter'): + self.crawler.logformatter = None + self.crawler.engine = MockedEngine() + self.mw = SplashMiddleware.from_crawler(self.crawler) - assert req1.meta['download_slot'] == req2.meta['download_slot'] - assert req1.meta['download_slot'] != req3.meta['download_slot'] + def test_nosplash(self): + req = scrapy.Request("http://example.com") + old_meta = copy.deepcopy(req.meta) + assert self.mw.process_request(req, None) is None + assert old_meta == req.meta + def test_splash_request(self): + req = SplashRequest("http://example.com?foo=bar&url=1&wait=100") -def test_slot_policy_scrapy_default(): - mw = _get_mw() - req = scrapy.Request("http://example.com", meta = {'splash': { - 'slot_policy': scrapyjs.SlotPolicy.SCRAPY_DEFAULT - }}) - req = mw.process_request(req, None) - assert 'download_slot' not in req.meta + req2 = self.mw.process_request(req, None) + assert req2 is not None + assert req2 is not req + assert req2.url == "http://127.0.0.1:8050/render.html" + assert req2.headers == {'Content-Type': ['application/json']} + assert req2.method == 'POST' + expected_body = {'url': req.url} + expected_body.update(SplashRequest.default_splash_meta['args']) + assert json.loads(req2.body) == expected_body -def test_adjust_timeout(): - mw = _get_mw() - req1 = scrapy.Request("http://example.com", meta = { - 'splash': {'args': {'timeout': 60, 'html': 1}}, - - # download_timeout is always present, - # it is set by DownloadTimeoutMiddleware - 'download_timeout': 30, - }) - req1 = mw.process_request(req1, None) - assert req1.meta['download_timeout'] > 60 + def test_splash_request_no_url(self): + lua_source = "function main(splash) return {result='ok'} end" + req1 = SplashRequest(meta={'splash': { + 'args': {'lua_source': lua_source}, + 'endpoint': 'execute', + }}) + req = self.mw.process_request(req1, None) + assert req.url == 'http://127.0.0.1:8050/execute' + assert json.loads(req.body) == { + 'url': 'about:blank', + 'lua_source': lua_source + } - req2 = scrapy.Request("http://example.com", meta = { - 'splash': {'args': {'html': 1}}, - 'download_timeout': 30, - }) - req2 = mw.process_request(req2, None) - assert req2.meta['download_timeout'] == 30 + def test_override_splash_url(self): + req1 = scrapy.Request("http://example.com", meta={ + 'splash': { + 'endpoint': 'render.png', + 'splash_url': 'http://splash.example.com' + } + }) + req = self.mw.process_request(req1, None) + assert req.url == 'http://splash.example.com/render.png' + assert json.loads(req.body) == {'url': req1.url} + + def test_float_wait_arg(self): + req1 = scrapy.Request("http://example.com", meta={ + 'splash': { + 'endpoint': 'render.html', + 'args': {'wait': 0.5} + } + }) + req = self.mw.process_request(req1, None) + assert json.loads(req.body) == {'url': req1.url, 'wait': 0.5} + + def test_slot_policy_single_slot(self): + meta = {'splash': { + 'slot_policy': scrapyjs.SlotPolicy.SINGLE_SLOT + }} + + req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) + req1 = self.mw.process_request(req1, None) + + req2 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) + req2 = self.mw.process_request(req2, None) + + assert req1.meta.get('download_slot') + assert req1.meta['download_slot'] == req2.meta['download_slot'] + + def test_slot_policy_per_domain(self): + meta = {'splash': { + 'slot_policy': scrapyjs.SlotPolicy.PER_DOMAIN + }} + + req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) + req1 = self.mw.process_request(req1, None) + + req2 = scrapy.Request("http://example.com/path2", meta=meta) + req2 = self.mw.process_request(req2, None) + + req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) + req3 = self.mw.process_request(req3, None) + + assert req1.meta.get('download_slot') + assert req3.meta.get('download_slot') + + assert req1.meta['download_slot'] == req2.meta['download_slot'] + assert req1.meta['download_slot'] != req3.meta['download_slot'] + + def test_slot_policy_scrapy_default(self): + req = scrapy.Request("http://example.com", meta = {'splash': { + 'slot_policy': scrapyjs.SlotPolicy.SCRAPY_DEFAULT + }}) + req = self.mw.process_request(req, None) + assert 'download_slot' not in req.meta + + def test_adjust_timeout(self): + req1 = scrapy.Request("http://example.com", meta = { + 'splash': {'args': {'timeout': 60, 'html': 1}}, + + # download_timeout is always present, + # it is set by DownloadTimeoutMiddleware + 'download_timeout': 30, + }) + req1 = self.mw.process_request(req1, None) + assert req1.meta['download_timeout'] > 60 + + req2 = scrapy.Request("http://example.com", meta = { + 'splash': {'args': {'html': 1}}, + 'download_timeout': 30, + }) + req2 = self.mw.process_request(req2, None) + assert req2.meta['download_timeout'] == 30 + + def test_spider_attribute(self): + req_url = "http://scrapy.org" + req1 = scrapy.Request(req_url) + + spider = self.crawler._create_spider("foo") + spider.splash = {"args": {"images": 0}} + + req1 = self.mw.process_request(req1, spider) + self.assertIn("_splash_processed", req1.meta) + self.assertIn("render.json", req1.url) + self.assertIn("url", json.loads(req1.body)) + self.assertEqual(json.loads(req1.body).get("url"), req_url) + self.assertIn("images", json.loads(req1.body)) + + # spider attribute blank middleware disabled + spider.splash = {} + req2 = self.mw.process_request(req1, spider) + self.assertIsNone(req2) From a9b532322727129ee2dd2ea1516ddd6d86a898e4 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 19 Jun 2015 14:27:48 +0200 Subject: [PATCH 6/7] Revert "[tests] refactors tests, adds tests for spider attr" This reverts commit bed8998a8a3a2db84d9ac115246d2de18b24b0ca. T# --- tests/test_middleware.py | 275 ++++++++++++++++++--------------------- 1 file changed, 125 insertions(+), 150 deletions(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index d3f232a..aa84efd 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -2,179 +2,154 @@ from __future__ import absolute_import import copy import json -from twisted.trial import unittest import scrapy +from scrapy.core.engine import ExecutionEngine from scrapy.utils.test import get_crawler -from scrapy.utils.httpobj import urlparse_cached import scrapyjs from scrapyjs.middleware import SplashMiddleware from scrapyjs.request import SplashRequest -class MockedSlot(object): +def _get_mw(): + crawler = get_crawler(settings_dict={ + 'DOWNLOAD_HANDLERS': {'s3': None}, # for faster test running + }) + if not hasattr(crawler, 'logformatter'): + crawler.logformatter = None + crawler.engine = ExecutionEngine(crawler, lambda _: None) + # spider = crawler._create_spider("foo") + return SplashMiddleware.from_crawler(crawler) + + +def test_nosplash(): + mw = _get_mw() + req = scrapy.Request("http://example.com") + old_meta = copy.deepcopy(req.meta) + assert mw.process_request(req, None) is None + assert old_meta == req.meta + + +def test_splash_request(): + mw = _get_mw() + req = SplashRequest("http://example.com?foo=bar&url=1&wait=100") + + req2 = mw.process_request(req, None) + assert req2 is not None + assert req2 is not req + assert req2.url == "http://127.0.0.1:8050/render.html" + assert req2.headers == {'Content-Type': ['application/json']} + assert req2.method == 'POST' + + expected_body = {'url': req.url} + expected_body.update(SplashRequest.default_splash_meta['args']) + assert json.loads(req2.body) == expected_body + + +def test_splash_request_no_url(): + mw = _get_mw() + lua_source = "function main(splash) return {result='ok'} end" + req1 = SplashRequest(meta={'splash': { + 'args': {'lua_source': lua_source}, + 'endpoint': 'execute', + }}) + req = mw.process_request(req1, None) + assert req.url == 'http://127.0.0.1:8050/execute' + assert json.loads(req.body) == { + 'url': 'about:blank', + 'lua_source': lua_source + } + + +def test_override_splash_url(): + mw = _get_mw() + req1 = scrapy.Request("http://example.com", meta={ + 'splash': { + 'endpoint': 'render.png', + 'splash_url': 'http://splash.example.com' + } + }) + req = mw.process_request(req1, None) + assert req.url == 'http://splash.example.com/render.png' + assert json.loads(req.body) == {'url': req1.url} + + +def test_float_wait_arg(): + mw = _get_mw() + req1 = scrapy.Request("http://example.com", meta={ + 'splash': { + 'endpoint': 'render.html', + 'args': {'wait': 0.5} + } + }) + req = mw.process_request(req1, None) + assert json.loads(req.body) == {'url': req1.url, 'wait': 0.5} - def __init__(self, delay=0.0): - self.delay = delay -class MockedDownloader(object): +def test_slot_policy_single_slot(): + mw = _get_mw() + meta = {'splash': { + 'slot_policy': scrapyjs.SlotPolicy.SINGLE_SLOT + }} - def __init__(self): - self.slots = {} + req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) + req1 = mw.process_request(req1, None) - def _get_slot_key(self, request, spider): - if 'download_slot' in request.meta: - return request.meta['download_slot'] + req2 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) + req2 = mw.process_request(req2, None) - key = urlparse_cached(request).hostname or '' - return key + assert req1.meta.get('download_slot') + assert req1.meta['download_slot'] == req2.meta['download_slot'] -class MockedEngine(object): - downloader = MockedDownloader() +def test_slot_policy_per_domain(): + mw = _get_mw() + meta = {'splash': { + 'slot_policy': scrapyjs.SlotPolicy.PER_DOMAIN + }} + req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) + req1 = mw.process_request(req1, None) -class MiddlewareTest(unittest.TestCase): + req2 = scrapy.Request("http://example.com/path2", meta=meta) + req2 = mw.process_request(req2, None) - def setUp(self): - self.crawler = get_crawler(settings_dict={ - 'DOWNLOAD_HANDLERS': {'s3': None}, # for faster test running - }) - if not hasattr(self.crawler, 'logformatter'): - self.crawler.logformatter = None - self.crawler.engine = MockedEngine() - self.mw = SplashMiddleware.from_crawler(self.crawler) + req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) + req3 = mw.process_request(req3, None) - def test_nosplash(self): - req = scrapy.Request("http://example.com") - old_meta = copy.deepcopy(req.meta) - assert self.mw.process_request(req, None) is None - assert old_meta == req.meta + assert req1.meta.get('download_slot') + assert req3.meta.get('download_slot') - def test_splash_request(self): - req = SplashRequest("http://example.com?foo=bar&url=1&wait=100") + assert req1.meta['download_slot'] == req2.meta['download_slot'] + assert req1.meta['download_slot'] != req3.meta['download_slot'] - req2 = self.mw.process_request(req, None) - assert req2 is not None - assert req2 is not req - assert req2.url == "http://127.0.0.1:8050/render.html" - assert req2.headers == {'Content-Type': ['application/json']} - assert req2.method == 'POST' - expected_body = {'url': req.url} - expected_body.update(SplashRequest.default_splash_meta['args']) - assert json.loads(req2.body) == expected_body +def test_slot_policy_scrapy_default(): + mw = _get_mw() + req = scrapy.Request("http://example.com", meta = {'splash': { + 'slot_policy': scrapyjs.SlotPolicy.SCRAPY_DEFAULT + }}) + req = mw.process_request(req, None) + assert 'download_slot' not in req.meta - def test_splash_request_no_url(self): - lua_source = "function main(splash) return {result='ok'} end" - req1 = SplashRequest(meta={'splash': { - 'args': {'lua_source': lua_source}, - 'endpoint': 'execute', - }}) - req = self.mw.process_request(req1, None) - assert req.url == 'http://127.0.0.1:8050/execute' - assert json.loads(req.body) == { - 'url': 'about:blank', - 'lua_source': lua_source - } - def test_override_splash_url(self): - req1 = scrapy.Request("http://example.com", meta={ - 'splash': { - 'endpoint': 'render.png', - 'splash_url': 'http://splash.example.com' - } - }) - req = self.mw.process_request(req1, None) - assert req.url == 'http://splash.example.com/render.png' - assert json.loads(req.body) == {'url': req1.url} - - def test_float_wait_arg(self): - req1 = scrapy.Request("http://example.com", meta={ - 'splash': { - 'endpoint': 'render.html', - 'args': {'wait': 0.5} - } - }) - req = self.mw.process_request(req1, None) - assert json.loads(req.body) == {'url': req1.url, 'wait': 0.5} - - def test_slot_policy_single_slot(self): - meta = {'splash': { - 'slot_policy': scrapyjs.SlotPolicy.SINGLE_SLOT - }} - - req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) - req1 = self.mw.process_request(req1, None) - - req2 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) - req2 = self.mw.process_request(req2, None) - - assert req1.meta.get('download_slot') - assert req1.meta['download_slot'] == req2.meta['download_slot'] - - def test_slot_policy_per_domain(self): - meta = {'splash': { - 'slot_policy': scrapyjs.SlotPolicy.PER_DOMAIN - }} - - req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) - req1 = self.mw.process_request(req1, None) - - req2 = scrapy.Request("http://example.com/path2", meta=meta) - req2 = self.mw.process_request(req2, None) - - req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) - req3 = self.mw.process_request(req3, None) - - assert req1.meta.get('download_slot') - assert req3.meta.get('download_slot') - - assert req1.meta['download_slot'] == req2.meta['download_slot'] - assert req1.meta['download_slot'] != req3.meta['download_slot'] - - def test_slot_policy_scrapy_default(self): - req = scrapy.Request("http://example.com", meta = {'splash': { - 'slot_policy': scrapyjs.SlotPolicy.SCRAPY_DEFAULT - }}) - req = self.mw.process_request(req, None) - assert 'download_slot' not in req.meta - - def test_adjust_timeout(self): - req1 = scrapy.Request("http://example.com", meta = { - 'splash': {'args': {'timeout': 60, 'html': 1}}, - - # download_timeout is always present, - # it is set by DownloadTimeoutMiddleware - 'download_timeout': 30, - }) - req1 = self.mw.process_request(req1, None) - assert req1.meta['download_timeout'] > 60 - - req2 = scrapy.Request("http://example.com", meta = { - 'splash': {'args': {'html': 1}}, - 'download_timeout': 30, - }) - req2 = self.mw.process_request(req2, None) - assert req2.meta['download_timeout'] == 30 - - def test_spider_attribute(self): - req_url = "http://scrapy.org" - req1 = scrapy.Request(req_url) - - spider = self.crawler._create_spider("foo") - spider.splash = {"args": {"images": 0}} - - req1 = self.mw.process_request(req1, spider) - self.assertIn("_splash_processed", req1.meta) - self.assertIn("render.json", req1.url) - self.assertIn("url", json.loads(req1.body)) - self.assertEqual(json.loads(req1.body).get("url"), req_url) - self.assertIn("images", json.loads(req1.body)) - - # spider attribute blank middleware disabled - spider.splash = {} - req2 = self.mw.process_request(req1, spider) - self.assertIsNone(req2) +def test_adjust_timeout(): + mw = _get_mw() + req1 = scrapy.Request("http://example.com", meta = { + 'splash': {'args': {'timeout': 60, 'html': 1}}, + + # download_timeout is always present, + # it is set by DownloadTimeoutMiddleware + 'download_timeout': 30, + }) + req1 = mw.process_request(req1, None) + assert req1.meta['download_timeout'] > 60 + + req2 = scrapy.Request("http://example.com", meta = { + 'splash': {'args': {'html': 1}}, + 'download_timeout': 30, + }) + req2 = mw.process_request(req2, None) + assert req2.meta['download_timeout'] == 30 From 586ab58f2424e693781eebc017bfd6b5e78d93ff Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 19 Jun 2015 14:42:52 +0200 Subject: [PATCH 7/7] [#15/spider_attribute] adds proper tests --- scrapyjs/middleware.py | 1 - tests/test_middleware.py | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/scrapyjs/middleware.py b/scrapyjs/middleware.py index 2a5acb8..d418c72 100644 --- a/scrapyjs/middleware.py +++ b/scrapyjs/middleware.py @@ -6,7 +6,6 @@ from scrapy.exceptions import NotConfigured from scrapy import log -from scrapy.http.response.html import HtmlResponse from scrapy.http.headers import Headers diff --git a/tests/test_middleware.py b/tests/test_middleware.py index aa84efd..363b876 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -153,3 +153,25 @@ def test_adjust_timeout(): }) req2 = mw.process_request(req2, None) assert req2.meta['download_timeout'] == 30 + + +def test_spider_attribute(): + req_url = "http://scrapy.org" + req1 = scrapy.Request(req_url) + + spider = scrapy.Spider("example") + spider.splash = {"args": {"images": 0}} + + mw = _get_mw() + req1 = mw.process_request(req1, spider) + assert "_splash_processed" in req1.meta + assert "render.json" in req1.url + assert "url" in json.loads(req1.body) + assert json.loads(req1.body).get("url") == req_url + assert "images" in json.loads(req1.body) + assert req1.method == 'POST' + + # spider attribute blank middleware disabled + spider.splash = {} + req2 = mw.process_request(req1, spider) + assert req2 is None