Skip to content

Commit 95f0079

Browse files
committed
Fix SplashRequest
1 parent 810e8db commit 95f0079

File tree

2 files changed

+21
-7
lines changed

2 files changed

+21
-7
lines changed

scrapyjs/middleware.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,10 @@ def __init__(self, crawler, splash_base_url, slot_policy):
3636

3737
@classmethod
3838
def from_crawler(cls, crawler):
39-
splash_base_url = crawler.settings.get('SPLASH_URL', cls.default_splash_url)
40-
slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY', cls.default_policy)
39+
splash_base_url = crawler.settings.get('SPLASH_URL',
40+
cls.default_splash_url)
41+
slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY',
42+
cls.default_policy)
4143

4244
if slot_policy not in SlotPolicy._known:
4345
raise NotConfigured("Incorrect slot policy: %r" % slot_policy)
@@ -49,17 +51,20 @@ def process_request(self, request, spider):
4951
if not splash_options:
5052
return
5153

54+
if request.meta.get("_splash_processed"):
55+
# don't process the same request more than once
56+
return
57+
5258
if request.method != 'GET':
5359
logger.warn(
54-
"Currently only GET requests are supported by SplashMiddleware; "
55-
"%(request)s will be handled without Splash",
60+
"Currently only GET requests are supported by SplashMiddleware;"
61+
" %(request)s will be handled without Splash",
5662
{'request': request},
5763
extra={'spider': spider}
5864
)
5965
return request
6066

6167
meta = request.meta
62-
del meta['splash']
6368
meta['_splash_processed'] = splash_options
6469

6570
slot_policy = splash_options.get('slot_policy', self.slot_policy)
@@ -83,7 +88,9 @@ def process_request(self, request, spider):
8388
# But we can change Scrapy `download_timeout`: increase
8489
# it when it's too small. Decreasing `download_timeout` is not
8590
# safe.
86-
timeout_current = meta.get('download_timeout', 1e6) # no timeout means infinite timeout
91+
92+
# no timeout means infinite timeout
93+
timeout_current = meta.get('download_timeout', 1e6)
8794
timeout_expected = float(args['timeout']) + self.splash_extra_timeout
8895

8996
if timeout_expected > timeout_current:
@@ -131,4 +138,6 @@ def _set_download_slot(self, request, meta, slot_policy):
131138
pass
132139

133140
def _get_slot_key(self, request_or_response):
134-
return self.crawler.engine.downloader._get_slot_key(request_or_response, None)
141+
return self.crawler.engine.downloader._get_slot_key(
142+
request_or_response, None
143+
)

scrapyjs/request.py

+5
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ def __init__(self, url=None, *args, **kwargs):
2323
meta['splash'] = copy.deepcopy(self.default_splash_meta)
2424
super(SplashRequest, self).__init__(url, *args, meta=meta, **kwargs)
2525

26+
def replace(self, *args, **kwargs):
27+
cls = super(SplashRequest, self).replace(*args, **kwargs)
28+
cls._original_url = self._original_url
29+
return cls
30+
2631
def __str__(self):
2732
return "<%s %s %s>" % (self.method, self.url, self._original_url)
2833

0 commit comments

Comments
 (0)