-
Notifications
You must be signed in to change notification settings - Fork 457
/
Copy pathmiddleware.py
138 lines (109 loc) · 5.04 KB
/
middleware.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import json
import logging
from urlparse import urljoin
from scrapy.exceptions import NotConfigured
from scrapy import log
from scrapy.http.response.html import HtmlResponse
from scrapy.http.headers import Headers
class SlotPolicy(object):
PER_DOMAIN = 'per_domain'
SINGLE_SLOT = 'single_slot'
SCRAPY_DEFAULT = 'scrapy_default'
_known = {PER_DOMAIN, SINGLE_SLOT, SCRAPY_DEFAULT}
class SplashMiddleware(object):
"""
Scrapy downloader middleware that passes requests through Splash
when 'splash' Request.meta key is set.
"""
default_splash_url = 'http://127.0.0.1:8050'
default_endpoint = "render.json"
splash_extra_timeout = 5.0
default_policy = SlotPolicy.PER_DOMAIN
def __init__(self, crawler, splash_base_url, slot_policy):
self.crawler = crawler
self.splash_base_url = splash_base_url
self.slot_policy = slot_policy
def get_splash_options(self, request, spider):
if request.meta.get("dont_proxy"):
return
spider_options = getattr(spider, "splash", {})
request_options = request.meta.get("splash")
return request_options or spider_options
@classmethod
def from_crawler(cls, crawler):
splash_base_url = crawler.settings.get('SPLASH_URL', cls.default_splash_url)
slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY', cls.default_policy)
if slot_policy not in SlotPolicy._known:
raise NotConfigured("Incorrect slot policy: %r" % slot_policy)
return cls(crawler, splash_base_url, slot_policy)
def process_request(self, request, spider):
splash_options = self.get_splash_options(request, spider)
if not splash_options:
return
elif request.meta.get("_splash_processed"):
return
if request.method != 'GET':
log.msg("Currently only GET requests are supported by SplashMiddleware; %s "
"will be handled without Splash" % request, logging.WARNING)
return request
meta = request.meta
slot_policy = splash_options.get('slot_policy', self.slot_policy)
self._set_download_slot(request, meta, slot_policy)
args = splash_options.setdefault('args', {})
args['url'] = request.url
body = json.dumps(args, ensure_ascii=False)
if 'timeout' in args:
# User requested a Splash timeout explicitly.
#
# We can't catch a case when user requested `download_timeout`
# explicitly because a default value for `download_timeout`
# is set by DownloadTimeoutMiddleware.
#
# As user requested Splash timeout explicitly, we shouldn't change
# it. Another reason not to change the requested Splash timeout is
# because it may cause a validation error on the remote end.
#
# But we can change Scrapy `download_timeout`: increase
# it when it's too small. Decreasing `download_timeout` is not
# safe.
timeout_current = meta.get('download_timeout', 1e6) # no timeout means infinite timeout
timeout_expected = float(args['timeout']) + self.splash_extra_timeout
if timeout_expected > timeout_current:
meta['download_timeout'] = timeout_expected
endpoint = splash_options.setdefault('endpoint', self.default_endpoint)
splash_base_url = splash_options.get('splash_url', self.splash_base_url)
splash_url = urljoin(splash_base_url, endpoint)
meta['_splash_processed'] = True
req_rep = request.replace(
url=splash_url,
method='POST',
body=body,
# FIXME: original HTTP headers (including cookies)
# are not respected.
headers=Headers({'Content-Type': 'application/json'}),
)
self.crawler.stats.inc_value('splash/%s/request_count' % endpoint)
return req_rep
def process_response(self, request, response, spider):
splash_options = self.get_splash_options(request, spider)
if splash_options:
endpoint = splash_options['endpoint']
self.crawler.stats.inc_value(
'splash/%s/response_count/%s' % (endpoint, response.status)
)
return response
def _set_download_slot(self, request, meta, slot_policy):
if slot_policy == SlotPolicy.PER_DOMAIN:
# Use the same download slot to (sort of) respect download
# delays and concurrency options.
meta['download_slot'] = self._get_slot_key(request)
elif slot_policy == SlotPolicy.SINGLE_SLOT:
# Use a single slot for all Splash requests
meta['download_slot'] = '__splash__'
elif slot_policy == SlotPolicy.SCRAPY_DEFAULT:
# Use standard Scrapy concurrency setup
pass
def _get_slot_key(self, request_or_response):
return self.crawler.engine.downloader._get_slot_key(request_or_response, None)