变更
This commit is contained in:
@@ -0,0 +1,197 @@
|
||||
import contextlib
|
||||
import os
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import DontCloseSpider
|
||||
from scrapy.settings import Settings
|
||||
|
||||
from scrapy_redis.spiders import RedisCrawlSpider, RedisSpider
|
||||
|
||||
REDIS_HOST = os.environ.get("REDIS_HOST", "localhost")
|
||||
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def flushall(server):
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
server.flushall()
|
||||
|
||||
|
||||
class MySpider(RedisSpider):
|
||||
name = "myspider"
|
||||
|
||||
|
||||
class MyCrawlSpider(RedisCrawlSpider):
|
||||
name = "myspider"
|
||||
|
||||
|
||||
def get_crawler(**kwargs):
|
||||
return mock.Mock(
|
||||
settings=Settings(
|
||||
{
|
||||
"REDIS_HOST": REDIS_HOST,
|
||||
"REDIS_PORT": REDIS_PORT,
|
||||
}
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class TestRedisMixin_setup_redis:
|
||||
|
||||
def setup(self):
|
||||
self.myspider = MySpider()
|
||||
|
||||
def test_crawler_required(self):
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
self.myspider.setup_redis()
|
||||
assert "crawler" in str(excinfo.value)
|
||||
|
||||
def test_requires_redis_key(self):
|
||||
self.myspider.crawler = get_crawler()
|
||||
self.myspider.redis_key = ""
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
self.myspider.setup_redis()
|
||||
assert "redis_key" in str(excinfo.value)
|
||||
|
||||
def test_invalid_batch_size(self):
|
||||
self.myspider.redis_batch_size = "x"
|
||||
self.myspider.crawler = get_crawler()
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
self.myspider.setup_redis()
|
||||
assert "redis_batch_size" in str(excinfo.value)
|
||||
|
||||
def test_invalid_idle_time(self):
|
||||
self.myspider.max_idle_time = "x"
|
||||
self.myspider.crawler = get_crawler()
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
self.myspider.setup_redis()
|
||||
assert "max_idle_time" in str(excinfo.value)
|
||||
|
||||
@mock.patch("scrapy_redis.spiders.connection")
|
||||
def test_via_from_crawler(self, connection):
|
||||
server = connection.from_settings.return_value = mock.Mock()
|
||||
crawler = get_crawler()
|
||||
myspider = MySpider.from_crawler(crawler)
|
||||
assert myspider.server is server
|
||||
connection.from_settings.assert_called_with(crawler.settings)
|
||||
crawler.signals.connect.assert_called_with(
|
||||
myspider.spider_idle, signal=signals.spider_idle
|
||||
)
|
||||
# Second call does nothing.
|
||||
server = myspider.server
|
||||
crawler.signals.connect.reset_mock()
|
||||
myspider.setup_redis()
|
||||
assert myspider.server is server
|
||||
assert crawler.signals.connect.call_count == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"spider_cls",
|
||||
[
|
||||
MySpider,
|
||||
MyCrawlSpider,
|
||||
],
|
||||
)
|
||||
def test_from_crawler_with_spider_arguments(spider_cls):
|
||||
crawler = get_crawler()
|
||||
spider = spider_cls.from_crawler(
|
||||
crawler,
|
||||
"foo",
|
||||
redis_key="key:%(name)s",
|
||||
redis_batch_size="2000",
|
||||
max_idle_time="100",
|
||||
)
|
||||
assert spider.name == "foo"
|
||||
assert spider.redis_key == "key:foo"
|
||||
assert spider.redis_batch_size == 2000
|
||||
assert spider.max_idle_time == 100
|
||||
|
||||
|
||||
class MockRequest(mock.Mock):
|
||||
def __init__(self, url, **kwargs):
|
||||
super().__init__()
|
||||
self.url = url
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.url == other.url
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.url)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<{self.__class__.__name__}({self.url})>"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"spider_cls",
|
||||
[
|
||||
MySpider,
|
||||
MyCrawlSpider,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("start_urls_as_zset", [False, True])
|
||||
@pytest.mark.parametrize("start_urls_as_set", [False, True])
|
||||
@mock.patch("scrapy.spiders.Request", MockRequest)
|
||||
def test_consume_urls_from_redis(start_urls_as_zset, start_urls_as_set, spider_cls):
|
||||
batch_size = 5
|
||||
redis_key = "start:urls"
|
||||
crawler = get_crawler()
|
||||
crawler.settings.setdict(
|
||||
{
|
||||
"REDIS_HOST": REDIS_HOST,
|
||||
"REDIS_PORT": REDIS_PORT,
|
||||
"REDIS_START_URLS_KEY": redis_key,
|
||||
"REDIS_START_URLS_AS_ZSET": start_urls_as_zset,
|
||||
"REDIS_START_URLS_AS_SET": start_urls_as_set,
|
||||
"CONCURRENT_REQUESTS": batch_size,
|
||||
}
|
||||
)
|
||||
spider = spider_cls.from_crawler(crawler)
|
||||
with flushall(spider.server):
|
||||
urls = [f"http://example.com/{i}" for i in range(batch_size * 2)]
|
||||
reqs = []
|
||||
if start_urls_as_set:
|
||||
server_put = spider.server.sadd
|
||||
elif start_urls_as_zset:
|
||||
|
||||
def server_put(key, value):
|
||||
spider.server.zadd(key, {value: 0})
|
||||
|
||||
else:
|
||||
server_put = spider.server.rpush
|
||||
for url in urls:
|
||||
server_put(redis_key, url)
|
||||
reqs.append(MockRequest(url))
|
||||
|
||||
# First call is to start requests.
|
||||
start_requests = list(spider.start_requests())
|
||||
if start_urls_as_zset or start_urls_as_set:
|
||||
assert len(start_requests) == batch_size
|
||||
assert {r.url for r in start_requests}.issubset(r.url for r in reqs)
|
||||
else:
|
||||
assert start_requests == reqs[:batch_size]
|
||||
|
||||
# Second call is to spider idle method.
|
||||
with pytest.raises(DontCloseSpider):
|
||||
spider.spider_idle()
|
||||
# Process remaining requests in the queue.
|
||||
with pytest.raises(DontCloseSpider):
|
||||
spider.spider_idle()
|
||||
|
||||
# Last batch was passed to crawl.
|
||||
assert crawler.engine.crawl.call_count == batch_size
|
||||
|
||||
if start_urls_as_zset or start_urls_as_set:
|
||||
crawler.engine.crawl.assert_has_calls(
|
||||
[mock.call(req) for req in reqs if req not in start_requests],
|
||||
any_order=True,
|
||||
)
|
||||
else:
|
||||
crawler.engine.crawl.assert_has_calls(
|
||||
[mock.call(req) for req in reqs[batch_size:]]
|
||||
)
|
||||
Reference in New Issue
Block a user