This commit is contained in:
2025-08-05 09:19:34 +08:00
commit 584548d006
1696 changed files with 53855 additions and 0 deletions
@@ -0,0 +1,18 @@
skips:
- B101
- B105
- B301
- B303
- B306
- B307
- B311
- B320
- B321
- B324
- B403
- B404
- B406
- B410
- B503
- B603
- B605
@@ -0,0 +1,35 @@
[bumpversion]
current_version = 0.9.1
commit = False
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>\w+))?
serialize =
{major}.{minor}.{patch}-{release}
{major}.{minor}.{patch}
[bumpversion:part:release]
optional_value = placeholder
values =
a1
b1
rc1
placeholder
[bumpversion:file:VERSION]
search = {current_version}
replace = {new_version}
[bumpversion:file:src/scrapy_redis/__init__.py]
search = __version__ = "{current_version}"
replace = __version__ = "{new_version}"
[bumpversion:file:.cookiecutterrc]
search = version: {current_version}
replace = version: {new_version}
[bumpversion:file:HISTORY.rst]
search = .. bumpversion marker
replace = .. bumpversion marker
{new_version} ({now:%Y-%m-%d})
------------------
@@ -0,0 +1,19 @@
# Generated by cookiepatcher, a small shim around cookiecutter (pip install cookiepatcher)
cookiecutter:
email: rolando at rmax.io
full_name: Rolando Espinoza
github_username: rolando
project_name: Scrapy-Redis
project_package: scrapy_redis
project_short_description: Redis-based components for Scrapy.
project_slug: scrapy-redis
pypi_username: rolando
use_codecov: y
use_cython: n
use_landscape: y
use_pypi_deployment_with_travis: n
use_pytest: y
use_requiresio: y
version: 0.9.1
year: 2011-2022
@@ -0,0 +1,25 @@
[paths]
source =
src
[run]
omit = setup.py
branch = true
source =
scrapy_redis
tests
parallel = true
[report]
show_missing = true
precision = 2
omit = */__init__.py
exclude_lines =
pragma: no cover
def __repr__
if self.debug:
if settings.DEBUG
raise AssertionError
raise NotImplementedError
if 0:
if __name__ == .__main__.:
@@ -0,0 +1,46 @@
*.py[cod]
*.swp
*~
.ropeproject
# C extensions
*.so
# Packages
*.egg
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
lib
lib64
__pycache__
# Installer logs
pip-log.txt
# Unit test / coverage reports
.coverage
.tox
nosetests.xml
# Translations
*.mo
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
# JetBrains PyCharm IDE
/.idea/
.venv
.tags
@@ -0,0 +1,21 @@
# http://editorconfig.org
root = true
[*]
indent_style = space
indent_size = 4
trim_trailing_whitespace = true
insert_final_newline = true
charset = utf-8
end_of_line = lf
[*.bat]
indent_style = tab
end_of_line = crlf
[LICENSE]
insert_final_newline = false
[Makefile]
indent_style = tab
+12
View File
@@ -0,0 +1,12 @@
[flake8]
max-line-length = 119
ignore =
W503
P102
P103
exclude =
tests/test_spiders.py E731
docs/conf.py E265
@@ -0,0 +1,3 @@
# GitHub syntax highlighting
pixi.lock linguist-language=YAML
@@ -0,0 +1,11 @@
# Description
Please describe your problem/feature request/bug
# Step to Reproduce
Please offer the steps to reproduce your problem/bug
# Error log
Please provide error message or screen shot for better understanding.
@@ -0,0 +1,25 @@
# Description
Please include a summary of the changes and the related issue. Please also include relevant motivation and context. List any dependencies that are required for this change.
Fixes #(issue)
# How Has This Been Tested?
Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration
- [] pytest
- [] Other test (please specify)
# Test Configuration:
- OS version:
- Necessary Libraries (optional):
# Checklist:
- [] My code follows the style guidelines of this project
- [] I have performed a self-review of my code
- [] I have commented my code, particularly in hard-to-understand areas
- [] I have made corresponding changes to the documentation
- [] My changes generate no new warnings
- [] I have added tests that prove my fix is effective or that my feature works
- [] New and existing unit tests pass locally with my changes
- [] Any dependent changes have been merged and published in downstream modules
@@ -0,0 +1,31 @@
# This is GitHub Action for cross platform building
name: build
on:
push:
branches: [master]
pull_request:
branches: [master]
jobs:
builds:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.12"]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Run build
env:
TOXENV: build
run: |
pip install -r requirements-tests.txt
tox
@@ -0,0 +1,41 @@
# This is GitHub Action for linting and security check
name: check
on:
push:
branches: [master]
pull_request:
branches: [master]
concurrency:
group: ${{github.workflow}}-${{ github.ref }}
cancel-in-progress: true
jobs:
checks:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
env: [security, flake8]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Run check
env:
TOXENV: ${{ matrix.env }}
run: |
pip install -r requirements-tests.txt
tox
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: pre-commit/action@v3.0.0
@@ -0,0 +1,30 @@
# This is GitHub Action for cross platform building
name: docs
on:
push:
branches: [master]
pull_request:
branches: [master]
jobs:
builds:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Build docs
env:
TOXENV: docs
run: |
pip install -r requirements-tests.txt
tox
@@ -0,0 +1,43 @@
# This is GitHub Action for tests
name: test
on:
push:
branches: [master]
pull_request:
branches: [master]
jobs:
tests:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
services:
redis:
image: redis
options: >-
--health-cmd "redis-cli ping"
--health-interval 10s
--health-timeout 5s
--health-retries 5
container: python:${{ matrix.python-version }}
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Run pytest
env:
REDIS_HOST: redis
TOXENV: pytest
TOX_TESTENV_PASSENV: REDIS_HOST
run: |
pip install -r requirements-tests.txt
tox
@@ -0,0 +1,67 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
.venv
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# rope-vim
.ropeproject
# Extra
.DS_Store
.vscode
@@ -0,0 +1,2 @@
[settings]
profile = black
@@ -0,0 +1,36 @@
repos:
- repo: https://github.com/PyCQA/bandit
rev: 1.7.7
hooks:
- id: bandit
args: [-r, -c, .bandit.yml]
- repo: https://github.com/PyCQA/flake8
rev: 7.0.0
hooks:
- id: flake8
additional_dependencies:
- flake8-bugbear
- flake8-comprehensions
- flake8-debugger
#- flake8-docstrings
- flake8-string-format
- flake8-type-checking
- repo: https://github.com/psf/black.git
rev: 24.2.0
hooks:
- id: black
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
- repo: https://github.com/adamchainz/blacken-docs
rev: 1.16.0
hooks:
- id: blacken-docs
additional_dependencies:
- black==24.2.0
- repo: https://github.com/asottile/pyupgrade
rev: v3.15.2
hooks:
- id: pyupgrade
args: [--py38-plus, --keep-runtime-typing]
@@ -0,0 +1 @@
3.10.13
@@ -0,0 +1,17 @@
version: 2
formats: all
sphinx:
configuration: docs/conf.py
fail_on_warning: true
build:
os: ubuntu-22.04
tools:
# For available versions, see:
# https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
python: "3.12"
python:
install:
- requirements: docs/requirements.txt
- path: .
@@ -0,0 +1,68 @@
language: python
python: 3.5
sudo: false
services:
- redis-server
env:
- TOXENV=py27-scrapyrel
- TOXENV=py34-scrapyrel
- TOXENV=py35-scrapyrel
matrix:
fast_finish: true
before_install:
- python --version
- uname -a
- lsb_release -a
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
install:
- pip install -U pip wheel
- pip install -U tox twine coverage
- virtualenv --version
- pip --version
- tox --version
# command to run tests, e.g. python setup.py test
script:
- tox -e $TOXENV --workdir $HOME/.tox
after_success:
# Codecov requires a single .coverage and will run 'coverage xml' to
# generate the report.
- coverage combine
- bash <(curl -s https://codecov.io/bash)
after_failure:
- more $HOME/.tox/log/* | cat
- more $HOME/.tox/*/log/* | cat
before_cache:
- rm -fr $HOME/.cache/pip/log
- rm -fr $HOME/.tox/log/*
- rm -fr $HOME/.tox/*/log/*
cache:
directories:
- $HOME/.cache/pip
- $HOME/.tox/
notifications:
email:
on_sucess: never
on_failure: always
deploy:
provider: pypi
distributions: "sdist bdist_wheel"
user: darkrho
password:
secure: "Pgcj+Otx9o2MxOuXibvz9LUd5DqlW0jaKDScVOAcFT+//U0esjRqY08bRFQlrSTXokJa6X/dVZlb2mQE8L4vr7mLFspRGO4FByK34L089/ETwsLKI2rks2zVbmPSyweL3sz88EXLKmYs7WsKtCnET67qra6hreKbO67ALAh5WWk="
on:
tags: true
all_branches: true
repo: rolando/scrapy-redis
condition: "$TOXENV == py35-scrapyrel"
@@ -0,0 +1,13 @@
=======
Credits
=======
Development Lead
----------------
* R Max Espinoza <hey at rmax.dev>
Contributors
------------
None yet. Why not be the first?
@@ -0,0 +1,138 @@
.. highlight:: shell
============
Contribution
============
Contributions are welcome, and they are greatly appreciated! Every
little bit helps, and credit will always be given.
You can contribute in many ways:
Types of Contributions
----------------------
New to here
~~~~~~~~~~~
Any issue with good first issue tag on it is a great place to start! Feel free to ask any questions here.
Don't know how to start
~~~~~~~~~~~~~~~~~~~~~~~
Review codebases and PRs can give you quite a knowledge to know what's going on here!
Report Bugs
~~~~~~~~~~~
Report bugs at https://github.com/rmax/scrapy-redis/issues.
If you are reporting a bug, please include:
* Your operating system name and version.
* Any details about your local setup that might be helpful in troubleshooting.
* Detailed steps to reproduce the bug.
Fix Bugs
~~~~~~~~
Look through the GitHub issues for bugs. Anything tagged with "bug"
is open to whoever wants to implement it.
Implement Features & improvments
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Look through the GitHub issues for features. Anything tagged with "feature" or "improvments"
is open to whoever wants to implement it.
Write Documentation
~~~~~~~~~~~~~~~~~~~
Scrapy-Redis could always use more documentation, whether as part of the
official Scrapy-Redis docs, in docstrings, or even on the web in blog posts,
articles, and such.
Submit Feedback
~~~~~~~~~~~~~~~
The best way to send feedback is to file an issue at https://github.com/rmax/scrapy-redis/issues.
If you are proposing a feature:
* Explain in detail how it would work.
* Keep the scope as narrow as possible, to make it easier to implement.
* Remember that this is a volunteer-driven project, and that contributions
are welcome :)
Get Started!
------------
Ready to contribute? Here's how to set up `scrapy-redis` for local development.
Setup environment
~~~~~~~~~~~~~~~~~
1. Fork the `scrapy-redis` repo on GitHub.
2. Clone your fork locally::
git clone git@github.com:your_name_here/scrapy-redis.git
3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
pip install virtualenv==20.0.23
virtualenv --python=/usr/bin/python3 ~/scrapy_redis
source ~/scrapy_redis/bin/activate
cd scrapy-redis/
pip install -r requirements-install.txt
pip install .
4. Create a branch for local development::
git checkout -b name-of-your-bugfix-or-feature
Now you can make your changes locally.
Setup testing environment
~~~~~~~~~~~~~~~~~~~~~~~~~
1. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
pip install -r requirements-tests.txt
flake8 src/ tests/
python -m pytest --ignore=setup.py
tox
2. Note that if the error of `No module named scrapy_redis` shows, please check the install `scrapy-redis` of your branch by::
pip install .
3. Or change the import lines::
from scrapy_redis import xxx # from this
from src.scrapy_redis import xxx # to this
4. Commit your changes and push your branch to GitHub::
git add .
git commit -m "Your detailed description of your changes."
git push origin name-of-your-bugfix-or-feature
5. Submit a pull request through the GitHub website.
Pull Request Guidelines
-----------------------
Before you submit a pull request, check that it meets these guidelines:
1. The pull request should include tests.
2. If the pull request adds functionality, the docs should be updated. Put
your new functionality into a function with a docstring, and add the
feature to the list in README.rst.
3. Make sure that the tests pass for all supported Python versions.
Tips
----
To run a subset of tests::
pytest tests/test_scrapy_redis
@@ -0,0 +1,16 @@
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install tox and dependencies (replace 'your-requirements.txt' with your actual file)
COPY requirements.txt .
COPY requirements-tests.txt .
RUN pip install -r requirements.txt -r requirements-tests.txt
# Copy your project code
COPY . .
# Run Tox tests
CMD ["tox"]
+136
View File
@@ -0,0 +1,136 @@
=======
History
=======
.. bumpversion marker
0.9.1 (2024-07-06)
------------------
* Fixed docs build.
0.9.0 (2024-07-06)
------------------
* Fixed ``Scheduler`` not compatible with BaseDupeFilter (#294)
* Added precommit hooks.
* Switched to Python 3.12 as default build version.
0.8.0 (2024-07-03)
------------------
* Fixed request fingerprint method.
* Fixed support for Scrapy 2.6+.
* Fixed tox tests and github workflow.
* Deprecated ``REDIS_START_URLS_BATCH_SIZE``.
0.7.3 (2022-07-21)
------------------
* Move docs to GitHub Wiki
* Update tox and support dynamic tests
* Update support for json data
* Refactor max idle time
* Add support for python3.7~python3.10
* Deprecate python2.x support
0.7.2 (2021-12-27)
------------------
* Fix RedisStatsCollector._get_key()
* Fix redis-py dependency version
* Added maximum idle waiting time MAX_IDLE_TIME_BEFORE_CLOSE
0.7.1 (2021-03-27)
------------------
* Fixes datetime parse error for redis-py 3.x.
* Add support for stats extensions.
0.7.1-rc1 (2021-03-27)
----------------------
* Fixes datetime parse error for redis-py 3.x.
0.7.1-b1 (2021-03-22)
---------------------
* Add support for stats extensions.
0.7.0-dev (unreleased)
----------------------
* Unreleased.
0.6.8 (2017-02-14)
------------------
* Fixed automated release due to not matching registered email.
0.6.7 (2016-12-27)
------------------
* Fixes bad formatting in logging message.
0.6.6 (2016-12-20)
------------------
* Fixes wrong message on dupefilter duplicates.
0.6.5 (2016-12-19)
------------------
* Fixed typo in default settings.
0.6.4 (2016-12-18)
------------------
* Fixed data decoding in Python 3.x.
* Added ``REDIS_ENCODING`` setting (default ``utf-8``).
* Default to ``CONCURRENT_REQUESTS`` value for ``REDIS_START_URLS_BATCH_SIZE``.
* Renamed queue classes to a proper naming conventiong (backwards compatible).
0.6.3 (2016-07-03)
------------------
* Added ``REDIS_START_URLS_KEY`` setting.
* Fixed spider method ``from_crawler`` signature.
0.6.2 (2016-06-26)
------------------
* Support ``redis_cls`` parameter in ``REDIS_PARAMS`` setting.
* Python 3.x compatibility fixed.
* Added ``SCHEDULER_SERIALIZER`` setting.
0.6.1 (2016-06-25)
------------------
* **Backwards incompatible change:** Require explicit ``DUPEFILTER_CLASS``
setting.
* Added ``SCHEDULER_FLUSH_ON_START`` setting.
* Added ``REDIS_START_URLS_AS_SET`` setting.
* Added ``REDIS_ITEMS_KEY`` setting.
* Added ``REDIS_ITEMS_SERIALIZER`` setting.
* Added ``REDIS_PARAMS`` setting.
* Added ``REDIS_START_URLS_BATCH_SIZE`` spider attribute to read start urls
in batches.
* Added ``RedisCrawlSpider``.
0.6.0 (2015-07-05)
------------------
* Updated code to be compatible with Scrapy 1.0.
* Added `-a domain=...` option for example spiders.
0.5.0 (2013-09-02)
------------------
* Added `REDIS_URL` setting to support Redis connection string.
* Added `SCHEDULER_IDLE_BEFORE_CLOSE` setting to prevent the spider closing too
quickly when the queue is empty. Default value is zero keeping the previous
behavior.
* Schedule preemptively requests on item scraped.
* This version is the latest release compatible with Scrapy 0.24.x.
0.4.0 (2013-04-19)
------------------
* Added `RedisSpider` and `RedisMixin` classes as building blocks for spiders
to be fed through a redis queue.
* Added redis queue stats.
* Let the encoder handle the item as it comes instead converting it to a dict.
0.3.0 (2013-02-18)
------------------
* Added support for different queue classes.
* Changed requests serialization from `marshal` to `cPickle`.
0.2.0 (2013-02-17)
------------------
* Improved backward compatibility.
* Added example project.
0.1.0 (2011-09-01)
------------------
* First release on PyPI.
+19
View File
@@ -0,0 +1,19 @@
Copyright (c) 2011-2024, R Max Espinoza
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
@@ -0,0 +1,16 @@
graft docs
graft src
graft tests
graft example-project
include *.in
include *.ini
include *.rst
include *.txt
include LICENSE
include VERSION
include Makefile
global-exclude __pycache__ *.py[cod]
global-exclude *.so *.dylib
+156
View File
@@ -0,0 +1,156 @@
.PHONY: clean-so clean-test clean-pyc clean-build clean-docs clean
.PHONY: docs check check-manifest check-setup check-history lint
.PHONY: test test-all coverage
.PHONY: compile-reqs install-reqs
.PHONY: release dist install build-inplace
define BROWSER_PYSCRIPT
import os, webbrowser, sys
FAIL = "\033[91m"
ENDC = "\033[0m"
try:
from urllib.request import pathname2url
except:
print(FAIL + "Python2 is deprecated, please upgrade your python >= 3.7" + ENDC)
webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
endef
export BROWSER_PYSCRIPT
BROWSER := python -c "$$BROWSER_PYSCRIPT"
SPHINX_BUILD := html
help:
@echo "check - check setup, code style, setup, etc"
@echo "check-manifest - check manifest"
@echo "check-setup - check setup"
@echo "check-history - check history"
@echo "clean - remove all build, test, coverage and Python artifacts"
@echo "clean-build - remove build artifacts"
@echo "clean-docs - remove docs artifacts"
@echo "clean-pyc - remove Python file artifacts"
@echo "clean-test - remove test and coverage artifacts"
@echo "clean-so - remove compiled extensions"
@echo "lint - check style with flake8"
@echo "test - run tests quickly with the default Python"
@echo "test-all - run tests on every Python version with tox"
@echo "coverage - check code coverage quickly with the default Python"
@echo "compile-reqs - compile requirements"
@echo "install-reqs - install requirements"
@echo "docs - generate Sphinx HTML documentation, including API docs"
@echo "dist-upload - package and upload a release"
@echo "release - bump release and push changes"
@echo "dist - package"
@echo "develop - install package in develop mode"
@echo "install - install the package to the active Python's site-packages"
check: check-setup check-manifest check-history lint
check-setup:
@echo "Checking package metadata (name, description, etc)"
python setup.py check --strict --metadata --restructuredtext
check-manifest:
@echo "Checking MANIFEST.in"
check-manifest --ignore ".*"
check-history:
@echo "Checking latest version in HISTORY"
VERSION=`cat VERSION`; grep "^$${VERSION}\b" HISTORY.rst
clean: clean-build clean-docs clean-pyc clean-test clean-so
clean-build:
rm -fr build/
rm -fr dist/
rm -fr .eggs/
find . -name '*.egg-info' -exec rm -fr {} +
find . -name '*.egg' -exec rm -f {} +
clean-docs:
$(MAKE) -C docs clean
clean-pyc:
find . -name '*.pyc' -exec rm -f {} +
find . -name '*.pyo' -exec rm -f {} +
find . -name '*~' -exec rm -f {} +
find . -name '__pycache__' -exec rm -fr {} +
clean-test:
rm -fr .tox/
rm -f .coverage
rm -fr htmlcov/
clean-so:
find . -name '*.so' -exec rm -f {} +
lint:
flake8 src tests
build-inplace:
python setup.py build_ext --inplace
develop: clean
pip install -e .
test: develop
pytest --ignore=setup.py
test-all:
tox -v
coverage: develop
coverage run -m pytest --ignore=setup.py
coverage combine
coverage report
coverage html
$(BROWSER) htmlcov/index.html
docs-build: develop
rm -f docs/scrapy_redis.rst
rm -f docs/modules.rst
sphinx-apidoc -o docs/ src/scrapy_redis
$(MAKE) -C docs clean
$(MAKE) -C docs $(SPHINX_BUILD)
docs: docs-build
$(BROWSER) docs/_build/$(SPHINX_BUILD)/index.html
servedocs: docs
watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
release:
@echo "To do a release, follow the steps:"
@echo "- bumpversion release"
@echo "- Review and commit"
@echo "- git tag -a \`cat VERSION\`"
@echo "- git push --follow-tags"
dist-upload: clean check dist
twine upload dist/*
dist: clean
python setup.py sdist
python setup.py bdist_wheel
ls -l dist
install: clean
pip install .
REQUIREMENTS_IN := $(wildcard requirements*.in)
.PHONY: $(REQUIREMENTS_IN)
requirements%.txt: requirements%.in
pip-compile -v $< -o $@
REQUIREMENTS_TXT := $(REQUIREMENTS_IN:.in=.txt)
ifndef REQUIREMENTS_TXT
REQUIREMENTS_TXT := $(wildcard requirements*.txt)
endif
compile-reqs: $(REQUIREMENTS_TXT)
@test -z "$$REQUIREMENTS_TXT" && echo "No 'requirements*.in' files. Nothing to do"
install-reqs:
@test -z "$$REQUIREMENTS_TXT" && echo "No 'requirements*.txt' files. Nothing to do"
$(foreach req,$(REQUIREMENTS_TXT),pip install -r $(req);)
+110
View File
@@ -0,0 +1,110 @@
============
Scrapy-Redis
============
.. image:: https://readthedocs.org/projects/scrapy-redis/badge/?version=latest
:alt: Documentation Status
:target: https://readthedocs.org/projects/scrapy-redis/?badge=latest
.. image:: https://img.shields.io/pypi/v/scrapy-redis.svg
:target: https://pypi.python.org/pypi/scrapy-redis
.. image:: https://img.shields.io/pypi/pyversions/scrapy-redis.svg
:target: https://pypi.python.org/pypi/scrapy-redis
.. image:: https://github.com/rmax/scrapy-redis/actions/workflows/builds.yml/badge.svg
:target: https://github.com/rmax/scrapy-redis/actions/workflows/builds.yml
.. image:: https://github.com/rmax/scrapy-redis/actions/workflows/checks.yml/badge.svg
:target: https://github.com/rmax/scrapy-redis/actions/workflows/checks.yml
.. image:: https://github.com/rmax/scrapy-redis/actions/workflows/tests.yml/badge.svg
:target: https://github.com/rmax/scrapy-redis/actions/workflows/tests.yml
.. image:: https://codecov.io/github/rmax/scrapy-redis/coverage.svg?branch=master
:alt: Coverage Status
:target: https://codecov.io/github/rmax/scrapy-redis
.. image:: https://img.shields.io/badge/security-bandit-green.svg
:alt: Security Status
:target: https://github.com/rmax/scrapy-redis
Redis-based components for Scrapy.
* Usage: https://github.com/rmax/scrapy-redis/wiki/Usage
* Documentation: https://github.com/rmax/scrapy-redis/wiki.
* Release: https://github.com/rmax/scrapy-redis/wiki/History
* Contribution: https://github.com/rmax/scrapy-redis/wiki/Getting-Started
* LICENSE: MIT license
Features
--------
* Distributed crawling/scraping
You can start multiple spider instances that share a single redis queue.
Best suitable for broad multi-domain crawls.
* Distributed post-processing
Scraped items gets pushed into a redis queued meaning that you can start as
many as needed post-processing processes sharing the items queue.
* Scrapy plug-and-play components
Scheduler + Duplication Filter, Item Pipeline, Base Spiders.
* In this forked version: added ``json`` supported data in Redis
data contains ``url``, ```meta``` and other optional parameters. ``meta`` is a nested json which contains sub-data.
this function extract this data and send another FormRequest with ``url``, ``meta`` and addition ``formdata``.
For example:
.. code-block:: json
{ "url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }
this data can be accessed in `scrapy spider` through response.
like: `request.url`, `request.meta`, `request.cookies`
.. note:: This features cover the basic case of distributing the workload across multiple workers. If you need more features like URL expiration, advanced URL prioritization, etc., we suggest you to take a look at the Frontera_ project.
Requirements
------------
* Python 3.7+
* Redis >= 5.0
* ``Scrapy`` >= 2.0
* ``redis-py`` >= 4.0
Installation
------------
From pip
.. code-block:: bash
pip install scrapy-redis
From GitHub
.. code-block:: bash
git clone https://github.com/darkrho/scrapy-redis.git
cd scrapy-redis
python setup.py install
.. note:: For using this json supported data feature, please make sure you have not installed the scrapy-redis through pip. If you already did it, you first uninstall that one.
.. code-block:: bash
pip uninstall scrapy-redis
Alternative Choice
---------------------------
Frontera_ is a web crawling framework consisting of `crawl frontier`_, and distribution/scaling primitives, allowing to build a large scale online web crawler.
.. _Frontera: https://github.com/scrapinghub/frontera
.. _crawl frontier: http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html
+11
View File
@@ -0,0 +1,11 @@
TODO
====
* Add SCRAPY_JOB global support (jobs sharing same SCRAPY_JOB share same queues).
* Use a spider middleware instead of spider mixin. This will avoid the spider
idle signal hack.
* Allow to use pubsub whenever appropriate.
* Move example project to its own repository. Include different crawling use
cases (i.e.: producer/consumer).
* Add pyrebloom dupefilter.
* Warn and pass unserializable requests.
+1
View File
@@ -0,0 +1 @@
0.9.1
@@ -0,0 +1,20 @@
version: '3.8'
services:
python:
build: .
command: tox -e security,flake8,pytest
environment:
REDIS_HOST: redis # Use service name for hostname within docker network
REDIS_PORT: 6379
TOX_TESTENV_PASSENV: "REDIS_HOST REDIS_PORT"
volumes:
- ./:/app # Mount your project directory into the container
depends_on:
- redis
redis:
image: redis:6.2-alpine
ports:
- "6379:6379" # Map Redis port to host port
@@ -0,0 +1,177 @@
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/scrapy-redis.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/scrapy-redis.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/scrapy-redis"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/scrapy-redis"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
@@ -0,0 +1 @@
.. include:: ../AUTHORS.rst
@@ -0,0 +1,273 @@
#!/usr/bin/env python
#
# scrapy-redis documentation build configuration file, created by
# sphinx-quickstart on Tue Jul 9 22:26:36 2013.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import os
import re
# If extensions (or modules to document with autodoc) are in another
# directory, add these directories to sys.path here. If the directory is
# relative to the documentation root, use os.path.abspath to make it
# absolute, like shown here.
# sys.path.insert(0, os.path.abspath('.'))
# Get the project root dir, which is the parent dir of this
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# -- General configuration ---------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.napoleon",
"sphinx.ext.viewcode",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The suffix of source filenames.
source_suffix = ".rst"
# The encoding of source files.
# source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = "index"
# General information about the project.
project = "Scrapy-Redis"
copyright = "2011-2024, R Max Espinoza"
# The version info for the project you're documenting, acts as replacement
# for |version| and |release|, also used in various other places throughout
# the built documents.
#
# The full version, including alpha/beta/rc tags.
release = open(os.path.join(project_root, "VERSION")).read().strip()
# The short X.Y version.
version = re.findall(r"\d+\.\d+\.\d+", release)[0]
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
# language = None
# There are two options for replacing |today|: either, you set today to
# some non-false value, then it is used:
# today = ''
# Else, today_fmt is used as the format for a strftime call.
# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ["_build"]
# The reST default role (used for this markup: `text`) to use for all
# documents.
# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built
# documents.
# keep_warnings = False
# -- Options for HTML output -------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = "default"
# Theme options are theme-specific and customize the look and feel of a
# theme further. For a list of options available for each theme, see the
# documentation.
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
# html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
# html_title = None
# A shorter title for the navigation bar. Default is the same as
# html_title.
# html_short_title = None
# The name of an image file (relative to this directory) to place at the
# top of the sidebar.
# html_logo = None
# The name of an image file (within the static path) to use as favicon
# of the docs. This file should be a Windows icon file (.ico) being
# 16x16 or 32x32 pixels large.
# html_favicon = None
# Add any paths that contain custom static files (such as style sheets)
# here, relative to this directory. They are copied after the builtin
# static files, so a file named "default.css" will overwrite the builtin
# "default.css".
# html_static_path = ["_static"]
# If not '', a 'Last updated on:' timestamp is inserted at every page
# bottom, using the given strftime format.
# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names
# to template names.
# html_additional_pages = {}
# If false, no module index is generated.
# html_domain_indices = True
# If false, no index is generated.
# html_use_index = True
# If true, the index is split into individual pages for each letter.
# html_split_index = False
# If true, links to the reST sources are added to the pages.
# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer.
# Default is True.
# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer.
# Default is True.
# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages
# will contain a <link> tag referring to it. The value of this option
# must be the base URL from which the finished HTML is served.
# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
# html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = "scrapy_redisdoc"
# -- Options for LaTeX output ------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
# 'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass
# [howto/manual]).
latex_documents = [
(
"index",
"scrapy_redis.tex",
"Scrapy-Redis Documentation",
"R Max Espinoza",
"manual",
),
]
# The name of an image file (relative to this directory) to place at
# the top of the title page.
# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings
# are parts, not chapters.
# latex_use_parts = False
# If true, show page references after internal links.
# latex_show_pagerefs = False
# If true, show URL addresses after external links.
# latex_show_urls = False
# Documents to append as an appendix to all manuals.
# latex_appendices = []
# If false, no module index is generated.
# latex_domain_indices = True
# -- Options for manual page output ------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
("index", "scrapy_redis", "Scrapy-Redis Documentation", ["R Max Espinoza"], 1)
]
# If true, show URL addresses after external links.
# man_show_urls = False
# -- Options for Texinfo output ----------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(
"index",
"scrapy_redis",
"Scrapy-Redis Documentation",
"R Max Espinoza",
"scrapy-redis",
"One line description of project.",
"Miscellaneous",
),
]
# Documents to append as an appendix to all manuals.
# texinfo_appendices = []
# If false, no module index is generated.
# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
# texinfo_no_detailmenu = False
@@ -0,0 +1 @@
.. include:: ../CONTRIBUTING.rst
@@ -0,0 +1 @@
.. include:: ../HISTORY.rst
@@ -0,0 +1,27 @@
.. scrapy-redis documentation master file, created by
sphinx-quickstart on Tue Jul 9 22:26:36 2013.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to Scrapy-Redis's documentation!
========================================
Contents:
.. toctree::
:maxdepth: 2
readme
installation
modules
contributing
history
authors
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
@@ -0,0 +1,49 @@
.. highlight:: shell
============
Installation
============
Stable release
--------------
To install Scrapy-Redis, run this command in your terminal:
.. code-block:: console
pip install scrapy-redis
If you don't have `pip`_ installed, this `Python installation guide`_ can guide
you through the process.
.. _pip: https://pip.pypa.io
.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
From sources
------------
The sources for Scrapy-Redis can be downloaded from the `Github repo`_.
You can either clone the public repository:
.. code-block:: console
git clone git://github.com/rolando/scrapy-redis
Or download the `tarball`_:
.. code-block:: console
curl -OL https://github.com/rolando/scrapy-redis/tarball/master
Once you have a copy of the source, you can install it with:
.. code-block:: console
pip install -e .
.. _Github repo: https://github.com/rolando/scrapy-redis
.. _tarball: https://github.com/rolando/scrapy-redis/tarball/master
@@ -0,0 +1,242 @@
@ECHO OFF
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set BUILDDIR=_build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
set I18NSPHINXOPTS=%SPHINXOPTS% .
if NOT "%PAPER%" == "" (
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
)
if "%1" == "" goto help
if "%1" == "help" (
:help
echo.Please use `make ^<target^>` where ^<target^> is one of
echo. html to make standalone HTML files
echo. dirhtml to make HTML files named index.html in directories
echo. singlehtml to make a single large HTML file
echo. pickle to make pickle files
echo. json to make JSON files
echo. htmlhelp to make HTML files and a HTML help project
echo. qthelp to make HTML files and a qthelp project
echo. devhelp to make HTML files and a Devhelp project
echo. epub to make an epub
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
echo. text to make text files
echo. man to make manual pages
echo. texinfo to make Texinfo files
echo. gettext to make PO message catalogs
echo. changes to make an overview over all changed/added/deprecated items
echo. xml to make Docutils-native XML files
echo. pseudoxml to make pseudoxml-XML files for display purposes
echo. linkcheck to check all external links for integrity
echo. doctest to run all doctests embedded in the documentation if enabled
goto end
)
if "%1" == "clean" (
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
del /q /s %BUILDDIR%\*
goto end
)
%SPHINXBUILD% 2> nul
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
if "%1" == "html" (
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
goto end
)
if "%1" == "dirhtml" (
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
goto end
)
if "%1" == "singlehtml" (
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
goto end
)
if "%1" == "pickle" (
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the pickle files.
goto end
)
if "%1" == "json" (
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the JSON files.
goto end
)
if "%1" == "htmlhelp" (
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
goto end
)
if "%1" == "qthelp" (
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\scrapy-redis.qhcp
echo.To view the help file:
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\scrapy-redis.ghc
goto end
)
if "%1" == "devhelp" (
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished.
goto end
)
if "%1" == "epub" (
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The epub file is in %BUILDDIR%/epub.
goto end
)
if "%1" == "latex" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
if errorlevel 1 exit /b 1
echo.
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdf" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf
cd %BUILDDIR%/..
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdfja" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf-ja
cd %BUILDDIR%/..
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "text" (
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The text files are in %BUILDDIR%/text.
goto end
)
if "%1" == "man" (
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The manual pages are in %BUILDDIR%/man.
goto end
)
if "%1" == "texinfo" (
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
goto end
)
if "%1" == "gettext" (
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
goto end
)
if "%1" == "changes" (
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
if errorlevel 1 exit /b 1
echo.
echo.The overview file is in %BUILDDIR%/changes.
goto end
)
if "%1" == "linkcheck" (
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
if errorlevel 1 exit /b 1
echo.
echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
goto end
)
if "%1" == "doctest" (
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
if errorlevel 1 exit /b 1
echo.
echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
goto end
)
if "%1" == "xml" (
%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The XML files are in %BUILDDIR%/xml.
goto end
)
if "%1" == "pseudoxml" (
%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
goto end
)
:end
@@ -0,0 +1,7 @@
API Reference
=============
.. toctree::
:maxdepth: 4
scrapy_redis
@@ -0,0 +1 @@
.. include:: ../README.rst
@@ -0,0 +1,8 @@
# This packages are requires only for development and release management.
Sphinx
bumpversion
check-manifest
pip-tools
twine
watchdog
wheel
@@ -0,0 +1,62 @@
scrapy_redis package
====================
Submodules
----------
scrapy_redis.connection module
------------------------------
.. automodule:: scrapy_redis.connection
:members:
:undoc-members:
:show-inheritance:
scrapy_redis.dupefilter module
------------------------------
.. automodule:: scrapy_redis.dupefilter
:members:
:undoc-members:
:show-inheritance:
scrapy_redis.pipelines module
-----------------------------
.. automodule:: scrapy_redis.pipelines
:members:
:undoc-members:
:show-inheritance:
scrapy_redis.queue module
-------------------------
.. automodule:: scrapy_redis.queue
:members:
:undoc-members:
:show-inheritance:
scrapy_redis.scheduler module
-----------------------------
.. automodule:: scrapy_redis.scheduler
:members:
:undoc-members:
:show-inheritance:
scrapy_redis.spiders module
---------------------------
.. automodule:: scrapy_redis.spiders
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: scrapy_redis
:members:
:undoc-members:
:show-inheritance:
@@ -0,0 +1,5 @@
#@IgnoreInspection BashAddShebang
FROM python:2.7-onbuild
ENTRYPOINT ["scrapy"]
CMD ["crawl", "dmoz"]
@@ -0,0 +1,154 @@
============================
Scrapy Redis Example Project
============================
This directory contains an example Scrapy project integrated with scrapy-redis.
By default, all items are sent to redis (key ``<spider>:items``). All spiders
schedule requests through redis, so you can start additional spiders to speed
up the crawling.
Spiders
-------
* **dmoz**
This spider simply scrapes dmoz.org.
* **myspider_redis**
This spider uses redis as a shared requests queue and uses
``myspider:start_urls`` as start URLs seed. For each URL, the spider outputs
one item.
* **mycrawler_redis**
This spider uses redis as a shared requests queue and uses
``mycrawler:start_urls`` as start URLs seed. For each URL, the spider follows
are links.
.. note::
All requests are persisted by default. You can clear the queue by using the
``SCHEDULER_FLUSH_ON_START`` setting. For example: ``scrapy crawl dmoz -s
SCHEDULER_FLUSH_ON_START=1``.
Running the example project
---------------------------
This example illustrates how to share a spider's requests queue
across multiple spider instances, highly suitable for broad crawls.
1. Check scrapy_redis package in your ``PYTHONPATH``
2. Run the crawler for first time then stop it
.. code-block:: bash
cd example-project
scrapy crawl dmoz
... [dmoz] ...
^C
3. Run the crawler again to resume stopped crawling
.. code-block:: bash
scrapy crawl dmoz
... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled)
4. Start one or more additional scrapy crawlers
.. code-block:: bash
scrapy crawl dmoz
... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled)
5. Start one or more post-processing workers
.. code-block:: bash
python process_items.py dmoz:items -v
...
Processing: Kilani Giftware (http://www.dmoz.org/Computers/Shopping/Gifts/)
Processing: NinjaGizmos.com (http://www.dmoz.org/Computers/Shopping/Gifts/)
...
Feeding a Spider from Redis
---------------------------
The class ``scrapy_redis.spiders.RedisSpider`` enables a spider to read the
urls from redis. The urls in the redis queue will be processed one
after another, if the first request yields more requests, the spider
will process those requests before fetching another url from redis.
For example, create a file ``myspider.py`` with the code below:
.. code-block:: python
from scrapy_redis.spiders import RedisSpider
class MySpider(RedisSpider):
name = "myspider"
def parse(self, response):
# do stuff
pass
Then:
1. run the spider
.. code-block:: bash
scrapy runspider myspider.py
2. push json data to redis
.. code-block:: bash
redis-cli lpush myspider '{"url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }'
.. note::
* These spiders rely on the spider idle signal to fetch start urls, hence it
may have a few seconds of delay between the time you push a new url and the
spider starts crawling it.
* Also please pay attention to json formatting.
Processing items
----------------
The ``process_items.py`` provides an example of consuming the items queue::
.. code-block:: bash
python process_items.py --help
Run via Docker
--------------
You require the following applications:
* docker (https://docs.docker.com/installation/)
* docker-compose (https://docs.docker.com/compose/install/)
For implementation details see `Dockerfile` and `docker-compose.yml` and read
official docker documentation.
1. To start sample `example-project` (`-d` for daemon)::
docker-compose up
2. To scale `crawler` (4 instances for example)::
docker-compose scale crawler=4
@@ -0,0 +1,9 @@
redis:
image: redis
ports:
- "6379:6379" # added port for external db provisioning
crawler:
build: .
links:
- redis:localhost
@@ -0,0 +1,24 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Field, Item
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, TakeFirst
class ExampleItem(Item):
name = Field()
description = Field()
link = Field()
crawled = Field()
spider = Field()
url = Field()
class ExampleLoader(ItemLoader):
default_item_class = ExampleItem
default_input_processor = MapCompose(lambda s: s.strip())
default_output_processor = TakeFirst()
description_out = Join()
@@ -0,0 +1,12 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
from datetime import datetime
class ExamplePipeline:
def process_item(self, item, spider):
item["crawled"] = datetime.utcnow()
item["spider"] = spider.name
return item
@@ -0,0 +1,37 @@
# Scrapy settings for example project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ["example.spiders"]
NEWSPIDER_MODULE = "example.spiders"
LOG_LEVEL = "WARNING"
USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)"
#设置重复过滤器模块
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#设置调度器,scrapy_redis具备与数据库交互的功能
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#设置当爬虫结束时是否保持redis数据库中的去重集合与任务队列
SCHEDULER_PERSIST = True
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
ITEM_PIPELINES = {
"example.pipelines.ExamplePipeline": 300,
#当开启该管道,该管道将会把数据存到redis数据库中
"scrapy_redis.pipelines.RedisPipeline": 400,
}
#设置redis数据库
REDIS_URL = "redis://127.0.0.1:6379"
LOG_LEVEL = "DEBUG"
# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1
@@ -0,0 +1,8 @@
# This package will contain the spiders of your Scrapy project
#
# To create the first spider for your project use this command:
#
# scrapy genspider myspider myspider-domain.com
#
# For more info see:
# http://doc.scrapy.org/topics/spiders.html
@@ -0,0 +1,26 @@
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class DmozSpider(CrawlSpider):
"""Follow categories and extract links."""
name = "dmoz"
allowed_domains = ["dmoztools.net"]
start_urls = ["http://www.dmoztools.net/"]
rules = [
Rule(
LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
callback="parse_directory",
follow=True,
),
]
def parse_directory(self, response):
for div in response.css(".title-and-desc"):
yield {
"name": div.css(".site-title::text").extract_first(),
"description": div.css(".site-descr::text").extract_first().strip(),
"link": div.css("a::attr(href)").extract_first(),
}
@@ -0,0 +1,28 @@
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy_redis.spiders import RedisCrawlSpider
class MyCrawler(RedisCrawlSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = "mycrawler_redis"
redis_key = "mycrawler:start_urls"
rules = (
# follow all links
Rule(LinkExtractor(), callback="parse_page", follow=True),
)
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop("domain", "")
self.allowed_domains = filter(None, domain.split(","))
super().__init__(*args, **kwargs)
def parse_page(self, response):
return {
"name": response.css("title::text").extract_first(),
"url": response.url,
}
@@ -0,0 +1,20 @@
from scrapy_redis.spiders import RedisSpider
class MySpider(RedisSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = "myspider_redis"
redis_key = "myspider:start_urls"
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop("domain", "")
self.allowed_domains = filter(None, domain.split(","))
super().__init__(*args, **kwargs)
def parse(self, response):
return {
"name": response.css("title::text").extract_first(),
"url": response.url,
}
@@ -0,0 +1,105 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""A script to process items from a redis queue."""
import argparse
import json
import logging
import pprint
import sys
import time
from scrapy_redis import get_redis
logger = logging.getLogger("process_items")
def process_items(r, keys, timeout, limit=0, log_every=1000, wait=0.1):
"""Process items from a redis queue.
Parameters
----------
r : Redis
Redis connection instance.
keys : list
List of keys to read the items from.
timeout: int
Read timeout.
"""
limit = limit or float("inf")
processed = 0
while processed < limit:
# Change ``blpop`` to ``brpop`` to process as LIFO.
ret = r.blpop(keys, timeout)
# If data is found before the timeout then we consider we are done.
if ret is None:
time.sleep(wait)
continue
source, data = ret
try:
item = json.loads(data)
except Exception:
logger.exception("Failed to load item:\n%r", pprint.pformat(data))
continue
try:
name = item.get("name") or item.get("title")
url = item.get("url") or item.get("link")
logger.debug("[%s] Processing item: %s <%s>", source, name, url)
except KeyError:
logger.exception(
"[%s] Failed to process item:\n%r", source, pprint.pformat(item)
)
continue
processed += 1
if processed % log_every == 0:
logger.info("Processed %s items", processed)
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("key", help="Redis key where items are stored")
parser.add_argument("--host")
parser.add_argument("--port")
parser.add_argument("--timeout", type=int, default=5)
parser.add_argument("--limit", type=int, default=0)
parser.add_argument("--progress-every", type=int, default=100)
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args()
params = {}
if args.host:
params["host"] = args.host
if args.port:
params["port"] = args.port
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
r = get_redis(**params)
host = r.connection_pool.get_connection("info").host
logger.info("Waiting for items in '%s' (server: %s)", args.key, host)
kwargs = {
"keys": [args.key],
"timeout": args.timeout,
"limit": args.limit,
"log_every": args.progress_every,
}
try:
process_items(r, **kwargs)
retcode = 0 # ok
except KeyboardInterrupt:
retcode = 0 # ok
except Exception:
logger.exception("Unhandled exception")
retcode = 2
return retcode
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,2 @@
scrapy
scrapy-redis
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# http://doc.scrapy.org/topics/scrapyd.html
[settings]
default = example.settings
[deploy]
#url = http://localhost:6800/
project = example
+125
View File
@@ -0,0 +1,125 @@
[MASTER]
persistent=no
jobs=1 # >1 hides results
suggestion-mode=yes # guess common misconfiguration and emit user-friendly hints
py-version = 3.11.3
[MESSAGES CONTROL]
disable=abstract-method,
anomalous-backslash-in-string,
arguments-differ,
arguments-renamed,
attribute-defined-outside-init,
bad-classmethod-argument,
bad-continuation,
bad-indentation,
bad-mcs-classmethod-argument,
bad-super-call,
bad-whitespace,
bare-except,
blacklisted-name,
broad-except,
c-extension-no-member,
catching-non-exception,
cell-var-from-loop,
comparison-with-callable,
consider-iterating-dictionary,
consider-using-dict-items,
consider-using-from-import,
consider-using-in,
consider-using-set-comprehension,
consider-using-sys-exit,
consider-using-with,
cyclic-import,
dangerous-default-value,
deprecated-method,
deprecated-module,
duplicate-code, # https://github.com/PyCQA/pylint/issues/214
eval-used,
expression-not-assigned,
fixme,
function-redefined,
global-statement,
import-error,
import-outside-toplevel,
import-self,
inconsistent-return-statements,
inherit-non-class,
invalid-name,
invalid-overridden-method,
isinstance-second-argument-not-valid-type,
keyword-arg-before-vararg,
line-too-long,
logging-format-interpolation,
logging-not-lazy,
lost-exception,
method-hidden,
misplaced-comparison-constant,
missing-docstring,
missing-final-newline,
multiple-imports,
multiple-statements,
no-else-continue,
no-else-raise,
no-else-return,
no-init,
no-member,
no-method-argument,
no-name-in-module,
no-self-argument,
no-self-use,
no-value-for-parameter,
not-an-iterable,
not-callable,
pointless-statement,
pointless-string-statement,
protected-access,
raise-missing-from,
redefined-argument-from-local,
redefined-builtin,
redefined-outer-name,
reimported,
signature-differs,
singleton-comparison,
super-init-not-called,
super-with-arguments,
superfluous-parens,
too-few-public-methods,
too-many-ancestors,
too-many-arguments,
too-many-branches,
too-many-format-args,
too-many-function-args,
too-many-instance-attributes,
too-many-lines,
too-many-locals,
too-many-public-methods,
too-many-return-statements,
trailing-newlines,
trailing-whitespace,
unbalanced-tuple-unpacking,
undefined-variable,
undefined-loop-variable,
unexpected-special-method-signature,
ungrouped-imports,
unidiomatic-typecheck,
unnecessary-comprehension,
unnecessary-lambda,
unnecessary-pass,
unreachable,
unspecified-encoding,
unsupported-assignment-operation,
unsubscriptable-object,
unused-argument,
unused-import,
unused-private-member,
unused-variable,
unused-wildcard-import,
use-implicit-booleaness-not-comparison,
used-before-assignment,
useless-object-inheritance, # Required for Python 2 support
useless-return,
useless-super-delegation,
wildcard-import,
wrong-import-order,
wrong-import-position
@@ -0,0 +1,11 @@
[pytest]
norecursedirs =
.*
dist
build
python_files =
test_*.py
*_test.py
tests.py
addopts =
-rxEfsw -v
@@ -0,0 +1,6 @@
# This packages are required to run all the tests.
flake8
mock
pytest>=6.0,<7
pytest-cov
tox>=4.0,<5
@@ -0,0 +1,3 @@
scrapy>=2.6.0
redis>=4.2
six>=1.15
@@ -0,0 +1,6 @@
[wheel]
universal = 1
[flake8]
exclude = docs, tests
max-line-length = 120
+59
View File
@@ -0,0 +1,59 @@
#!/usr/bin/env python
import io
from pkgutil import walk_packages
from setuptools import setup
def find_packages(path):
# This method returns packages and subpackages as well.
return [name for _, name, is_pkg in walk_packages([path]) if is_pkg]
def read_file(filename):
with open(filename) as fp:
return fp.read().strip()
def read_rst(filename):
# Ignore unsupported directives by pypi.
content = read_file(filename)
return "".join(
line for line in io.StringIO(content) if not line.startswith(".. comment::")
)
def read_requirements(filename):
return [
line.strip()
for line in read_file(filename).splitlines()
if not line.startswith("#")
]
setup(
name="scrapy-redis",
version=read_file("VERSION"),
description="Redis-based components for Scrapy.",
long_description=read_rst("README.rst") + "\n\n" + read_rst("HISTORY.rst"),
author="R Max Espinoza",
author_email="hey@rmax.dev",
url="https://github.com/rmax/scrapy-redis",
packages=list(find_packages("src")),
package_dir={"": "src"},
install_requires=read_requirements("requirements.txt"),
include_package_data=True,
license="MIT",
keywords="scrapy-redis",
classifiers=[
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
],
)
@@ -0,0 +1,5 @@
from .connection import get_redis, get_redis_from_settings # NOQA
__author__ = "R Max Espinoza"
__email__ = "hey at rmax.dev"
__version__ = "0.9.1"
@@ -0,0 +1,97 @@
from scrapy.utils.misc import load_object
from . import defaults
# Shortcut maps 'setting name' -> 'parmater name'.
SETTINGS_PARAMS_MAP = {
"REDIS_URL": "url",
"REDIS_HOST": "host",
"REDIS_PORT": "port",
"REDIS_DB": "db",
"REDIS_ENCODING": "encoding",
}
SETTINGS_PARAMS_MAP["REDIS_DECODE_RESPONSES"] = "decode_responses"
def get_redis_from_settings(settings):
"""Returns a redis client instance from given Scrapy settings object.
This function uses ``get_client`` to instantiate the client and uses
``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You
can override them using the ``REDIS_PARAMS`` setting.
Parameters
----------
settings : Settings
A scrapy settings object. See the supported settings below.
Returns
-------
server
Redis client instance.
Other Parameters
----------------
REDIS_URL : str, optional
Server connection URL.
REDIS_HOST : str, optional
Server host.
REDIS_PORT : str, optional
Server port.
REDIS_DB : int, optional
Server database
REDIS_ENCODING : str, optional
Data encoding.
REDIS_PARAMS : dict, optional
Additional client parameters.
Python 3 Only
----------------
REDIS_DECODE_RESPONSES : bool, optional
Sets the `decode_responses` kwarg in Redis cls ctor
"""
params = defaults.REDIS_PARAMS.copy()
params.update(settings.getdict("REDIS_PARAMS"))
# XXX: Deprecate REDIS_* settings.
for source, dest in SETTINGS_PARAMS_MAP.items():
val = settings.get(source)
if val:
params[dest] = val
# Allow ``redis_cls`` to be a path to a class.
if isinstance(params.get("redis_cls"), str):
params["redis_cls"] = load_object(params["redis_cls"])
return get_redis(**params)
# Backwards compatible alias.
from_settings = get_redis_from_settings
def get_redis(**kwargs):
"""Returns a redis client instance.
Parameters
----------
redis_cls : class, optional
Defaults to ``redis.StrictRedis``.
url : str, optional
If given, ``redis_cls.from_url`` is used to instantiate the class.
**kwargs
Extra parameters to be passed to the ``redis_cls`` class.
Returns
-------
server
Redis client instance.
"""
redis_cls = kwargs.pop("redis_cls", defaults.REDIS_CLS)
url = kwargs.pop("url", None)
if url:
return redis_cls.from_url(url, **kwargs)
else:
return redis_cls(**kwargs)
@@ -0,0 +1,29 @@
import redis
# For standalone use.
DUPEFILTER_KEY = "dupefilter:%(timestamp)s"
PIPELINE_KEY = "%(spider)s:items"
STATS_KEY = "%(spider)s:stats"
REDIS_CLS = redis.StrictRedis
REDIS_ENCODING = "utf-8"
# Sane connection defaults.
REDIS_PARAMS = {
"socket_timeout": 30,
"socket_connect_timeout": 30,
"retry_on_timeout": True,
"encoding": REDIS_ENCODING,
}
REDIS_CONCURRENT_REQUESTS = 16
SCHEDULER_QUEUE_KEY = "%(spider)s:requests"
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.PriorityQueue"
SCHEDULER_DUPEFILTER_KEY = "%(spider)s:dupefilter"
SCHEDULER_DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST = False
START_URLS_KEY = "%(name)s:start_urls"
START_URLS_AS_SET = False
START_URLS_AS_ZSET = False
MAX_IDLE_TIME = 0
@@ -0,0 +1,169 @@
import hashlib
import json
import logging
import time
from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.python import to_unicode
from w3lib.url import canonicalize_url
from . import defaults
from .connection import get_redis_from_settings
logger = logging.getLogger(__name__)
# TODO: Rename class to RedisDupeFilter.
class RFPDupeFilter(BaseDupeFilter):
"""Redis-based request duplicates filter.
This class can also be used with default Scrapy's scheduler.
"""
logger = logger
def __init__(self, server, key, debug=False):
"""Initialize the duplicates filter.
Parameters
----------
server : redis.StrictRedis
The redis server instance.
key : str
Redis key Where to store fingerprints.
debug : bool, optional
Whether to log filtered requests.
"""
self.server = server
self.key = key
self.debug = debug
self.logdupes = True
@classmethod
def from_settings(cls, settings):
"""Returns an instance from given settings.
This uses by default the key ``dupefilter:<timestamp>``. When using the
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
it needs to pass the spider name in the key.
Parameters
----------
settings : scrapy.settings.Settings
Returns
-------
RFPDupeFilter
A RFPDupeFilter instance.
"""
server = get_redis_from_settings(settings)
# XXX: This creates one-time key. needed to support to use this
# class as standalone dupefilter with scrapy's default scheduler
# if scrapy passes spider on open() method this wouldn't be needed
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
key = defaults.DUPEFILTER_KEY % {"timestamp": int(time.time())}
debug = settings.getbool("DUPEFILTER_DEBUG")
return cls(server, key=key, debug=debug)
@classmethod
def from_crawler(cls, crawler):
"""Returns instance from crawler.
Parameters
----------
crawler : scrapy.crawler.Crawler
Returns
-------
RFPDupeFilter
Instance of RFPDupeFilter.
"""
return cls.from_settings(crawler.settings)
def request_seen(self, request):
"""Returns True if request was already seen.
Parameters
----------
request : scrapy.http.Request
Returns
-------
bool
"""
fp = self.request_fingerprint(request)
# This returns the number of values added, zero if already exists.
added = self.server.sadd(self.key, fp)
return added == 0
def request_fingerprint(self, request):
"""Returns a fingerprint for a given request.
Parameters
----------
request : scrapy.http.Request
Returns
-------
str
"""
fingerprint_data = {
"method": to_unicode(request.method),
"url": canonicalize_url(request.url),
"body": (request.body or b"").hex(),
}
fingerprint_json = json.dumps(fingerprint_data, sort_keys=True)
return hashlib.sha1(fingerprint_json.encode()).hexdigest()
@classmethod
def from_spider(cls, spider):
settings = spider.settings
server = get_redis_from_settings(settings)
dupefilter_key = settings.get(
"SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY
)
key = dupefilter_key % {"spider": spider.name}
debug = settings.getbool("DUPEFILTER_DEBUG")
return cls(server, key=key, debug=debug)
def close(self, reason=""):
"""Delete data on close. Called by Scrapy's scheduler.
Parameters
----------
reason : str, optional
"""
self.clear()
def clear(self):
"""Clears fingerprints data."""
self.server.delete(self.key)
def log(self, request, spider):
"""Logs given request.
Parameters
----------
request : scrapy.http.Request
spider : scrapy.spiders.Spider
"""
if self.debug:
msg = "Filtered duplicate request: %(request)s"
self.logger.debug(msg, {"request": request}, extra={"spider": spider})
elif self.logdupes:
msg = (
"Filtered duplicate request %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)"
)
self.logger.debug(msg, {"request": request}, extra={"spider": spider})
self.logdupes = False
@@ -0,0 +1,14 @@
"""A pickle wrapper module with protocol=-1 by default."""
try:
import cPickle as pickle # PY2
except ImportError:
import pickle
def loads(s):
return pickle.loads(s)
def dumps(obj):
return pickle.dumps(obj, protocol=-1)
@@ -0,0 +1,73 @@
from scrapy.utils.misc import load_object
from scrapy.utils.serialize import ScrapyJSONEncoder
from twisted.internet.threads import deferToThread
from . import connection, defaults
default_serialize = ScrapyJSONEncoder().encode
class RedisPipeline:
"""Pushes serialized item into a redis list/queue
Settings
--------
REDIS_ITEMS_KEY : str
Redis key where to store items.
REDIS_ITEMS_SERIALIZER : str
Object path to serializer function.
"""
def __init__(
self, server, key=defaults.PIPELINE_KEY, serialize_func=default_serialize
):
"""Initialize pipeline.
Parameters
----------
server : StrictRedis
Redis client instance.
key : str
Redis key where to store items.
serialize_func : callable
Items serializer function.
"""
self.server = server
self.key = key
self.serialize = serialize_func
@classmethod
def from_settings(cls, settings):
params = {
"server": connection.from_settings(settings),
}
if settings.get("REDIS_ITEMS_KEY"):
params["key"] = settings["REDIS_ITEMS_KEY"]
if settings.get("REDIS_ITEMS_SERIALIZER"):
params["serialize_func"] = load_object(settings["REDIS_ITEMS_SERIALIZER"])
return cls(**params)
@classmethod
def from_crawler(cls, crawler):
return cls.from_settings(crawler.settings)
def process_item(self, item, spider):
return deferToThread(self._process_item, item, spider)
def _process_item(self, item, spider):
key = self.item_key(item, spider)
data = self.serialize(item)
self.server.rpush(key, data)
return item
def item_key(self, item, spider):
"""Returns redis key based on given spider.
Override this function to use a different key depending on the item
and/or spider.
"""
return self.key % {"spider": spider.name}
@@ -0,0 +1,155 @@
try:
from scrapy.utils.request import request_from_dict
except ImportError:
from scrapy.utils.reqser import request_to_dict, request_from_dict
from . import picklecompat
class Base:
"""Per-spider base queue class"""
def __init__(self, server, spider, key, serializer=None):
"""Initialize per-spider redis queue.
Parameters
----------
server : StrictRedis
Redis client instance.
spider : Spider
Scrapy spider instance.
key: str
Redis key where to put and get messages.
serializer : object
Serializer object with ``loads`` and ``dumps`` methods.
"""
if serializer is None:
# Backward compatibility.
# TODO: deprecate pickle.
serializer = picklecompat
if not hasattr(serializer, "loads"):
raise TypeError(
f"serializer does not implement 'loads' function: {serializer}"
)
if not hasattr(serializer, "dumps"):
raise TypeError(
f"serializer does not implement 'dumps' function: {serializer}"
)
self.server = server
self.spider = spider
self.key = key % {"spider": spider.name}
self.serializer = serializer
def _encode_request(self, request):
"""Encode a request object"""
try:
obj = request.to_dict(spider=self.spider)
except AttributeError:
obj = request_to_dict(request, self.spider)
return self.serializer.dumps(obj)
def _decode_request(self, encoded_request):
"""Decode an request previously encoded"""
obj = self.serializer.loads(encoded_request)
return request_from_dict(obj, spider=self.spider)
def __len__(self):
"""Return the length of the queue"""
raise NotImplementedError
def push(self, request):
"""Push a request"""
raise NotImplementedError
def pop(self, timeout=0):
"""Pop a request"""
raise NotImplementedError
def clear(self):
"""Clear queue/stack"""
self.server.delete(self.key)
class FifoQueue(Base):
"""Per-spider FIFO queue"""
def __len__(self):
"""Return the length of the queue"""
return self.server.llen(self.key)
def push(self, request):
"""Push a request"""
self.server.lpush(self.key, self._encode_request(request))
def pop(self, timeout=0):
"""Pop a request"""
if timeout > 0:
data = self.server.brpop(self.key, timeout)
if isinstance(data, tuple):
data = data[1]
else:
data = self.server.rpop(self.key)
if data:
return self._decode_request(data)
class PriorityQueue(Base):
"""Per-spider priority queue abstraction using redis' sorted set"""
def __len__(self):
"""Return the length of the queue"""
return self.server.zcard(self.key)
def push(self, request):
"""Push a request"""
data = self._encode_request(request)
score = -request.priority
# We don't use zadd method as the order of arguments change depending on
# whether the class is Redis or StrictRedis, and the option of using
# kwargs only accepts strings, not bytes.
self.server.execute_command("ZADD", self.key, score, data)
def pop(self, timeout=0):
"""
Pop a request
timeout not support in this queue class
"""
# use atomic range/remove using multi/exec
pipe = self.server.pipeline()
pipe.multi()
pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
results, count = pipe.execute()
if results:
return self._decode_request(results[0])
class LifoQueue(Base):
"""Per-spider LIFO queue."""
def __len__(self):
"""Return the length of the stack"""
return self.server.llen(self.key)
def push(self, request):
"""Push a request"""
self.server.lpush(self.key, self._encode_request(request))
def pop(self, timeout=0):
"""Pop a request"""
if timeout > 0:
data = self.server.blpop(self.key, timeout)
if isinstance(data, tuple):
data = data[1]
else:
data = self.server.lpop(self.key)
if data:
return self._decode_request(data)
# TODO: Deprecate the use of these names.
SpiderQueue = FifoQueue
SpiderStack = LifoQueue
SpiderPriorityQueue = PriorityQueue
@@ -0,0 +1,182 @@
import importlib
from scrapy.utils.misc import load_object
from . import connection, defaults
# TODO: add SCRAPY_JOB support.
class Scheduler:
"""Redis-based scheduler
Settings
--------
SCHEDULER_PERSIST : bool (default: False)
Whether to persist or clear redis queue.
SCHEDULER_FLUSH_ON_START : bool (default: False)
Whether to flush redis queue on start.
SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0)
How many seconds to wait before closing if no message is received.
SCHEDULER_QUEUE_KEY : str
Scheduler redis key.
SCHEDULER_QUEUE_CLASS : str
Scheduler queue class.
SCHEDULER_DUPEFILTER_KEY : str
Scheduler dupefilter redis key.
SCHEDULER_DUPEFILTER_CLASS : str
Scheduler dupefilter class.
SCHEDULER_SERIALIZER : str
Scheduler serializer.
"""
def __init__(
self,
server,
persist=False,
flush_on_start=False,
queue_key=defaults.SCHEDULER_QUEUE_KEY,
queue_cls=defaults.SCHEDULER_QUEUE_CLASS,
dupefilter=None,
dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY,
dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS,
idle_before_close=0,
serializer=None,
):
"""Initialize scheduler.
Parameters
----------
server : Redis
The redis server instance.
persist : bool
Whether to flush requests when closing. Default is False.
flush_on_start : bool
Whether to flush requests on start. Default is False.
queue_key : str
Requests queue key.
queue_cls : str
Importable path to the queue class.
dupefilter: Dupefilter
Custom dupefilter instance.
dupefilter_key : str
Duplicates filter key.
dupefilter_cls : str
Importable path to the dupefilter class.
idle_before_close : int
Timeout before giving up.
"""
if idle_before_close < 0:
raise TypeError("idle_before_close cannot be negative")
self.server = server
self.persist = persist
self.flush_on_start = flush_on_start
self.queue_key = queue_key
self.queue_cls = queue_cls
self.df = dupefilter
self.dupefilter_cls = dupefilter_cls
self.dupefilter_key = dupefilter_key
self.idle_before_close = idle_before_close
self.serializer = serializer
self.stats = None
def __len__(self):
return len(self.queue)
@classmethod
def from_settings(cls, settings):
kwargs = {
"persist": settings.getbool("SCHEDULER_PERSIST"),
"flush_on_start": settings.getbool("SCHEDULER_FLUSH_ON_START"),
"idle_before_close": settings.getint("SCHEDULER_IDLE_BEFORE_CLOSE"),
}
# If these values are missing, it means we want to use the defaults.
optional = {
# TODO: Use custom prefixes for this settings to note that are
# specific to scrapy-redis.
"queue_key": "SCHEDULER_QUEUE_KEY",
"queue_cls": "SCHEDULER_QUEUE_CLASS",
"dupefilter_key": "SCHEDULER_DUPEFILTER_KEY",
# We use the default setting name to keep compatibility.
"dupefilter_cls": "DUPEFILTER_CLASS",
"serializer": "SCHEDULER_SERIALIZER",
}
for name, setting_name in optional.items():
val = settings.get(setting_name)
if val:
kwargs[name] = val
dupefilter_cls = load_object(kwargs["dupefilter_cls"])
if not hasattr(dupefilter_cls, "from_spider"):
kwargs["dupefilter"] = dupefilter_cls.from_settings(settings)
# Support serializer as a path to a module.
if isinstance(kwargs.get("serializer"), str):
kwargs["serializer"] = importlib.import_module(kwargs["serializer"])
server = connection.from_settings(settings)
# Ensure the connection is working.
server.ping()
return cls(server=server, **kwargs)
@classmethod
def from_crawler(cls, crawler):
instance = cls.from_settings(crawler.settings)
# FIXME: for now, stats are only supported from this constructor
instance.stats = crawler.stats
return instance
def open(self, spider):
self.spider = spider
try:
self.queue = load_object(self.queue_cls)(
server=self.server,
spider=spider,
key=self.queue_key % {"spider": spider.name},
serializer=self.serializer,
)
except TypeError as e:
raise ValueError(
f"Failed to instantiate queue class '{self.queue_cls}': {e}"
)
if not self.df:
self.df = load_object(self.dupefilter_cls).from_spider(spider)
if self.flush_on_start:
self.flush()
# notice if there are requests already in the queue to resume the crawl
if len(self.queue):
spider.log(f"Resuming crawl ({len(self.queue)} requests scheduled)")
def close(self, reason):
if not self.persist:
self.flush()
def flush(self):
self.df.clear()
self.queue.clear()
def enqueue_request(self, request):
if not request.dont_filter and self.df.request_seen(request):
self.df.log(request, self.spider)
return False
if self.stats:
self.stats.inc_value("scheduler/enqueued/redis", spider=self.spider)
self.queue.push(request)
return True
def next_request(self):
block_pop_timeout = self.idle_before_close
request = self.queue.pop(block_pop_timeout)
if request and self.stats:
self.stats.inc_value("scheduler/dequeued/redis", spider=self.spider)
return request
def has_pending_requests(self):
return len(self) > 0
@@ -0,0 +1,297 @@
import json
import time
from collections.abc import Iterable
from scrapy import FormRequest, signals
from scrapy import version_info as scrapy_version
from scrapy.exceptions import DontCloseSpider
from scrapy.spiders import CrawlSpider, Spider
from scrapy_redis.utils import TextColor
from . import connection, defaults
from .utils import bytes_to_str, is_dict
class RedisMixin:
"""Mixin class to implement reading urls from a redis queue."""
redis_key = None
redis_batch_size = None
redis_encoding = None
# Redis client placeholder.
server = None
# Idle start time
spider_idle_start_time = int(time.time())
max_idle_time = None
def start_requests(self):
"""Returns a batch of start requests from redis."""
return self.next_requests()
def setup_redis(self, crawler=None):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
"""
if self.server is not None:
return
if crawler is None:
# We allow optional crawler argument to keep backwards
# compatibility.
# XXX: Raise a deprecation warning.
crawler = getattr(self, "crawler", None)
if crawler is None:
raise ValueError("crawler is required")
settings = crawler.settings
if self.redis_key is None:
self.redis_key = settings.get(
"REDIS_START_URLS_KEY",
defaults.START_URLS_KEY,
)
self.redis_key = self.redis_key % {"name": self.name}
if not self.redis_key.strip():
raise ValueError("redis_key must not be empty")
if self.redis_batch_size is None:
self.redis_batch_size = settings.getint(
"CONCURRENT_REQUESTS", defaults.REDIS_CONCURRENT_REQUESTS
)
try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer")
if self.redis_encoding is None:
self.redis_encoding = settings.get(
"REDIS_ENCODING", defaults.REDIS_ENCODING
)
self.logger.info(
"Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s)",
self.__dict__,
)
self.server = connection.from_settings(crawler.settings)
if settings.getbool("REDIS_START_URLS_AS_SET", defaults.START_URLS_AS_SET):
self.fetch_data = self.server.spop
self.count_size = self.server.scard
elif settings.getbool("REDIS_START_URLS_AS_ZSET", defaults.START_URLS_AS_ZSET):
self.fetch_data = self.pop_priority_queue
self.count_size = self.server.zcard
else:
self.fetch_data = self.pop_list_queue
self.count_size = self.server.llen
if self.max_idle_time is None:
self.max_idle_time = settings.get(
"MAX_IDLE_TIME_BEFORE_CLOSE", defaults.MAX_IDLE_TIME
)
try:
self.max_idle_time = int(self.max_idle_time)
except (TypeError, ValueError):
raise ValueError("max_idle_time must be an integer")
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def pop_list_queue(self, redis_key, batch_size):
with self.server.pipeline() as pipe:
pipe.lrange(redis_key, 0, batch_size - 1)
pipe.ltrim(redis_key, batch_size, -1)
datas, _ = pipe.execute()
return datas
def pop_priority_queue(self, redis_key, batch_size):
with self.server.pipeline() as pipe:
pipe.zrevrange(redis_key, 0, batch_size - 1)
pipe.zremrangebyrank(redis_key, -batch_size, -1)
datas, _ = pipe.execute()
return datas
def next_requests(self):
"""Returns a request to be scheduled or none."""
# XXX: Do we need to use a timeout here?
found = 0
datas = self.fetch_data(self.redis_key, self.redis_batch_size)
for data in datas:
reqs = self.make_request_from_data(data)
if isinstance(reqs, Iterable):
for req in reqs:
yield req
# XXX: should be here?
found += 1
self.logger.info(f"start req url:{req.url}")
elif reqs:
yield reqs
found += 1
else:
self.logger.debug(f"Request not made from data: {data}")
if found:
self.logger.debug(f"Read {found} requests from '{self.redis_key}'")
def make_request_from_data(self, data):
"""Returns a `Request` instance for data coming from Redis.
Overriding this function to support the `json` requested `data` that contains
`url` ,`meta` and other optional parameters. `meta` is a nested json which contains sub-data.
Along with:
After accessing the data, sending the FormRequest with `url`, `meta` and addition `formdata`, `method`
For example:
.. code:: json
{
"url": "https://example.com",
"meta": {
"job-id":"123xsd",
"start-date":"dd/mm/yy",
},
"url_cookie_key":"fertxsas",
"method":"POST",
}
If `url` is empty, return `[]`. So you should verify the `url` in the data.
If `method` is empty, the request object will set method to 'GET', optional.
If `meta` is empty, the request object will set `meta` to an empty dictionary, optional.
This json supported data can be accessed from 'scrapy.spider' through response.
'request.url', 'request.meta', 'request.cookies', 'request.method'
Parameters
----------
data : bytes
Message from redis.
"""
formatted_data = bytes_to_str(data, self.redis_encoding)
if is_dict(formatted_data):
parameter = json.loads(formatted_data)
else:
self.logger.warning(
f"{TextColor.WARNING}WARNING: String request is deprecated, please use JSON data format. "
f"Detail information, please check https://github.com/rmax/scrapy-redis#features{TextColor.ENDC}"
)
return FormRequest(formatted_data, dont_filter=True)
if parameter.get("url", None) is None:
self.logger.warning(
f"{TextColor.WARNING}The data from Redis has no url key in push data{TextColor.ENDC}"
)
return []
url = parameter.pop("url")
method = parameter.pop("method").upper() if "method" in parameter else "GET"
metadata = parameter.pop("meta") if "meta" in parameter else {}
return FormRequest(
url, dont_filter=True, method=method, formdata=parameter, meta=metadata
)
def schedule_next_requests(self):
"""Schedules a request if available"""
# TODO: While there is capacity, schedule a batch of redis requests.
for req in self.next_requests():
# see https://github.com/scrapy/scrapy/issues/5994
if scrapy_version >= (2, 6):
self.crawler.engine.crawl(req)
else:
self.crawler.engine.crawl(req, spider=self)
def spider_idle(self):
"""
Schedules a request if available, otherwise waits.
or close spider when waiting seconds > MAX_IDLE_TIME_BEFORE_CLOSE.
MAX_IDLE_TIME_BEFORE_CLOSE will not affect SCHEDULER_IDLE_BEFORE_CLOSE.
"""
if self.server is not None and self.count_size(self.redis_key) > 0:
self.spider_idle_start_time = int(time.time())
self.schedule_next_requests()
idle_time = int(time.time()) - self.spider_idle_start_time
if self.max_idle_time != 0 and idle_time >= self.max_idle_time:
return
raise DontCloseSpider
class RedisSpider(RedisMixin, Spider):
"""Spider that reads urls from redis queue when idle.
Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue.
Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: False)
Use SET operations to retrieve messages from the redis queue. If False,
the messages are retrieve using the LPOP command.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue.
"""
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
obj = super().from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj
class RedisCrawlSpider(RedisMixin, CrawlSpider):
"""Spider that reads urls from redis queue when idle.
Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue.
Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: True)
Use SET operations to retrieve messages from the redis queue.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue.
"""
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
obj = super().from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj
@@ -0,0 +1,90 @@
from datetime import datetime
from scrapy.statscollectors import StatsCollector
from .connection import from_settings as redis_from_settings
from .defaults import SCHEDULER_PERSIST, STATS_KEY
from .utils import convert_bytes_to_str
class RedisStatsCollector(StatsCollector):
"""
Stats Collector based on Redis
"""
def __init__(self, crawler, spider=None):
super().__init__(crawler)
self.server = redis_from_settings(crawler.settings)
self.spider = spider
self.spider_name = spider.name if spider else crawler.spidercls.name
self.stats_key = crawler.settings.get("STATS_KEY", STATS_KEY)
self.persist = crawler.settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST)
def _get_key(self, spider=None):
"""Return the hash name of stats"""
if spider:
return self.stats_key % {"spider": spider.name}
if self.spider:
return self.stats_key % {"spider": self.spider.name}
return self.stats_key % {"spider": self.spider_name or "scrapy"}
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
@classmethod
def from_spider(cls, spider):
return cls(spider.crawler)
def get_value(self, key, default=None, spider=None):
"""Return the value of hash stats"""
if self.server.hexists(self._get_key(spider), key):
return int(self.server.hget(self._get_key(spider), key))
else:
return default
def get_stats(self, spider=None):
"""Return the all of the values of hash stats"""
stats = self.server.hgetall(self._get_key(spider))
if stats:
return convert_bytes_to_str(stats)
return {}
def set_value(self, key, value, spider=None):
"""Set the value according to hash key of stats"""
if isinstance(value, datetime):
value = value.timestamp()
self.server.hset(self._get_key(spider), key, value)
def set_stats(self, stats, spider=None):
"""Set all the hash stats"""
self.server.hmset(self._get_key(spider), stats)
def inc_value(self, key, count=1, start=0, spider=None):
"""Set increment of value according to key"""
if not self.server.hexists(self._get_key(spider), key):
self.set_value(key, start)
self.server.hincrby(self._get_key(spider), key, count)
def max_value(self, key, value, spider=None):
"""Set max value between current and new value"""
self.set_value(key, max(self.get_value(key, value), value))
def min_value(self, key, value, spider=None):
"""Set min value between current and new value"""
self.set_value(key, min(self.get_value(key, value), value))
def clear_stats(self, spider=None):
"""Clear all the hash stats"""
self.server.delete(self._get_key(spider))
def open_spider(self, spider):
"""Set spider to self"""
if spider:
self.spider = spider
def close_spider(self, spider, reason):
"""Clear spider and clear stats"""
self.spider = None
if not self.persist:
self.clear_stats(spider)
@@ -0,0 +1,44 @@
import json
from json import JSONDecodeError
import six
class TextColor:
HEADER = "\033[95m"
OKBLUE = "\033[94m"
OKCYAN = "\033[96m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"
def bytes_to_str(s, encoding="utf-8"):
"""Returns a str if a bytes object is given."""
if six.PY3 and isinstance(s, bytes):
return s.decode(encoding)
return s
def is_dict(string_content):
"""Try load string_content as json, if failed, return False, else return True."""
try:
json.loads(string_content)
except JSONDecodeError:
return False
return True
def convert_bytes_to_str(data, encoding="utf-8"):
"""Convert a dict's keys & values from `bytes` to `str`
or convert bytes to str"""
if isinstance(data, bytes):
return data.decode(encoding)
if isinstance(data, dict):
return dict(map(convert_bytes_to_str, data.items()))
elif isinstance(data, tuple):
return map(convert_bytes_to_str, data)
return data
@@ -0,0 +1,69 @@
from unittest import mock
from scrapy.settings import Settings
from scrapy_redis import defaults
from scrapy_redis.connection import from_settings, get_redis, get_redis_from_settings
class TestGetRedis:
def test_default_instance(self):
server = get_redis()
assert isinstance(server, defaults.REDIS_CLS)
def test_custom_class(self):
client_cls = mock.Mock()
server = get_redis(param="foo", redis_cls=client_cls)
assert server is client_cls.return_value
client_cls.assert_called_with(param="foo")
def test_from_url(self):
client_cls = mock.Mock()
url = "redis://localhost"
server = get_redis(redis_cls=client_cls, url=url, param="foo")
assert server is client_cls.from_url.return_value
client_cls.from_url.assert_called_with(url, param="foo")
class TestFromSettings:
def setup(self):
self.redis_cls = mock.Mock()
self.expected_params = {
"timeout": 0,
"flag": False,
}
self.settings = Settings(
{
"REDIS_PARAMS": dict(self.expected_params, redis_cls=self.redis_cls),
}
)
def test_redis_cls_default(self):
server = from_settings(Settings())
assert isinstance(server, defaults.REDIS_CLS)
def test_redis_cls_custom_path(self):
self.settings["REDIS_PARAMS"]["redis_cls"] = "unittest.mock.Mock"
server = from_settings(self.settings)
assert isinstance(server, mock.Mock)
def test_default_params(self):
server = from_settings(self.settings)
assert server is self.redis_cls.return_value
self.redis_cls.assert_called_with(
**dict(defaults.REDIS_PARAMS, **self.expected_params)
)
def test_override_default_params(self):
for key, _ in defaults.REDIS_PARAMS.items():
self.expected_params[key] = self.settings["REDIS_PARAMS"][key] = object()
server = from_settings(self.settings)
assert server is self.redis_cls.return_value
self.redis_cls.assert_called_with(**self.expected_params)
def test_get_server_from_settings_alias():
assert from_settings is get_redis_from_settings
@@ -0,0 +1,108 @@
from unittest import mock
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy_redis.dupefilter import RFPDupeFilter
def get_redis_mock():
server = mock.Mock()
def sadd(key, fp, added=0, db={}): # noqa: mutable db
fingerprints = db.setdefault(key, set())
if fp not in fingerprints:
fingerprints.add(fp)
added += 1
return added
server.sadd = sadd
return server
class TestRFPDupeFilter:
def setup(self):
self.server = get_redis_mock()
self.key = "dupefilter:1"
self.df = RFPDupeFilter(self.server, self.key)
def test_request_seen(self):
req = Request("http://example.com")
def same_request():
assert not self.df.request_seen(req)
assert self.df.request_seen(req)
def diff_method():
diff_method = Request("http://example.com", method="POST")
assert self.df.request_seen(req)
assert not self.df.request_seen(diff_method)
def diff_url():
diff_url = Request("http://example2.com")
assert self.df.request_seen(req)
assert not self.df.request_seen(diff_url)
same_request()
diff_method()
diff_url()
def test_overridable_request_fingerprinter(self):
req = Request("http://example.com")
self.df.request_fingerprint = mock.Mock(wraps=self.df.request_fingerprint)
assert not self.df.request_seen(req)
self.df.request_fingerprint.assert_called_with(req)
def test_clear_deletes(self):
self.df.clear()
self.server.delete.assert_called_with(self.key)
def test_close_calls_clear(self):
self.df.clear = mock.Mock(wraps=self.df.clear)
self.df.close()
self.df.close(reason="foo")
assert self.df.clear.call_count == 2
def test_log_dupes():
def _test(df, dupes, logcount):
df.logger.debug = mock.Mock(wraps=df.logger.debug)
for _ in range(dupes):
req = Request("http://example")
df.log(req, spider=mock.Mock())
assert df.logger.debug.call_count == logcount
server = get_redis_mock()
df_quiet = RFPDupeFilter(server, "foo") # debug=False
_test(df_quiet, 5, 1)
df_debug = RFPDupeFilter(server, "foo", debug=True)
_test(df_debug, 5, 5)
@mock.patch("scrapy_redis.dupefilter.get_redis_from_settings")
class TestFromMethods:
def setup(self):
self.settings = Settings(
{
"DUPEFILTER_DEBUG": True,
}
)
def test_from_settings(self, get_redis_from_settings):
df = RFPDupeFilter.from_settings(self.settings)
self.assert_dupefilter(df, get_redis_from_settings)
def test_from_crawler(self, get_redis_from_settings):
crawler = mock.Mock(settings=self.settings)
df = RFPDupeFilter.from_crawler(crawler)
self.assert_dupefilter(df, get_redis_from_settings)
def assert_dupefilter(self, df, get_redis_from_settings):
assert df.server is get_redis_from_settings.return_value
assert df.key.startswith("dupefilter:")
assert df.debug # true
@@ -0,0 +1,7 @@
import scrapy_redis
def test_package_metadata():
assert scrapy_redis.__author__
assert scrapy_redis.__email__
assert scrapy_redis.__version__
@@ -0,0 +1,18 @@
from scrapy_redis import picklecompat
def test_picklecompat():
obj = {
"_encoding": "utf-8",
"body": "",
"callback": "_response_downloaded",
"cookies": {},
"dont_filter": False,
"errback": None,
"headers": {"Referer": ["http://www.dmoz.org/"]},
"meta": {"depth": 1, "link_text": "Fran\xe7ais", "rule": 0},
"method": "GET",
"priority": 0,
"url": "http://www.dmoz.org/World/Fran%C3%A7ais/",
}
assert obj == picklecompat.loads(picklecompat.dumps(obj))
@@ -0,0 +1,38 @@
from unittest import mock
from scrapy import Spider
from scrapy.http import Request
from scrapy_redis.queue import Base
class TestBaseQueue:
queue_cls = Base
def setup(self):
self.server = mock.Mock()
self.spider = Spider(name="foo")
self.spider.parse_method = lambda x: x
self.key = "key"
self.q = self.queue_cls(self.server, self.spider, self.key)
def test_encode_decode_requests(self, q=None):
if q is None:
q = self.q
req = Request(
"http://example.com", callback=self.spider.parse, meta={"foo": "bar"}
)
out = q._decode_request(q._encode_request(req))
assert req.url == out.url
assert req.meta == out.meta
assert req.callback == out.callback
def test_custom_serializer(self):
serializer = mock.Mock()
serializer.dumps = mock.Mock(side_effect=lambda x: x)
serializer.loads = mock.Mock(side_effect=lambda x: x)
q = Base(self.server, self.spider, self.key, serializer=serializer)
self.test_encode_decode_requests(q)
assert serializer.dumps.call_count == 1
assert serializer.loads.call_count == 1
@@ -0,0 +1,296 @@
import os
from unittest import TestCase, mock
import redis
from scrapy import Request, Spider
from scrapy.settings import Settings
from scrapy.utils.test import get_crawler
from scrapy_redis import connection
from scrapy_redis.dupefilter import RFPDupeFilter
from scrapy_redis.queue import FifoQueue, LifoQueue, PriorityQueue
from scrapy_redis.scheduler import Scheduler
# allow test settings from environment
REDIS_HOST = os.environ.get("REDIS_HOST", "localhost")
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
def get_spider(*args, **kwargs):
crawler = get_crawler(
spidercls=kwargs.pop("spidercls", None),
settings_dict=kwargs.pop("settings_dict", None),
)
return crawler._create_spider(*args, **kwargs)
class RedisTestMixin:
@property
def server(self):
if not hasattr(self, "_redis"):
self._redis = redis.Redis(REDIS_HOST, REDIS_PORT)
return self._redis
def clear_keys(self, prefix):
keys = self.server.keys(prefix + "*")
if keys:
self.server.delete(*keys)
class DupeFilterTest(RedisTestMixin, TestCase):
def setUp(self):
self.key = "scrapy_redis:tests:dupefilter:"
self.df = RFPDupeFilter(self.server, self.key)
def tearDown(self):
self.clear_keys(self.key)
def test_dupe_filter(self):
req = Request("http://example.com")
self.assertFalse(self.df.request_seen(req))
self.assertTrue(self.df.request_seen(req))
self.df.close("nothing")
class QueueTestMixin(RedisTestMixin):
queue_cls = None
def setUp(self):
self.spider = get_spider(name="myspider")
self.key = f"scrapy_redis:tests:{self.spider.name}:queue"
self.q = self.queue_cls(self.server, Spider("myspider"), self.key)
def tearDown(self):
self.clear_keys(self.key)
def test_clear(self):
self.assertEqual(len(self.q), 0)
for i in range(10):
# XXX: can't use same url for all requests as SpiderPriorityQueue
# uses redis' set implemention and we will end with only one
# request in the set and thus failing the test. It should be noted
# that when using SpiderPriorityQueue it acts as a request
# duplication filter whenever the serielized requests are the same.
# This might be unwanted on repetitive requests to the same page
# even with dont_filter=True flag.
req = Request(f"http://example.com/?page={i}")
self.q.push(req)
self.assertEqual(len(self.q), 10)
self.q.clear()
self.assertEqual(len(self.q), 0)
class FifoQueueTest(QueueTestMixin, TestCase):
queue_cls = FifoQueue
def test_queue(self):
req1 = Request("http://example.com/page1")
req2 = Request("http://example.com/page2")
self.q.push(req1)
self.q.push(req2)
out1 = self.q.pop()
out2 = self.q.pop(timeout=1)
self.assertEqual(out1.url, req1.url)
self.assertEqual(out2.url, req2.url)
class PriorityQueueTest(QueueTestMixin, TestCase):
queue_cls = PriorityQueue
def test_queue(self):
req1 = Request("http://example.com/page1", priority=100)
req2 = Request("http://example.com/page2", priority=50)
req3 = Request("http://example.com/page2", priority=200)
self.q.push(req1)
self.q.push(req2)
self.q.push(req3)
out1 = self.q.pop()
out2 = self.q.pop(timeout=0)
out3 = self.q.pop(timeout=1)
self.assertEqual(out1.url, req3.url)
self.assertEqual(out2.url, req1.url)
self.assertEqual(out3.url, req2.url)
class LifoQueueTest(QueueTestMixin, TestCase):
queue_cls = LifoQueue
def test_queue(self):
req1 = Request("http://example.com/page1")
req2 = Request("http://example.com/page2")
self.q.push(req1)
self.q.push(req2)
out1 = self.q.pop()
out2 = self.q.pop(timeout=1)
self.assertEqual(out1.url, req2.url)
self.assertEqual(out2.url, req1.url)
class SchedulerTest(RedisTestMixin, TestCase):
def setUp(self):
self.key_prefix = "scrapy_redis:tests:"
self.queue_key = self.key_prefix + "%(spider)s:requests"
self.dupefilter_key = self.key_prefix + "%(spider)s:dupefilter"
self.spider = get_spider(
name="myspider",
settings_dict={
"REDIS_HOST": REDIS_HOST,
"REDIS_PORT": REDIS_PORT,
"SCHEDULER_QUEUE_KEY": self.queue_key,
"SCHEDULER_DUPEFILTER_KEY": self.dupefilter_key,
"SCHEDULER_FLUSH_ON_START": False,
"SCHEDULER_PERSIST": False,
"SCHEDULER_SERIALIZER": "pickle",
"DUPEFILTER_CLASS": "scrapy_redis.dupefilter.RFPDupeFilter",
},
)
self.scheduler = Scheduler.from_crawler(self.spider.crawler)
def tearDown(self):
self.clear_keys(self.key_prefix)
def test_scheduler(self):
# default no persist
self.assertFalse(self.scheduler.persist)
self.scheduler.open(self.spider)
self.assertEqual(len(self.scheduler), 0)
req = Request("http://example.com")
self.scheduler.enqueue_request(req)
self.assertTrue(self.scheduler.has_pending_requests())
self.assertEqual(len(self.scheduler), 1)
# dupefilter in action
self.scheduler.enqueue_request(req)
self.assertEqual(len(self.scheduler), 1)
out = self.scheduler.next_request()
self.assertEqual(out.url, req.url)
self.assertFalse(self.scheduler.has_pending_requests())
self.assertEqual(len(self.scheduler), 0)
self.scheduler.close("finish")
def test_scheduler_persistent(self):
# TODO: Improve this test to avoid the need to check for log messages.
self.spider.log = mock.Mock(spec=self.spider.log)
self.scheduler.persist = True
self.scheduler.open(self.spider)
self.assertEqual(self.spider.log.call_count, 0)
self.scheduler.enqueue_request(Request("http://example.com/page1"))
self.scheduler.enqueue_request(Request("http://example.com/page2"))
self.assertTrue(self.scheduler.has_pending_requests())
self.scheduler.close("finish")
self.scheduler.open(self.spider)
self.spider.log.assert_has_calls(
[
mock.call("Resuming crawl (2 requests scheduled)"),
]
)
self.assertEqual(len(self.scheduler), 2)
self.scheduler.persist = False
self.scheduler.close("finish")
self.assertEqual(len(self.scheduler), 0)
class ConnectionTest(TestCase):
# We can get a connection from just REDIS_URL.
def test_redis_url(self):
settings = Settings(
{
"REDIS_URL": "redis://foo:bar@localhost:9001/42",
}
)
server = connection.from_settings(settings)
connect_args = server.connection_pool.connection_kwargs
self.assertEqual(connect_args["host"], "localhost")
self.assertEqual(connect_args["port"], 9001)
self.assertEqual(connect_args["password"], "bar")
self.assertEqual(connect_args["db"], 42)
# We can get a connection from REDIS_HOST/REDIS_PORT.
def test_redis_host_port(self):
settings = Settings(
{
"REDIS_HOST": "localhost",
"REDIS_PORT": 9001,
}
)
server = connection.from_settings(settings)
connect_args = server.connection_pool.connection_kwargs
self.assertEqual(connect_args["host"], "localhost")
self.assertEqual(connect_args["port"], 9001)
# REDIS_URL takes precedence over REDIS_HOST/REDIS_PORT.
def test_redis_url_precedence(self):
settings = Settings(
{
"REDIS_HOST": "baz",
"REDIS_PORT": 1337,
"REDIS_URL": "redis://foo:bar@localhost:9001/42",
}
)
server = connection.from_settings(settings)
connect_args = server.connection_pool.connection_kwargs
self.assertEqual(connect_args["host"], "localhost")
self.assertEqual(connect_args["port"], 9001)
self.assertEqual(connect_args["password"], "bar")
self.assertEqual(connect_args["db"], 42)
# We fallback to REDIS_HOST/REDIS_PORT if REDIS_URL is None.
def test_redis_host_port_fallback(self):
settings = Settings(
{"REDIS_HOST": "baz", "REDIS_PORT": 1337, "REDIS_URL": None}
)
server = connection.from_settings(settings)
connect_args = server.connection_pool.connection_kwargs
self.assertEqual(connect_args["host"], "baz")
self.assertEqual(connect_args["port"], 1337)
# We use default values for REDIS_HOST/REDIS_PORT.
def test_redis_default(self):
settings = Settings()
server = connection.from_settings(settings)
connect_args = server.connection_pool.connection_kwargs
self.assertEqual(connect_args["host"], "localhost")
self.assertEqual(connect_args["port"], 6379)
@@ -0,0 +1,197 @@
import contextlib
import os
from unittest import mock
import pytest
from scrapy import signals
from scrapy.exceptions import DontCloseSpider
from scrapy.settings import Settings
from scrapy_redis.spiders import RedisCrawlSpider, RedisSpider
REDIS_HOST = os.environ.get("REDIS_HOST", "localhost")
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
@contextlib.contextmanager
def flushall(server):
try:
yield
finally:
server.flushall()
class MySpider(RedisSpider):
name = "myspider"
class MyCrawlSpider(RedisCrawlSpider):
name = "myspider"
def get_crawler(**kwargs):
return mock.Mock(
settings=Settings(
{
"REDIS_HOST": REDIS_HOST,
"REDIS_PORT": REDIS_PORT,
}
),
**kwargs,
)
class TestRedisMixin_setup_redis:
def setup(self):
self.myspider = MySpider()
def test_crawler_required(self):
with pytest.raises(ValueError) as excinfo:
self.myspider.setup_redis()
assert "crawler" in str(excinfo.value)
def test_requires_redis_key(self):
self.myspider.crawler = get_crawler()
self.myspider.redis_key = ""
with pytest.raises(ValueError) as excinfo:
self.myspider.setup_redis()
assert "redis_key" in str(excinfo.value)
def test_invalid_batch_size(self):
self.myspider.redis_batch_size = "x"
self.myspider.crawler = get_crawler()
with pytest.raises(ValueError) as excinfo:
self.myspider.setup_redis()
assert "redis_batch_size" in str(excinfo.value)
def test_invalid_idle_time(self):
self.myspider.max_idle_time = "x"
self.myspider.crawler = get_crawler()
with pytest.raises(ValueError) as excinfo:
self.myspider.setup_redis()
assert "max_idle_time" in str(excinfo.value)
@mock.patch("scrapy_redis.spiders.connection")
def test_via_from_crawler(self, connection):
server = connection.from_settings.return_value = mock.Mock()
crawler = get_crawler()
myspider = MySpider.from_crawler(crawler)
assert myspider.server is server
connection.from_settings.assert_called_with(crawler.settings)
crawler.signals.connect.assert_called_with(
myspider.spider_idle, signal=signals.spider_idle
)
# Second call does nothing.
server = myspider.server
crawler.signals.connect.reset_mock()
myspider.setup_redis()
assert myspider.server is server
assert crawler.signals.connect.call_count == 0
@pytest.mark.parametrize(
"spider_cls",
[
MySpider,
MyCrawlSpider,
],
)
def test_from_crawler_with_spider_arguments(spider_cls):
crawler = get_crawler()
spider = spider_cls.from_crawler(
crawler,
"foo",
redis_key="key:%(name)s",
redis_batch_size="2000",
max_idle_time="100",
)
assert spider.name == "foo"
assert spider.redis_key == "key:foo"
assert spider.redis_batch_size == 2000
assert spider.max_idle_time == 100
class MockRequest(mock.Mock):
def __init__(self, url, **kwargs):
super().__init__()
self.url = url
def __eq__(self, other):
return self.url == other.url
def __hash__(self):
return hash(self.url)
def __repr__(self):
return f"<{self.__class__.__name__}({self.url})>"
@pytest.mark.parametrize(
"spider_cls",
[
MySpider,
MyCrawlSpider,
],
)
@pytest.mark.parametrize("start_urls_as_zset", [False, True])
@pytest.mark.parametrize("start_urls_as_set", [False, True])
@mock.patch("scrapy.spiders.Request", MockRequest)
def test_consume_urls_from_redis(start_urls_as_zset, start_urls_as_set, spider_cls):
batch_size = 5
redis_key = "start:urls"
crawler = get_crawler()
crawler.settings.setdict(
{
"REDIS_HOST": REDIS_HOST,
"REDIS_PORT": REDIS_PORT,
"REDIS_START_URLS_KEY": redis_key,
"REDIS_START_URLS_AS_ZSET": start_urls_as_zset,
"REDIS_START_URLS_AS_SET": start_urls_as_set,
"CONCURRENT_REQUESTS": batch_size,
}
)
spider = spider_cls.from_crawler(crawler)
with flushall(spider.server):
urls = [f"http://example.com/{i}" for i in range(batch_size * 2)]
reqs = []
if start_urls_as_set:
server_put = spider.server.sadd
elif start_urls_as_zset:
def server_put(key, value):
spider.server.zadd(key, {value: 0})
else:
server_put = spider.server.rpush
for url in urls:
server_put(redis_key, url)
reqs.append(MockRequest(url))
# First call is to start requests.
start_requests = list(spider.start_requests())
if start_urls_as_zset or start_urls_as_set:
assert len(start_requests) == batch_size
assert {r.url for r in start_requests}.issubset(r.url for r in reqs)
else:
assert start_requests == reqs[:batch_size]
# Second call is to spider idle method.
with pytest.raises(DontCloseSpider):
spider.spider_idle()
# Process remaining requests in the queue.
with pytest.raises(DontCloseSpider):
spider.spider_idle()
# Last batch was passed to crawl.
assert crawler.engine.crawl.call_count == batch_size
if start_urls_as_zset or start_urls_as_set:
crawler.engine.crawl.assert_has_calls(
[mock.call(req) for req in reqs if req not in start_requests],
any_order=True,
)
else:
crawler.engine.crawl.assert_has_calls(
[mock.call(req) for req in reqs[batch_size:]]
)
@@ -0,0 +1,7 @@
from scrapy_redis.utils import bytes_to_str
def test_bytes_to_str():
assert bytes_to_str(b"foo") == "foo"
# This char is the same in bytes or latin1.
assert bytes_to_str(b"\xc1", "latin1") == "\xc1"
+90
View File
@@ -0,0 +1,90 @@
[tox]
requires =
tox>=4
envlist =
docs
security
flake8
py{38,39,310,311,312}-scrapy{26,27,28,29,210,211}-redis{42,43,44,45,46,50}
minversion = 3.0.0
[base]
deps =
-r requirements-tests.txt
-r requirements.txt
setuptools
[testenv]
basepython =
py38: python3.8
py39: python3.9
py310: python3.10
py311: python3.11
py312: python3.12
deps =
{[base]deps}
scrapy26: scrapy~=2.6.0
scrapy27: scrapy~=2.7.0
scrapy28: scrapy~=2.8.0
scrapy29: scrapy~=2.9.0
scrapy210: scrapy~=2.10.0
scrapy211: scrapy~=2.11.0
redis42: redis~=4.2.0
redis43: redis~=4.3.0
redis44: redis~=4.4.0
redis45: redis~=4.5.0
redis46: redis~=4.6.0
redis50: redis~=5.0.0
passenv =
REDIS_HOST
REDIS_PORT
commands =
python -m pytest # --cov-report term --cov=scrapy_redis
[testenv:flake8]
basepython =
python3.12
deps =
{[base]deps}
commands =
flake8 --ignore=W503,E265,E731 docs src tests
[testenv:security]
basepython =
python3.12
deps =
bandit~=1.7.3
commands =
bandit -r -c .bandit.yml src/ tests/
[testenv:pytest]
basepython =
python3.12
deps =
{[testenv]deps}
passenv =
REDIS_HOST
REDIS_PORT
commands =
python -m pytest --cov-report term --cov=scrapy_redis
[testenv:build]
basepython =
python3.12
deps =
{[base]deps}
build
commands =
python -m build
[testenv:docs]
basepython =
python3.12
deps =
{[base]deps}
-r docs/requirements.txt
allowlist_externals =
make
commands =
# Same command as readthedocs
make -C docs html SPHINXOPTS="-T -W --keep-going -D language=en"