变更
This commit is contained in:
@@ -0,0 +1,18 @@
|
||||
skips:
|
||||
- B101
|
||||
- B105
|
||||
- B301
|
||||
- B303
|
||||
- B306
|
||||
- B307
|
||||
- B311
|
||||
- B320
|
||||
- B321
|
||||
- B324
|
||||
- B403
|
||||
- B404
|
||||
- B406
|
||||
- B410
|
||||
- B503
|
||||
- B603
|
||||
- B605
|
||||
@@ -0,0 +1,35 @@
|
||||
[bumpversion]
|
||||
current_version = 0.9.1
|
||||
commit = False
|
||||
tag = False
|
||||
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>\w+))?
|
||||
serialize =
|
||||
{major}.{minor}.{patch}-{release}
|
||||
{major}.{minor}.{patch}
|
||||
|
||||
[bumpversion:part:release]
|
||||
optional_value = placeholder
|
||||
values =
|
||||
a1
|
||||
b1
|
||||
rc1
|
||||
placeholder
|
||||
|
||||
[bumpversion:file:VERSION]
|
||||
search = {current_version}
|
||||
replace = {new_version}
|
||||
|
||||
[bumpversion:file:src/scrapy_redis/__init__.py]
|
||||
search = __version__ = "{current_version}"
|
||||
replace = __version__ = "{new_version}"
|
||||
|
||||
[bumpversion:file:.cookiecutterrc]
|
||||
search = version: {current_version}
|
||||
replace = version: {new_version}
|
||||
|
||||
[bumpversion:file:HISTORY.rst]
|
||||
search = .. bumpversion marker
|
||||
replace = .. bumpversion marker
|
||||
|
||||
{new_version} ({now:%Y-%m-%d})
|
||||
------------------
|
||||
@@ -0,0 +1,19 @@
|
||||
# Generated by cookiepatcher, a small shim around cookiecutter (pip install cookiepatcher)
|
||||
|
||||
cookiecutter:
|
||||
email: rolando at rmax.io
|
||||
full_name: Rolando Espinoza
|
||||
github_username: rolando
|
||||
project_name: Scrapy-Redis
|
||||
project_package: scrapy_redis
|
||||
project_short_description: Redis-based components for Scrapy.
|
||||
project_slug: scrapy-redis
|
||||
pypi_username: rolando
|
||||
use_codecov: y
|
||||
use_cython: n
|
||||
use_landscape: y
|
||||
use_pypi_deployment_with_travis: n
|
||||
use_pytest: y
|
||||
use_requiresio: y
|
||||
version: 0.9.1
|
||||
year: 2011-2022
|
||||
@@ -0,0 +1,25 @@
|
||||
[paths]
|
||||
source =
|
||||
src
|
||||
|
||||
[run]
|
||||
omit = setup.py
|
||||
branch = true
|
||||
source =
|
||||
scrapy_redis
|
||||
tests
|
||||
parallel = true
|
||||
|
||||
[report]
|
||||
show_missing = true
|
||||
precision = 2
|
||||
omit = */__init__.py
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
def __repr__
|
||||
if self.debug:
|
||||
if settings.DEBUG
|
||||
raise AssertionError
|
||||
raise NotImplementedError
|
||||
if 0:
|
||||
if __name__ == .__main__.:
|
||||
@@ -0,0 +1,46 @@
|
||||
*.py[cod]
|
||||
*.swp
|
||||
*~
|
||||
|
||||
.ropeproject
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Packages
|
||||
*.egg
|
||||
*.egg-info
|
||||
dist
|
||||
build
|
||||
eggs
|
||||
parts
|
||||
bin
|
||||
var
|
||||
sdist
|
||||
develop-eggs
|
||||
.installed.cfg
|
||||
lib
|
||||
lib64
|
||||
__pycache__
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
.coverage
|
||||
.tox
|
||||
nosetests.xml
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
|
||||
# Mr Developer
|
||||
.mr.developer.cfg
|
||||
.project
|
||||
.pydevproject
|
||||
|
||||
# JetBrains PyCharm IDE
|
||||
/.idea/
|
||||
|
||||
.venv
|
||||
.tags
|
||||
@@ -0,0 +1,21 @@
|
||||
# http://editorconfig.org
|
||||
|
||||
root = true
|
||||
|
||||
[*]
|
||||
indent_style = space
|
||||
indent_size = 4
|
||||
trim_trailing_whitespace = true
|
||||
insert_final_newline = true
|
||||
charset = utf-8
|
||||
end_of_line = lf
|
||||
|
||||
[*.bat]
|
||||
indent_style = tab
|
||||
end_of_line = crlf
|
||||
|
||||
[LICENSE]
|
||||
insert_final_newline = false
|
||||
|
||||
[Makefile]
|
||||
indent_style = tab
|
||||
@@ -0,0 +1,12 @@
|
||||
|
||||
[flake8]
|
||||
|
||||
max-line-length = 119
|
||||
ignore =
|
||||
W503
|
||||
P102
|
||||
P103
|
||||
|
||||
exclude =
|
||||
tests/test_spiders.py E731
|
||||
docs/conf.py E265
|
||||
@@ -0,0 +1,3 @@
|
||||
# GitHub syntax highlighting
|
||||
pixi.lock linguist-language=YAML
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
# Description
|
||||
|
||||
Please describe your problem/feature request/bug
|
||||
|
||||
# Step to Reproduce
|
||||
|
||||
Please offer the steps to reproduce your problem/bug
|
||||
|
||||
# Error log
|
||||
|
||||
Please provide error message or screen shot for better understanding.
|
||||
@@ -0,0 +1,25 @@
|
||||
# Description
|
||||
|
||||
Please include a summary of the changes and the related issue. Please also include relevant motivation and context. List any dependencies that are required for this change.
|
||||
|
||||
Fixes #(issue)
|
||||
|
||||
# How Has This Been Tested?
|
||||
|
||||
Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration
|
||||
- [] pytest
|
||||
- [] Other test (please specify)
|
||||
|
||||
# Test Configuration:
|
||||
- OS version:
|
||||
- Necessary Libraries (optional):
|
||||
|
||||
# Checklist:
|
||||
- [] My code follows the style guidelines of this project
|
||||
- [] I have performed a self-review of my code
|
||||
- [] I have commented my code, particularly in hard-to-understand areas
|
||||
- [] I have made corresponding changes to the documentation
|
||||
- [] My changes generate no new warnings
|
||||
- [] I have added tests that prove my fix is effective or that my feature works
|
||||
- [] New and existing unit tests pass locally with my changes
|
||||
- [] Any dependent changes have been merged and published in downstream modules
|
||||
@@ -0,0 +1,31 @@
|
||||
# This is GitHub Action for cross platform building
|
||||
name: build
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
pull_request:
|
||||
branches: [master]
|
||||
|
||||
jobs:
|
||||
builds:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
python-version: ["3.12"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Run build
|
||||
env:
|
||||
TOXENV: build
|
||||
run: |
|
||||
pip install -r requirements-tests.txt
|
||||
tox
|
||||
@@ -0,0 +1,41 @@
|
||||
# This is GitHub Action for linting and security check
|
||||
name: check
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
pull_request:
|
||||
branches: [master]
|
||||
|
||||
concurrency:
|
||||
group: ${{github.workflow}}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
checks:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
env: [security, flake8]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Run check
|
||||
env:
|
||||
TOXENV: ${{ matrix.env }}
|
||||
run: |
|
||||
pip install -r requirements-tests.txt
|
||||
tox
|
||||
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: pre-commit/action@v3.0.0
|
||||
@@ -0,0 +1,30 @@
|
||||
# This is GitHub Action for cross platform building
|
||||
name: docs
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
pull_request:
|
||||
branches: [master]
|
||||
|
||||
jobs:
|
||||
builds:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Build docs
|
||||
env:
|
||||
TOXENV: docs
|
||||
run: |
|
||||
pip install -r requirements-tests.txt
|
||||
tox
|
||||
@@ -0,0 +1,43 @@
|
||||
# This is GitHub Action for tests
|
||||
name: test
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
pull_request:
|
||||
branches: [master]
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
|
||||
services:
|
||||
redis:
|
||||
image: redis
|
||||
options: >-
|
||||
--health-cmd "redis-cli ping"
|
||||
--health-interval 10s
|
||||
--health-timeout 5s
|
||||
--health-retries 5
|
||||
|
||||
container: python:${{ matrix.python-version }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Run pytest
|
||||
env:
|
||||
REDIS_HOST: redis
|
||||
TOXENV: pytest
|
||||
TOX_TESTENV_PASSENV: REDIS_HOST
|
||||
run: |
|
||||
pip install -r requirements-tests.txt
|
||||
tox
|
||||
@@ -0,0 +1,67 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
.venv
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*,cover
|
||||
.hypothesis/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# rope-vim
|
||||
.ropeproject
|
||||
|
||||
# Extra
|
||||
.DS_Store
|
||||
.vscode
|
||||
@@ -0,0 +1,2 @@
|
||||
[settings]
|
||||
profile = black
|
||||
@@ -0,0 +1,36 @@
|
||||
repos:
|
||||
- repo: https://github.com/PyCQA/bandit
|
||||
rev: 1.7.7
|
||||
hooks:
|
||||
- id: bandit
|
||||
args: [-r, -c, .bandit.yml]
|
||||
- repo: https://github.com/PyCQA/flake8
|
||||
rev: 7.0.0
|
||||
hooks:
|
||||
- id: flake8
|
||||
additional_dependencies:
|
||||
- flake8-bugbear
|
||||
- flake8-comprehensions
|
||||
- flake8-debugger
|
||||
#- flake8-docstrings
|
||||
- flake8-string-format
|
||||
- flake8-type-checking
|
||||
- repo: https://github.com/psf/black.git
|
||||
rev: 24.2.0
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.13.2
|
||||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/adamchainz/blacken-docs
|
||||
rev: 1.16.0
|
||||
hooks:
|
||||
- id: blacken-docs
|
||||
additional_dependencies:
|
||||
- black==24.2.0
|
||||
- repo: https://github.com/asottile/pyupgrade
|
||||
rev: v3.15.2
|
||||
hooks:
|
||||
- id: pyupgrade
|
||||
args: [--py38-plus, --keep-runtime-typing]
|
||||
@@ -0,0 +1 @@
|
||||
3.10.13
|
||||
@@ -0,0 +1,17 @@
|
||||
version: 2
|
||||
formats: all
|
||||
sphinx:
|
||||
configuration: docs/conf.py
|
||||
fail_on_warning: true
|
||||
|
||||
build:
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
# For available versions, see:
|
||||
# https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
|
||||
python: "3.12"
|
||||
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/requirements.txt
|
||||
- path: .
|
||||
@@ -0,0 +1,68 @@
|
||||
language: python
|
||||
python: 3.5
|
||||
sudo: false
|
||||
|
||||
services:
|
||||
- redis-server
|
||||
|
||||
env:
|
||||
- TOXENV=py27-scrapyrel
|
||||
- TOXENV=py34-scrapyrel
|
||||
- TOXENV=py35-scrapyrel
|
||||
|
||||
matrix:
|
||||
fast_finish: true
|
||||
|
||||
before_install:
|
||||
- python --version
|
||||
- uname -a
|
||||
- lsb_release -a
|
||||
|
||||
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
|
||||
install:
|
||||
- pip install -U pip wheel
|
||||
- pip install -U tox twine coverage
|
||||
- virtualenv --version
|
||||
- pip --version
|
||||
- tox --version
|
||||
|
||||
# command to run tests, e.g. python setup.py test
|
||||
script:
|
||||
- tox -e $TOXENV --workdir $HOME/.tox
|
||||
|
||||
after_success:
|
||||
# Codecov requires a single .coverage and will run 'coverage xml' to
|
||||
# generate the report.
|
||||
- coverage combine
|
||||
- bash <(curl -s https://codecov.io/bash)
|
||||
|
||||
after_failure:
|
||||
- more $HOME/.tox/log/* | cat
|
||||
- more $HOME/.tox/*/log/* | cat
|
||||
|
||||
before_cache:
|
||||
- rm -fr $HOME/.cache/pip/log
|
||||
- rm -fr $HOME/.tox/log/*
|
||||
- rm -fr $HOME/.tox/*/log/*
|
||||
|
||||
cache:
|
||||
directories:
|
||||
- $HOME/.cache/pip
|
||||
- $HOME/.tox/
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_sucess: never
|
||||
on_failure: always
|
||||
|
||||
deploy:
|
||||
provider: pypi
|
||||
distributions: "sdist bdist_wheel"
|
||||
user: darkrho
|
||||
password:
|
||||
secure: "Pgcj+Otx9o2MxOuXibvz9LUd5DqlW0jaKDScVOAcFT+//U0esjRqY08bRFQlrSTXokJa6X/dVZlb2mQE8L4vr7mLFspRGO4FByK34L089/ETwsLKI2rks2zVbmPSyweL3sz88EXLKmYs7WsKtCnET67qra6hreKbO67ALAh5WWk="
|
||||
on:
|
||||
tags: true
|
||||
all_branches: true
|
||||
repo: rolando/scrapy-redis
|
||||
condition: "$TOXENV == py35-scrapyrel"
|
||||
@@ -0,0 +1,13 @@
|
||||
=======
|
||||
Credits
|
||||
=======
|
||||
|
||||
Development Lead
|
||||
----------------
|
||||
|
||||
* R Max Espinoza <hey at rmax.dev>
|
||||
|
||||
Contributors
|
||||
------------
|
||||
|
||||
None yet. Why not be the first?
|
||||
@@ -0,0 +1,138 @@
|
||||
.. highlight:: shell
|
||||
|
||||
============
|
||||
Contribution
|
||||
============
|
||||
|
||||
Contributions are welcome, and they are greatly appreciated! Every
|
||||
little bit helps, and credit will always be given.
|
||||
|
||||
You can contribute in many ways:
|
||||
|
||||
Types of Contributions
|
||||
----------------------
|
||||
|
||||
New to here
|
||||
~~~~~~~~~~~
|
||||
|
||||
Any issue with good first issue tag on it is a great place to start! Feel free to ask any questions here.
|
||||
|
||||
Don't know how to start
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Review codebases and PRs can give you quite a knowledge to know what's going on here!
|
||||
|
||||
Report Bugs
|
||||
~~~~~~~~~~~
|
||||
|
||||
Report bugs at https://github.com/rmax/scrapy-redis/issues.
|
||||
|
||||
If you are reporting a bug, please include:
|
||||
|
||||
* Your operating system name and version.
|
||||
* Any details about your local setup that might be helpful in troubleshooting.
|
||||
* Detailed steps to reproduce the bug.
|
||||
|
||||
Fix Bugs
|
||||
~~~~~~~~
|
||||
|
||||
Look through the GitHub issues for bugs. Anything tagged with "bug"
|
||||
is open to whoever wants to implement it.
|
||||
|
||||
Implement Features & improvments
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Look through the GitHub issues for features. Anything tagged with "feature" or "improvments"
|
||||
is open to whoever wants to implement it.
|
||||
|
||||
Write Documentation
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Scrapy-Redis could always use more documentation, whether as part of the
|
||||
official Scrapy-Redis docs, in docstrings, or even on the web in blog posts,
|
||||
articles, and such.
|
||||
|
||||
Submit Feedback
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
The best way to send feedback is to file an issue at https://github.com/rmax/scrapy-redis/issues.
|
||||
|
||||
If you are proposing a feature:
|
||||
|
||||
* Explain in detail how it would work.
|
||||
* Keep the scope as narrow as possible, to make it easier to implement.
|
||||
* Remember that this is a volunteer-driven project, and that contributions
|
||||
are welcome :)
|
||||
|
||||
Get Started!
|
||||
------------
|
||||
|
||||
Ready to contribute? Here's how to set up `scrapy-redis` for local development.
|
||||
|
||||
Setup environment
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
1. Fork the `scrapy-redis` repo on GitHub.
|
||||
2. Clone your fork locally::
|
||||
|
||||
git clone git@github.com:your_name_here/scrapy-redis.git
|
||||
|
||||
3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
|
||||
|
||||
pip install virtualenv==20.0.23
|
||||
virtualenv --python=/usr/bin/python3 ~/scrapy_redis
|
||||
source ~/scrapy_redis/bin/activate
|
||||
cd scrapy-redis/
|
||||
pip install -r requirements-install.txt
|
||||
pip install .
|
||||
|
||||
4. Create a branch for local development::
|
||||
|
||||
git checkout -b name-of-your-bugfix-or-feature
|
||||
|
||||
Now you can make your changes locally.
|
||||
|
||||
Setup testing environment
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
1. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
|
||||
|
||||
pip install -r requirements-tests.txt
|
||||
flake8 src/ tests/
|
||||
python -m pytest --ignore=setup.py
|
||||
tox
|
||||
|
||||
2. Note that if the error of `No module named scrapy_redis` shows, please check the install `scrapy-redis` of your branch by::
|
||||
|
||||
pip install .
|
||||
|
||||
3. Or change the import lines::
|
||||
|
||||
from scrapy_redis import xxx # from this
|
||||
from src.scrapy_redis import xxx # to this
|
||||
|
||||
4. Commit your changes and push your branch to GitHub::
|
||||
|
||||
git add .
|
||||
git commit -m "Your detailed description of your changes."
|
||||
git push origin name-of-your-bugfix-or-feature
|
||||
|
||||
5. Submit a pull request through the GitHub website.
|
||||
|
||||
Pull Request Guidelines
|
||||
-----------------------
|
||||
|
||||
Before you submit a pull request, check that it meets these guidelines:
|
||||
|
||||
1. The pull request should include tests.
|
||||
2. If the pull request adds functionality, the docs should be updated. Put
|
||||
your new functionality into a function with a docstring, and add the
|
||||
feature to the list in README.rst.
|
||||
3. Make sure that the tests pass for all supported Python versions.
|
||||
|
||||
Tips
|
||||
----
|
||||
|
||||
To run a subset of tests::
|
||||
|
||||
pytest tests/test_scrapy_redis
|
||||
@@ -0,0 +1,16 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install tox and dependencies (replace 'your-requirements.txt' with your actual file)
|
||||
COPY requirements.txt .
|
||||
COPY requirements-tests.txt .
|
||||
RUN pip install -r requirements.txt -r requirements-tests.txt
|
||||
|
||||
# Copy your project code
|
||||
COPY . .
|
||||
|
||||
# Run Tox tests
|
||||
CMD ["tox"]
|
||||
|
||||
@@ -0,0 +1,136 @@
|
||||
=======
|
||||
History
|
||||
=======
|
||||
|
||||
.. bumpversion marker
|
||||
|
||||
0.9.1 (2024-07-06)
|
||||
------------------
|
||||
* Fixed docs build.
|
||||
|
||||
0.9.0 (2024-07-06)
|
||||
------------------
|
||||
* Fixed ``Scheduler`` not compatible with BaseDupeFilter (#294)
|
||||
* Added precommit hooks.
|
||||
* Switched to Python 3.12 as default build version.
|
||||
|
||||
0.8.0 (2024-07-03)
|
||||
------------------
|
||||
* Fixed request fingerprint method.
|
||||
* Fixed support for Scrapy 2.6+.
|
||||
* Fixed tox tests and github workflow.
|
||||
* Deprecated ``REDIS_START_URLS_BATCH_SIZE``.
|
||||
|
||||
0.7.3 (2022-07-21)
|
||||
------------------
|
||||
* Move docs to GitHub Wiki
|
||||
* Update tox and support dynamic tests
|
||||
* Update support for json data
|
||||
* Refactor max idle time
|
||||
* Add support for python3.7~python3.10
|
||||
* Deprecate python2.x support
|
||||
|
||||
0.7.2 (2021-12-27)
|
||||
------------------
|
||||
* Fix RedisStatsCollector._get_key()
|
||||
* Fix redis-py dependency version
|
||||
* Added maximum idle waiting time MAX_IDLE_TIME_BEFORE_CLOSE
|
||||
|
||||
0.7.1 (2021-03-27)
|
||||
------------------
|
||||
* Fixes datetime parse error for redis-py 3.x.
|
||||
* Add support for stats extensions.
|
||||
|
||||
0.7.1-rc1 (2021-03-27)
|
||||
----------------------
|
||||
* Fixes datetime parse error for redis-py 3.x.
|
||||
|
||||
0.7.1-b1 (2021-03-22)
|
||||
---------------------
|
||||
* Add support for stats extensions.
|
||||
|
||||
0.7.0-dev (unreleased)
|
||||
----------------------
|
||||
* Unreleased.
|
||||
|
||||
0.6.8 (2017-02-14)
|
||||
------------------
|
||||
* Fixed automated release due to not matching registered email.
|
||||
|
||||
0.6.7 (2016-12-27)
|
||||
------------------
|
||||
* Fixes bad formatting in logging message.
|
||||
|
||||
0.6.6 (2016-12-20)
|
||||
------------------
|
||||
* Fixes wrong message on dupefilter duplicates.
|
||||
|
||||
0.6.5 (2016-12-19)
|
||||
------------------
|
||||
* Fixed typo in default settings.
|
||||
|
||||
0.6.4 (2016-12-18)
|
||||
------------------
|
||||
* Fixed data decoding in Python 3.x.
|
||||
* Added ``REDIS_ENCODING`` setting (default ``utf-8``).
|
||||
* Default to ``CONCURRENT_REQUESTS`` value for ``REDIS_START_URLS_BATCH_SIZE``.
|
||||
* Renamed queue classes to a proper naming conventiong (backwards compatible).
|
||||
|
||||
0.6.3 (2016-07-03)
|
||||
------------------
|
||||
* Added ``REDIS_START_URLS_KEY`` setting.
|
||||
* Fixed spider method ``from_crawler`` signature.
|
||||
|
||||
0.6.2 (2016-06-26)
|
||||
------------------
|
||||
* Support ``redis_cls`` parameter in ``REDIS_PARAMS`` setting.
|
||||
* Python 3.x compatibility fixed.
|
||||
* Added ``SCHEDULER_SERIALIZER`` setting.
|
||||
|
||||
0.6.1 (2016-06-25)
|
||||
------------------
|
||||
* **Backwards incompatible change:** Require explicit ``DUPEFILTER_CLASS``
|
||||
setting.
|
||||
* Added ``SCHEDULER_FLUSH_ON_START`` setting.
|
||||
* Added ``REDIS_START_URLS_AS_SET`` setting.
|
||||
* Added ``REDIS_ITEMS_KEY`` setting.
|
||||
* Added ``REDIS_ITEMS_SERIALIZER`` setting.
|
||||
* Added ``REDIS_PARAMS`` setting.
|
||||
* Added ``REDIS_START_URLS_BATCH_SIZE`` spider attribute to read start urls
|
||||
in batches.
|
||||
* Added ``RedisCrawlSpider``.
|
||||
|
||||
0.6.0 (2015-07-05)
|
||||
------------------
|
||||
* Updated code to be compatible with Scrapy 1.0.
|
||||
* Added `-a domain=...` option for example spiders.
|
||||
|
||||
0.5.0 (2013-09-02)
|
||||
------------------
|
||||
* Added `REDIS_URL` setting to support Redis connection string.
|
||||
* Added `SCHEDULER_IDLE_BEFORE_CLOSE` setting to prevent the spider closing too
|
||||
quickly when the queue is empty. Default value is zero keeping the previous
|
||||
behavior.
|
||||
* Schedule preemptively requests on item scraped.
|
||||
* This version is the latest release compatible with Scrapy 0.24.x.
|
||||
|
||||
0.4.0 (2013-04-19)
|
||||
------------------
|
||||
* Added `RedisSpider` and `RedisMixin` classes as building blocks for spiders
|
||||
to be fed through a redis queue.
|
||||
* Added redis queue stats.
|
||||
* Let the encoder handle the item as it comes instead converting it to a dict.
|
||||
|
||||
0.3.0 (2013-02-18)
|
||||
------------------
|
||||
* Added support for different queue classes.
|
||||
* Changed requests serialization from `marshal` to `cPickle`.
|
||||
|
||||
0.2.0 (2013-02-17)
|
||||
------------------
|
||||
* Improved backward compatibility.
|
||||
* Added example project.
|
||||
|
||||
0.1.0 (2011-09-01)
|
||||
------------------
|
||||
* First release on PyPI.
|
||||
@@ -0,0 +1,19 @@
|
||||
Copyright (c) 2011-2024, R Max Espinoza
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
of the Software, and to permit persons to whom the Software is furnished to do
|
||||
so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1,16 @@
|
||||
graft docs
|
||||
graft src
|
||||
graft tests
|
||||
graft example-project
|
||||
|
||||
include *.in
|
||||
include *.ini
|
||||
include *.rst
|
||||
include *.txt
|
||||
|
||||
include LICENSE
|
||||
include VERSION
|
||||
include Makefile
|
||||
|
||||
global-exclude __pycache__ *.py[cod]
|
||||
global-exclude *.so *.dylib
|
||||
@@ -0,0 +1,156 @@
|
||||
.PHONY: clean-so clean-test clean-pyc clean-build clean-docs clean
|
||||
.PHONY: docs check check-manifest check-setup check-history lint
|
||||
.PHONY: test test-all coverage
|
||||
.PHONY: compile-reqs install-reqs
|
||||
.PHONY: release dist install build-inplace
|
||||
define BROWSER_PYSCRIPT
|
||||
import os, webbrowser, sys
|
||||
FAIL = "\033[91m"
|
||||
ENDC = "\033[0m"
|
||||
|
||||
try:
|
||||
from urllib.request import pathname2url
|
||||
except:
|
||||
print(FAIL + "Python2 is deprecated, please upgrade your python >= 3.7" + ENDC)
|
||||
|
||||
webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
|
||||
endef
|
||||
export BROWSER_PYSCRIPT
|
||||
BROWSER := python -c "$$BROWSER_PYSCRIPT"
|
||||
|
||||
SPHINX_BUILD := html
|
||||
|
||||
help:
|
||||
@echo "check - check setup, code style, setup, etc"
|
||||
@echo "check-manifest - check manifest"
|
||||
@echo "check-setup - check setup"
|
||||
@echo "check-history - check history"
|
||||
@echo "clean - remove all build, test, coverage and Python artifacts"
|
||||
@echo "clean-build - remove build artifacts"
|
||||
@echo "clean-docs - remove docs artifacts"
|
||||
@echo "clean-pyc - remove Python file artifacts"
|
||||
@echo "clean-test - remove test and coverage artifacts"
|
||||
@echo "clean-so - remove compiled extensions"
|
||||
@echo "lint - check style with flake8"
|
||||
@echo "test - run tests quickly with the default Python"
|
||||
@echo "test-all - run tests on every Python version with tox"
|
||||
@echo "coverage - check code coverage quickly with the default Python"
|
||||
@echo "compile-reqs - compile requirements"
|
||||
@echo "install-reqs - install requirements"
|
||||
@echo "docs - generate Sphinx HTML documentation, including API docs"
|
||||
@echo "dist-upload - package and upload a release"
|
||||
@echo "release - bump release and push changes"
|
||||
@echo "dist - package"
|
||||
@echo "develop - install package in develop mode"
|
||||
@echo "install - install the package to the active Python's site-packages"
|
||||
|
||||
check: check-setup check-manifest check-history lint
|
||||
|
||||
check-setup:
|
||||
@echo "Checking package metadata (name, description, etc)"
|
||||
python setup.py check --strict --metadata --restructuredtext
|
||||
|
||||
check-manifest:
|
||||
@echo "Checking MANIFEST.in"
|
||||
check-manifest --ignore ".*"
|
||||
|
||||
check-history:
|
||||
@echo "Checking latest version in HISTORY"
|
||||
VERSION=`cat VERSION`; grep "^$${VERSION}\b" HISTORY.rst
|
||||
|
||||
clean: clean-build clean-docs clean-pyc clean-test clean-so
|
||||
|
||||
clean-build:
|
||||
rm -fr build/
|
||||
rm -fr dist/
|
||||
rm -fr .eggs/
|
||||
find . -name '*.egg-info' -exec rm -fr {} +
|
||||
find . -name '*.egg' -exec rm -f {} +
|
||||
|
||||
clean-docs:
|
||||
$(MAKE) -C docs clean
|
||||
|
||||
clean-pyc:
|
||||
find . -name '*.pyc' -exec rm -f {} +
|
||||
find . -name '*.pyo' -exec rm -f {} +
|
||||
find . -name '*~' -exec rm -f {} +
|
||||
find . -name '__pycache__' -exec rm -fr {} +
|
||||
|
||||
clean-test:
|
||||
rm -fr .tox/
|
||||
rm -f .coverage
|
||||
rm -fr htmlcov/
|
||||
|
||||
clean-so:
|
||||
find . -name '*.so' -exec rm -f {} +
|
||||
|
||||
lint:
|
||||
flake8 src tests
|
||||
|
||||
build-inplace:
|
||||
python setup.py build_ext --inplace
|
||||
|
||||
develop: clean
|
||||
pip install -e .
|
||||
|
||||
test: develop
|
||||
pytest --ignore=setup.py
|
||||
|
||||
test-all:
|
||||
tox -v
|
||||
|
||||
coverage: develop
|
||||
coverage run -m pytest --ignore=setup.py
|
||||
coverage combine
|
||||
coverage report
|
||||
coverage html
|
||||
$(BROWSER) htmlcov/index.html
|
||||
|
||||
docs-build: develop
|
||||
rm -f docs/scrapy_redis.rst
|
||||
rm -f docs/modules.rst
|
||||
sphinx-apidoc -o docs/ src/scrapy_redis
|
||||
$(MAKE) -C docs clean
|
||||
$(MAKE) -C docs $(SPHINX_BUILD)
|
||||
|
||||
docs: docs-build
|
||||
$(BROWSER) docs/_build/$(SPHINX_BUILD)/index.html
|
||||
|
||||
servedocs: docs
|
||||
watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
|
||||
|
||||
release:
|
||||
@echo "To do a release, follow the steps:"
|
||||
@echo "- bumpversion release"
|
||||
@echo "- Review and commit"
|
||||
@echo "- git tag -a \`cat VERSION\`"
|
||||
@echo "- git push --follow-tags"
|
||||
|
||||
dist-upload: clean check dist
|
||||
twine upload dist/*
|
||||
|
||||
dist: clean
|
||||
python setup.py sdist
|
||||
python setup.py bdist_wheel
|
||||
ls -l dist
|
||||
|
||||
install: clean
|
||||
pip install .
|
||||
|
||||
REQUIREMENTS_IN := $(wildcard requirements*.in)
|
||||
.PHONY: $(REQUIREMENTS_IN)
|
||||
|
||||
requirements%.txt: requirements%.in
|
||||
pip-compile -v $< -o $@
|
||||
|
||||
REQUIREMENTS_TXT := $(REQUIREMENTS_IN:.in=.txt)
|
||||
ifndef REQUIREMENTS_TXT
|
||||
REQUIREMENTS_TXT := $(wildcard requirements*.txt)
|
||||
endif
|
||||
|
||||
compile-reqs: $(REQUIREMENTS_TXT)
|
||||
@test -z "$$REQUIREMENTS_TXT" && echo "No 'requirements*.in' files. Nothing to do"
|
||||
|
||||
install-reqs:
|
||||
@test -z "$$REQUIREMENTS_TXT" && echo "No 'requirements*.txt' files. Nothing to do"
|
||||
$(foreach req,$(REQUIREMENTS_TXT),pip install -r $(req);)
|
||||
@@ -0,0 +1,110 @@
|
||||
============
|
||||
Scrapy-Redis
|
||||
============
|
||||
|
||||
.. image:: https://readthedocs.org/projects/scrapy-redis/badge/?version=latest
|
||||
:alt: Documentation Status
|
||||
:target: https://readthedocs.org/projects/scrapy-redis/?badge=latest
|
||||
|
||||
.. image:: https://img.shields.io/pypi/v/scrapy-redis.svg
|
||||
:target: https://pypi.python.org/pypi/scrapy-redis
|
||||
|
||||
.. image:: https://img.shields.io/pypi/pyversions/scrapy-redis.svg
|
||||
:target: https://pypi.python.org/pypi/scrapy-redis
|
||||
|
||||
.. image:: https://github.com/rmax/scrapy-redis/actions/workflows/builds.yml/badge.svg
|
||||
:target: https://github.com/rmax/scrapy-redis/actions/workflows/builds.yml
|
||||
|
||||
.. image:: https://github.com/rmax/scrapy-redis/actions/workflows/checks.yml/badge.svg
|
||||
:target: https://github.com/rmax/scrapy-redis/actions/workflows/checks.yml
|
||||
|
||||
.. image:: https://github.com/rmax/scrapy-redis/actions/workflows/tests.yml/badge.svg
|
||||
:target: https://github.com/rmax/scrapy-redis/actions/workflows/tests.yml
|
||||
|
||||
.. image:: https://codecov.io/github/rmax/scrapy-redis/coverage.svg?branch=master
|
||||
:alt: Coverage Status
|
||||
:target: https://codecov.io/github/rmax/scrapy-redis
|
||||
|
||||
.. image:: https://img.shields.io/badge/security-bandit-green.svg
|
||||
:alt: Security Status
|
||||
:target: https://github.com/rmax/scrapy-redis
|
||||
|
||||
Redis-based components for Scrapy.
|
||||
|
||||
* Usage: https://github.com/rmax/scrapy-redis/wiki/Usage
|
||||
* Documentation: https://github.com/rmax/scrapy-redis/wiki.
|
||||
* Release: https://github.com/rmax/scrapy-redis/wiki/History
|
||||
* Contribution: https://github.com/rmax/scrapy-redis/wiki/Getting-Started
|
||||
* LICENSE: MIT license
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Distributed crawling/scraping
|
||||
|
||||
You can start multiple spider instances that share a single redis queue.
|
||||
Best suitable for broad multi-domain crawls.
|
||||
|
||||
* Distributed post-processing
|
||||
|
||||
Scraped items gets pushed into a redis queued meaning that you can start as
|
||||
many as needed post-processing processes sharing the items queue.
|
||||
|
||||
* Scrapy plug-and-play components
|
||||
|
||||
Scheduler + Duplication Filter, Item Pipeline, Base Spiders.
|
||||
|
||||
* In this forked version: added ``json`` supported data in Redis
|
||||
|
||||
data contains ``url``, ```meta``` and other optional parameters. ``meta`` is a nested json which contains sub-data.
|
||||
this function extract this data and send another FormRequest with ``url``, ``meta`` and addition ``formdata``.
|
||||
|
||||
For example:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
{ "url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }
|
||||
|
||||
this data can be accessed in `scrapy spider` through response.
|
||||
like: `request.url`, `request.meta`, `request.cookies`
|
||||
|
||||
.. note:: This features cover the basic case of distributing the workload across multiple workers. If you need more features like URL expiration, advanced URL prioritization, etc., we suggest you to take a look at the Frontera_ project.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
* Python 3.7+
|
||||
* Redis >= 5.0
|
||||
* ``Scrapy`` >= 2.0
|
||||
* ``redis-py`` >= 4.0
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
From pip
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install scrapy-redis
|
||||
|
||||
From GitHub
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
git clone https://github.com/darkrho/scrapy-redis.git
|
||||
cd scrapy-redis
|
||||
python setup.py install
|
||||
|
||||
.. note:: For using this json supported data feature, please make sure you have not installed the scrapy-redis through pip. If you already did it, you first uninstall that one.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip uninstall scrapy-redis
|
||||
|
||||
Alternative Choice
|
||||
---------------------------
|
||||
|
||||
Frontera_ is a web crawling framework consisting of `crawl frontier`_, and distribution/scaling primitives, allowing to build a large scale online web crawler.
|
||||
|
||||
.. _Frontera: https://github.com/scrapinghub/frontera
|
||||
.. _crawl frontier: http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html
|
||||
@@ -0,0 +1,11 @@
|
||||
TODO
|
||||
====
|
||||
|
||||
* Add SCRAPY_JOB global support (jobs sharing same SCRAPY_JOB share same queues).
|
||||
* Use a spider middleware instead of spider mixin. This will avoid the spider
|
||||
idle signal hack.
|
||||
* Allow to use pubsub whenever appropriate.
|
||||
* Move example project to its own repository. Include different crawling use
|
||||
cases (i.e.: producer/consumer).
|
||||
* Add pyrebloom dupefilter.
|
||||
* Warn and pass unserializable requests.
|
||||
@@ -0,0 +1 @@
|
||||
0.9.1
|
||||
@@ -0,0 +1,20 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
python:
|
||||
build: .
|
||||
command: tox -e security,flake8,pytest
|
||||
environment:
|
||||
REDIS_HOST: redis # Use service name for hostname within docker network
|
||||
REDIS_PORT: 6379
|
||||
TOX_TESTENV_PASSENV: "REDIS_HOST REDIS_PORT"
|
||||
volumes:
|
||||
- ./:/app # Mount your project directory into the container
|
||||
depends_on:
|
||||
- redis
|
||||
|
||||
redis:
|
||||
image: redis:6.2-alpine
|
||||
ports:
|
||||
- "6379:6379" # Map Redis port to host port
|
||||
|
||||
@@ -0,0 +1,177 @@
|
||||
# Makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line.
|
||||
SPHINXOPTS =
|
||||
SPHINXBUILD = sphinx-build
|
||||
PAPER =
|
||||
BUILDDIR = _build
|
||||
|
||||
# User-friendly check for sphinx-build
|
||||
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
|
||||
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
|
||||
endif
|
||||
|
||||
# Internal variables.
|
||||
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||
PAPEROPT_letter = -D latex_paper_size=letter
|
||||
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
||||
# the i18n builder cannot share the environment and doctrees with the others
|
||||
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
||||
|
||||
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
|
||||
|
||||
help:
|
||||
@echo "Please use \`make <target>' where <target> is one of"
|
||||
@echo " html to make standalone HTML files"
|
||||
@echo " dirhtml to make HTML files named index.html in directories"
|
||||
@echo " singlehtml to make a single large HTML file"
|
||||
@echo " pickle to make pickle files"
|
||||
@echo " json to make JSON files"
|
||||
@echo " htmlhelp to make HTML files and a HTML help project"
|
||||
@echo " qthelp to make HTML files and a qthelp project"
|
||||
@echo " devhelp to make HTML files and a Devhelp project"
|
||||
@echo " epub to make an epub"
|
||||
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
||||
@echo " latexpdf to make LaTeX files and run them through pdflatex"
|
||||
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
|
||||
@echo " text to make text files"
|
||||
@echo " man to make manual pages"
|
||||
@echo " texinfo to make Texinfo files"
|
||||
@echo " info to make Texinfo files and run them through makeinfo"
|
||||
@echo " gettext to make PO message catalogs"
|
||||
@echo " changes to make an overview of all changed/added/deprecated items"
|
||||
@echo " xml to make Docutils-native XML files"
|
||||
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
|
||||
@echo " linkcheck to check all external links for integrity"
|
||||
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
||||
|
||||
clean:
|
||||
rm -rf $(BUILDDIR)/*
|
||||
|
||||
html:
|
||||
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
||||
@echo
|
||||
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
||||
|
||||
dirhtml:
|
||||
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
|
||||
@echo
|
||||
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
|
||||
|
||||
singlehtml:
|
||||
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
|
||||
@echo
|
||||
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
|
||||
|
||||
pickle:
|
||||
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
|
||||
@echo
|
||||
@echo "Build finished; now you can process the pickle files."
|
||||
|
||||
json:
|
||||
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
|
||||
@echo
|
||||
@echo "Build finished; now you can process the JSON files."
|
||||
|
||||
htmlhelp:
|
||||
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
|
||||
@echo
|
||||
@echo "Build finished; now you can run HTML Help Workshop with the" \
|
||||
".hhp project file in $(BUILDDIR)/htmlhelp."
|
||||
|
||||
qthelp:
|
||||
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
|
||||
@echo
|
||||
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
|
||||
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
|
||||
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/scrapy-redis.qhcp"
|
||||
@echo "To view the help file:"
|
||||
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/scrapy-redis.qhc"
|
||||
|
||||
devhelp:
|
||||
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
|
||||
@echo
|
||||
@echo "Build finished."
|
||||
@echo "To view the help file:"
|
||||
@echo "# mkdir -p $$HOME/.local/share/devhelp/scrapy-redis"
|
||||
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/scrapy-redis"
|
||||
@echo "# devhelp"
|
||||
|
||||
epub:
|
||||
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
|
||||
@echo
|
||||
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
|
||||
|
||||
latex:
|
||||
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||
@echo
|
||||
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
|
||||
@echo "Run \`make' in that directory to run these through (pdf)latex" \
|
||||
"(use \`make latexpdf' here to do that automatically)."
|
||||
|
||||
latexpdf:
|
||||
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||
@echo "Running LaTeX files through pdflatex..."
|
||||
$(MAKE) -C $(BUILDDIR)/latex all-pdf
|
||||
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
||||
|
||||
latexpdfja:
|
||||
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||
@echo "Running LaTeX files through platex and dvipdfmx..."
|
||||
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
|
||||
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
||||
|
||||
text:
|
||||
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
|
||||
@echo
|
||||
@echo "Build finished. The text files are in $(BUILDDIR)/text."
|
||||
|
||||
man:
|
||||
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
|
||||
@echo
|
||||
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
|
||||
|
||||
texinfo:
|
||||
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
||||
@echo
|
||||
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
|
||||
@echo "Run \`make' in that directory to run these through makeinfo" \
|
||||
"(use \`make info' here to do that automatically)."
|
||||
|
||||
info:
|
||||
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
||||
@echo "Running Texinfo files through makeinfo..."
|
||||
make -C $(BUILDDIR)/texinfo info
|
||||
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
|
||||
|
||||
gettext:
|
||||
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
|
||||
@echo
|
||||
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
|
||||
|
||||
changes:
|
||||
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
|
||||
@echo
|
||||
@echo "The overview file is in $(BUILDDIR)/changes."
|
||||
|
||||
linkcheck:
|
||||
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
|
||||
@echo
|
||||
@echo "Link check complete; look for any errors in the above output " \
|
||||
"or in $(BUILDDIR)/linkcheck/output.txt."
|
||||
|
||||
doctest:
|
||||
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
|
||||
@echo "Testing of doctests in the sources finished, look at the " \
|
||||
"results in $(BUILDDIR)/doctest/output.txt."
|
||||
|
||||
xml:
|
||||
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
|
||||
@echo
|
||||
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
|
||||
|
||||
pseudoxml:
|
||||
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
|
||||
@echo
|
||||
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
|
||||
@@ -0,0 +1 @@
|
||||
.. include:: ../AUTHORS.rst
|
||||
@@ -0,0 +1,273 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# scrapy-redis documentation build configuration file, created by
|
||||
# sphinx-quickstart on Tue Jul 9 22:26:36 2013.
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its
|
||||
# containing dir.
|
||||
#
|
||||
# Note that not all possible configuration values are present in this
|
||||
# autogenerated file.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another
|
||||
# directory, add these directories to sys.path here. If the directory is
|
||||
# relative to the documentation root, use os.path.abspath to make it
|
||||
# absolute, like shown here.
|
||||
# sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
# Get the project root dir, which is the parent dir of this
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# -- General configuration ---------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
# needs_sphinx = '1.0'
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||
extensions = [
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.napoleon",
|
||||
"sphinx.ext.viewcode",
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ["_templates"]
|
||||
|
||||
# The suffix of source filenames.
|
||||
source_suffix = ".rst"
|
||||
|
||||
# The encoding of source files.
|
||||
# source_encoding = 'utf-8-sig'
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = "index"
|
||||
|
||||
# General information about the project.
|
||||
project = "Scrapy-Redis"
|
||||
copyright = "2011-2024, R Max Espinoza"
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement
|
||||
# for |version| and |release|, also used in various other places throughout
|
||||
# the built documents.
|
||||
#
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
release = open(os.path.join(project_root, "VERSION")).read().strip()
|
||||
# The short X.Y version.
|
||||
version = re.findall(r"\d+\.\d+\.\d+", release)[0]
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
# language = None
|
||||
|
||||
# There are two options for replacing |today|: either, you set today to
|
||||
# some non-false value, then it is used:
|
||||
# today = ''
|
||||
# Else, today_fmt is used as the format for a strftime call.
|
||||
# today_fmt = '%B %d, %Y'
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
exclude_patterns = ["_build"]
|
||||
|
||||
# The reST default role (used for this markup: `text`) to use for all
|
||||
# documents.
|
||||
# default_role = None
|
||||
|
||||
# If true, '()' will be appended to :func: etc. cross-reference text.
|
||||
# add_function_parentheses = True
|
||||
|
||||
# If true, the current module name will be prepended to all description
|
||||
# unit titles (such as .. function::).
|
||||
# add_module_names = True
|
||||
|
||||
# If true, sectionauthor and moduleauthor directives will be shown in the
|
||||
# output. They are ignored by default.
|
||||
# show_authors = False
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = "sphinx"
|
||||
|
||||
# A list of ignored prefixes for module index sorting.
|
||||
# modindex_common_prefix = []
|
||||
|
||||
# If true, keep warnings as "system message" paragraphs in the built
|
||||
# documents.
|
||||
# keep_warnings = False
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
html_theme = "default"
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a
|
||||
# theme further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
# html_theme_options = {}
|
||||
|
||||
# Add any paths that contain custom themes here, relative to this directory.
|
||||
# html_theme_path = []
|
||||
|
||||
# The name for this set of Sphinx documents. If None, it defaults to
|
||||
# "<project> v<release> documentation".
|
||||
# html_title = None
|
||||
|
||||
# A shorter title for the navigation bar. Default is the same as
|
||||
# html_title.
|
||||
# html_short_title = None
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the
|
||||
# top of the sidebar.
|
||||
# html_logo = None
|
||||
|
||||
# The name of an image file (within the static path) to use as favicon
|
||||
# of the docs. This file should be a Windows icon file (.ico) being
|
||||
# 16x16 or 32x32 pixels large.
|
||||
# html_favicon = None
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets)
|
||||
# here, relative to this directory. They are copied after the builtin
|
||||
# static files, so a file named "default.css" will overwrite the builtin
|
||||
# "default.css".
|
||||
# html_static_path = ["_static"]
|
||||
|
||||
# If not '', a 'Last updated on:' timestamp is inserted at every page
|
||||
# bottom, using the given strftime format.
|
||||
# html_last_updated_fmt = '%b %d, %Y'
|
||||
|
||||
# If true, SmartyPants will be used to convert quotes and dashes to
|
||||
# typographically correct entities.
|
||||
# html_use_smartypants = True
|
||||
|
||||
# Custom sidebar templates, maps document names to template names.
|
||||
# html_sidebars = {}
|
||||
|
||||
# Additional templates that should be rendered to pages, maps page names
|
||||
# to template names.
|
||||
# html_additional_pages = {}
|
||||
|
||||
# If false, no module index is generated.
|
||||
# html_domain_indices = True
|
||||
|
||||
# If false, no index is generated.
|
||||
# html_use_index = True
|
||||
|
||||
# If true, the index is split into individual pages for each letter.
|
||||
# html_split_index = False
|
||||
|
||||
# If true, links to the reST sources are added to the pages.
|
||||
# html_show_sourcelink = True
|
||||
|
||||
# If true, "Created using Sphinx" is shown in the HTML footer.
|
||||
# Default is True.
|
||||
# html_show_sphinx = True
|
||||
|
||||
# If true, "(C) Copyright ..." is shown in the HTML footer.
|
||||
# Default is True.
|
||||
# html_show_copyright = True
|
||||
|
||||
# If true, an OpenSearch description file will be output, and all pages
|
||||
# will contain a <link> tag referring to it. The value of this option
|
||||
# must be the base URL from which the finished HTML is served.
|
||||
# html_use_opensearch = ''
|
||||
|
||||
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
||||
# html_file_suffix = None
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = "scrapy_redisdoc"
|
||||
|
||||
|
||||
# -- Options for LaTeX output ------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
# The paper size ('letterpaper' or 'a4paper').
|
||||
# 'papersize': 'letterpaper',
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
# 'pointsize': '10pt',
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
# 'preamble': '',
|
||||
}
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title, author, documentclass
|
||||
# [howto/manual]).
|
||||
latex_documents = [
|
||||
(
|
||||
"index",
|
||||
"scrapy_redis.tex",
|
||||
"Scrapy-Redis Documentation",
|
||||
"R Max Espinoza",
|
||||
"manual",
|
||||
),
|
||||
]
|
||||
|
||||
# The name of an image file (relative to this directory) to place at
|
||||
# the top of the title page.
|
||||
# latex_logo = None
|
||||
|
||||
# For "manual" documents, if this is true, then toplevel headings
|
||||
# are parts, not chapters.
|
||||
# latex_use_parts = False
|
||||
|
||||
# If true, show page references after internal links.
|
||||
# latex_show_pagerefs = False
|
||||
|
||||
# If true, show URL addresses after external links.
|
||||
# latex_show_urls = False
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
# latex_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
# latex_domain_indices = True
|
||||
|
||||
|
||||
# -- Options for manual page output ------------------------------------
|
||||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
("index", "scrapy_redis", "Scrapy-Redis Documentation", ["R Max Espinoza"], 1)
|
||||
]
|
||||
|
||||
# If true, show URL addresses after external links.
|
||||
# man_show_urls = False
|
||||
|
||||
|
||||
# -- Options for Texinfo output ----------------------------------------
|
||||
|
||||
# Grouping the document tree into Texinfo files. List of tuples
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
(
|
||||
"index",
|
||||
"scrapy_redis",
|
||||
"Scrapy-Redis Documentation",
|
||||
"R Max Espinoza",
|
||||
"scrapy-redis",
|
||||
"One line description of project.",
|
||||
"Miscellaneous",
|
||||
),
|
||||
]
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
# texinfo_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
# texinfo_domain_indices = True
|
||||
|
||||
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
||||
# texinfo_show_urls = 'footnote'
|
||||
|
||||
# If true, do not generate a @detailmenu in the "Top" node's menu.
|
||||
# texinfo_no_detailmenu = False
|
||||
@@ -0,0 +1 @@
|
||||
.. include:: ../CONTRIBUTING.rst
|
||||
@@ -0,0 +1 @@
|
||||
.. include:: ../HISTORY.rst
|
||||
@@ -0,0 +1,27 @@
|
||||
.. scrapy-redis documentation master file, created by
|
||||
sphinx-quickstart on Tue Jul 9 22:26:36 2013.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
Welcome to Scrapy-Redis's documentation!
|
||||
========================================
|
||||
|
||||
Contents:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
readme
|
||||
installation
|
||||
modules
|
||||
contributing
|
||||
history
|
||||
authors
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
.. highlight:: shell
|
||||
|
||||
============
|
||||
Installation
|
||||
============
|
||||
|
||||
|
||||
Stable release
|
||||
--------------
|
||||
|
||||
To install Scrapy-Redis, run this command in your terminal:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
pip install scrapy-redis
|
||||
|
||||
If you don't have `pip`_ installed, this `Python installation guide`_ can guide
|
||||
you through the process.
|
||||
|
||||
.. _pip: https://pip.pypa.io
|
||||
.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
|
||||
|
||||
|
||||
From sources
|
||||
------------
|
||||
|
||||
The sources for Scrapy-Redis can be downloaded from the `Github repo`_.
|
||||
|
||||
You can either clone the public repository:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
git clone git://github.com/rolando/scrapy-redis
|
||||
|
||||
Or download the `tarball`_:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
curl -OL https://github.com/rolando/scrapy-redis/tarball/master
|
||||
|
||||
Once you have a copy of the source, you can install it with:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
pip install -e .
|
||||
|
||||
|
||||
.. _Github repo: https://github.com/rolando/scrapy-redis
|
||||
.. _tarball: https://github.com/rolando/scrapy-redis/tarball/master
|
||||
@@ -0,0 +1,242 @@
|
||||
@ECHO OFF
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set BUILDDIR=_build
|
||||
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
|
||||
set I18NSPHINXOPTS=%SPHINXOPTS% .
|
||||
if NOT "%PAPER%" == "" (
|
||||
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
|
||||
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
|
||||
)
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
if "%1" == "help" (
|
||||
:help
|
||||
echo.Please use `make ^<target^>` where ^<target^> is one of
|
||||
echo. html to make standalone HTML files
|
||||
echo. dirhtml to make HTML files named index.html in directories
|
||||
echo. singlehtml to make a single large HTML file
|
||||
echo. pickle to make pickle files
|
||||
echo. json to make JSON files
|
||||
echo. htmlhelp to make HTML files and a HTML help project
|
||||
echo. qthelp to make HTML files and a qthelp project
|
||||
echo. devhelp to make HTML files and a Devhelp project
|
||||
echo. epub to make an epub
|
||||
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
|
||||
echo. text to make text files
|
||||
echo. man to make manual pages
|
||||
echo. texinfo to make Texinfo files
|
||||
echo. gettext to make PO message catalogs
|
||||
echo. changes to make an overview over all changed/added/deprecated items
|
||||
echo. xml to make Docutils-native XML files
|
||||
echo. pseudoxml to make pseudoxml-XML files for display purposes
|
||||
echo. linkcheck to check all external links for integrity
|
||||
echo. doctest to run all doctests embedded in the documentation if enabled
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "clean" (
|
||||
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
|
||||
del /q /s %BUILDDIR%\*
|
||||
goto end
|
||||
)
|
||||
|
||||
|
||||
%SPHINXBUILD% 2> nul
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
if "%1" == "html" (
|
||||
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "dirhtml" (
|
||||
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "singlehtml" (
|
||||
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "pickle" (
|
||||
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished; now you can process the pickle files.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "json" (
|
||||
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished; now you can process the JSON files.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "htmlhelp" (
|
||||
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished; now you can run HTML Help Workshop with the ^
|
||||
.hhp project file in %BUILDDIR%/htmlhelp.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "qthelp" (
|
||||
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished; now you can run "qcollectiongenerator" with the ^
|
||||
.qhcp project file in %BUILDDIR%/qthelp, like this:
|
||||
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\scrapy-redis.qhcp
|
||||
echo.To view the help file:
|
||||
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\scrapy-redis.ghc
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "devhelp" (
|
||||
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "epub" (
|
||||
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished. The epub file is in %BUILDDIR%/epub.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "latex" (
|
||||
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "latexpdf" (
|
||||
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
|
||||
cd %BUILDDIR%/latex
|
||||
make all-pdf
|
||||
cd %BUILDDIR%/..
|
||||
echo.
|
||||
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "latexpdfja" (
|
||||
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
|
||||
cd %BUILDDIR%/latex
|
||||
make all-pdf-ja
|
||||
cd %BUILDDIR%/..
|
||||
echo.
|
||||
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "text" (
|
||||
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished. The text files are in %BUILDDIR%/text.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "man" (
|
||||
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished. The manual pages are in %BUILDDIR%/man.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "texinfo" (
|
||||
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "gettext" (
|
||||
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "changes" (
|
||||
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.The overview file is in %BUILDDIR%/changes.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "linkcheck" (
|
||||
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Link check complete; look for any errors in the above output ^
|
||||
or in %BUILDDIR%/linkcheck/output.txt.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "doctest" (
|
||||
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Testing of doctests in the sources finished, look at the ^
|
||||
results in %BUILDDIR%/doctest/output.txt.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "xml" (
|
||||
%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished. The XML files are in %BUILDDIR%/xml.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "pseudoxml" (
|
||||
%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
|
||||
if errorlevel 1 exit /b 1
|
||||
echo.
|
||||
echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
|
||||
goto end
|
||||
)
|
||||
|
||||
:end
|
||||
@@ -0,0 +1,7 @@
|
||||
API Reference
|
||||
=============
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
scrapy_redis
|
||||
@@ -0,0 +1 @@
|
||||
.. include:: ../README.rst
|
||||
@@ -0,0 +1,8 @@
|
||||
# This packages are requires only for development and release management.
|
||||
Sphinx
|
||||
bumpversion
|
||||
check-manifest
|
||||
pip-tools
|
||||
twine
|
||||
watchdog
|
||||
wheel
|
||||
@@ -0,0 +1,62 @@
|
||||
scrapy_redis package
|
||||
====================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
scrapy_redis.connection module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: scrapy_redis.connection
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
scrapy_redis.dupefilter module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: scrapy_redis.dupefilter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
scrapy_redis.pipelines module
|
||||
-----------------------------
|
||||
|
||||
.. automodule:: scrapy_redis.pipelines
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
scrapy_redis.queue module
|
||||
-------------------------
|
||||
|
||||
.. automodule:: scrapy_redis.queue
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
scrapy_redis.scheduler module
|
||||
-----------------------------
|
||||
|
||||
.. automodule:: scrapy_redis.scheduler
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
scrapy_redis.spiders module
|
||||
---------------------------
|
||||
|
||||
.. automodule:: scrapy_redis.spiders
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: scrapy_redis
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
@@ -0,0 +1,5 @@
|
||||
#@IgnoreInspection BashAddShebang
|
||||
FROM python:2.7-onbuild
|
||||
|
||||
ENTRYPOINT ["scrapy"]
|
||||
CMD ["crawl", "dmoz"]
|
||||
@@ -0,0 +1,154 @@
|
||||
============================
|
||||
Scrapy Redis Example Project
|
||||
============================
|
||||
|
||||
|
||||
This directory contains an example Scrapy project integrated with scrapy-redis.
|
||||
By default, all items are sent to redis (key ``<spider>:items``). All spiders
|
||||
schedule requests through redis, so you can start additional spiders to speed
|
||||
up the crawling.
|
||||
|
||||
Spiders
|
||||
-------
|
||||
|
||||
* **dmoz**
|
||||
|
||||
This spider simply scrapes dmoz.org.
|
||||
|
||||
* **myspider_redis**
|
||||
|
||||
This spider uses redis as a shared requests queue and uses
|
||||
``myspider:start_urls`` as start URLs seed. For each URL, the spider outputs
|
||||
one item.
|
||||
|
||||
* **mycrawler_redis**
|
||||
|
||||
This spider uses redis as a shared requests queue and uses
|
||||
``mycrawler:start_urls`` as start URLs seed. For each URL, the spider follows
|
||||
are links.
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
All requests are persisted by default. You can clear the queue by using the
|
||||
``SCHEDULER_FLUSH_ON_START`` setting. For example: ``scrapy crawl dmoz -s
|
||||
SCHEDULER_FLUSH_ON_START=1``.
|
||||
|
||||
|
||||
Running the example project
|
||||
---------------------------
|
||||
|
||||
This example illustrates how to share a spider's requests queue
|
||||
across multiple spider instances, highly suitable for broad crawls.
|
||||
|
||||
1. Check scrapy_redis package in your ``PYTHONPATH``
|
||||
|
||||
2. Run the crawler for first time then stop it
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd example-project
|
||||
scrapy crawl dmoz
|
||||
... [dmoz] ...
|
||||
^C
|
||||
|
||||
3. Run the crawler again to resume stopped crawling
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
scrapy crawl dmoz
|
||||
... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled)
|
||||
|
||||
4. Start one or more additional scrapy crawlers
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
scrapy crawl dmoz
|
||||
... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled)
|
||||
|
||||
5. Start one or more post-processing workers
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python process_items.py dmoz:items -v
|
||||
...
|
||||
Processing: Kilani Giftware (http://www.dmoz.org/Computers/Shopping/Gifts/)
|
||||
Processing: NinjaGizmos.com (http://www.dmoz.org/Computers/Shopping/Gifts/)
|
||||
...
|
||||
|
||||
|
||||
Feeding a Spider from Redis
|
||||
---------------------------
|
||||
|
||||
The class ``scrapy_redis.spiders.RedisSpider`` enables a spider to read the
|
||||
urls from redis. The urls in the redis queue will be processed one
|
||||
after another, if the first request yields more requests, the spider
|
||||
will process those requests before fetching another url from redis.
|
||||
|
||||
For example, create a file ``myspider.py`` with the code below:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from scrapy_redis.spiders import RedisSpider
|
||||
|
||||
|
||||
class MySpider(RedisSpider):
|
||||
name = "myspider"
|
||||
|
||||
def parse(self, response):
|
||||
# do stuff
|
||||
pass
|
||||
|
||||
|
||||
Then:
|
||||
|
||||
1. run the spider
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
scrapy runspider myspider.py
|
||||
|
||||
2. push json data to redis
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
redis-cli lpush myspider '{"url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }'
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
* These spiders rely on the spider idle signal to fetch start urls, hence it
|
||||
may have a few seconds of delay between the time you push a new url and the
|
||||
spider starts crawling it.
|
||||
|
||||
* Also please pay attention to json formatting.
|
||||
|
||||
|
||||
Processing items
|
||||
----------------
|
||||
|
||||
The ``process_items.py`` provides an example of consuming the items queue::
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python process_items.py --help
|
||||
|
||||
|
||||
Run via Docker
|
||||
--------------
|
||||
|
||||
You require the following applications:
|
||||
|
||||
* docker (https://docs.docker.com/installation/)
|
||||
* docker-compose (https://docs.docker.com/compose/install/)
|
||||
|
||||
For implementation details see `Dockerfile` and `docker-compose.yml` and read
|
||||
official docker documentation.
|
||||
|
||||
1. To start sample `example-project` (`-d` for daemon)::
|
||||
|
||||
docker-compose up
|
||||
|
||||
2. To scale `crawler` (4 instances for example)::
|
||||
|
||||
docker-compose scale crawler=4
|
||||
@@ -0,0 +1,9 @@
|
||||
redis:
|
||||
image: redis
|
||||
ports:
|
||||
- "6379:6379" # added port for external db provisioning
|
||||
|
||||
crawler:
|
||||
build: .
|
||||
links:
|
||||
- redis:localhost
|
||||
@@ -0,0 +1,24 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# http://doc.scrapy.org/topics/items.html
|
||||
|
||||
from scrapy.item import Field, Item
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.loader.processors import Join, MapCompose, TakeFirst
|
||||
|
||||
|
||||
class ExampleItem(Item):
|
||||
name = Field()
|
||||
description = Field()
|
||||
link = Field()
|
||||
crawled = Field()
|
||||
spider = Field()
|
||||
url = Field()
|
||||
|
||||
|
||||
class ExampleLoader(ItemLoader):
|
||||
default_item_class = ExampleItem
|
||||
default_input_processor = MapCompose(lambda s: s.strip())
|
||||
default_output_processor = TakeFirst()
|
||||
description_out = Join()
|
||||
@@ -0,0 +1,12 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class ExamplePipeline:
|
||||
def process_item(self, item, spider):
|
||||
item["crawled"] = datetime.utcnow()
|
||||
item["spider"] = spider.name
|
||||
return item
|
||||
@@ -0,0 +1,37 @@
|
||||
# Scrapy settings for example project
|
||||
#
|
||||
# For simplicity, this file contains only the most important settings by
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/topics/settings.html
|
||||
#
|
||||
SPIDER_MODULES = ["example.spiders"]
|
||||
NEWSPIDER_MODULE = "example.spiders"
|
||||
|
||||
LOG_LEVEL = "WARNING"
|
||||
|
||||
USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)"
|
||||
|
||||
#设置重复过滤器模块
|
||||
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
|
||||
#设置调度器,scrapy_redis具备与数据库交互的功能
|
||||
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
|
||||
#设置当爬虫结束时是否保持redis数据库中的去重集合与任务队列
|
||||
SCHEDULER_PERSIST = True
|
||||
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
|
||||
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
|
||||
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
|
||||
|
||||
ITEM_PIPELINES = {
|
||||
"example.pipelines.ExamplePipeline": 300,
|
||||
#当开启该管道,该管道将会把数据存到redis数据库中
|
||||
"scrapy_redis.pipelines.RedisPipeline": 400,
|
||||
}
|
||||
#设置redis数据库
|
||||
REDIS_URL = "redis://127.0.0.1:6379"
|
||||
|
||||
LOG_LEVEL = "DEBUG"
|
||||
|
||||
# Introduce an artifical delay to make use of parallelism. to speed up the
|
||||
# crawl.
|
||||
DOWNLOAD_DELAY = 1
|
||||
@@ -0,0 +1,8 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# To create the first spider for your project use this command:
|
||||
#
|
||||
# scrapy genspider myspider myspider-domain.com
|
||||
#
|
||||
# For more info see:
|
||||
# http://doc.scrapy.org/topics/spiders.html
|
||||
@@ -0,0 +1,26 @@
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
|
||||
|
||||
class DmozSpider(CrawlSpider):
|
||||
"""Follow categories and extract links."""
|
||||
|
||||
name = "dmoz"
|
||||
allowed_domains = ["dmoztools.net"]
|
||||
start_urls = ["http://www.dmoztools.net/"]
|
||||
|
||||
rules = [
|
||||
Rule(
|
||||
LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
|
||||
callback="parse_directory",
|
||||
follow=True,
|
||||
),
|
||||
]
|
||||
|
||||
def parse_directory(self, response):
|
||||
for div in response.css(".title-and-desc"):
|
||||
yield {
|
||||
"name": div.css(".site-title::text").extract_first(),
|
||||
"description": div.css(".site-descr::text").extract_first().strip(),
|
||||
"link": div.css("a::attr(href)").extract_first(),
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import Rule
|
||||
|
||||
from scrapy_redis.spiders import RedisCrawlSpider
|
||||
|
||||
|
||||
class MyCrawler(RedisCrawlSpider):
|
||||
"""Spider that reads urls from redis queue (myspider:start_urls)."""
|
||||
|
||||
name = "mycrawler_redis"
|
||||
redis_key = "mycrawler:start_urls"
|
||||
|
||||
rules = (
|
||||
# follow all links
|
||||
Rule(LinkExtractor(), callback="parse_page", follow=True),
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Dynamically define the allowed domains list.
|
||||
domain = kwargs.pop("domain", "")
|
||||
self.allowed_domains = filter(None, domain.split(","))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def parse_page(self, response):
|
||||
return {
|
||||
"name": response.css("title::text").extract_first(),
|
||||
"url": response.url,
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
from scrapy_redis.spiders import RedisSpider
|
||||
|
||||
|
||||
class MySpider(RedisSpider):
|
||||
"""Spider that reads urls from redis queue (myspider:start_urls)."""
|
||||
|
||||
name = "myspider_redis"
|
||||
redis_key = "myspider:start_urls"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Dynamically define the allowed domains list.
|
||||
domain = kwargs.pop("domain", "")
|
||||
self.allowed_domains = filter(None, domain.split(","))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def parse(self, response):
|
||||
return {
|
||||
"name": response.css("title::text").extract_first(),
|
||||
"url": response.url,
|
||||
}
|
||||
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
"""A script to process items from a redis queue."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import pprint
|
||||
import sys
|
||||
import time
|
||||
|
||||
from scrapy_redis import get_redis
|
||||
|
||||
logger = logging.getLogger("process_items")
|
||||
|
||||
|
||||
def process_items(r, keys, timeout, limit=0, log_every=1000, wait=0.1):
|
||||
"""Process items from a redis queue.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
r : Redis
|
||||
Redis connection instance.
|
||||
keys : list
|
||||
List of keys to read the items from.
|
||||
timeout: int
|
||||
Read timeout.
|
||||
|
||||
"""
|
||||
limit = limit or float("inf")
|
||||
processed = 0
|
||||
while processed < limit:
|
||||
# Change ``blpop`` to ``brpop`` to process as LIFO.
|
||||
ret = r.blpop(keys, timeout)
|
||||
# If data is found before the timeout then we consider we are done.
|
||||
if ret is None:
|
||||
time.sleep(wait)
|
||||
continue
|
||||
|
||||
source, data = ret
|
||||
try:
|
||||
item = json.loads(data)
|
||||
except Exception:
|
||||
logger.exception("Failed to load item:\n%r", pprint.pformat(data))
|
||||
continue
|
||||
|
||||
try:
|
||||
name = item.get("name") or item.get("title")
|
||||
url = item.get("url") or item.get("link")
|
||||
logger.debug("[%s] Processing item: %s <%s>", source, name, url)
|
||||
except KeyError:
|
||||
logger.exception(
|
||||
"[%s] Failed to process item:\n%r", source, pprint.pformat(item)
|
||||
)
|
||||
continue
|
||||
|
||||
processed += 1
|
||||
if processed % log_every == 0:
|
||||
logger.info("Processed %s items", processed)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("key", help="Redis key where items are stored")
|
||||
parser.add_argument("--host")
|
||||
parser.add_argument("--port")
|
||||
parser.add_argument("--timeout", type=int, default=5)
|
||||
parser.add_argument("--limit", type=int, default=0)
|
||||
parser.add_argument("--progress-every", type=int, default=100)
|
||||
parser.add_argument("-v", "--verbose", action="store_true")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
params = {}
|
||||
if args.host:
|
||||
params["host"] = args.host
|
||||
if args.port:
|
||||
params["port"] = args.port
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
r = get_redis(**params)
|
||||
host = r.connection_pool.get_connection("info").host
|
||||
logger.info("Waiting for items in '%s' (server: %s)", args.key, host)
|
||||
kwargs = {
|
||||
"keys": [args.key],
|
||||
"timeout": args.timeout,
|
||||
"limit": args.limit,
|
||||
"log_every": args.progress_every,
|
||||
}
|
||||
try:
|
||||
process_items(r, **kwargs)
|
||||
retcode = 0 # ok
|
||||
except KeyboardInterrupt:
|
||||
retcode = 0 # ok
|
||||
except Exception:
|
||||
logger.exception("Unhandled exception")
|
||||
retcode = 2
|
||||
|
||||
return retcode
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,2 @@
|
||||
scrapy
|
||||
scrapy-redis
|
||||
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# http://doc.scrapy.org/topics/scrapyd.html
|
||||
|
||||
[settings]
|
||||
default = example.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = example
|
||||
@@ -0,0 +1,125 @@
|
||||
[MASTER]
|
||||
persistent=no
|
||||
jobs=1 # >1 hides results
|
||||
suggestion-mode=yes # guess common misconfiguration and emit user-friendly hints
|
||||
py-version = 3.11.3
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
disable=abstract-method,
|
||||
anomalous-backslash-in-string,
|
||||
arguments-differ,
|
||||
arguments-renamed,
|
||||
attribute-defined-outside-init,
|
||||
bad-classmethod-argument,
|
||||
bad-continuation,
|
||||
bad-indentation,
|
||||
bad-mcs-classmethod-argument,
|
||||
bad-super-call,
|
||||
bad-whitespace,
|
||||
bare-except,
|
||||
blacklisted-name,
|
||||
broad-except,
|
||||
c-extension-no-member,
|
||||
catching-non-exception,
|
||||
cell-var-from-loop,
|
||||
comparison-with-callable,
|
||||
consider-iterating-dictionary,
|
||||
consider-using-dict-items,
|
||||
consider-using-from-import,
|
||||
consider-using-in,
|
||||
consider-using-set-comprehension,
|
||||
consider-using-sys-exit,
|
||||
consider-using-with,
|
||||
cyclic-import,
|
||||
dangerous-default-value,
|
||||
deprecated-method,
|
||||
deprecated-module,
|
||||
duplicate-code, # https://github.com/PyCQA/pylint/issues/214
|
||||
eval-used,
|
||||
expression-not-assigned,
|
||||
fixme,
|
||||
function-redefined,
|
||||
global-statement,
|
||||
import-error,
|
||||
import-outside-toplevel,
|
||||
import-self,
|
||||
inconsistent-return-statements,
|
||||
inherit-non-class,
|
||||
invalid-name,
|
||||
invalid-overridden-method,
|
||||
isinstance-second-argument-not-valid-type,
|
||||
keyword-arg-before-vararg,
|
||||
line-too-long,
|
||||
logging-format-interpolation,
|
||||
logging-not-lazy,
|
||||
lost-exception,
|
||||
method-hidden,
|
||||
misplaced-comparison-constant,
|
||||
missing-docstring,
|
||||
missing-final-newline,
|
||||
multiple-imports,
|
||||
multiple-statements,
|
||||
no-else-continue,
|
||||
no-else-raise,
|
||||
no-else-return,
|
||||
no-init,
|
||||
no-member,
|
||||
no-method-argument,
|
||||
no-name-in-module,
|
||||
no-self-argument,
|
||||
no-self-use,
|
||||
no-value-for-parameter,
|
||||
not-an-iterable,
|
||||
not-callable,
|
||||
pointless-statement,
|
||||
pointless-string-statement,
|
||||
protected-access,
|
||||
raise-missing-from,
|
||||
redefined-argument-from-local,
|
||||
redefined-builtin,
|
||||
redefined-outer-name,
|
||||
reimported,
|
||||
signature-differs,
|
||||
singleton-comparison,
|
||||
super-init-not-called,
|
||||
super-with-arguments,
|
||||
superfluous-parens,
|
||||
too-few-public-methods,
|
||||
too-many-ancestors,
|
||||
too-many-arguments,
|
||||
too-many-branches,
|
||||
too-many-format-args,
|
||||
too-many-function-args,
|
||||
too-many-instance-attributes,
|
||||
too-many-lines,
|
||||
too-many-locals,
|
||||
too-many-public-methods,
|
||||
too-many-return-statements,
|
||||
trailing-newlines,
|
||||
trailing-whitespace,
|
||||
unbalanced-tuple-unpacking,
|
||||
undefined-variable,
|
||||
undefined-loop-variable,
|
||||
unexpected-special-method-signature,
|
||||
ungrouped-imports,
|
||||
unidiomatic-typecheck,
|
||||
unnecessary-comprehension,
|
||||
unnecessary-lambda,
|
||||
unnecessary-pass,
|
||||
unreachable,
|
||||
unspecified-encoding,
|
||||
unsupported-assignment-operation,
|
||||
unsubscriptable-object,
|
||||
unused-argument,
|
||||
unused-import,
|
||||
unused-private-member,
|
||||
unused-variable,
|
||||
unused-wildcard-import,
|
||||
use-implicit-booleaness-not-comparison,
|
||||
used-before-assignment,
|
||||
useless-object-inheritance, # Required for Python 2 support
|
||||
useless-return,
|
||||
useless-super-delegation,
|
||||
wildcard-import,
|
||||
wrong-import-order,
|
||||
wrong-import-position
|
||||
@@ -0,0 +1,11 @@
|
||||
[pytest]
|
||||
norecursedirs =
|
||||
.*
|
||||
dist
|
||||
build
|
||||
python_files =
|
||||
test_*.py
|
||||
*_test.py
|
||||
tests.py
|
||||
addopts =
|
||||
-rxEfsw -v
|
||||
@@ -0,0 +1,6 @@
|
||||
# This packages are required to run all the tests.
|
||||
flake8
|
||||
mock
|
||||
pytest>=6.0,<7
|
||||
pytest-cov
|
||||
tox>=4.0,<5
|
||||
@@ -0,0 +1,3 @@
|
||||
scrapy>=2.6.0
|
||||
redis>=4.2
|
||||
six>=1.15
|
||||
@@ -0,0 +1,6 @@
|
||||
[wheel]
|
||||
universal = 1
|
||||
|
||||
[flake8]
|
||||
exclude = docs, tests
|
||||
max-line-length = 120
|
||||
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env python
|
||||
import io
|
||||
from pkgutil import walk_packages
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
|
||||
def find_packages(path):
|
||||
# This method returns packages and subpackages as well.
|
||||
return [name for _, name, is_pkg in walk_packages([path]) if is_pkg]
|
||||
|
||||
|
||||
def read_file(filename):
|
||||
with open(filename) as fp:
|
||||
return fp.read().strip()
|
||||
|
||||
|
||||
def read_rst(filename):
|
||||
# Ignore unsupported directives by pypi.
|
||||
content = read_file(filename)
|
||||
return "".join(
|
||||
line for line in io.StringIO(content) if not line.startswith(".. comment::")
|
||||
)
|
||||
|
||||
|
||||
def read_requirements(filename):
|
||||
return [
|
||||
line.strip()
|
||||
for line in read_file(filename).splitlines()
|
||||
if not line.startswith("#")
|
||||
]
|
||||
|
||||
|
||||
setup(
|
||||
name="scrapy-redis",
|
||||
version=read_file("VERSION"),
|
||||
description="Redis-based components for Scrapy.",
|
||||
long_description=read_rst("README.rst") + "\n\n" + read_rst("HISTORY.rst"),
|
||||
author="R Max Espinoza",
|
||||
author_email="hey@rmax.dev",
|
||||
url="https://github.com/rmax/scrapy-redis",
|
||||
packages=list(find_packages("src")),
|
||||
package_dir={"": "src"},
|
||||
install_requires=read_requirements("requirements.txt"),
|
||||
include_package_data=True,
|
||||
license="MIT",
|
||||
keywords="scrapy-redis",
|
||||
classifiers=[
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Natural Language :: English",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
],
|
||||
)
|
||||
@@ -0,0 +1,5 @@
|
||||
from .connection import get_redis, get_redis_from_settings # NOQA
|
||||
|
||||
__author__ = "R Max Espinoza"
|
||||
__email__ = "hey at rmax.dev"
|
||||
__version__ = "0.9.1"
|
||||
@@ -0,0 +1,97 @@
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
from . import defaults
|
||||
|
||||
# Shortcut maps 'setting name' -> 'parmater name'.
|
||||
SETTINGS_PARAMS_MAP = {
|
||||
"REDIS_URL": "url",
|
||||
"REDIS_HOST": "host",
|
||||
"REDIS_PORT": "port",
|
||||
"REDIS_DB": "db",
|
||||
"REDIS_ENCODING": "encoding",
|
||||
}
|
||||
|
||||
SETTINGS_PARAMS_MAP["REDIS_DECODE_RESPONSES"] = "decode_responses"
|
||||
|
||||
|
||||
def get_redis_from_settings(settings):
|
||||
"""Returns a redis client instance from given Scrapy settings object.
|
||||
|
||||
This function uses ``get_client`` to instantiate the client and uses
|
||||
``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You
|
||||
can override them using the ``REDIS_PARAMS`` setting.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
settings : Settings
|
||||
A scrapy settings object. See the supported settings below.
|
||||
|
||||
Returns
|
||||
-------
|
||||
server
|
||||
Redis client instance.
|
||||
|
||||
Other Parameters
|
||||
----------------
|
||||
REDIS_URL : str, optional
|
||||
Server connection URL.
|
||||
REDIS_HOST : str, optional
|
||||
Server host.
|
||||
REDIS_PORT : str, optional
|
||||
Server port.
|
||||
REDIS_DB : int, optional
|
||||
Server database
|
||||
REDIS_ENCODING : str, optional
|
||||
Data encoding.
|
||||
REDIS_PARAMS : dict, optional
|
||||
Additional client parameters.
|
||||
|
||||
Python 3 Only
|
||||
----------------
|
||||
REDIS_DECODE_RESPONSES : bool, optional
|
||||
Sets the `decode_responses` kwarg in Redis cls ctor
|
||||
|
||||
"""
|
||||
params = defaults.REDIS_PARAMS.copy()
|
||||
params.update(settings.getdict("REDIS_PARAMS"))
|
||||
# XXX: Deprecate REDIS_* settings.
|
||||
for source, dest in SETTINGS_PARAMS_MAP.items():
|
||||
val = settings.get(source)
|
||||
if val:
|
||||
params[dest] = val
|
||||
|
||||
# Allow ``redis_cls`` to be a path to a class.
|
||||
if isinstance(params.get("redis_cls"), str):
|
||||
params["redis_cls"] = load_object(params["redis_cls"])
|
||||
|
||||
return get_redis(**params)
|
||||
|
||||
|
||||
# Backwards compatible alias.
|
||||
from_settings = get_redis_from_settings
|
||||
|
||||
|
||||
def get_redis(**kwargs):
|
||||
"""Returns a redis client instance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
redis_cls : class, optional
|
||||
Defaults to ``redis.StrictRedis``.
|
||||
url : str, optional
|
||||
If given, ``redis_cls.from_url`` is used to instantiate the class.
|
||||
**kwargs
|
||||
Extra parameters to be passed to the ``redis_cls`` class.
|
||||
|
||||
Returns
|
||||
-------
|
||||
server
|
||||
Redis client instance.
|
||||
|
||||
"""
|
||||
redis_cls = kwargs.pop("redis_cls", defaults.REDIS_CLS)
|
||||
url = kwargs.pop("url", None)
|
||||
if url:
|
||||
return redis_cls.from_url(url, **kwargs)
|
||||
else:
|
||||
return redis_cls(**kwargs)
|
||||
@@ -0,0 +1,29 @@
|
||||
import redis
|
||||
|
||||
# For standalone use.
|
||||
DUPEFILTER_KEY = "dupefilter:%(timestamp)s"
|
||||
|
||||
PIPELINE_KEY = "%(spider)s:items"
|
||||
|
||||
STATS_KEY = "%(spider)s:stats"
|
||||
|
||||
REDIS_CLS = redis.StrictRedis
|
||||
REDIS_ENCODING = "utf-8"
|
||||
# Sane connection defaults.
|
||||
REDIS_PARAMS = {
|
||||
"socket_timeout": 30,
|
||||
"socket_connect_timeout": 30,
|
||||
"retry_on_timeout": True,
|
||||
"encoding": REDIS_ENCODING,
|
||||
}
|
||||
REDIS_CONCURRENT_REQUESTS = 16
|
||||
|
||||
SCHEDULER_QUEUE_KEY = "%(spider)s:requests"
|
||||
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.PriorityQueue"
|
||||
SCHEDULER_DUPEFILTER_KEY = "%(spider)s:dupefilter"
|
||||
SCHEDULER_DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
|
||||
SCHEDULER_PERSIST = False
|
||||
START_URLS_KEY = "%(name)s:start_urls"
|
||||
START_URLS_AS_SET = False
|
||||
START_URLS_AS_ZSET = False
|
||||
MAX_IDLE_TIME = 0
|
||||
@@ -0,0 +1,169 @@
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
|
||||
from scrapy.dupefilters import BaseDupeFilter
|
||||
from scrapy.utils.python import to_unicode
|
||||
from w3lib.url import canonicalize_url
|
||||
|
||||
from . import defaults
|
||||
from .connection import get_redis_from_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# TODO: Rename class to RedisDupeFilter.
|
||||
class RFPDupeFilter(BaseDupeFilter):
|
||||
"""Redis-based request duplicates filter.
|
||||
|
||||
This class can also be used with default Scrapy's scheduler.
|
||||
|
||||
"""
|
||||
|
||||
logger = logger
|
||||
|
||||
def __init__(self, server, key, debug=False):
|
||||
"""Initialize the duplicates filter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
server : redis.StrictRedis
|
||||
The redis server instance.
|
||||
key : str
|
||||
Redis key Where to store fingerprints.
|
||||
debug : bool, optional
|
||||
Whether to log filtered requests.
|
||||
|
||||
"""
|
||||
self.server = server
|
||||
self.key = key
|
||||
self.debug = debug
|
||||
self.logdupes = True
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
"""Returns an instance from given settings.
|
||||
|
||||
This uses by default the key ``dupefilter:<timestamp>``. When using the
|
||||
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
|
||||
it needs to pass the spider name in the key.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
settings : scrapy.settings.Settings
|
||||
|
||||
Returns
|
||||
-------
|
||||
RFPDupeFilter
|
||||
A RFPDupeFilter instance.
|
||||
|
||||
|
||||
"""
|
||||
server = get_redis_from_settings(settings)
|
||||
# XXX: This creates one-time key. needed to support to use this
|
||||
# class as standalone dupefilter with scrapy's default scheduler
|
||||
# if scrapy passes spider on open() method this wouldn't be needed
|
||||
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
|
||||
key = defaults.DUPEFILTER_KEY % {"timestamp": int(time.time())}
|
||||
debug = settings.getbool("DUPEFILTER_DEBUG")
|
||||
return cls(server, key=key, debug=debug)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
"""Returns instance from crawler.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
crawler : scrapy.crawler.Crawler
|
||||
|
||||
Returns
|
||||
-------
|
||||
RFPDupeFilter
|
||||
Instance of RFPDupeFilter.
|
||||
|
||||
"""
|
||||
return cls.from_settings(crawler.settings)
|
||||
|
||||
def request_seen(self, request):
|
||||
"""Returns True if request was already seen.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
request : scrapy.http.Request
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
|
||||
"""
|
||||
fp = self.request_fingerprint(request)
|
||||
# This returns the number of values added, zero if already exists.
|
||||
added = self.server.sadd(self.key, fp)
|
||||
return added == 0
|
||||
|
||||
def request_fingerprint(self, request):
|
||||
"""Returns a fingerprint for a given request.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
request : scrapy.http.Request
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
|
||||
"""
|
||||
fingerprint_data = {
|
||||
"method": to_unicode(request.method),
|
||||
"url": canonicalize_url(request.url),
|
||||
"body": (request.body or b"").hex(),
|
||||
}
|
||||
fingerprint_json = json.dumps(fingerprint_data, sort_keys=True)
|
||||
return hashlib.sha1(fingerprint_json.encode()).hexdigest()
|
||||
|
||||
@classmethod
|
||||
def from_spider(cls, spider):
|
||||
settings = spider.settings
|
||||
server = get_redis_from_settings(settings)
|
||||
dupefilter_key = settings.get(
|
||||
"SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY
|
||||
)
|
||||
key = dupefilter_key % {"spider": spider.name}
|
||||
debug = settings.getbool("DUPEFILTER_DEBUG")
|
||||
return cls(server, key=key, debug=debug)
|
||||
|
||||
def close(self, reason=""):
|
||||
"""Delete data on close. Called by Scrapy's scheduler.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
reason : str, optional
|
||||
|
||||
"""
|
||||
self.clear()
|
||||
|
||||
def clear(self):
|
||||
"""Clears fingerprints data."""
|
||||
self.server.delete(self.key)
|
||||
|
||||
def log(self, request, spider):
|
||||
"""Logs given request.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
request : scrapy.http.Request
|
||||
spider : scrapy.spiders.Spider
|
||||
|
||||
"""
|
||||
if self.debug:
|
||||
msg = "Filtered duplicate request: %(request)s"
|
||||
self.logger.debug(msg, {"request": request}, extra={"spider": spider})
|
||||
elif self.logdupes:
|
||||
msg = (
|
||||
"Filtered duplicate request %(request)s"
|
||||
" - no more duplicates will be shown"
|
||||
" (see DUPEFILTER_DEBUG to show all duplicates)"
|
||||
)
|
||||
self.logger.debug(msg, {"request": request}, extra={"spider": spider})
|
||||
self.logdupes = False
|
||||
@@ -0,0 +1,14 @@
|
||||
"""A pickle wrapper module with protocol=-1 by default."""
|
||||
|
||||
try:
|
||||
import cPickle as pickle # PY2
|
||||
except ImportError:
|
||||
import pickle
|
||||
|
||||
|
||||
def loads(s):
|
||||
return pickle.loads(s)
|
||||
|
||||
|
||||
def dumps(obj):
|
||||
return pickle.dumps(obj, protocol=-1)
|
||||
@@ -0,0 +1,73 @@
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder
|
||||
from twisted.internet.threads import deferToThread
|
||||
|
||||
from . import connection, defaults
|
||||
|
||||
default_serialize = ScrapyJSONEncoder().encode
|
||||
|
||||
|
||||
class RedisPipeline:
|
||||
"""Pushes serialized item into a redis list/queue
|
||||
|
||||
Settings
|
||||
--------
|
||||
REDIS_ITEMS_KEY : str
|
||||
Redis key where to store items.
|
||||
REDIS_ITEMS_SERIALIZER : str
|
||||
Object path to serializer function.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, server, key=defaults.PIPELINE_KEY, serialize_func=default_serialize
|
||||
):
|
||||
"""Initialize pipeline.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
server : StrictRedis
|
||||
Redis client instance.
|
||||
key : str
|
||||
Redis key where to store items.
|
||||
serialize_func : callable
|
||||
Items serializer function.
|
||||
|
||||
"""
|
||||
self.server = server
|
||||
self.key = key
|
||||
self.serialize = serialize_func
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
params = {
|
||||
"server": connection.from_settings(settings),
|
||||
}
|
||||
if settings.get("REDIS_ITEMS_KEY"):
|
||||
params["key"] = settings["REDIS_ITEMS_KEY"]
|
||||
if settings.get("REDIS_ITEMS_SERIALIZER"):
|
||||
params["serialize_func"] = load_object(settings["REDIS_ITEMS_SERIALIZER"])
|
||||
|
||||
return cls(**params)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls.from_settings(crawler.settings)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
return deferToThread(self._process_item, item, spider)
|
||||
|
||||
def _process_item(self, item, spider):
|
||||
key = self.item_key(item, spider)
|
||||
data = self.serialize(item)
|
||||
self.server.rpush(key, data)
|
||||
return item
|
||||
|
||||
def item_key(self, item, spider):
|
||||
"""Returns redis key based on given spider.
|
||||
|
||||
Override this function to use a different key depending on the item
|
||||
and/or spider.
|
||||
|
||||
"""
|
||||
return self.key % {"spider": spider.name}
|
||||
@@ -0,0 +1,155 @@
|
||||
try:
|
||||
from scrapy.utils.request import request_from_dict
|
||||
except ImportError:
|
||||
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
||||
|
||||
from . import picklecompat
|
||||
|
||||
|
||||
class Base:
|
||||
"""Per-spider base queue class"""
|
||||
|
||||
def __init__(self, server, spider, key, serializer=None):
|
||||
"""Initialize per-spider redis queue.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
server : StrictRedis
|
||||
Redis client instance.
|
||||
spider : Spider
|
||||
Scrapy spider instance.
|
||||
key: str
|
||||
Redis key where to put and get messages.
|
||||
serializer : object
|
||||
Serializer object with ``loads`` and ``dumps`` methods.
|
||||
|
||||
"""
|
||||
if serializer is None:
|
||||
# Backward compatibility.
|
||||
# TODO: deprecate pickle.
|
||||
serializer = picklecompat
|
||||
if not hasattr(serializer, "loads"):
|
||||
raise TypeError(
|
||||
f"serializer does not implement 'loads' function: {serializer}"
|
||||
)
|
||||
if not hasattr(serializer, "dumps"):
|
||||
raise TypeError(
|
||||
f"serializer does not implement 'dumps' function: {serializer}"
|
||||
)
|
||||
|
||||
self.server = server
|
||||
self.spider = spider
|
||||
self.key = key % {"spider": spider.name}
|
||||
self.serializer = serializer
|
||||
|
||||
def _encode_request(self, request):
|
||||
"""Encode a request object"""
|
||||
try:
|
||||
obj = request.to_dict(spider=self.spider)
|
||||
except AttributeError:
|
||||
obj = request_to_dict(request, self.spider)
|
||||
return self.serializer.dumps(obj)
|
||||
|
||||
def _decode_request(self, encoded_request):
|
||||
"""Decode an request previously encoded"""
|
||||
obj = self.serializer.loads(encoded_request)
|
||||
return request_from_dict(obj, spider=self.spider)
|
||||
|
||||
def __len__(self):
|
||||
"""Return the length of the queue"""
|
||||
raise NotImplementedError
|
||||
|
||||
def push(self, request):
|
||||
"""Push a request"""
|
||||
raise NotImplementedError
|
||||
|
||||
def pop(self, timeout=0):
|
||||
"""Pop a request"""
|
||||
raise NotImplementedError
|
||||
|
||||
def clear(self):
|
||||
"""Clear queue/stack"""
|
||||
self.server.delete(self.key)
|
||||
|
||||
|
||||
class FifoQueue(Base):
|
||||
"""Per-spider FIFO queue"""
|
||||
|
||||
def __len__(self):
|
||||
"""Return the length of the queue"""
|
||||
return self.server.llen(self.key)
|
||||
|
||||
def push(self, request):
|
||||
"""Push a request"""
|
||||
self.server.lpush(self.key, self._encode_request(request))
|
||||
|
||||
def pop(self, timeout=0):
|
||||
"""Pop a request"""
|
||||
if timeout > 0:
|
||||
data = self.server.brpop(self.key, timeout)
|
||||
if isinstance(data, tuple):
|
||||
data = data[1]
|
||||
else:
|
||||
data = self.server.rpop(self.key)
|
||||
if data:
|
||||
return self._decode_request(data)
|
||||
|
||||
|
||||
class PriorityQueue(Base):
|
||||
"""Per-spider priority queue abstraction using redis' sorted set"""
|
||||
|
||||
def __len__(self):
|
||||
"""Return the length of the queue"""
|
||||
return self.server.zcard(self.key)
|
||||
|
||||
def push(self, request):
|
||||
"""Push a request"""
|
||||
data = self._encode_request(request)
|
||||
score = -request.priority
|
||||
# We don't use zadd method as the order of arguments change depending on
|
||||
# whether the class is Redis or StrictRedis, and the option of using
|
||||
# kwargs only accepts strings, not bytes.
|
||||
self.server.execute_command("ZADD", self.key, score, data)
|
||||
|
||||
def pop(self, timeout=0):
|
||||
"""
|
||||
Pop a request
|
||||
timeout not support in this queue class
|
||||
"""
|
||||
# use atomic range/remove using multi/exec
|
||||
pipe = self.server.pipeline()
|
||||
pipe.multi()
|
||||
pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
|
||||
results, count = pipe.execute()
|
||||
if results:
|
||||
return self._decode_request(results[0])
|
||||
|
||||
|
||||
class LifoQueue(Base):
|
||||
"""Per-spider LIFO queue."""
|
||||
|
||||
def __len__(self):
|
||||
"""Return the length of the stack"""
|
||||
return self.server.llen(self.key)
|
||||
|
||||
def push(self, request):
|
||||
"""Push a request"""
|
||||
self.server.lpush(self.key, self._encode_request(request))
|
||||
|
||||
def pop(self, timeout=0):
|
||||
"""Pop a request"""
|
||||
if timeout > 0:
|
||||
data = self.server.blpop(self.key, timeout)
|
||||
if isinstance(data, tuple):
|
||||
data = data[1]
|
||||
else:
|
||||
data = self.server.lpop(self.key)
|
||||
|
||||
if data:
|
||||
return self._decode_request(data)
|
||||
|
||||
|
||||
# TODO: Deprecate the use of these names.
|
||||
SpiderQueue = FifoQueue
|
||||
SpiderStack = LifoQueue
|
||||
SpiderPriorityQueue = PriorityQueue
|
||||
@@ -0,0 +1,182 @@
|
||||
import importlib
|
||||
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
from . import connection, defaults
|
||||
|
||||
|
||||
# TODO: add SCRAPY_JOB support.
|
||||
class Scheduler:
|
||||
"""Redis-based scheduler
|
||||
|
||||
Settings
|
||||
--------
|
||||
SCHEDULER_PERSIST : bool (default: False)
|
||||
Whether to persist or clear redis queue.
|
||||
SCHEDULER_FLUSH_ON_START : bool (default: False)
|
||||
Whether to flush redis queue on start.
|
||||
SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0)
|
||||
How many seconds to wait before closing if no message is received.
|
||||
SCHEDULER_QUEUE_KEY : str
|
||||
Scheduler redis key.
|
||||
SCHEDULER_QUEUE_CLASS : str
|
||||
Scheduler queue class.
|
||||
SCHEDULER_DUPEFILTER_KEY : str
|
||||
Scheduler dupefilter redis key.
|
||||
SCHEDULER_DUPEFILTER_CLASS : str
|
||||
Scheduler dupefilter class.
|
||||
SCHEDULER_SERIALIZER : str
|
||||
Scheduler serializer.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
server,
|
||||
persist=False,
|
||||
flush_on_start=False,
|
||||
queue_key=defaults.SCHEDULER_QUEUE_KEY,
|
||||
queue_cls=defaults.SCHEDULER_QUEUE_CLASS,
|
||||
dupefilter=None,
|
||||
dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY,
|
||||
dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS,
|
||||
idle_before_close=0,
|
||||
serializer=None,
|
||||
):
|
||||
"""Initialize scheduler.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
server : Redis
|
||||
The redis server instance.
|
||||
persist : bool
|
||||
Whether to flush requests when closing. Default is False.
|
||||
flush_on_start : bool
|
||||
Whether to flush requests on start. Default is False.
|
||||
queue_key : str
|
||||
Requests queue key.
|
||||
queue_cls : str
|
||||
Importable path to the queue class.
|
||||
dupefilter: Dupefilter
|
||||
Custom dupefilter instance.
|
||||
dupefilter_key : str
|
||||
Duplicates filter key.
|
||||
dupefilter_cls : str
|
||||
Importable path to the dupefilter class.
|
||||
idle_before_close : int
|
||||
Timeout before giving up.
|
||||
|
||||
"""
|
||||
if idle_before_close < 0:
|
||||
raise TypeError("idle_before_close cannot be negative")
|
||||
|
||||
self.server = server
|
||||
self.persist = persist
|
||||
self.flush_on_start = flush_on_start
|
||||
self.queue_key = queue_key
|
||||
self.queue_cls = queue_cls
|
||||
self.df = dupefilter
|
||||
self.dupefilter_cls = dupefilter_cls
|
||||
self.dupefilter_key = dupefilter_key
|
||||
self.idle_before_close = idle_before_close
|
||||
self.serializer = serializer
|
||||
self.stats = None
|
||||
|
||||
def __len__(self):
|
||||
return len(self.queue)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
kwargs = {
|
||||
"persist": settings.getbool("SCHEDULER_PERSIST"),
|
||||
"flush_on_start": settings.getbool("SCHEDULER_FLUSH_ON_START"),
|
||||
"idle_before_close": settings.getint("SCHEDULER_IDLE_BEFORE_CLOSE"),
|
||||
}
|
||||
|
||||
# If these values are missing, it means we want to use the defaults.
|
||||
optional = {
|
||||
# TODO: Use custom prefixes for this settings to note that are
|
||||
# specific to scrapy-redis.
|
||||
"queue_key": "SCHEDULER_QUEUE_KEY",
|
||||
"queue_cls": "SCHEDULER_QUEUE_CLASS",
|
||||
"dupefilter_key": "SCHEDULER_DUPEFILTER_KEY",
|
||||
# We use the default setting name to keep compatibility.
|
||||
"dupefilter_cls": "DUPEFILTER_CLASS",
|
||||
"serializer": "SCHEDULER_SERIALIZER",
|
||||
}
|
||||
for name, setting_name in optional.items():
|
||||
val = settings.get(setting_name)
|
||||
if val:
|
||||
kwargs[name] = val
|
||||
|
||||
dupefilter_cls = load_object(kwargs["dupefilter_cls"])
|
||||
if not hasattr(dupefilter_cls, "from_spider"):
|
||||
kwargs["dupefilter"] = dupefilter_cls.from_settings(settings)
|
||||
|
||||
# Support serializer as a path to a module.
|
||||
if isinstance(kwargs.get("serializer"), str):
|
||||
kwargs["serializer"] = importlib.import_module(kwargs["serializer"])
|
||||
|
||||
server = connection.from_settings(settings)
|
||||
# Ensure the connection is working.
|
||||
server.ping()
|
||||
|
||||
return cls(server=server, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
instance = cls.from_settings(crawler.settings)
|
||||
# FIXME: for now, stats are only supported from this constructor
|
||||
instance.stats = crawler.stats
|
||||
return instance
|
||||
|
||||
def open(self, spider):
|
||||
self.spider = spider
|
||||
|
||||
try:
|
||||
self.queue = load_object(self.queue_cls)(
|
||||
server=self.server,
|
||||
spider=spider,
|
||||
key=self.queue_key % {"spider": spider.name},
|
||||
serializer=self.serializer,
|
||||
)
|
||||
except TypeError as e:
|
||||
raise ValueError(
|
||||
f"Failed to instantiate queue class '{self.queue_cls}': {e}"
|
||||
)
|
||||
|
||||
if not self.df:
|
||||
self.df = load_object(self.dupefilter_cls).from_spider(spider)
|
||||
|
||||
if self.flush_on_start:
|
||||
self.flush()
|
||||
# notice if there are requests already in the queue to resume the crawl
|
||||
if len(self.queue):
|
||||
spider.log(f"Resuming crawl ({len(self.queue)} requests scheduled)")
|
||||
|
||||
def close(self, reason):
|
||||
if not self.persist:
|
||||
self.flush()
|
||||
|
||||
def flush(self):
|
||||
self.df.clear()
|
||||
self.queue.clear()
|
||||
|
||||
def enqueue_request(self, request):
|
||||
if not request.dont_filter and self.df.request_seen(request):
|
||||
self.df.log(request, self.spider)
|
||||
return False
|
||||
if self.stats:
|
||||
self.stats.inc_value("scheduler/enqueued/redis", spider=self.spider)
|
||||
self.queue.push(request)
|
||||
return True
|
||||
|
||||
def next_request(self):
|
||||
block_pop_timeout = self.idle_before_close
|
||||
request = self.queue.pop(block_pop_timeout)
|
||||
if request and self.stats:
|
||||
self.stats.inc_value("scheduler/dequeued/redis", spider=self.spider)
|
||||
return request
|
||||
|
||||
def has_pending_requests(self):
|
||||
return len(self) > 0
|
||||
@@ -0,0 +1,297 @@
|
||||
import json
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
|
||||
from scrapy import FormRequest, signals
|
||||
from scrapy import version_info as scrapy_version
|
||||
from scrapy.exceptions import DontCloseSpider
|
||||
from scrapy.spiders import CrawlSpider, Spider
|
||||
|
||||
from scrapy_redis.utils import TextColor
|
||||
|
||||
from . import connection, defaults
|
||||
from .utils import bytes_to_str, is_dict
|
||||
|
||||
|
||||
class RedisMixin:
|
||||
"""Mixin class to implement reading urls from a redis queue."""
|
||||
|
||||
redis_key = None
|
||||
redis_batch_size = None
|
||||
redis_encoding = None
|
||||
|
||||
# Redis client placeholder.
|
||||
server = None
|
||||
|
||||
# Idle start time
|
||||
spider_idle_start_time = int(time.time())
|
||||
max_idle_time = None
|
||||
|
||||
def start_requests(self):
|
||||
"""Returns a batch of start requests from redis."""
|
||||
return self.next_requests()
|
||||
|
||||
def setup_redis(self, crawler=None):
|
||||
"""Setup redis connection and idle signal.
|
||||
|
||||
This should be called after the spider has set its crawler object.
|
||||
"""
|
||||
if self.server is not None:
|
||||
return
|
||||
|
||||
if crawler is None:
|
||||
# We allow optional crawler argument to keep backwards
|
||||
# compatibility.
|
||||
# XXX: Raise a deprecation warning.
|
||||
crawler = getattr(self, "crawler", None)
|
||||
|
||||
if crawler is None:
|
||||
raise ValueError("crawler is required")
|
||||
|
||||
settings = crawler.settings
|
||||
|
||||
if self.redis_key is None:
|
||||
self.redis_key = settings.get(
|
||||
"REDIS_START_URLS_KEY",
|
||||
defaults.START_URLS_KEY,
|
||||
)
|
||||
|
||||
self.redis_key = self.redis_key % {"name": self.name}
|
||||
|
||||
if not self.redis_key.strip():
|
||||
raise ValueError("redis_key must not be empty")
|
||||
|
||||
if self.redis_batch_size is None:
|
||||
self.redis_batch_size = settings.getint(
|
||||
"CONCURRENT_REQUESTS", defaults.REDIS_CONCURRENT_REQUESTS
|
||||
)
|
||||
|
||||
try:
|
||||
self.redis_batch_size = int(self.redis_batch_size)
|
||||
except (TypeError, ValueError):
|
||||
raise ValueError("redis_batch_size must be an integer")
|
||||
|
||||
if self.redis_encoding is None:
|
||||
self.redis_encoding = settings.get(
|
||||
"REDIS_ENCODING", defaults.REDIS_ENCODING
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
"Reading start URLs from redis key '%(redis_key)s' "
|
||||
"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s)",
|
||||
self.__dict__,
|
||||
)
|
||||
|
||||
self.server = connection.from_settings(crawler.settings)
|
||||
|
||||
if settings.getbool("REDIS_START_URLS_AS_SET", defaults.START_URLS_AS_SET):
|
||||
self.fetch_data = self.server.spop
|
||||
self.count_size = self.server.scard
|
||||
elif settings.getbool("REDIS_START_URLS_AS_ZSET", defaults.START_URLS_AS_ZSET):
|
||||
self.fetch_data = self.pop_priority_queue
|
||||
self.count_size = self.server.zcard
|
||||
else:
|
||||
self.fetch_data = self.pop_list_queue
|
||||
self.count_size = self.server.llen
|
||||
|
||||
if self.max_idle_time is None:
|
||||
self.max_idle_time = settings.get(
|
||||
"MAX_IDLE_TIME_BEFORE_CLOSE", defaults.MAX_IDLE_TIME
|
||||
)
|
||||
|
||||
try:
|
||||
self.max_idle_time = int(self.max_idle_time)
|
||||
except (TypeError, ValueError):
|
||||
raise ValueError("max_idle_time must be an integer")
|
||||
|
||||
# The idle signal is called when the spider has no requests left,
|
||||
# that's when we will schedule new requests from redis queue
|
||||
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
|
||||
|
||||
def pop_list_queue(self, redis_key, batch_size):
|
||||
with self.server.pipeline() as pipe:
|
||||
pipe.lrange(redis_key, 0, batch_size - 1)
|
||||
pipe.ltrim(redis_key, batch_size, -1)
|
||||
datas, _ = pipe.execute()
|
||||
return datas
|
||||
|
||||
def pop_priority_queue(self, redis_key, batch_size):
|
||||
with self.server.pipeline() as pipe:
|
||||
pipe.zrevrange(redis_key, 0, batch_size - 1)
|
||||
pipe.zremrangebyrank(redis_key, -batch_size, -1)
|
||||
datas, _ = pipe.execute()
|
||||
return datas
|
||||
|
||||
def next_requests(self):
|
||||
"""Returns a request to be scheduled or none."""
|
||||
# XXX: Do we need to use a timeout here?
|
||||
found = 0
|
||||
datas = self.fetch_data(self.redis_key, self.redis_batch_size)
|
||||
for data in datas:
|
||||
reqs = self.make_request_from_data(data)
|
||||
if isinstance(reqs, Iterable):
|
||||
for req in reqs:
|
||||
yield req
|
||||
# XXX: should be here?
|
||||
found += 1
|
||||
self.logger.info(f"start req url:{req.url}")
|
||||
elif reqs:
|
||||
yield reqs
|
||||
found += 1
|
||||
else:
|
||||
self.logger.debug(f"Request not made from data: {data}")
|
||||
|
||||
if found:
|
||||
self.logger.debug(f"Read {found} requests from '{self.redis_key}'")
|
||||
|
||||
def make_request_from_data(self, data):
|
||||
"""Returns a `Request` instance for data coming from Redis.
|
||||
|
||||
Overriding this function to support the `json` requested `data` that contains
|
||||
`url` ,`meta` and other optional parameters. `meta` is a nested json which contains sub-data.
|
||||
|
||||
Along with:
|
||||
After accessing the data, sending the FormRequest with `url`, `meta` and addition `formdata`, `method`
|
||||
|
||||
For example:
|
||||
|
||||
.. code:: json
|
||||
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"meta": {
|
||||
"job-id":"123xsd",
|
||||
"start-date":"dd/mm/yy",
|
||||
},
|
||||
"url_cookie_key":"fertxsas",
|
||||
"method":"POST",
|
||||
}
|
||||
|
||||
If `url` is empty, return `[]`. So you should verify the `url` in the data.
|
||||
If `method` is empty, the request object will set method to 'GET', optional.
|
||||
If `meta` is empty, the request object will set `meta` to an empty dictionary, optional.
|
||||
|
||||
This json supported data can be accessed from 'scrapy.spider' through response.
|
||||
'request.url', 'request.meta', 'request.cookies', 'request.method'
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : bytes
|
||||
Message from redis.
|
||||
|
||||
"""
|
||||
formatted_data = bytes_to_str(data, self.redis_encoding)
|
||||
|
||||
if is_dict(formatted_data):
|
||||
parameter = json.loads(formatted_data)
|
||||
else:
|
||||
self.logger.warning(
|
||||
f"{TextColor.WARNING}WARNING: String request is deprecated, please use JSON data format. "
|
||||
f"Detail information, please check https://github.com/rmax/scrapy-redis#features{TextColor.ENDC}"
|
||||
)
|
||||
return FormRequest(formatted_data, dont_filter=True)
|
||||
|
||||
if parameter.get("url", None) is None:
|
||||
self.logger.warning(
|
||||
f"{TextColor.WARNING}The data from Redis has no url key in push data{TextColor.ENDC}"
|
||||
)
|
||||
return []
|
||||
|
||||
url = parameter.pop("url")
|
||||
method = parameter.pop("method").upper() if "method" in parameter else "GET"
|
||||
metadata = parameter.pop("meta") if "meta" in parameter else {}
|
||||
|
||||
return FormRequest(
|
||||
url, dont_filter=True, method=method, formdata=parameter, meta=metadata
|
||||
)
|
||||
|
||||
def schedule_next_requests(self):
|
||||
"""Schedules a request if available"""
|
||||
# TODO: While there is capacity, schedule a batch of redis requests.
|
||||
for req in self.next_requests():
|
||||
# see https://github.com/scrapy/scrapy/issues/5994
|
||||
if scrapy_version >= (2, 6):
|
||||
self.crawler.engine.crawl(req)
|
||||
else:
|
||||
self.crawler.engine.crawl(req, spider=self)
|
||||
|
||||
def spider_idle(self):
|
||||
"""
|
||||
Schedules a request if available, otherwise waits.
|
||||
or close spider when waiting seconds > MAX_IDLE_TIME_BEFORE_CLOSE.
|
||||
MAX_IDLE_TIME_BEFORE_CLOSE will not affect SCHEDULER_IDLE_BEFORE_CLOSE.
|
||||
"""
|
||||
if self.server is not None and self.count_size(self.redis_key) > 0:
|
||||
self.spider_idle_start_time = int(time.time())
|
||||
|
||||
self.schedule_next_requests()
|
||||
|
||||
idle_time = int(time.time()) - self.spider_idle_start_time
|
||||
if self.max_idle_time != 0 and idle_time >= self.max_idle_time:
|
||||
return
|
||||
raise DontCloseSpider
|
||||
|
||||
|
||||
class RedisSpider(RedisMixin, Spider):
|
||||
"""Spider that reads urls from redis queue when idle.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
redis_key : str (default: REDIS_START_URLS_KEY)
|
||||
Redis key where to fetch start URLs from..
|
||||
redis_batch_size : int (default: CONCURRENT_REQUESTS)
|
||||
Number of messages to fetch from redis on each attempt.
|
||||
redis_encoding : str (default: REDIS_ENCODING)
|
||||
Encoding to use when decoding messages from redis queue.
|
||||
|
||||
Settings
|
||||
--------
|
||||
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
|
||||
Default Redis key where to fetch start URLs from..
|
||||
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
|
||||
Default number of messages to fetch from redis on each attempt.
|
||||
REDIS_START_URLS_AS_SET : bool (default: False)
|
||||
Use SET operations to retrieve messages from the redis queue. If False,
|
||||
the messages are retrieve using the LPOP command.
|
||||
REDIS_ENCODING : str (default: "utf-8")
|
||||
Default encoding to use when decoding messages from redis queue.
|
||||
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
obj = super().from_crawler(crawler, *args, **kwargs)
|
||||
obj.setup_redis(crawler)
|
||||
return obj
|
||||
|
||||
|
||||
class RedisCrawlSpider(RedisMixin, CrawlSpider):
|
||||
"""Spider that reads urls from redis queue when idle.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
redis_key : str (default: REDIS_START_URLS_KEY)
|
||||
Redis key where to fetch start URLs from..
|
||||
redis_batch_size : int (default: CONCURRENT_REQUESTS)
|
||||
Number of messages to fetch from redis on each attempt.
|
||||
redis_encoding : str (default: REDIS_ENCODING)
|
||||
Encoding to use when decoding messages from redis queue.
|
||||
|
||||
Settings
|
||||
--------
|
||||
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
|
||||
Default Redis key where to fetch start URLs from..
|
||||
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
|
||||
Default number of messages to fetch from redis on each attempt.
|
||||
REDIS_START_URLS_AS_SET : bool (default: True)
|
||||
Use SET operations to retrieve messages from the redis queue.
|
||||
REDIS_ENCODING : str (default: "utf-8")
|
||||
Default encoding to use when decoding messages from redis queue.
|
||||
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
obj = super().from_crawler(crawler, *args, **kwargs)
|
||||
obj.setup_redis(crawler)
|
||||
return obj
|
||||
@@ -0,0 +1,90 @@
|
||||
from datetime import datetime
|
||||
|
||||
from scrapy.statscollectors import StatsCollector
|
||||
|
||||
from .connection import from_settings as redis_from_settings
|
||||
from .defaults import SCHEDULER_PERSIST, STATS_KEY
|
||||
from .utils import convert_bytes_to_str
|
||||
|
||||
|
||||
class RedisStatsCollector(StatsCollector):
|
||||
"""
|
||||
Stats Collector based on Redis
|
||||
"""
|
||||
|
||||
def __init__(self, crawler, spider=None):
|
||||
super().__init__(crawler)
|
||||
self.server = redis_from_settings(crawler.settings)
|
||||
self.spider = spider
|
||||
self.spider_name = spider.name if spider else crawler.spidercls.name
|
||||
self.stats_key = crawler.settings.get("STATS_KEY", STATS_KEY)
|
||||
self.persist = crawler.settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST)
|
||||
|
||||
def _get_key(self, spider=None):
|
||||
"""Return the hash name of stats"""
|
||||
if spider:
|
||||
return self.stats_key % {"spider": spider.name}
|
||||
if self.spider:
|
||||
return self.stats_key % {"spider": self.spider.name}
|
||||
return self.stats_key % {"spider": self.spider_name or "scrapy"}
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
@classmethod
|
||||
def from_spider(cls, spider):
|
||||
return cls(spider.crawler)
|
||||
|
||||
def get_value(self, key, default=None, spider=None):
|
||||
"""Return the value of hash stats"""
|
||||
if self.server.hexists(self._get_key(spider), key):
|
||||
return int(self.server.hget(self._get_key(spider), key))
|
||||
else:
|
||||
return default
|
||||
|
||||
def get_stats(self, spider=None):
|
||||
"""Return the all of the values of hash stats"""
|
||||
stats = self.server.hgetall(self._get_key(spider))
|
||||
if stats:
|
||||
return convert_bytes_to_str(stats)
|
||||
return {}
|
||||
|
||||
def set_value(self, key, value, spider=None):
|
||||
"""Set the value according to hash key of stats"""
|
||||
if isinstance(value, datetime):
|
||||
value = value.timestamp()
|
||||
self.server.hset(self._get_key(spider), key, value)
|
||||
|
||||
def set_stats(self, stats, spider=None):
|
||||
"""Set all the hash stats"""
|
||||
self.server.hmset(self._get_key(spider), stats)
|
||||
|
||||
def inc_value(self, key, count=1, start=0, spider=None):
|
||||
"""Set increment of value according to key"""
|
||||
if not self.server.hexists(self._get_key(spider), key):
|
||||
self.set_value(key, start)
|
||||
self.server.hincrby(self._get_key(spider), key, count)
|
||||
|
||||
def max_value(self, key, value, spider=None):
|
||||
"""Set max value between current and new value"""
|
||||
self.set_value(key, max(self.get_value(key, value), value))
|
||||
|
||||
def min_value(self, key, value, spider=None):
|
||||
"""Set min value between current and new value"""
|
||||
self.set_value(key, min(self.get_value(key, value), value))
|
||||
|
||||
def clear_stats(self, spider=None):
|
||||
"""Clear all the hash stats"""
|
||||
self.server.delete(self._get_key(spider))
|
||||
|
||||
def open_spider(self, spider):
|
||||
"""Set spider to self"""
|
||||
if spider:
|
||||
self.spider = spider
|
||||
|
||||
def close_spider(self, spider, reason):
|
||||
"""Clear spider and clear stats"""
|
||||
self.spider = None
|
||||
if not self.persist:
|
||||
self.clear_stats(spider)
|
||||
@@ -0,0 +1,44 @@
|
||||
import json
|
||||
from json import JSONDecodeError
|
||||
|
||||
import six
|
||||
|
||||
|
||||
class TextColor:
|
||||
HEADER = "\033[95m"
|
||||
OKBLUE = "\033[94m"
|
||||
OKCYAN = "\033[96m"
|
||||
OKGREEN = "\033[92m"
|
||||
WARNING = "\033[93m"
|
||||
FAIL = "\033[91m"
|
||||
ENDC = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
UNDERLINE = "\033[4m"
|
||||
|
||||
|
||||
def bytes_to_str(s, encoding="utf-8"):
|
||||
"""Returns a str if a bytes object is given."""
|
||||
if six.PY3 and isinstance(s, bytes):
|
||||
return s.decode(encoding)
|
||||
return s
|
||||
|
||||
|
||||
def is_dict(string_content):
|
||||
"""Try load string_content as json, if failed, return False, else return True."""
|
||||
try:
|
||||
json.loads(string_content)
|
||||
except JSONDecodeError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def convert_bytes_to_str(data, encoding="utf-8"):
|
||||
"""Convert a dict's keys & values from `bytes` to `str`
|
||||
or convert bytes to str"""
|
||||
if isinstance(data, bytes):
|
||||
return data.decode(encoding)
|
||||
if isinstance(data, dict):
|
||||
return dict(map(convert_bytes_to_str, data.items()))
|
||||
elif isinstance(data, tuple):
|
||||
return map(convert_bytes_to_str, data)
|
||||
return data
|
||||
@@ -0,0 +1,69 @@
|
||||
from unittest import mock
|
||||
|
||||
from scrapy.settings import Settings
|
||||
|
||||
from scrapy_redis import defaults
|
||||
from scrapy_redis.connection import from_settings, get_redis, get_redis_from_settings
|
||||
|
||||
|
||||
class TestGetRedis:
|
||||
|
||||
def test_default_instance(self):
|
||||
server = get_redis()
|
||||
assert isinstance(server, defaults.REDIS_CLS)
|
||||
|
||||
def test_custom_class(self):
|
||||
client_cls = mock.Mock()
|
||||
server = get_redis(param="foo", redis_cls=client_cls)
|
||||
assert server is client_cls.return_value
|
||||
client_cls.assert_called_with(param="foo")
|
||||
|
||||
def test_from_url(self):
|
||||
client_cls = mock.Mock()
|
||||
url = "redis://localhost"
|
||||
server = get_redis(redis_cls=client_cls, url=url, param="foo")
|
||||
assert server is client_cls.from_url.return_value
|
||||
client_cls.from_url.assert_called_with(url, param="foo")
|
||||
|
||||
|
||||
class TestFromSettings:
|
||||
|
||||
def setup(self):
|
||||
self.redis_cls = mock.Mock()
|
||||
self.expected_params = {
|
||||
"timeout": 0,
|
||||
"flag": False,
|
||||
}
|
||||
self.settings = Settings(
|
||||
{
|
||||
"REDIS_PARAMS": dict(self.expected_params, redis_cls=self.redis_cls),
|
||||
}
|
||||
)
|
||||
|
||||
def test_redis_cls_default(self):
|
||||
server = from_settings(Settings())
|
||||
assert isinstance(server, defaults.REDIS_CLS)
|
||||
|
||||
def test_redis_cls_custom_path(self):
|
||||
self.settings["REDIS_PARAMS"]["redis_cls"] = "unittest.mock.Mock"
|
||||
server = from_settings(self.settings)
|
||||
assert isinstance(server, mock.Mock)
|
||||
|
||||
def test_default_params(self):
|
||||
server = from_settings(self.settings)
|
||||
assert server is self.redis_cls.return_value
|
||||
self.redis_cls.assert_called_with(
|
||||
**dict(defaults.REDIS_PARAMS, **self.expected_params)
|
||||
)
|
||||
|
||||
def test_override_default_params(self):
|
||||
for key, _ in defaults.REDIS_PARAMS.items():
|
||||
self.expected_params[key] = self.settings["REDIS_PARAMS"][key] = object()
|
||||
|
||||
server = from_settings(self.settings)
|
||||
assert server is self.redis_cls.return_value
|
||||
self.redis_cls.assert_called_with(**self.expected_params)
|
||||
|
||||
|
||||
def test_get_server_from_settings_alias():
|
||||
assert from_settings is get_redis_from_settings
|
||||
@@ -0,0 +1,108 @@
|
||||
from unittest import mock
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.settings import Settings
|
||||
|
||||
from scrapy_redis.dupefilter import RFPDupeFilter
|
||||
|
||||
|
||||
def get_redis_mock():
|
||||
server = mock.Mock()
|
||||
|
||||
def sadd(key, fp, added=0, db={}): # noqa: mutable db
|
||||
fingerprints = db.setdefault(key, set())
|
||||
if fp not in fingerprints:
|
||||
fingerprints.add(fp)
|
||||
added += 1
|
||||
return added
|
||||
|
||||
server.sadd = sadd
|
||||
|
||||
return server
|
||||
|
||||
|
||||
class TestRFPDupeFilter:
|
||||
|
||||
def setup(self):
|
||||
self.server = get_redis_mock()
|
||||
self.key = "dupefilter:1"
|
||||
self.df = RFPDupeFilter(self.server, self.key)
|
||||
|
||||
def test_request_seen(self):
|
||||
req = Request("http://example.com")
|
||||
|
||||
def same_request():
|
||||
assert not self.df.request_seen(req)
|
||||
assert self.df.request_seen(req)
|
||||
|
||||
def diff_method():
|
||||
diff_method = Request("http://example.com", method="POST")
|
||||
assert self.df.request_seen(req)
|
||||
assert not self.df.request_seen(diff_method)
|
||||
|
||||
def diff_url():
|
||||
diff_url = Request("http://example2.com")
|
||||
assert self.df.request_seen(req)
|
||||
assert not self.df.request_seen(diff_url)
|
||||
|
||||
same_request()
|
||||
diff_method()
|
||||
diff_url()
|
||||
|
||||
def test_overridable_request_fingerprinter(self):
|
||||
req = Request("http://example.com")
|
||||
self.df.request_fingerprint = mock.Mock(wraps=self.df.request_fingerprint)
|
||||
assert not self.df.request_seen(req)
|
||||
self.df.request_fingerprint.assert_called_with(req)
|
||||
|
||||
def test_clear_deletes(self):
|
||||
self.df.clear()
|
||||
self.server.delete.assert_called_with(self.key)
|
||||
|
||||
def test_close_calls_clear(self):
|
||||
self.df.clear = mock.Mock(wraps=self.df.clear)
|
||||
self.df.close()
|
||||
self.df.close(reason="foo")
|
||||
assert self.df.clear.call_count == 2
|
||||
|
||||
|
||||
def test_log_dupes():
|
||||
def _test(df, dupes, logcount):
|
||||
df.logger.debug = mock.Mock(wraps=df.logger.debug)
|
||||
for _ in range(dupes):
|
||||
req = Request("http://example")
|
||||
df.log(req, spider=mock.Mock())
|
||||
assert df.logger.debug.call_count == logcount
|
||||
|
||||
server = get_redis_mock()
|
||||
|
||||
df_quiet = RFPDupeFilter(server, "foo") # debug=False
|
||||
_test(df_quiet, 5, 1)
|
||||
|
||||
df_debug = RFPDupeFilter(server, "foo", debug=True)
|
||||
_test(df_debug, 5, 5)
|
||||
|
||||
|
||||
@mock.patch("scrapy_redis.dupefilter.get_redis_from_settings")
|
||||
class TestFromMethods:
|
||||
|
||||
def setup(self):
|
||||
self.settings = Settings(
|
||||
{
|
||||
"DUPEFILTER_DEBUG": True,
|
||||
}
|
||||
)
|
||||
|
||||
def test_from_settings(self, get_redis_from_settings):
|
||||
df = RFPDupeFilter.from_settings(self.settings)
|
||||
self.assert_dupefilter(df, get_redis_from_settings)
|
||||
|
||||
def test_from_crawler(self, get_redis_from_settings):
|
||||
crawler = mock.Mock(settings=self.settings)
|
||||
df = RFPDupeFilter.from_crawler(crawler)
|
||||
self.assert_dupefilter(df, get_redis_from_settings)
|
||||
|
||||
def assert_dupefilter(self, df, get_redis_from_settings):
|
||||
assert df.server is get_redis_from_settings.return_value
|
||||
assert df.key.startswith("dupefilter:")
|
||||
assert df.debug # true
|
||||
@@ -0,0 +1,7 @@
|
||||
import scrapy_redis
|
||||
|
||||
|
||||
def test_package_metadata():
|
||||
assert scrapy_redis.__author__
|
||||
assert scrapy_redis.__email__
|
||||
assert scrapy_redis.__version__
|
||||
@@ -0,0 +1,18 @@
|
||||
from scrapy_redis import picklecompat
|
||||
|
||||
|
||||
def test_picklecompat():
|
||||
obj = {
|
||||
"_encoding": "utf-8",
|
||||
"body": "",
|
||||
"callback": "_response_downloaded",
|
||||
"cookies": {},
|
||||
"dont_filter": False,
|
||||
"errback": None,
|
||||
"headers": {"Referer": ["http://www.dmoz.org/"]},
|
||||
"meta": {"depth": 1, "link_text": "Fran\xe7ais", "rule": 0},
|
||||
"method": "GET",
|
||||
"priority": 0,
|
||||
"url": "http://www.dmoz.org/World/Fran%C3%A7ais/",
|
||||
}
|
||||
assert obj == picklecompat.loads(picklecompat.dumps(obj))
|
||||
@@ -0,0 +1,38 @@
|
||||
from unittest import mock
|
||||
|
||||
from scrapy import Spider
|
||||
from scrapy.http import Request
|
||||
|
||||
from scrapy_redis.queue import Base
|
||||
|
||||
|
||||
class TestBaseQueue:
|
||||
|
||||
queue_cls = Base
|
||||
|
||||
def setup(self):
|
||||
self.server = mock.Mock()
|
||||
self.spider = Spider(name="foo")
|
||||
self.spider.parse_method = lambda x: x
|
||||
self.key = "key"
|
||||
self.q = self.queue_cls(self.server, self.spider, self.key)
|
||||
|
||||
def test_encode_decode_requests(self, q=None):
|
||||
if q is None:
|
||||
q = self.q
|
||||
req = Request(
|
||||
"http://example.com", callback=self.spider.parse, meta={"foo": "bar"}
|
||||
)
|
||||
out = q._decode_request(q._encode_request(req))
|
||||
assert req.url == out.url
|
||||
assert req.meta == out.meta
|
||||
assert req.callback == out.callback
|
||||
|
||||
def test_custom_serializer(self):
|
||||
serializer = mock.Mock()
|
||||
serializer.dumps = mock.Mock(side_effect=lambda x: x)
|
||||
serializer.loads = mock.Mock(side_effect=lambda x: x)
|
||||
q = Base(self.server, self.spider, self.key, serializer=serializer)
|
||||
self.test_encode_decode_requests(q)
|
||||
assert serializer.dumps.call_count == 1
|
||||
assert serializer.loads.call_count == 1
|
||||
@@ -0,0 +1,296 @@
|
||||
import os
|
||||
from unittest import TestCase, mock
|
||||
|
||||
import redis
|
||||
from scrapy import Request, Spider
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
from scrapy_redis import connection
|
||||
from scrapy_redis.dupefilter import RFPDupeFilter
|
||||
from scrapy_redis.queue import FifoQueue, LifoQueue, PriorityQueue
|
||||
from scrapy_redis.scheduler import Scheduler
|
||||
|
||||
# allow test settings from environment
|
||||
REDIS_HOST = os.environ.get("REDIS_HOST", "localhost")
|
||||
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
|
||||
|
||||
|
||||
def get_spider(*args, **kwargs):
|
||||
crawler = get_crawler(
|
||||
spidercls=kwargs.pop("spidercls", None),
|
||||
settings_dict=kwargs.pop("settings_dict", None),
|
||||
)
|
||||
return crawler._create_spider(*args, **kwargs)
|
||||
|
||||
|
||||
class RedisTestMixin:
|
||||
|
||||
@property
|
||||
def server(self):
|
||||
if not hasattr(self, "_redis"):
|
||||
self._redis = redis.Redis(REDIS_HOST, REDIS_PORT)
|
||||
return self._redis
|
||||
|
||||
def clear_keys(self, prefix):
|
||||
keys = self.server.keys(prefix + "*")
|
||||
if keys:
|
||||
self.server.delete(*keys)
|
||||
|
||||
|
||||
class DupeFilterTest(RedisTestMixin, TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.key = "scrapy_redis:tests:dupefilter:"
|
||||
self.df = RFPDupeFilter(self.server, self.key)
|
||||
|
||||
def tearDown(self):
|
||||
self.clear_keys(self.key)
|
||||
|
||||
def test_dupe_filter(self):
|
||||
req = Request("http://example.com")
|
||||
|
||||
self.assertFalse(self.df.request_seen(req))
|
||||
self.assertTrue(self.df.request_seen(req))
|
||||
|
||||
self.df.close("nothing")
|
||||
|
||||
|
||||
class QueueTestMixin(RedisTestMixin):
|
||||
|
||||
queue_cls = None
|
||||
|
||||
def setUp(self):
|
||||
self.spider = get_spider(name="myspider")
|
||||
self.key = f"scrapy_redis:tests:{self.spider.name}:queue"
|
||||
self.q = self.queue_cls(self.server, Spider("myspider"), self.key)
|
||||
|
||||
def tearDown(self):
|
||||
self.clear_keys(self.key)
|
||||
|
||||
def test_clear(self):
|
||||
self.assertEqual(len(self.q), 0)
|
||||
|
||||
for i in range(10):
|
||||
# XXX: can't use same url for all requests as SpiderPriorityQueue
|
||||
# uses redis' set implemention and we will end with only one
|
||||
# request in the set and thus failing the test. It should be noted
|
||||
# that when using SpiderPriorityQueue it acts as a request
|
||||
# duplication filter whenever the serielized requests are the same.
|
||||
# This might be unwanted on repetitive requests to the same page
|
||||
# even with dont_filter=True flag.
|
||||
req = Request(f"http://example.com/?page={i}")
|
||||
self.q.push(req)
|
||||
self.assertEqual(len(self.q), 10)
|
||||
|
||||
self.q.clear()
|
||||
self.assertEqual(len(self.q), 0)
|
||||
|
||||
|
||||
class FifoQueueTest(QueueTestMixin, TestCase):
|
||||
|
||||
queue_cls = FifoQueue
|
||||
|
||||
def test_queue(self):
|
||||
req1 = Request("http://example.com/page1")
|
||||
req2 = Request("http://example.com/page2")
|
||||
|
||||
self.q.push(req1)
|
||||
self.q.push(req2)
|
||||
|
||||
out1 = self.q.pop()
|
||||
out2 = self.q.pop(timeout=1)
|
||||
|
||||
self.assertEqual(out1.url, req1.url)
|
||||
self.assertEqual(out2.url, req2.url)
|
||||
|
||||
|
||||
class PriorityQueueTest(QueueTestMixin, TestCase):
|
||||
|
||||
queue_cls = PriorityQueue
|
||||
|
||||
def test_queue(self):
|
||||
req1 = Request("http://example.com/page1", priority=100)
|
||||
req2 = Request("http://example.com/page2", priority=50)
|
||||
req3 = Request("http://example.com/page2", priority=200)
|
||||
|
||||
self.q.push(req1)
|
||||
self.q.push(req2)
|
||||
self.q.push(req3)
|
||||
|
||||
out1 = self.q.pop()
|
||||
out2 = self.q.pop(timeout=0)
|
||||
out3 = self.q.pop(timeout=1)
|
||||
|
||||
self.assertEqual(out1.url, req3.url)
|
||||
self.assertEqual(out2.url, req1.url)
|
||||
self.assertEqual(out3.url, req2.url)
|
||||
|
||||
|
||||
class LifoQueueTest(QueueTestMixin, TestCase):
|
||||
|
||||
queue_cls = LifoQueue
|
||||
|
||||
def test_queue(self):
|
||||
req1 = Request("http://example.com/page1")
|
||||
req2 = Request("http://example.com/page2")
|
||||
|
||||
self.q.push(req1)
|
||||
self.q.push(req2)
|
||||
|
||||
out1 = self.q.pop()
|
||||
out2 = self.q.pop(timeout=1)
|
||||
|
||||
self.assertEqual(out1.url, req2.url)
|
||||
self.assertEqual(out2.url, req1.url)
|
||||
|
||||
|
||||
class SchedulerTest(RedisTestMixin, TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.key_prefix = "scrapy_redis:tests:"
|
||||
self.queue_key = self.key_prefix + "%(spider)s:requests"
|
||||
self.dupefilter_key = self.key_prefix + "%(spider)s:dupefilter"
|
||||
self.spider = get_spider(
|
||||
name="myspider",
|
||||
settings_dict={
|
||||
"REDIS_HOST": REDIS_HOST,
|
||||
"REDIS_PORT": REDIS_PORT,
|
||||
"SCHEDULER_QUEUE_KEY": self.queue_key,
|
||||
"SCHEDULER_DUPEFILTER_KEY": self.dupefilter_key,
|
||||
"SCHEDULER_FLUSH_ON_START": False,
|
||||
"SCHEDULER_PERSIST": False,
|
||||
"SCHEDULER_SERIALIZER": "pickle",
|
||||
"DUPEFILTER_CLASS": "scrapy_redis.dupefilter.RFPDupeFilter",
|
||||
},
|
||||
)
|
||||
self.scheduler = Scheduler.from_crawler(self.spider.crawler)
|
||||
|
||||
def tearDown(self):
|
||||
self.clear_keys(self.key_prefix)
|
||||
|
||||
def test_scheduler(self):
|
||||
# default no persist
|
||||
self.assertFalse(self.scheduler.persist)
|
||||
|
||||
self.scheduler.open(self.spider)
|
||||
self.assertEqual(len(self.scheduler), 0)
|
||||
|
||||
req = Request("http://example.com")
|
||||
self.scheduler.enqueue_request(req)
|
||||
self.assertTrue(self.scheduler.has_pending_requests())
|
||||
self.assertEqual(len(self.scheduler), 1)
|
||||
|
||||
# dupefilter in action
|
||||
self.scheduler.enqueue_request(req)
|
||||
self.assertEqual(len(self.scheduler), 1)
|
||||
|
||||
out = self.scheduler.next_request()
|
||||
self.assertEqual(out.url, req.url)
|
||||
|
||||
self.assertFalse(self.scheduler.has_pending_requests())
|
||||
self.assertEqual(len(self.scheduler), 0)
|
||||
|
||||
self.scheduler.close("finish")
|
||||
|
||||
def test_scheduler_persistent(self):
|
||||
# TODO: Improve this test to avoid the need to check for log messages.
|
||||
self.spider.log = mock.Mock(spec=self.spider.log)
|
||||
|
||||
self.scheduler.persist = True
|
||||
self.scheduler.open(self.spider)
|
||||
|
||||
self.assertEqual(self.spider.log.call_count, 0)
|
||||
|
||||
self.scheduler.enqueue_request(Request("http://example.com/page1"))
|
||||
self.scheduler.enqueue_request(Request("http://example.com/page2"))
|
||||
|
||||
self.assertTrue(self.scheduler.has_pending_requests())
|
||||
self.scheduler.close("finish")
|
||||
|
||||
self.scheduler.open(self.spider)
|
||||
self.spider.log.assert_has_calls(
|
||||
[
|
||||
mock.call("Resuming crawl (2 requests scheduled)"),
|
||||
]
|
||||
)
|
||||
self.assertEqual(len(self.scheduler), 2)
|
||||
|
||||
self.scheduler.persist = False
|
||||
self.scheduler.close("finish")
|
||||
|
||||
self.assertEqual(len(self.scheduler), 0)
|
||||
|
||||
|
||||
class ConnectionTest(TestCase):
|
||||
|
||||
# We can get a connection from just REDIS_URL.
|
||||
def test_redis_url(self):
|
||||
settings = Settings(
|
||||
{
|
||||
"REDIS_URL": "redis://foo:bar@localhost:9001/42",
|
||||
}
|
||||
)
|
||||
|
||||
server = connection.from_settings(settings)
|
||||
connect_args = server.connection_pool.connection_kwargs
|
||||
|
||||
self.assertEqual(connect_args["host"], "localhost")
|
||||
self.assertEqual(connect_args["port"], 9001)
|
||||
self.assertEqual(connect_args["password"], "bar")
|
||||
self.assertEqual(connect_args["db"], 42)
|
||||
|
||||
# We can get a connection from REDIS_HOST/REDIS_PORT.
|
||||
def test_redis_host_port(self):
|
||||
settings = Settings(
|
||||
{
|
||||
"REDIS_HOST": "localhost",
|
||||
"REDIS_PORT": 9001,
|
||||
}
|
||||
)
|
||||
|
||||
server = connection.from_settings(settings)
|
||||
connect_args = server.connection_pool.connection_kwargs
|
||||
|
||||
self.assertEqual(connect_args["host"], "localhost")
|
||||
self.assertEqual(connect_args["port"], 9001)
|
||||
|
||||
# REDIS_URL takes precedence over REDIS_HOST/REDIS_PORT.
|
||||
def test_redis_url_precedence(self):
|
||||
settings = Settings(
|
||||
{
|
||||
"REDIS_HOST": "baz",
|
||||
"REDIS_PORT": 1337,
|
||||
"REDIS_URL": "redis://foo:bar@localhost:9001/42",
|
||||
}
|
||||
)
|
||||
|
||||
server = connection.from_settings(settings)
|
||||
connect_args = server.connection_pool.connection_kwargs
|
||||
|
||||
self.assertEqual(connect_args["host"], "localhost")
|
||||
self.assertEqual(connect_args["port"], 9001)
|
||||
self.assertEqual(connect_args["password"], "bar")
|
||||
self.assertEqual(connect_args["db"], 42)
|
||||
|
||||
# We fallback to REDIS_HOST/REDIS_PORT if REDIS_URL is None.
|
||||
def test_redis_host_port_fallback(self):
|
||||
settings = Settings(
|
||||
{"REDIS_HOST": "baz", "REDIS_PORT": 1337, "REDIS_URL": None}
|
||||
)
|
||||
|
||||
server = connection.from_settings(settings)
|
||||
connect_args = server.connection_pool.connection_kwargs
|
||||
|
||||
self.assertEqual(connect_args["host"], "baz")
|
||||
self.assertEqual(connect_args["port"], 1337)
|
||||
|
||||
# We use default values for REDIS_HOST/REDIS_PORT.
|
||||
def test_redis_default(self):
|
||||
settings = Settings()
|
||||
|
||||
server = connection.from_settings(settings)
|
||||
connect_args = server.connection_pool.connection_kwargs
|
||||
|
||||
self.assertEqual(connect_args["host"], "localhost")
|
||||
self.assertEqual(connect_args["port"], 6379)
|
||||
@@ -0,0 +1,197 @@
|
||||
import contextlib
|
||||
import os
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import DontCloseSpider
|
||||
from scrapy.settings import Settings
|
||||
|
||||
from scrapy_redis.spiders import RedisCrawlSpider, RedisSpider
|
||||
|
||||
REDIS_HOST = os.environ.get("REDIS_HOST", "localhost")
|
||||
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def flushall(server):
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
server.flushall()
|
||||
|
||||
|
||||
class MySpider(RedisSpider):
|
||||
name = "myspider"
|
||||
|
||||
|
||||
class MyCrawlSpider(RedisCrawlSpider):
|
||||
name = "myspider"
|
||||
|
||||
|
||||
def get_crawler(**kwargs):
|
||||
return mock.Mock(
|
||||
settings=Settings(
|
||||
{
|
||||
"REDIS_HOST": REDIS_HOST,
|
||||
"REDIS_PORT": REDIS_PORT,
|
||||
}
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class TestRedisMixin_setup_redis:
|
||||
|
||||
def setup(self):
|
||||
self.myspider = MySpider()
|
||||
|
||||
def test_crawler_required(self):
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
self.myspider.setup_redis()
|
||||
assert "crawler" in str(excinfo.value)
|
||||
|
||||
def test_requires_redis_key(self):
|
||||
self.myspider.crawler = get_crawler()
|
||||
self.myspider.redis_key = ""
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
self.myspider.setup_redis()
|
||||
assert "redis_key" in str(excinfo.value)
|
||||
|
||||
def test_invalid_batch_size(self):
|
||||
self.myspider.redis_batch_size = "x"
|
||||
self.myspider.crawler = get_crawler()
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
self.myspider.setup_redis()
|
||||
assert "redis_batch_size" in str(excinfo.value)
|
||||
|
||||
def test_invalid_idle_time(self):
|
||||
self.myspider.max_idle_time = "x"
|
||||
self.myspider.crawler = get_crawler()
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
self.myspider.setup_redis()
|
||||
assert "max_idle_time" in str(excinfo.value)
|
||||
|
||||
@mock.patch("scrapy_redis.spiders.connection")
|
||||
def test_via_from_crawler(self, connection):
|
||||
server = connection.from_settings.return_value = mock.Mock()
|
||||
crawler = get_crawler()
|
||||
myspider = MySpider.from_crawler(crawler)
|
||||
assert myspider.server is server
|
||||
connection.from_settings.assert_called_with(crawler.settings)
|
||||
crawler.signals.connect.assert_called_with(
|
||||
myspider.spider_idle, signal=signals.spider_idle
|
||||
)
|
||||
# Second call does nothing.
|
||||
server = myspider.server
|
||||
crawler.signals.connect.reset_mock()
|
||||
myspider.setup_redis()
|
||||
assert myspider.server is server
|
||||
assert crawler.signals.connect.call_count == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"spider_cls",
|
||||
[
|
||||
MySpider,
|
||||
MyCrawlSpider,
|
||||
],
|
||||
)
|
||||
def test_from_crawler_with_spider_arguments(spider_cls):
|
||||
crawler = get_crawler()
|
||||
spider = spider_cls.from_crawler(
|
||||
crawler,
|
||||
"foo",
|
||||
redis_key="key:%(name)s",
|
||||
redis_batch_size="2000",
|
||||
max_idle_time="100",
|
||||
)
|
||||
assert spider.name == "foo"
|
||||
assert spider.redis_key == "key:foo"
|
||||
assert spider.redis_batch_size == 2000
|
||||
assert spider.max_idle_time == 100
|
||||
|
||||
|
||||
class MockRequest(mock.Mock):
|
||||
def __init__(self, url, **kwargs):
|
||||
super().__init__()
|
||||
self.url = url
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.url == other.url
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.url)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<{self.__class__.__name__}({self.url})>"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"spider_cls",
|
||||
[
|
||||
MySpider,
|
||||
MyCrawlSpider,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("start_urls_as_zset", [False, True])
|
||||
@pytest.mark.parametrize("start_urls_as_set", [False, True])
|
||||
@mock.patch("scrapy.spiders.Request", MockRequest)
|
||||
def test_consume_urls_from_redis(start_urls_as_zset, start_urls_as_set, spider_cls):
|
||||
batch_size = 5
|
||||
redis_key = "start:urls"
|
||||
crawler = get_crawler()
|
||||
crawler.settings.setdict(
|
||||
{
|
||||
"REDIS_HOST": REDIS_HOST,
|
||||
"REDIS_PORT": REDIS_PORT,
|
||||
"REDIS_START_URLS_KEY": redis_key,
|
||||
"REDIS_START_URLS_AS_ZSET": start_urls_as_zset,
|
||||
"REDIS_START_URLS_AS_SET": start_urls_as_set,
|
||||
"CONCURRENT_REQUESTS": batch_size,
|
||||
}
|
||||
)
|
||||
spider = spider_cls.from_crawler(crawler)
|
||||
with flushall(spider.server):
|
||||
urls = [f"http://example.com/{i}" for i in range(batch_size * 2)]
|
||||
reqs = []
|
||||
if start_urls_as_set:
|
||||
server_put = spider.server.sadd
|
||||
elif start_urls_as_zset:
|
||||
|
||||
def server_put(key, value):
|
||||
spider.server.zadd(key, {value: 0})
|
||||
|
||||
else:
|
||||
server_put = spider.server.rpush
|
||||
for url in urls:
|
||||
server_put(redis_key, url)
|
||||
reqs.append(MockRequest(url))
|
||||
|
||||
# First call is to start requests.
|
||||
start_requests = list(spider.start_requests())
|
||||
if start_urls_as_zset or start_urls_as_set:
|
||||
assert len(start_requests) == batch_size
|
||||
assert {r.url for r in start_requests}.issubset(r.url for r in reqs)
|
||||
else:
|
||||
assert start_requests == reqs[:batch_size]
|
||||
|
||||
# Second call is to spider idle method.
|
||||
with pytest.raises(DontCloseSpider):
|
||||
spider.spider_idle()
|
||||
# Process remaining requests in the queue.
|
||||
with pytest.raises(DontCloseSpider):
|
||||
spider.spider_idle()
|
||||
|
||||
# Last batch was passed to crawl.
|
||||
assert crawler.engine.crawl.call_count == batch_size
|
||||
|
||||
if start_urls_as_zset or start_urls_as_set:
|
||||
crawler.engine.crawl.assert_has_calls(
|
||||
[mock.call(req) for req in reqs if req not in start_requests],
|
||||
any_order=True,
|
||||
)
|
||||
else:
|
||||
crawler.engine.crawl.assert_has_calls(
|
||||
[mock.call(req) for req in reqs[batch_size:]]
|
||||
)
|
||||
@@ -0,0 +1,7 @@
|
||||
from scrapy_redis.utils import bytes_to_str
|
||||
|
||||
|
||||
def test_bytes_to_str():
|
||||
assert bytes_to_str(b"foo") == "foo"
|
||||
# This char is the same in bytes or latin1.
|
||||
assert bytes_to_str(b"\xc1", "latin1") == "\xc1"
|
||||
@@ -0,0 +1,90 @@
|
||||
[tox]
|
||||
requires =
|
||||
tox>=4
|
||||
envlist =
|
||||
docs
|
||||
security
|
||||
flake8
|
||||
py{38,39,310,311,312}-scrapy{26,27,28,29,210,211}-redis{42,43,44,45,46,50}
|
||||
minversion = 3.0.0
|
||||
|
||||
[base]
|
||||
deps =
|
||||
-r requirements-tests.txt
|
||||
-r requirements.txt
|
||||
setuptools
|
||||
|
||||
[testenv]
|
||||
basepython =
|
||||
py38: python3.8
|
||||
py39: python3.9
|
||||
py310: python3.10
|
||||
py311: python3.11
|
||||
py312: python3.12
|
||||
deps =
|
||||
{[base]deps}
|
||||
scrapy26: scrapy~=2.6.0
|
||||
scrapy27: scrapy~=2.7.0
|
||||
scrapy28: scrapy~=2.8.0
|
||||
scrapy29: scrapy~=2.9.0
|
||||
scrapy210: scrapy~=2.10.0
|
||||
scrapy211: scrapy~=2.11.0
|
||||
redis42: redis~=4.2.0
|
||||
redis43: redis~=4.3.0
|
||||
redis44: redis~=4.4.0
|
||||
redis45: redis~=4.5.0
|
||||
redis46: redis~=4.6.0
|
||||
redis50: redis~=5.0.0
|
||||
passenv =
|
||||
REDIS_HOST
|
||||
REDIS_PORT
|
||||
commands =
|
||||
python -m pytest # --cov-report term --cov=scrapy_redis
|
||||
|
||||
[testenv:flake8]
|
||||
basepython =
|
||||
python3.12
|
||||
deps =
|
||||
{[base]deps}
|
||||
commands =
|
||||
flake8 --ignore=W503,E265,E731 docs src tests
|
||||
|
||||
[testenv:security]
|
||||
basepython =
|
||||
python3.12
|
||||
deps =
|
||||
bandit~=1.7.3
|
||||
commands =
|
||||
bandit -r -c .bandit.yml src/ tests/
|
||||
|
||||
[testenv:pytest]
|
||||
basepython =
|
||||
python3.12
|
||||
deps =
|
||||
{[testenv]deps}
|
||||
passenv =
|
||||
REDIS_HOST
|
||||
REDIS_PORT
|
||||
commands =
|
||||
python -m pytest --cov-report term --cov=scrapy_redis
|
||||
|
||||
[testenv:build]
|
||||
basepython =
|
||||
python3.12
|
||||
deps =
|
||||
{[base]deps}
|
||||
build
|
||||
commands =
|
||||
python -m build
|
||||
|
||||
[testenv:docs]
|
||||
basepython =
|
||||
python3.12
|
||||
deps =
|
||||
{[base]deps}
|
||||
-r docs/requirements.txt
|
||||
allowlist_externals =
|
||||
make
|
||||
commands =
|
||||
# Same command as readthedocs
|
||||
make -C docs html SPHINXOPTS="-T -W --keep-going -D language=en"
|
||||
Reference in New Issue
Block a user