变更
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class DoubanItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
name = scrapy.Field()
|
||||
info = scrapy.Field()
|
||||
score = scrapy.Field()
|
||||
desc = scrapy.Field()
|
||||
|
||||
@@ -0,0 +1,134 @@
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
import base64
|
||||
|
||||
from scrapy import signals
|
||||
import random
|
||||
from douban.settings import USER_AGENT_LIST
|
||||
from douban.settings import PROXY_LIST
|
||||
from .myextend import pro
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
|
||||
|
||||
class RandomUserAgent(object):
|
||||
def process_request(self, request, spider):
|
||||
ua = random.choice(USER_AGENT_LIST)
|
||||
request.headers["'User-Agent'"] = ua
|
||||
# print(request.headers)
|
||||
|
||||
|
||||
class RandomProxy(object):
|
||||
def process_request(self, request, spider):
|
||||
proxy = random.choice(pro.proxy_list)
|
||||
print(proxy)
|
||||
|
||||
username = "azonhgez"
|
||||
password = "7lvu0dnm"
|
||||
request.meta['proxy'] = "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy}
|
||||
|
||||
# if "user_passwd" in proxy:
|
||||
# # # 对账号密码进行base64编码
|
||||
# # b64_up=base64.b64encode(proxy["user_passwd"].encode())
|
||||
# # #设置认证
|
||||
# # request.headers["Proxy-Authorization"] = "basic "+b64_up.decode()
|
||||
# # #设置代理
|
||||
# # request.meta['proxy']=proxy['ip_port']
|
||||
# else:
|
||||
# request.meta['proxy']=proxy['ip_port']
|
||||
|
||||
class DoubanSpiderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, or item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request or item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
|
||||
|
||||
class DoubanDownloaderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
# -- coding: utf-8 --
|
||||
import time
|
||||
import threading
|
||||
|
||||
import requests
|
||||
from scrapy import signals
|
||||
|
||||
# 提取代理IP的api
|
||||
api_url = 'https://kps.kdlapi.com/api/getkps/?secret_id=ou5nlcm9klazz4rhi8ht&signature=1ffl6otrop2on40eyeuxe46c0cavc9k6&num=10&pt=1&format=json&sep=1'
|
||||
foo = True
|
||||
|
||||
class Proxy:
|
||||
|
||||
def __init__(self, ):
|
||||
self._proxy_list = requests.get(api_url).json().get('data').get('proxy_list')
|
||||
|
||||
@property
|
||||
def proxy_list(self):
|
||||
return self._proxy_list
|
||||
|
||||
@proxy_list.setter
|
||||
def proxy_list(self, list):
|
||||
self._proxy_list = list
|
||||
|
||||
|
||||
pro = Proxy()
|
||||
print(pro.proxy_list)
|
||||
|
||||
|
||||
class MyExtend:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
# 将自定义方法绑定到scrapy信号上,使程序与spider引擎同步启动与关闭
|
||||
# scrapy信号文档: https://www.osgeo.cn/scrapy/topics/signals.html
|
||||
# scrapy自定义拓展文档: https://www.osgeo.cn/scrapy/topics/extensions.html
|
||||
crawler.signals.connect(self.start, signals.engine_started)
|
||||
crawler.signals.connect(self.close, signals.spider_closed)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def start(self):
|
||||
t = threading.Thread(target=self.extract_proxy)
|
||||
t.start()
|
||||
|
||||
def extract_proxy(self):
|
||||
while foo:
|
||||
pro.proxy_list = requests.get(api_url).json().get('data').get('proxy_list')
|
||||
#设置每15秒提取一次ip
|
||||
time.sleep(15)
|
||||
|
||||
def close(self):
|
||||
global foo
|
||||
foo = False
|
||||
@@ -0,0 +1,13 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
|
||||
class DoubanPipeline:
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
@@ -0,0 +1,119 @@
|
||||
# Scrapy settings for douban project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = "douban"
|
||||
|
||||
SPIDER_MODULES = ["douban.spiders"]
|
||||
NEWSPIDER_MODULE = "douban.spiders"
|
||||
LOG_LEVEL="WARNING"
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = """
|
||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"""
|
||||
|
||||
USER_AGENT_LIST = ["Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"
|
||||
]
|
||||
|
||||
PROXY_LIST=[
|
||||
{"ip_port":"119.96.179.83:16819","user_passwd":"7lvu0dnm"},
|
||||
# {"ip_port":"185.195.107.254:23445"}
|
||||
|
||||
]
|
||||
|
||||
# Obey robots.txt rules
|
||||
# ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# "douban.middlewares.DoubanSpiderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
# "douban.middlewares.DoubanDownloaderMiddleware": 543,
|
||||
"douban.middlewares.RandomUserAgent": 542,
|
||||
"douban.middlewares.RandomProxy": 541,
|
||||
}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# "douban.pipelines.DoubanPipeline": 300,
|
||||
#}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = "httpcache"
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
@@ -0,0 +1,28 @@
|
||||
import scrapy
|
||||
from douban.items import DoubanItem
|
||||
|
||||
class Db250Spider(scrapy.Spider):
|
||||
name = "db250"
|
||||
allowed_domains = ["douban.com"]
|
||||
start_urls = ["https://movie.douban.com/top250"]
|
||||
|
||||
def parse(self, response):
|
||||
el_list = response.xpath("//div[@class='info']")
|
||||
|
||||
|
||||
for el in el_list:
|
||||
item = DoubanItem()
|
||||
item['name']=el.xpath("./div[1]/a/span[1]/text()").extract_first()
|
||||
item['info']=el.xpath("./div[2]/p[1]/text()").extract_first()
|
||||
item['score']=el.xpath("./div[2]/div/span[2]/text()").extract_first()
|
||||
item['desc']=el.xpath("./p[2]/span/text()").extract_first()
|
||||
# print(item)
|
||||
yield item
|
||||
|
||||
|
||||
url = response.xpath("//span[@class='next']/a/@href").extract_first()
|
||||
if url != None:
|
||||
url = response.urljoin(url)
|
||||
yield scrapy.Request(
|
||||
url=url
|
||||
)
|
||||
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = douban.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = douban
|
||||
Reference in New Issue
Block a user