diff --git a/app.py b/app.py
index c3c449c..1ea8336 100644
--- a/app.py
+++ b/app.py
@@ -99,8 +99,10 @@ app.secret_key = 'this is secret_key you know ?' # 设置 Flask 的密钥,用
# 导入蓝图
from views.page import page
from views.user import user
+from views.spider_control import spider_bp
app.register_blueprint(page.pb) # 注册页面蓝图
app.register_blueprint(user.ub) # 注册用户蓝图
+app.register_blueprint(spider_bp) # 注册爬虫控制蓝图
# 首页路由,清空 session
@app.route('/')
diff --git a/spider/spiderData.py b/spider/spiderData.py
index 6a06a8e..b2ff366 100644
--- a/spider/spiderData.py
+++ b/spider/spiderData.py
@@ -3,6 +3,13 @@ from spiderDataPackage.spiderContent import start as spiderContent
from spiderDataPackage.spiderComments import start as spiderComments
from spiderDataPackage.settings import navAddr
import os
+import requests
+import time
+import random
+import logging
+from bs4 import BeautifulSoup
+from datetime import datetime
+from utils.logger import spider_logger as logging
def spiderData():
if not os.path.exists(navAddr):
@@ -13,5 +20,131 @@ def spiderData():
print('正在爬取文章评论数据')
spiderComments()
+class SpiderData:
+ def __init__(self):
+ self.headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+ }
+ self.base_url = 'https://s.weibo.com'
+
+ def crawl_topic(self, topic, depth=3, interval=5, max_retries=3, timeout=30):
+ """
+ 爬取指定话题的微博内容
+
+ :param topic: 要爬取的话题
+ :param depth: 爬取深度(页数)
+ :param interval: 请求间隔时间(秒)
+ :param max_retries: 最大重试次数
+ :param timeout: 请求超时时间(秒)
+ """
+ logging.info(f"开始爬取话题: {topic}")
+
+ for page in range(1, depth + 1):
+ retries = 0
+ while retries < max_retries:
+ try:
+ url = f"{self.base_url}/weibo?q={topic}&page={page}"
+ response = requests.get(url, headers=self.headers, timeout=timeout)
+
+ if response.status_code == 200:
+ self._parse_page(response.text)
+ logging.info(f"成功爬取话题 {topic} 第 {page} 页")
+ break
+ else:
+ logging.warning(f"请求失败,状态码: {response.status_code}")
+ retries += 1
+
+ except requests.RequestException as e:
+ logging.error(f"请求异常: {e}")
+ retries += 1
+
+ if retries < max_retries:
+ sleep_time = interval * (1 + random.random())
+ logging.info(f"等待 {sleep_time:.2f} 秒后重试...")
+ time.sleep(sleep_time)
+
+ if retries == max_retries:
+ logging.error(f"话题 {topic} 第 {page} 页爬取失败,已达到最大重试次数")
+ continue
+
+ # 在页面之间添加随机延迟
+ if page < depth:
+ sleep_time = interval * (1 + random.random())
+ logging.info(f"等待 {sleep_time:.2f} 秒后继续...")
+ time.sleep(sleep_time)
+
+ def _parse_page(self, html_content):
+ """
+ 解析页面内容并保存数据
+
+ :param html_content: 页面HTML内容
+ """
+ try:
+ soup = BeautifulSoup(html_content, 'html.parser')
+ weibo_items = soup.find_all('div', class_='card-wrap')
+
+ for item in weibo_items:
+ try:
+ # 提取微博内容
+ content = item.find('p', class_='txt')
+ if not content:
+ continue
+
+ # 提取用户信息
+ user_info = item.find('a', class_='name')
+ if not user_info:
+ continue
+
+ # 提取发布时间
+ time_info = item.find('p', class_='from')
+
+ # 提取互动数据
+ actions = item.find_all('li', class_='action')
+
+ # 构建数据字典
+ weibo_data = {
+ 'content': content.text.strip(),
+ 'user_name': user_info.text.strip(),
+ 'publish_time': time_info.text.strip() if time_info else '',
+ 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0,
+ 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0,
+ 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0,
+ 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ }
+
+ # 保存到数据库
+ self._save_to_database(weibo_data)
+
+ except Exception as e:
+ logging.error(f"解析微博项时出错: {e}")
+ continue
+
+ except Exception as e:
+ logging.error(f"解析页面时出错: {e}")
+
+ def _extract_number(self, text):
+ """
+ 从文本中提取数字
+
+ :param text: 包含数字的文本
+ :return: 提取的数字,如果没有找到则返回0
+ """
+ try:
+ return int(''.join(filter(str.isdigit, text)))
+ except ValueError:
+ return 0
+
+ def _save_to_database(self, data):
+ """
+ 将数据保存到数据库
+
+ :param data: 要保存的数据字典
+ """
+ try:
+ # TODO: 实现数据库保存逻辑
+ logging.info(f"保存数据: {data}")
+ except Exception as e:
+ logging.error(f"保存数据时出错: {e}")
+
if __name__ == '__main__':
spiderData()
\ No newline at end of file
diff --git a/templates/spider_control.html b/templates/spider_control.html
new file mode 100644
index 0000000..9da3dac
--- /dev/null
+++ b/templates/spider_control.html
@@ -0,0 +1,291 @@
+
+
+
+
+
+ 爬虫控制面板
+
+
+
+
+
+
+
爬虫控制面板
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 每个话题爬取的页数(1-10)
+
+
+
+
+
+
+ 每次请求之间的间隔时间
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/views/spider_control.py b/views/spider_control.py
new file mode 100644
index 0000000..3f414ce
--- /dev/null
+++ b/views/spider_control.py
@@ -0,0 +1,213 @@
+from flask import Blueprint, jsonify, request, render_template
+import json
+import os
+from datetime import datetime
+import threading
+from queue import Queue
+import asyncio
+import websockets
+import logging
+from spider.spiderData import SpiderData
+
+# 创建蓝图
+spider_bp = Blueprint('spider', __name__)
+
+# 创建日志记录器
+logger = logging.getLogger('spider_control')
+logger.setLevel(logging.INFO)
+
+# 存储WebSocket连接的集合
+websocket_connections = set()
+
+# 创建消息队列
+message_queue = Queue()
+
+# 默认配置
+DEFAULT_CONFIG = {
+ 'crawlDepth': 3,
+ 'interval': 5,
+ 'maxRetries': 3,
+ 'timeout': 30
+}
+
+def load_config():
+ """加载爬虫配置"""
+ config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json')
+ try:
+ if os.path.exists(config_path):
+ with open(config_path, 'r', encoding='utf-8') as f:
+ return json.load(f)
+ except Exception as e:
+ logger.error(f"加载配置文件失败: {e}")
+ return DEFAULT_CONFIG
+
+def save_config(config):
+ """保存爬虫配置"""
+ config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json')
+ try:
+ with open(config_path, 'w', encoding='utf-8') as f:
+ json.dump(config, f, ensure_ascii=False, indent=4)
+ return True
+ except Exception as e:
+ logger.error(f"保存配置文件失败: {e}")
+ return False
+
+async def broadcast_message(message):
+ """广播消息到所有WebSocket连接"""
+ if not websocket_connections:
+ return
+
+ for websocket in websocket_connections.copy():
+ try:
+ await websocket.send(json.dumps(message))
+ except websockets.exceptions.ConnectionClosed:
+ websocket_connections.remove(websocket)
+ except Exception as e:
+ logger.error(f"发送WebSocket消息失败: {e}")
+ websocket_connections.remove(websocket)
+
+def spider_worker(topics, parameters):
+ """爬虫工作线程"""
+ total_topics = len(topics)
+ completed_topics = 0
+
+ try:
+ spider = SpiderData()
+
+ for topic in topics:
+ try:
+ # 更新进度
+ progress = int((completed_topics / total_topics) * 100)
+ asyncio.run(broadcast_message({
+ 'type': 'progress',
+ 'value': progress
+ }))
+
+ # 发送开始爬取的日志
+ asyncio.run(broadcast_message({
+ 'type': 'log',
+ 'message': f'开始爬取话题: {topic}'
+ }))
+
+ # 执行爬取
+ spider.crawl_topic(
+ topic=topic,
+ depth=parameters['crawlDepth'],
+ interval=parameters['interval'],
+ max_retries=parameters['maxRetries'],
+ timeout=parameters['timeout']
+ )
+
+ completed_topics += 1
+
+ # 发送完成爬取的日志
+ asyncio.run(broadcast_message({
+ 'type': 'log',
+ 'message': f'话题 {topic} 爬取完成'
+ }))
+
+ except Exception as e:
+ # 发送错误日志
+ asyncio.run(broadcast_message({
+ 'type': 'log',
+ 'message': f'爬取话题 {topic} 时出错: {str(e)}'
+ }))
+
+ # 更新最终进度
+ asyncio.run(broadcast_message({
+ 'type': 'progress',
+ 'value': 100
+ }))
+
+ # 发送完成消息
+ asyncio.run(broadcast_message({
+ 'type': 'log',
+ 'message': '所有话题爬取完成'
+ }))
+
+ except Exception as e:
+ # 发送错误日志
+ asyncio.run(broadcast_message({
+ 'type': 'log',
+ 'message': f'爬虫任务执行出错: {str(e)}'
+ }))
+
+@spider_bp.route('/spider/control')
+def spider_control():
+ """渲染爬虫控制页面"""
+ return render_template('spider_control.html')
+
+@spider_bp.route('/api/spider/start', methods=['POST'])
+def start_spider():
+ """启动爬虫任务"""
+ try:
+ data = request.get_json()
+ topics = data.get('topics', [])
+ parameters = data.get('parameters', DEFAULT_CONFIG)
+
+ if not topics:
+ return jsonify({
+ 'success': False,
+ 'message': '请选择至少一个话题'
+ })
+
+ # 启动爬虫线程
+ thread = threading.Thread(
+ target=spider_worker,
+ args=(topics, parameters),
+ daemon=True
+ )
+ thread.start()
+
+ return jsonify({
+ 'success': True,
+ 'message': '爬虫任务已启动'
+ })
+
+ except Exception as e:
+ logger.error(f"启动爬虫任务失败: {e}")
+ return jsonify({
+ 'success': False,
+ 'message': str(e)
+ })
+
+@spider_bp.route('/api/spider/save-config', methods=['POST'])
+def save_spider_config():
+ """保存爬虫配置"""
+ try:
+ config = request.get_json()
+ if save_config(config):
+ return jsonify({
+ 'success': True,
+ 'message': '配置保存成功'
+ })
+ else:
+ return jsonify({
+ 'success': False,
+ 'message': '配置保存失败'
+ })
+ except Exception as e:
+ logger.error(f"保存配置失败: {e}")
+ return jsonify({
+ 'success': False,
+ 'message': str(e)
+ })
+
+@spider_bp.websocket('/ws/spider-status')
+async def spider_status_socket():
+ """WebSocket连接处理"""
+ try:
+ websocket = websockets.WebSocketServerProtocol()
+ websocket_connections.add(websocket)
+
+ try:
+ while True:
+ # 保持连接活跃
+ await websocket.ping()
+ await asyncio.sleep(30)
+ except websockets.exceptions.ConnectionClosed:
+ pass
+ finally:
+ websocket_connections.remove(websocket)
+ except Exception as e:
+ logger.error(f"WebSocket连接处理失败: {e}")
\ No newline at end of file