diff --git a/app.py b/app.py index c3c449c..1ea8336 100644 --- a/app.py +++ b/app.py @@ -99,8 +99,10 @@ app.secret_key = 'this is secret_key you know ?' # 设置 Flask 的密钥,用 # 导入蓝图 from views.page import page from views.user import user +from views.spider_control import spider_bp app.register_blueprint(page.pb) # 注册页面蓝图 app.register_blueprint(user.ub) # 注册用户蓝图 +app.register_blueprint(spider_bp) # 注册爬虫控制蓝图 # 首页路由,清空 session @app.route('/') diff --git a/spider/spiderData.py b/spider/spiderData.py index 6a06a8e..b2ff366 100644 --- a/spider/spiderData.py +++ b/spider/spiderData.py @@ -3,6 +3,13 @@ from spiderDataPackage.spiderContent import start as spiderContent from spiderDataPackage.spiderComments import start as spiderComments from spiderDataPackage.settings import navAddr import os +import requests +import time +import random +import logging +from bs4 import BeautifulSoup +from datetime import datetime +from utils.logger import spider_logger as logging def spiderData(): if not os.path.exists(navAddr): @@ -13,5 +20,131 @@ def spiderData(): print('正在爬取文章评论数据') spiderComments() +class SpiderData: + def __init__(self): + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + self.base_url = 'https://s.weibo.com' + + def crawl_topic(self, topic, depth=3, interval=5, max_retries=3, timeout=30): + """ + 爬取指定话题的微博内容 + + :param topic: 要爬取的话题 + :param depth: 爬取深度(页数) + :param interval: 请求间隔时间(秒) + :param max_retries: 最大重试次数 + :param timeout: 请求超时时间(秒) + """ + logging.info(f"开始爬取话题: {topic}") + + for page in range(1, depth + 1): + retries = 0 + while retries < max_retries: + try: + url = f"{self.base_url}/weibo?q={topic}&page={page}" + response = requests.get(url, headers=self.headers, timeout=timeout) + + if response.status_code == 200: + self._parse_page(response.text) + logging.info(f"成功爬取话题 {topic} 第 {page} 页") + break + else: + logging.warning(f"请求失败,状态码: {response.status_code}") + retries += 1 + + except requests.RequestException as e: + logging.error(f"请求异常: {e}") + retries += 1 + + if retries < max_retries: + sleep_time = interval * (1 + random.random()) + logging.info(f"等待 {sleep_time:.2f} 秒后重试...") + time.sleep(sleep_time) + + if retries == max_retries: + logging.error(f"话题 {topic} 第 {page} 页爬取失败,已达到最大重试次数") + continue + + # 在页面之间添加随机延迟 + if page < depth: + sleep_time = interval * (1 + random.random()) + logging.info(f"等待 {sleep_time:.2f} 秒后继续...") + time.sleep(sleep_time) + + def _parse_page(self, html_content): + """ + 解析页面内容并保存数据 + + :param html_content: 页面HTML内容 + """ + try: + soup = BeautifulSoup(html_content, 'html.parser') + weibo_items = soup.find_all('div', class_='card-wrap') + + for item in weibo_items: + try: + # 提取微博内容 + content = item.find('p', class_='txt') + if not content: + continue + + # 提取用户信息 + user_info = item.find('a', class_='name') + if not user_info: + continue + + # 提取发布时间 + time_info = item.find('p', class_='from') + + # 提取互动数据 + actions = item.find_all('li', class_='action') + + # 构建数据字典 + weibo_data = { + 'content': content.text.strip(), + 'user_name': user_info.text.strip(), + 'publish_time': time_info.text.strip() if time_info else '', + 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0, + 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0, + 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0, + 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + + # 保存到数据库 + self._save_to_database(weibo_data) + + except Exception as e: + logging.error(f"解析微博项时出错: {e}") + continue + + except Exception as e: + logging.error(f"解析页面时出错: {e}") + + def _extract_number(self, text): + """ + 从文本中提取数字 + + :param text: 包含数字的文本 + :return: 提取的数字,如果没有找到则返回0 + """ + try: + return int(''.join(filter(str.isdigit, text))) + except ValueError: + return 0 + + def _save_to_database(self, data): + """ + 将数据保存到数据库 + + :param data: 要保存的数据字典 + """ + try: + # TODO: 实现数据库保存逻辑 + logging.info(f"保存数据: {data}") + except Exception as e: + logging.error(f"保存数据时出错: {e}") + if __name__ == '__main__': spiderData() \ No newline at end of file diff --git a/templates/spider_control.html b/templates/spider_control.html new file mode 100644 index 0000000..9da3dac --- /dev/null +++ b/templates/spider_control.html @@ -0,0 +1,291 @@ + + + + + + 爬虫控制面板 + + + + + +
+

爬虫控制面板

+ + +
+
+
选择话题类型
+
+
+
+ +
+ +
+
添加自定义话题
+
+ + +
+
+ +
+
已选择的话题:
+
+ +
+
+
+
+ + +
+
+
爬虫参数配置
+
+
+
+
+
+ + + 每个话题爬取的页数(1-10) +
+
+
+
+ + + 每次请求之间的间隔时间 +
+
+
+ +
+
+
+ + +
+
+
+
+ + +
+
+
+
+
+ + +
+ + +
+ + +
+
+
爬虫状态
+
+
+
+
+
+
+

+                
+
+
+
+ + + + + + \ No newline at end of file diff --git a/views/spider_control.py b/views/spider_control.py new file mode 100644 index 0000000..3f414ce --- /dev/null +++ b/views/spider_control.py @@ -0,0 +1,213 @@ +from flask import Blueprint, jsonify, request, render_template +import json +import os +from datetime import datetime +import threading +from queue import Queue +import asyncio +import websockets +import logging +from spider.spiderData import SpiderData + +# 创建蓝图 +spider_bp = Blueprint('spider', __name__) + +# 创建日志记录器 +logger = logging.getLogger('spider_control') +logger.setLevel(logging.INFO) + +# 存储WebSocket连接的集合 +websocket_connections = set() + +# 创建消息队列 +message_queue = Queue() + +# 默认配置 +DEFAULT_CONFIG = { + 'crawlDepth': 3, + 'interval': 5, + 'maxRetries': 3, + 'timeout': 30 +} + +def load_config(): + """加载爬虫配置""" + config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json') + try: + if os.path.exists(config_path): + with open(config_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + logger.error(f"加载配置文件失败: {e}") + return DEFAULT_CONFIG + +def save_config(config): + """保存爬虫配置""" + config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json') + try: + with open(config_path, 'w', encoding='utf-8') as f: + json.dump(config, f, ensure_ascii=False, indent=4) + return True + except Exception as e: + logger.error(f"保存配置文件失败: {e}") + return False + +async def broadcast_message(message): + """广播消息到所有WebSocket连接""" + if not websocket_connections: + return + + for websocket in websocket_connections.copy(): + try: + await websocket.send(json.dumps(message)) + except websockets.exceptions.ConnectionClosed: + websocket_connections.remove(websocket) + except Exception as e: + logger.error(f"发送WebSocket消息失败: {e}") + websocket_connections.remove(websocket) + +def spider_worker(topics, parameters): + """爬虫工作线程""" + total_topics = len(topics) + completed_topics = 0 + + try: + spider = SpiderData() + + for topic in topics: + try: + # 更新进度 + progress = int((completed_topics / total_topics) * 100) + asyncio.run(broadcast_message({ + 'type': 'progress', + 'value': progress + })) + + # 发送开始爬取的日志 + asyncio.run(broadcast_message({ + 'type': 'log', + 'message': f'开始爬取话题: {topic}' + })) + + # 执行爬取 + spider.crawl_topic( + topic=topic, + depth=parameters['crawlDepth'], + interval=parameters['interval'], + max_retries=parameters['maxRetries'], + timeout=parameters['timeout'] + ) + + completed_topics += 1 + + # 发送完成爬取的日志 + asyncio.run(broadcast_message({ + 'type': 'log', + 'message': f'话题 {topic} 爬取完成' + })) + + except Exception as e: + # 发送错误日志 + asyncio.run(broadcast_message({ + 'type': 'log', + 'message': f'爬取话题 {topic} 时出错: {str(e)}' + })) + + # 更新最终进度 + asyncio.run(broadcast_message({ + 'type': 'progress', + 'value': 100 + })) + + # 发送完成消息 + asyncio.run(broadcast_message({ + 'type': 'log', + 'message': '所有话题爬取完成' + })) + + except Exception as e: + # 发送错误日志 + asyncio.run(broadcast_message({ + 'type': 'log', + 'message': f'爬虫任务执行出错: {str(e)}' + })) + +@spider_bp.route('/spider/control') +def spider_control(): + """渲染爬虫控制页面""" + return render_template('spider_control.html') + +@spider_bp.route('/api/spider/start', methods=['POST']) +def start_spider(): + """启动爬虫任务""" + try: + data = request.get_json() + topics = data.get('topics', []) + parameters = data.get('parameters', DEFAULT_CONFIG) + + if not topics: + return jsonify({ + 'success': False, + 'message': '请选择至少一个话题' + }) + + # 启动爬虫线程 + thread = threading.Thread( + target=spider_worker, + args=(topics, parameters), + daemon=True + ) + thread.start() + + return jsonify({ + 'success': True, + 'message': '爬虫任务已启动' + }) + + except Exception as e: + logger.error(f"启动爬虫任务失败: {e}") + return jsonify({ + 'success': False, + 'message': str(e) + }) + +@spider_bp.route('/api/spider/save-config', methods=['POST']) +def save_spider_config(): + """保存爬虫配置""" + try: + config = request.get_json() + if save_config(config): + return jsonify({ + 'success': True, + 'message': '配置保存成功' + }) + else: + return jsonify({ + 'success': False, + 'message': '配置保存失败' + }) + except Exception as e: + logger.error(f"保存配置失败: {e}") + return jsonify({ + 'success': False, + 'message': str(e) + }) + +@spider_bp.websocket('/ws/spider-status') +async def spider_status_socket(): + """WebSocket连接处理""" + try: + websocket = websockets.WebSocketServerProtocol() + websocket_connections.add(websocket) + + try: + while True: + # 保持连接活跃 + await websocket.ping() + await asyncio.sleep(30) + except websockets.exceptions.ConnectionClosed: + pass + finally: + websocket_connections.remove(websocket) + except Exception as e: + logger.error(f"WebSocket连接处理失败: {e}") \ No newline at end of file