From 1180f285a00474313abb5a17e462e8830114b0ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=88=92=E9=85=92=E7=9A=84=E6=9D=8E=E7=99=BD?= <670939375@qq.com> Date: Sun, 23 Feb 2025 23:30:54 +0800 Subject: [PATCH] Add a visual control panel for the crawler, supporting customization of topics and parameter configuration. --- app.py | 2 + spider/spiderData.py | 133 ++++++++++++++++ templates/spider_control.html | 291 ++++++++++++++++++++++++++++++++++ views/spider_control.py | 213 +++++++++++++++++++++++++ 4 files changed, 639 insertions(+) create mode 100644 templates/spider_control.html create mode 100644 views/spider_control.py diff --git a/app.py b/app.py index c3c449c..1ea8336 100644 --- a/app.py +++ b/app.py @@ -99,8 +99,10 @@ app.secret_key = 'this is secret_key you know ?' # 设置 Flask 的密钥,用 # 导入蓝图 from views.page import page from views.user import user +from views.spider_control import spider_bp app.register_blueprint(page.pb) # 注册页面蓝图 app.register_blueprint(user.ub) # 注册用户蓝图 +app.register_blueprint(spider_bp) # 注册爬虫控制蓝图 # 首页路由,清空 session @app.route('/') diff --git a/spider/spiderData.py b/spider/spiderData.py index 6a06a8e..b2ff366 100644 --- a/spider/spiderData.py +++ b/spider/spiderData.py @@ -3,6 +3,13 @@ from spiderDataPackage.spiderContent import start as spiderContent from spiderDataPackage.spiderComments import start as spiderComments from spiderDataPackage.settings import navAddr import os +import requests +import time +import random +import logging +from bs4 import BeautifulSoup +from datetime import datetime +from utils.logger import spider_logger as logging def spiderData(): if not os.path.exists(navAddr): @@ -13,5 +20,131 @@ def spiderData(): print('正在爬取文章评论数据') spiderComments() +class SpiderData: + def __init__(self): + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + self.base_url = 'https://s.weibo.com' + + def crawl_topic(self, topic, depth=3, interval=5, max_retries=3, timeout=30): + """ + 爬取指定话题的微博内容 + + :param topic: 要爬取的话题 + :param depth: 爬取深度(页数) + :param interval: 请求间隔时间(秒) + :param max_retries: 最大重试次数 + :param timeout: 请求超时时间(秒) + """ + logging.info(f"开始爬取话题: {topic}") + + for page in range(1, depth + 1): + retries = 0 + while retries < max_retries: + try: + url = f"{self.base_url}/weibo?q={topic}&page={page}" + response = requests.get(url, headers=self.headers, timeout=timeout) + + if response.status_code == 200: + self._parse_page(response.text) + logging.info(f"成功爬取话题 {topic} 第 {page} 页") + break + else: + logging.warning(f"请求失败,状态码: {response.status_code}") + retries += 1 + + except requests.RequestException as e: + logging.error(f"请求异常: {e}") + retries += 1 + + if retries < max_retries: + sleep_time = interval * (1 + random.random()) + logging.info(f"等待 {sleep_time:.2f} 秒后重试...") + time.sleep(sleep_time) + + if retries == max_retries: + logging.error(f"话题 {topic} 第 {page} 页爬取失败,已达到最大重试次数") + continue + + # 在页面之间添加随机延迟 + if page < depth: + sleep_time = interval * (1 + random.random()) + logging.info(f"等待 {sleep_time:.2f} 秒后继续...") + time.sleep(sleep_time) + + def _parse_page(self, html_content): + """ + 解析页面内容并保存数据 + + :param html_content: 页面HTML内容 + """ + try: + soup = BeautifulSoup(html_content, 'html.parser') + weibo_items = soup.find_all('div', class_='card-wrap') + + for item in weibo_items: + try: + # 提取微博内容 + content = item.find('p', class_='txt') + if not content: + continue + + # 提取用户信息 + user_info = item.find('a', class_='name') + if not user_info: + continue + + # 提取发布时间 + time_info = item.find('p', class_='from') + + # 提取互动数据 + actions = item.find_all('li', class_='action') + + # 构建数据字典 + weibo_data = { + 'content': content.text.strip(), + 'user_name': user_info.text.strip(), + 'publish_time': time_info.text.strip() if time_info else '', + 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0, + 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0, + 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0, + 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + + # 保存到数据库 + self._save_to_database(weibo_data) + + except Exception as e: + logging.error(f"解析微博项时出错: {e}") + continue + + except Exception as e: + logging.error(f"解析页面时出错: {e}") + + def _extract_number(self, text): + """ + 从文本中提取数字 + + :param text: 包含数字的文本 + :return: 提取的数字,如果没有找到则返回0 + """ + try: + return int(''.join(filter(str.isdigit, text))) + except ValueError: + return 0 + + def _save_to_database(self, data): + """ + 将数据保存到数据库 + + :param data: 要保存的数据字典 + """ + try: + # TODO: 实现数据库保存逻辑 + logging.info(f"保存数据: {data}") + except Exception as e: + logging.error(f"保存数据时出错: {e}") + if __name__ == '__main__': spiderData() \ No newline at end of file diff --git a/templates/spider_control.html b/templates/spider_control.html new file mode 100644 index 0000000..9da3dac --- /dev/null +++ b/templates/spider_control.html @@ -0,0 +1,291 @@ + + +
+ + +