From 231d533ece3381293b48df1a36703eadb390cdf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=88=92=E9=85=92=E7=9A=84=E6=9D=8E=E7=99=BD?= <670939375@qq.com> Date: Sat, 15 Mar 2025 13:19:41 +0800 Subject: [PATCH] Optimize the crawler configuration page, add multi-account parallel functionality, adapt AI configuration features, and include database configuration options. --- spider/spiderData.py | 66 +++- templates/spider_control.html | 569 +++++++++++++++++++++++++++++++++- views/spider_control.py | 415 +++++++++++++++++++++++-- 3 files changed, 1014 insertions(+), 36 deletions(-) diff --git a/spider/spiderData.py b/spider/spiderData.py index 82987cb..1324385 100644 --- a/spider/spiderData.py +++ b/spider/spiderData.py @@ -93,8 +93,20 @@ class SpiderData: connection.rollback() def crawl_topic(self, topic: str, depth: int = 3, interval: int = 5, - max_retries: int = 3, timeout: int = 30): - """爬取指定话题的微博内容""" + max_retries: int = 3, timeout: int = 30, cookie: str = None, + filter_callback = None): + """ + 爬取指定话题的微博内容 + + Args: + topic: 话题关键词 + depth: 爬取深度(页数) + interval: 请求间隔(秒) + max_retries: 最大重试次数 + timeout: 请求超时时间(秒) + cookie: 用户Cookie + filter_callback: 筛选回调函数,返回True表示保留该条微博 + """ # 参数验证 if not isinstance(depth, int) or depth < 1 or depth > 10: raise ValueError("爬取深度必须在1-10页之间") @@ -105,6 +117,10 @@ class SpiderData: if not isinstance(timeout, int) or timeout < 10 or timeout > 60: raise ValueError("请求超时时间必须在10-60秒之间") + # 更新请求头中的Cookie + if cookie: + self.headers['Cookie'] = cookie + logging.info(f"开始爬取话题: {topic}, 参数: depth={depth}, interval={interval}, max_retries={max_retries}, timeout={timeout}") for page in range(1, depth + 1): @@ -116,7 +132,7 @@ class SpiderData: # 检查缓存 cached_content = self._get_cached_page(url) if cached_content: - self._parse_page(cached_content) + self._parse_page(cached_content, filter_callback) logging.info(f"使用缓存数据: {topic} 第 {page} 页") break @@ -125,7 +141,7 @@ class SpiderData: if response.status_code == 200: # 缓存页面内容 self._cache_page(url, response.text) - self._parse_page(response.text) + self._parse_page(response.text, filter_callback) logging.info(f"成功爬取话题 {topic} 第 {page} 页") break else: @@ -154,8 +170,14 @@ class SpiderData: # 最后刷新缓冲区 self._flush_buffer() - def _parse_page(self, html_content: str): - """解析页面内容并保存数据""" + def _parse_page(self, html_content: str, filter_callback = None): + """ + 解析页面内容并保存数据 + + Args: + html_content: HTML页面内容 + filter_callback: 筛选回调函数 + """ try: soup = BeautifulSoup(html_content, 'html.parser') weibo_items = soup.find_all('div', class_='card-wrap') @@ -178,6 +200,19 @@ class SpiderData: # 提取互动数据 actions = item.find_all('li', class_='action') + # 提取用户认证状态 + user_verified = bool(item.find('i', class_='icon-vip')) + + # 提取是否原创 + is_original = not bool(item.find('span', class_='repost')) + + # 提取是否包含媒体 + has_media = bool(item.find('div', class_='media')) + + # 提取发布位置 + location = item.find('a', class_='location') + location_text = location.text.strip() if location else '' + # 构建数据字典 weibo_data = { 'content': content.text.strip(), @@ -186,15 +221,22 @@ class SpiderData: 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0, 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0, 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0, + 'read_count': self._extract_number(actions[3].text) if len(actions) > 3 else 0, + 'user_verified': user_verified, + 'is_original': is_original, + 'has_media': has_media, + 'location': location_text, 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } - # 添加到插入缓冲区 - self.insert_buffer.append(weibo_data) - - # 如果缓冲区达到阈值,执行批量插入 - if len(self.insert_buffer) >= self.buffer_size: - self._flush_buffer() + # 如果有筛选回调函数,则进行筛选 + if filter_callback is None or filter_callback(weibo_data): + # 添加到插入缓冲区 + self.insert_buffer.append(weibo_data) + + # 如果缓冲区达到阈值,执行批量插入 + if len(self.insert_buffer) >= self.buffer_size: + self._flush_buffer() except Exception as e: logging.error(f"解析微博项时出错: {e}") diff --git a/templates/spider_control.html b/templates/spider_control.html index 37aeefe..b85542f 100644 --- a/templates/spider_control.html +++ b/templates/spider_control.html @@ -103,6 +103,214 @@ + +