diff --git a/spider/spiderData.py b/spider/spiderData.py index 82987cb..1324385 100644 --- a/spider/spiderData.py +++ b/spider/spiderData.py @@ -93,8 +93,20 @@ class SpiderData: connection.rollback() def crawl_topic(self, topic: str, depth: int = 3, interval: int = 5, - max_retries: int = 3, timeout: int = 30): - """爬取指定话题的微博内容""" + max_retries: int = 3, timeout: int = 30, cookie: str = None, + filter_callback = None): + """ + 爬取指定话题的微博内容 + + Args: + topic: 话题关键词 + depth: 爬取深度(页数) + interval: 请求间隔(秒) + max_retries: 最大重试次数 + timeout: 请求超时时间(秒) + cookie: 用户Cookie + filter_callback: 筛选回调函数,返回True表示保留该条微博 + """ # 参数验证 if not isinstance(depth, int) or depth < 1 or depth > 10: raise ValueError("爬取深度必须在1-10页之间") @@ -105,6 +117,10 @@ class SpiderData: if not isinstance(timeout, int) or timeout < 10 or timeout > 60: raise ValueError("请求超时时间必须在10-60秒之间") + # 更新请求头中的Cookie + if cookie: + self.headers['Cookie'] = cookie + logging.info(f"开始爬取话题: {topic}, 参数: depth={depth}, interval={interval}, max_retries={max_retries}, timeout={timeout}") for page in range(1, depth + 1): @@ -116,7 +132,7 @@ class SpiderData: # 检查缓存 cached_content = self._get_cached_page(url) if cached_content: - self._parse_page(cached_content) + self._parse_page(cached_content, filter_callback) logging.info(f"使用缓存数据: {topic} 第 {page} 页") break @@ -125,7 +141,7 @@ class SpiderData: if response.status_code == 200: # 缓存页面内容 self._cache_page(url, response.text) - self._parse_page(response.text) + self._parse_page(response.text, filter_callback) logging.info(f"成功爬取话题 {topic} 第 {page} 页") break else: @@ -154,8 +170,14 @@ class SpiderData: # 最后刷新缓冲区 self._flush_buffer() - def _parse_page(self, html_content: str): - """解析页面内容并保存数据""" + def _parse_page(self, html_content: str, filter_callback = None): + """ + 解析页面内容并保存数据 + + Args: + html_content: HTML页面内容 + filter_callback: 筛选回调函数 + """ try: soup = BeautifulSoup(html_content, 'html.parser') weibo_items = soup.find_all('div', class_='card-wrap') @@ -178,6 +200,19 @@ class SpiderData: # 提取互动数据 actions = item.find_all('li', class_='action') + # 提取用户认证状态 + user_verified = bool(item.find('i', class_='icon-vip')) + + # 提取是否原创 + is_original = not bool(item.find('span', class_='repost')) + + # 提取是否包含媒体 + has_media = bool(item.find('div', class_='media')) + + # 提取发布位置 + location = item.find('a', class_='location') + location_text = location.text.strip() if location else '' + # 构建数据字典 weibo_data = { 'content': content.text.strip(), @@ -186,15 +221,22 @@ class SpiderData: 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0, 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0, 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0, + 'read_count': self._extract_number(actions[3].text) if len(actions) > 3 else 0, + 'user_verified': user_verified, + 'is_original': is_original, + 'has_media': has_media, + 'location': location_text, 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } - # 添加到插入缓冲区 - self.insert_buffer.append(weibo_data) - - # 如果缓冲区达到阈值,执行批量插入 - if len(self.insert_buffer) >= self.buffer_size: - self._flush_buffer() + # 如果有筛选回调函数,则进行筛选 + if filter_callback is None or filter_callback(weibo_data): + # 添加到插入缓冲区 + self.insert_buffer.append(weibo_data) + + # 如果缓冲区达到阈值,执行批量插入 + if len(self.insert_buffer) >= self.buffer_size: + self._flush_buffer() except Exception as e: logging.error(f"解析微博项时出错: {e}") diff --git a/templates/spider_control.html b/templates/spider_control.html index 37aeefe..b85542f 100644 --- a/templates/spider_control.html +++ b/templates/spider_control.html @@ -103,6 +103,214 @@ + +
+
+
内容筛选配置
+ +
+
+
+
筛选条件说明:
+
    +
  • 数值条件:设置大于某个值进行筛选,如点赞数>1000
  • +
  • 正则匹配:使用正则表达式匹配内容,如包含特定关键词
  • +
  • 多个条件之间是"与"的关系,即同时满足才会保留
  • +
+
+ 提示:合理设置筛选条件可以提高数据质量 +
+
+
+
+ +
互动数据筛选
+
+
+
+ + +
+
+
+
+ + +
+
+
+
+ + +
+
+
+
+ + +
+
+
+ + +
内容正则筛选
+
+ +
+ + + +
高级选项
+
+ + +
+
+ + +
+
+ + +
+
+
+ + +
+
+
账号配置
+
+ + +
+
+
+
+
如何获取Cookie?
+
    +
  1. 登录微博网页版
  2. +
  3. 按F12打开开发者工具
  4. +
  5. 切换到Network标签页
  6. +
  7. 刷新页面,找到请求头中的Cookie值
  8. +
+
+ 注意:请勿泄露您的Cookie信息! +
+
+ 提示:添加多个账号可以提高爬取效率,系统会自动在账号间轮换。 +
+
+
+
+
+ +
+ +
+
+ + +
+
+
并行配置
+
+
+
+
+
+ + + 同时进行爬取的最大话题数(1-5) +
+
+
+
+ + + 避免请求过于频繁(30-120) +
+
+
+
+
+ + +
+
+
数据库配置
+
+
+
+
+
+ + +
+
+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+ + +
+
+
+
+ +
+
+
+
@@ -237,13 +445,43 @@ return; } + // 验证必要的配置 + if (!validateConfig()) { + return; + } + const config = { topics: Array.from(selectedTopics), parameters: { crawlDepth: parseInt(document.getElementById('crawlDepth').value), interval: parseInt(document.getElementById('interval').value), maxRetries: parseInt(document.getElementById('maxRetries').value), - timeout: parseInt(document.getElementById('timeout').value) + timeout: parseInt(document.getElementById('timeout').value), + maxConcurrent: parseInt(document.getElementById('maxConcurrent').value), + requestsPerMinute: parseInt(document.getElementById('requestsPerMinute').value) + }, + filters: { + interaction: { + minLikes: parseInt(document.getElementById('minLikes').value) || 0, + minComments: parseInt(document.getElementById('minComments').value) || 0, + minReposts: parseInt(document.getElementById('minReposts').value) || 0, + minReads: parseInt(document.getElementById('minReads').value) || 0 + }, + regex: getRegexFilters(), + options: { + originalOnly: document.getElementById('filterOriginal').checked, + withMediaOnly: document.getElementById('filterWithMedia').checked, + verifiedOnly: document.getElementById('filterVerified').checked + } + }, + accounts: getAccountsConfig(), + database: { + type: document.getElementById('dbType').value, + host: document.getElementById('dbHost').value, + port: parseInt(document.getElementById('dbPort').value), + name: document.getElementById('dbName').value, + user: document.getElementById('dbUser').value, + password: document.getElementById('dbPassword').value } }; @@ -268,6 +506,335 @@ }); } + // 账号管理相关函数 + let accounts = []; + let accountIdCounter = 0; + + function createAccountElement(account) { + const accountDiv = document.createElement('div'); + accountDiv.className = 'border rounded p-3 mb-3 position-relative account-item'; + accountDiv.dataset.id = account.id; + + const deleteButton = document.createElement('button'); + deleteButton.className = 'btn btn-sm btn-danger position-absolute top-0 end-0 m-2'; + deleteButton.innerHTML = ''; + deleteButton.onclick = () => removeAccount(account.id); + + const content = ` +
+
+
+ + +
+
+
+
+ + +
+
+
+
+ + +
+
+ + +
+ + `; + + accountDiv.innerHTML = content; + accountDiv.appendChild(deleteButton); + return accountDiv; + } + + function addAccount() { + const account = { + id: accountIdCounter++, + username: '', + password: '', + cookie: '', + saveCookie: false, + status: 'pending' + }; + accounts.push(account); + + const accountsList = document.getElementById('accountsList'); + accountsList.appendChild(createAccountElement(account)); + updateAccountsWarning(); + } + + function removeAccount(id) { + accounts = accounts.filter(account => account.id !== id); + const accountElement = document.querySelector(`.account-item[data-id="${id}"]`); + if (accountElement) { + accountElement.remove(); + } + updateAccountsWarning(); + } + + function updateAccountsWarning() { + const warning = document.getElementById('noAccountsWarning'); + warning.style.display = accounts.length === 0 ? 'block' : 'none'; + } + + function getAccountsConfig() { + return accounts.map(account => { + const accountElement = document.querySelector(`.account-item[data-id="${account.id}"]`); + return { + username: accountElement.querySelector('.account-username').value, + password: accountElement.querySelector('.account-password').value, + cookie: accountElement.querySelector('.account-cookie').value, + saveCookie: accountElement.querySelector('.account-save-cookie').checked + }; + }); + } + + async function validateAccount(id) { + const accountElement = document.querySelector(`.account-item[data-id="${id}"]`); + const statusElement = accountElement.querySelector('.account-status'); + const cookie = accountElement.querySelector('.account-cookie').value.trim(); + + if (!cookie) { + statusElement.className = 'account-status alert alert-danger'; + statusElement.innerHTML = '状态:验证失败 - Cookie不能为空'; + return; + } + + statusElement.className = 'account-status alert alert-warning'; + statusElement.innerHTML = '状态:验证中...'; + + try { + const response = await fetch('/api/spider/validate-account', { + method: 'POST', + headers: { + 'Content-Type': 'application/json' + }, + body: JSON.stringify({ + cookie: cookie + }) + }); + + const data = await response.json(); + if (data.success) { + statusElement.className = 'account-status alert alert-success'; + statusElement.innerHTML = '状态:验证成功'; + } else { + statusElement.className = 'account-status alert alert-danger'; + statusElement.innerHTML = `状态:验证失败 - ${data.message}`; + } + } catch (error) { + statusElement.className = 'account-status alert alert-danger'; + statusElement.innerHTML = `状态:验证失败 - ${error.message}`; + } + } + + // 正则筛选器管理 + let regexFilters = []; + let regexFilterIdCounter = 0; + + function createRegexFilterElement(filter) { + const filterDiv = document.createElement('div'); + filterDiv.className = 'border rounded p-3 mb-3 position-relative regex-filter-item'; + filterDiv.dataset.id = filter.id; + + const deleteButton = document.createElement('button'); + deleteButton.className = 'btn btn-sm btn-danger position-absolute top-0 end-0 m-2'; + deleteButton.innerHTML = ''; + deleteButton.onclick = () => removeRegexFilter(filter.id); + + const content = ` +
+
+
+ + +
+
+
+
+ + +
+
+
+
+ + +
+ `; + + filterDiv.innerHTML = content; + filterDiv.appendChild(deleteButton); + return filterDiv; + } + + function addRegexFilter() { + const filter = { + id: regexFilterIdCounter++, + pattern: '', + target: 'content', + inverse: false + }; + regexFilters.push(filter); + + const filtersList = document.getElementById('regexFilters'); + filtersList.appendChild(createRegexFilterElement(filter)); + } + + function removeRegexFilter(id) { + regexFilters = regexFilters.filter(filter => filter.id !== id); + const filterElement = document.querySelector(`.regex-filter-item[data-id="${id}"]`); + if (filterElement) { + filterElement.remove(); + } + } + + function getRegexFilters() { + return regexFilters.map(filter => { + const filterElement = document.querySelector(`.regex-filter-item[data-id="${filter.id}"]`); + return { + pattern: filterElement.querySelector('.regex-pattern').value, + target: filterElement.querySelector('.regex-target').value, + inverse: filterElement.querySelector('.regex-inverse').checked + }; + }).filter(filter => filter.pattern.trim() !== ''); + } + + // 验证配置 + function validateConfig() { + // 验证正则表达式 + const invalidRegex = regexFilters.some(filter => { + const filterElement = document.querySelector(`.regex-filter-item[data-id="${filter.id}"]`); + const pattern = filterElement.querySelector('.regex-pattern').value.trim(); + if (pattern !== '') { + try { + new RegExp(pattern); + return false; + } catch (e) { + alert(`正则表达式 "${pattern}" 格式无效!`); + return true; + } + } + return false; + }); + + if (invalidRegex) { + return false; + } + + // 验证是否有账号配置 + if (accounts.length === 0) { + alert('请至少添加一个账号!'); + return false; + } + + // 验证每个账号是否都有Cookie + const invalidAccounts = accounts.filter(account => { + const accountElement = document.querySelector(`.account-item[data-id="${account.id}"]`); + return !accountElement.querySelector('.account-cookie').value.trim(); + }); + + if (invalidAccounts.length > 0) { + alert('存在未配置Cookie的账号,请检查!'); + return false; + } + + // 验证并行配置 + const maxConcurrent = parseInt(document.getElementById('maxConcurrent').value); + const requestsPerMinute = parseInt(document.getElementById('requestsPerMinute').value); + if (maxConcurrent < 1 || maxConcurrent > 5) { + alert('最大并行数必须在1-5之间!'); + return false; + } + if (requestsPerMinute < 30 || requestsPerMinute > 120) { + alert('每分钟请求数必须在30-120之间!'); + return false; + } + + // 验证数据库配置 + const dbConfig = { + host: document.getElementById('dbHost').value.trim(), + port: document.getElementById('dbPort').value.trim(), + name: document.getElementById('dbName').value.trim(), + user: document.getElementById('dbUser').value.trim(), + password: document.getElementById('dbPassword').value.trim() + }; + + if (!dbConfig.host || !dbConfig.port || !dbConfig.name || !dbConfig.user || !dbConfig.password) { + alert('请完整填写数据库配置信息!'); + return false; + } + + return true; + } + + // 测试数据库连接 + async function testDbConnection() { + const dbConfig = { + type: document.getElementById('dbType').value, + host: document.getElementById('dbHost').value, + port: parseInt(document.getElementById('dbPort').value), + name: document.getElementById('dbName').value, + user: document.getElementById('dbUser').value, + password: document.getElementById('dbPassword').value + }; + + try { + const response = await fetch('/api/spider/test-db', { + method: 'POST', + headers: { + 'Content-Type': 'application/json' + }, + body: JSON.stringify(dbConfig) + }); + + const data = await response.json(); + if (data.success) { + alert('数据库连接测试成功!'); + } else { + alert('数据库连接测试失败:' + data.message); + } + } catch (error) { + alert('测试连接时发生错误:' + error.message); + } + } + + // 监听数据库类型变化 + document.getElementById('dbType').addEventListener('change', function() { + const dbType = this.value; + const portInput = document.getElementById('dbPort'); + + // 根据数据库类型设置默认端口 + switch(dbType) { + case 'mysql': + portInput.value = '3306'; + break; + case 'postgresql': + portInput.value = '5432'; + break; + case 'mongodb': + portInput.value = '27017'; + break; + } + }); + // 保存配置 function saveConfig() { const config = { diff --git a/views/spider_control.py b/views/spider_control.py index 67a445f..8d74d9a 100644 --- a/views/spider_control.py +++ b/views/spider_control.py @@ -14,6 +14,12 @@ import aiohttp from concurrent.futures import ThreadPoolExecutor from ratelimit import limits, sleep_and_retry from tenacity import retry, stop_after_attempt, wait_exponential +import pymysql +import psycopg2 +from pymongo import MongoClient +from cryptography.fernet import Fernet +import base64 +import re # 创建蓝图 spider_bp = Blueprint('spider', __name__) @@ -22,6 +28,10 @@ spider_bp = Blueprint('spider', __name__) logger = logging.getLogger('spider_control') logger.setLevel(logging.INFO) +# 加密密钥 +ENCRYPTION_KEY = Fernet.generate_key() +cipher_suite = Fernet(ENCRYPTION_KEY) + # 存储WebSocket连接的集合 websocket_connections = set() @@ -41,14 +51,93 @@ DEFAULT_CONFIG = { 'interval': 5, 'maxRetries': 3, 'timeout': 30, - 'maxConcurrent': 2 + 'maxConcurrent': 2, + 'requestsPerMinute': 60 } -# 限流装饰器 -@sleep_and_retry -@limits(calls=100, period=60) # 每分钟最多100个请求 -def rate_limited_request(): - pass +def encrypt_data(data): + """加密敏感数据""" + if not data: + return None + return cipher_suite.encrypt(data.encode()).decode() + +def decrypt_data(encrypted_data): + """解密敏感数据""" + if not encrypted_data: + return None + return cipher_suite.decrypt(encrypted_data.encode()).decode() + +@spider_bp.route('/api/spider/test-db', methods=['POST']) +def test_db_connection(): + """测试数据库连接""" + try: + data = request.get_json() + db_type = data.get('type') + host = data.get('host') + port = data.get('port') + db_name = data.get('name') + user = data.get('user') + password = data.get('password') + + if not all([db_type, host, port, db_name, user, password]): + return jsonify({ + 'success': False, + 'message': '请提供完整的数据库配置信息' + }) + + try: + if db_type == 'mysql': + connection = pymysql.connect( + host=host, + port=port, + user=user, + password=password, + database=db_name + ) + connection.close() + elif db_type == 'postgresql': + connection = psycopg2.connect( + host=host, + port=port, + database=db_name, + user=user, + password=password + ) + connection.close() + elif db_type == 'mongodb': + client = MongoClient( + host=host, + port=port, + username=user, + password=password, + authSource=db_name + ) + client.server_info() # 测试连接 + client.close() + else: + return jsonify({ + 'success': False, + 'message': '不支持的数据库类型' + }) + + return jsonify({ + 'success': True, + 'message': '数据库连接测试成功' + }) + + except Exception as e: + logger.error(f"数据库连接测试失败: {str(e)}") + return jsonify({ + 'success': False, + 'message': f'数据库连接失败: {str(e)}' + }) + + except Exception as e: + logger.error(f"处理数据库测试请求时出错: {str(e)}") + return jsonify({ + 'success': False, + 'message': str(e) + }) class SpiderWorker: def __init__(self, topics, parameters): @@ -60,6 +149,50 @@ class SpiderWorker: self.message_buffer = [] self.message_buffer_size = 10 self.semaphore = asyncio.Semaphore(parameters.get('maxConcurrent', DEFAULT_CONFIG['maxConcurrent'])) + self.rate_limiter = asyncio.Semaphore(parameters.get('requestsPerMinute', DEFAULT_CONFIG['requestsPerMinute'])) + self.accounts = parameters.get('accounts', []) + self.current_account_index = 0 + self.account_lock = asyncio.Lock() + + # 添加筛选条件 + self.filters = parameters.get('filters', {}) + self.interaction_filters = self.filters.get('interaction', {}) + self.regex_filters = self.filters.get('regex', []) + self.filter_options = self.filters.get('options', {}) + + # 初始化正则表达式 + self.compiled_regex = [] + for regex_filter in self.regex_filters: + try: + pattern = regex_filter['pattern'] + if pattern: + self.compiled_regex.append({ + 'regex': re.compile(pattern), + 'target': regex_filter['target'], + 'inverse': regex_filter['inverse'] + }) + except re.error as e: + logger.error(f"正则表达式编译失败: {pattern}, 错误: {e}") + + def get_next_account(self): + """获取下一个可用账号""" + with self.account_lock: + if not self.accounts: + raise ValueError("没有可用的账号") + + account = self.accounts[self.current_account_index] + self.current_account_index = (self.current_account_index + 1) % len(self.accounts) + return account + + async def acquire_rate_limit(self): + """获取速率限制令牌""" + await self.rate_limiter.acquire() + asyncio.create_task(self.release_rate_limit()) + + async def release_rate_limit(self): + """释放速率限制令牌""" + await asyncio.sleep(60) # 1分钟后释放 + self.rate_limiter.release() async def send_message(self, message): """异步发送消息,使用缓冲区优化""" @@ -82,22 +215,43 @@ class SpiderWorker: async def crawl_single_topic(self, topic): """爬取单个话题""" try: - rate_limited_request() + await self.acquire_rate_limit() + + # 获取当前要使用的账号 + account = self.get_next_account() await self.send_message({ 'type': 'log', - 'message': f'开始爬取话题: {topic}' + 'message': f'使用账号 {account["username"]} 开始爬取话题: {topic}' }) + filtered_count = 0 + total_count = 0 + async with self.semaphore: + # 创建一个回调函数来处理爬取的数据 + def process_post(post): + nonlocal filtered_count, total_count + total_count += 1 + + # 应用筛选条件 + if self.apply_filters(post): + filtered_count += 1 + return True + return False + + # 调用爬虫并传入回调函数 await asyncio.get_event_loop().run_in_executor( thread_pool, - self.spider.crawl_topic, - topic, - self.parameters['crawlDepth'], - self.parameters['interval'], - self.parameters['maxRetries'], - self.parameters['timeout'] + lambda: self.spider.crawl_topic( + topic, + self.parameters['crawlDepth'], + self.parameters['interval'], + self.parameters['maxRetries'], + self.parameters['timeout'], + account['cookie'], + process_post # 传入回调函数 + ) ) self.completed_topics += 1 @@ -108,9 +262,10 @@ class SpiderWorker: 'value': progress }) + # 发送筛选统计信息 await self.send_message({ 'type': 'log', - 'message': f'话题 {topic} 爬取完成' + 'message': f'话题 {topic} 爬取完成,共爬取 {total_count} 条微博,符合筛选条件 {filtered_count} 条' }) except Exception as e: @@ -142,6 +297,116 @@ class SpiderWorker: finally: await self.flush_messages() + def apply_filters(self, post): + """ + 应用筛选条件到单条微博 + + Args: + post: 微博数据字典 + + Returns: + bool: 是否通过筛选 + """ + try: + # 1. 检查互动数据 + if not self._check_interaction_metrics(post): + return False + + # 2. 检查正则匹配 + if not self._check_regex_filters(post): + return False + + # 3. 检查高级选项 + if not self._check_advanced_options(post): + return False + + return True + + except Exception as e: + logger.error(f"应用筛选条件时出错: {e}") + return False + + def _check_interaction_metrics(self, post): + """检查互动指标是否满足条件""" + try: + # 获取互动指标的最小值要求 + min_likes = self.interaction_filters.get('minLikes', 0) + min_comments = self.interaction_filters.get('minComments', 0) + min_reposts = self.interaction_filters.get('minReposts', 0) + min_reads = self.interaction_filters.get('minReads', 0) + + # 检查是否满足所有条件 + if post.get('like_count', 0) < min_likes: + return False + if post.get('comment_count', 0) < min_comments: + return False + if post.get('forward_count', 0) < min_reposts: + return False + if post.get('read_count', 0) < min_reads: + return False + + return True + + except Exception as e: + logger.error(f"检查互动指标时出错: {e}") + return False + + def _check_regex_filters(self, post): + """检查正则表达式匹配""" + try: + for regex_filter in self.compiled_regex: + regex = regex_filter['regex'] + target = regex_filter['target'] + inverse = regex_filter['inverse'] + + # 获取目标文本 + if target == 'content': + text = post.get('content', '') + elif target == 'author': + text = post.get('user_name', '') + elif target == 'location': + text = post.get('location', '') + else: + continue + + # 执行匹配 + match = bool(regex.search(text)) + + # 如果是反向匹配,取反结果 + if inverse: + match = not match + + # 如果不满足条件,返回False + if not match: + return False + + return True + + except Exception as e: + logger.error(f"检查正则匹配时出错: {e}") + return False + + def _check_advanced_options(self, post): + """检查高级筛选选项""" + try: + # 检查是否只要原创内容 + if self.filter_options.get('originalOnly') and not post.get('is_original', False): + return False + + # 检查是否必须包含媒体 + if self.filter_options.get('withMediaOnly') and not post.get('has_media', False): + return False + + # 检查是否只要认证用户 + if self.filter_options.get('verifiedOnly') and not post.get('user_verified', False): + return False + + return True + + except Exception as e: + logger.error(f"检查高级选项时出错: {e}") + return False + async def broadcast_message(messages): """广播消息到所有WebSocket连接""" if not websocket_connections: @@ -172,6 +437,7 @@ async def start_spider(): data = request.get_json() topics = data.get('topics', []) parameters = {**DEFAULT_CONFIG, **data.get('parameters', {})} + accounts = data.get('accounts', []) if not topics: return jsonify({ @@ -179,6 +445,20 @@ async def start_spider(): 'message': '请选择至少一个话题' }) + if not accounts: + return jsonify({ + 'success': False, + 'message': '请配置至少一个账号' + }) + + # 处理账号Cookie的加密存储 + for account in accounts: + if account.get('saveCookie'): + account['cookie'] = encrypt_data(account['cookie']) + + # 将账号信息添加到参数中 + parameters['accounts'] = accounts + # 创建爬虫工作器 worker = SpiderWorker(topics, parameters) @@ -298,17 +578,36 @@ def generate_ai_config(): # 构建AI提示 system_prompt = """你是一个专业的爬虫配置助手。请根据用户的自然语言描述,生成合适的微博爬虫配置。 + 配置应包含以下内容: 1. 要爬取的话题列表 -2. 爬虫参数(爬取深度、间隔时间、重试次数、超时时间) +2. 爬虫参数配置 + - 爬取深度(crawlDepth):1-10页 + - 间隔时间(interval):3-30秒 + - 重试次数(maxRetries):1-5次 + - 超时时间(timeout):10-60秒 + - 最大并行数(maxConcurrent):1-5 + - 每分钟请求数限制(requestsPerMinute):30-120 + +3. 内容筛选条件 + a) 互动数据筛选(设为0表示不启用) + - 最小点赞数(minLikes) + - 最小评论数(minComments) + - 最小转发数(minReposts) + - 最小阅读数(minReads) + + b) 正则表达式筛选(数组,可以有多个规则) + - pattern: 正则表达式模式 + - target: 匹配目标(content/author/location) + - inverse: 是否反向匹配(true/false) + + c) 高级筛选选项(布尔值) + - originalOnly: 是否只要原创内容 + - withMediaOnly: 是否必须包含媒体 + - verifiedOnly: 是否只要认证用户 请先用通俗易懂的语言解释你的配置建议,然后在最后提供一个JSON格式的具体配置。 -注意: -- 爬取深度(crawlDepth)范围:1-10页 -- 间隔时间(interval)范围:3-30秒 -- 重试次数(maxRetries)范围:1-5次 -- 超时时间(timeout)范围:10-60秒 -- 所有参数都必须是整数 +所有数值参数必须是整数,并且在指定范围内。 示例输出格式: 根据您的需求,我建议... @@ -319,7 +618,29 @@ def generate_ai_config(): "crawlDepth": 5, "interval": 5, "maxRetries": 3, - "timeout": 30 + "timeout": 30, + "maxConcurrent": 2, + "requestsPerMinute": 60 + }, + "filters": { + "interaction": { + "minLikes": 1000, + "minComments": 100, + "minReposts": 50, + "minReads": 10000 + }, + "regex": [ + { + "pattern": "关键词", + "target": "content", + "inverse": false + } + ], + "options": { + "originalOnly": true, + "withMediaOnly": false, + "verifiedOnly": true + } } }""" @@ -365,6 +686,54 @@ def generate_ai_config(): except Exception as e: logger.error(f"生成配置失败: {e}") + return jsonify({ + 'success': False, + 'message': str(e) + }) + +@spider_bp.route('/api/spider/validate-account', methods=['POST']) +async def validate_account(): + """验证微博账号""" + try: + data = request.get_json() + cookie = data.get('cookie') + + if not cookie: + return jsonify({ + 'success': False, + 'message': 'Cookie不能为空' + }) + + # 创建测试请求 + try: + async with aiohttp.ClientSession() as session: + headers = { + 'Cookie': cookie, + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + # 尝试访问微博API + async with session.get('https://weibo.com/ajax/profile/info', headers=headers) as response: + if response.status == 200: + data = await response.json() + if data.get('data', {}).get('user', {}): + return jsonify({ + 'success': True, + 'message': '账号验证成功' + }) + + return jsonify({ + 'success': False, + 'message': 'Cookie无效或已过期' + }) + except Exception as e: + logger.error(f"验证账号时发生错误: {e}") + return jsonify({ + 'success': False, + 'message': f'验证过程出错: {str(e)}' + }) + + except Exception as e: + logger.error(f"处理账号验证请求时出错: {e}") return jsonify({ 'success': False, 'message': str(e)