From 7726be4526157acc012a990b04d05bd11c126a4b Mon Sep 17 00:00:00 2001 From: BaiFu <670939375@qq.com> Date: Sun, 2 Nov 2025 00:52:08 +0800 Subject: [PATCH 1/7] Update README with important deployment announcement. Added important announcement about online deployment experience on Monday (11.3) to the README. --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 9307f9e..598ea3c 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,9 @@ +> [!IMPORTANT] +> 周一(11.3)会上**在线一键部署体验**,欢迎持续关注! + ## ⚡ 项目概述 “**微舆**” 是一个从0实现的创新型 多智能体 舆情分析系统,帮助大家破除信息茧房,还原舆情原貌,预测未来走向,辅助决策。用户只需像聊天一样提出分析需求,智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。 From 0aaf81ba067715c165a5ef88fbb7606362e634dc Mon Sep 17 00:00:00 2001 From: Doiiars Date: Mon, 3 Nov 2025 11:26:51 +0800 Subject: [PATCH 2/7] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dfetch=5Fnews=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E6=AD=A3=E5=B8=B8=E8=8E=B7=E5=8F=96=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../BroadTopicExtraction/get_today_news.py | 78 ++++++++++--------- requirements.txt | 3 +- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/MindSpider/BroadTopicExtraction/get_today_news.py b/MindSpider/BroadTopicExtraction/get_today_news.py index 2745381..972d441 100644 --- a/MindSpider/BroadTopicExtraction/get_today_news.py +++ b/MindSpider/BroadTopicExtraction/get_today_news.py @@ -12,6 +12,7 @@ import json from datetime import datetime, date from pathlib import Path from typing import List, Dict, Optional +from loguru import logger # 添加项目根目录到路径 project_root = Path(__file__).parent.parent @@ -38,8 +39,7 @@ SOURCE_NAMES = { "wallstreetcn": "华尔街见闻", "thepaper": "澎湃新闻", "cls-hot": "财联社", - "xueqiu": "雪球热榜", - "kuaishou": "快手热榜" + "xueqiu": "雪球热榜" } class NewsCollector: @@ -72,15 +72,25 @@ class NewsCollector: async def fetch_news(self, source: str) -> dict: """从指定源获取最新新闻""" url = f"{BASE_URL}/api/s?id={source}&latest" - headers = {"Accept": "application/json"} + headers = { + "Accept": "application/json, text/plain, */*", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/124.0.0.0 Safari/537.36" + ), + "Referer": BASE_URL, + "Connection": "keep-alive", + } try: - async with httpx.AsyncClient(timeout=30.0) as client: + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: response = await client.get(url, headers=headers) response.raise_for_status() # 解析JSON响应 - data = json.loads(response.text) + data = response.json() return { "source": source, "status": "success", @@ -91,21 +101,21 @@ class NewsCollector: return { "source": source, "status": "timeout", - "error": "请求超时", + "error": f"请求超时: {source}({url})", "timestamp": datetime.now().isoformat() } except httpx.HTTPStatusError as e: return { "source": source, "status": "http_error", - "error": f"HTTP错误: {e.response.status_code}", + "error": f"HTTP错误: {source}({url}) - {e.response.status_code}", "timestamp": datetime.now().isoformat() } except Exception as e: return { "source": source, "status": "error", - "error": f"未知错误: {str(e)}", + "error": f"未知错误: {source}({url}) - {str(e)}", "timestamp": datetime.now().isoformat() } @@ -114,13 +124,13 @@ class NewsCollector: if sources is None: sources = list(SOURCE_NAMES.keys()) - print(f"正在获取 {len(sources)} 个新闻源的最新内容...") - print("=" * 80) + logger.info(f"正在获取 {len(sources)} 个新闻源的最新内容...") + logger.info("=" * 80) results = [] for source in sources: source_name = SOURCE_NAMES.get(source, source) - print(f"正在获取 {source_name} 的新闻...") + logger.info(f"正在获取 {source_name} 的新闻...") result = await self.fetch_news(source) results.append(result) @@ -128,11 +138,11 @@ class NewsCollector: data = result["data"] if 'items' in data and isinstance(data['items'], list): count = len(data['items']) - print(f"✓ {source_name}: 获取成功,共 {count} 条新闻") + logger.info(f"✓ {source_name}: 获取成功,共 {count} 条新闻") else: - print(f"✓ {source_name}: 获取成功") + logger.info(f"✓ {source_name}: 获取成功") else: - print(f"✗ {source_name}: {result.get('error', '获取失败')}") + logger.error(f"✗ {source_name}: {result.get('error', '获取失败')}") # 避免请求过快 await asyncio.sleep(0.5) @@ -151,18 +161,21 @@ class NewsCollector: Returns: 包含收集结果的字典 """ - print(f"开始收集每日热点新闻...") - print(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + collection_summary_message = "" + collection_summary_message += "\n开始收集每日热点新闻...\n" + collection_summary_message += f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" # 选择新闻源 if sources is None: # 使用所有支持的新闻源 sources = list(SOURCE_NAMES.keys()) - print(f"将从 {len(sources)} 个新闻源收集数据:") + collection_summary_message += f"将从 {len(sources)} 个新闻源收集数据:\n" for source in sources: source_name = SOURCE_NAMES.get(source, source) - print(f" - {source_name}") + collection_summary_message += f" - {source_name}\n" + + logger.info(collection_summary_message) try: # 获取新闻数据 @@ -185,7 +198,7 @@ class NewsCollector: return processed_data except Exception as e: - print(f"收集新闻失败: {e}") + logger.exception(f"收集新闻失败: {e}") return { 'success': False, 'error': str(e), @@ -255,35 +268,30 @@ class NewsCollector: } except Exception as e: - print(f"处理新闻项失败: {e}") + logger.exception(f"处理新闻项失败: {e}") return None def _print_collection_summary(self, data: Dict): """打印收集摘要""" - print("\n" + "=" * 50) - print("新闻收集摘要") - print("=" * 50) - - print(f"总新闻源: {data['total_sources']}") - print(f"成功源数: {data['successful_sources']}") - print(f"总新闻数: {data['total_news']}") - + collection_summary_message = "" + collection_summary_message += f"\n总新闻源: {data['total_sources']}\n" + collection_summary_message += f"成功源数: {data['successful_sources']}\n" + collection_summary_message += f"总新闻数: {data['total_news']}\n" if 'saved_count' in data: - print(f"已保存数: {data['saved_count']}") - - print("=" * 50) + collection_summary_message += f"已保存数: {data['saved_count']}\n" + logger.info(collection_summary_message) def get_today_news(self) -> List[Dict]: """获取今天的新闻""" try: return self.db_manager.get_daily_news(date.today()) except Exception as e: - print(f"获取今日新闻失败: {e}") + logger.exception(f"获取今日新闻失败: {e}") return [] async def main(): """测试新闻收集器""" - print("测试新闻收集器...") + logger.info("测试新闻收集器...") async with NewsCollector() as collector: # 收集新闻 @@ -292,9 +300,9 @@ async def main(): ) if result['success']: - print(f"收集成功!共获取 {result['total_news']} 条新闻") + logger.info(f"收集成功!共获取 {result['total_news']} 条新闻") else: - print(f"收集失败: {result.get('error', '未知错误')}") + logger.error(f"收集失败: {result.get('error', '未知错误')}") if __name__ == "__main__": asyncio.run(main()) diff --git a/requirements.txt b/requirements.txt index 6bee269..a1ae6c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -71,4 +71,5 @@ flake8>=6.0.0 # ===== Web服务器 ===== fastapi==0.110.2 -uvicorn==0.29.0 \ No newline at end of file +uvicorn==0.29.0 +loguru \ No newline at end of file From bda4343c48a04983a82288c40cd118b7c3ed59c7 Mon Sep 17 00:00:00 2001 From: ghmark675 <188834327+ghmark675@users.noreply.github.com> Date: Mon, 3 Nov 2025 08:28:36 +0800 Subject: [PATCH 3/7] chore: stop track config.py --- .gitignore | 1 + README.md | 2 ++ config.py => config.py.example | 0 3 files changed, 3 insertions(+) rename config.py => config.py.example (100%) diff --git a/.gitignore b/.gitignore index 76eea29..ac6c0e9 100644 --- a/.gitignore +++ b/.gitignore @@ -295,6 +295,7 @@ secrets.json *.key *.pem *.crt +config.py # API 密钥 api_keys.txt diff --git a/README.md b/README.md index 598ea3c..790579b 100644 --- a/README.md +++ b/README.md @@ -223,6 +223,8 @@ playwright install chromium #### 4.1 配置API密钥 +复制一份 `config.py.example` 文件,命名为 `config.py` + 编辑 `config.py` 文件,填入您的API密钥(您也可以选择自己的模型、搜索代理,详情见config文件内): ```python diff --git a/config.py b/config.py.example similarity index 100% rename from config.py rename to config.py.example From 46b2f00a6eb7d92ef695a254df8c4ed3c44d3287 Mon Sep 17 00:00:00 2001 From: ghmark675 <188834327+ghmark675@users.noreply.github.com> Date: Mon, 3 Nov 2025 08:38:09 +0800 Subject: [PATCH 4/7] docs(README-EN): Update configuration instructions Instruct users to copy config.py.example to config.py for local setup. --- README-EN.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README-EN.md b/README-EN.md index 0650bb5..d03bd09 100644 --- a/README-EN.md +++ b/README-EN.md @@ -218,6 +218,8 @@ playwright install chromium #### 4.1 Configure API Keys +Copy the `config.py.example` file to `config.py` + Edit the `config.py` file and fill in your API keys (you can also choose your own models and search proxies; see the config file for details): ```python From 6fd897d82a9564081f60e53b5718b9eb9994f75a Mon Sep 17 00:00:00 2001 From: ghmark675 Date: Mon, 3 Nov 2025 12:16:17 +0800 Subject: [PATCH 5/7] chore(config): stop track MindSpider config.py change it to config.py.example --- .gitignore | 1 + MindSpider/{config.py => config.py.example} | 0 2 files changed, 1 insertion(+) rename MindSpider/{config.py => config.py.example} (100%) diff --git a/.gitignore b/.gitignore index ac6c0e9..8582217 100644 --- a/.gitignore +++ b/.gitignore @@ -296,6 +296,7 @@ secrets.json *.pem *.crt config.py +MindSpider/config.py # API 密钥 api_keys.txt diff --git a/MindSpider/config.py b/MindSpider/config.py.example similarity index 100% rename from MindSpider/config.py rename to MindSpider/config.py.example From dba7fa9902e676dbcec2aa65bdc694c433a37a68 Mon Sep 17 00:00:00 2001 From: ghmark675 Date: Mon, 3 Nov 2025 12:20:58 +0800 Subject: [PATCH 6/7] docs(README): Update configuration instructions Instruct users to copy config.py.example to config.py for local setup. --- README-EN.md | 3 +++ README.md | 2 ++ 2 files changed, 5 insertions(+) diff --git a/README-EN.md b/README-EN.md index d03bd09..57c117d 100644 --- a/README-EN.md +++ b/README-EN.md @@ -245,6 +245,9 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview" #### 4.2 Database Initialization **Option 1: Use Local Database** + +You can refer to `MindSpider\config.py\config.py.example` for the configuration template, and you can copy this file and rename it to `config.py`. + ```bash # Local MySQL database initialization cd MindSpider diff --git a/README.md b/README.md index 790579b..a512efd 100644 --- a/README.md +++ b/README.md @@ -253,6 +253,8 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview" > MindSpider爬虫系统跟舆情系统是各自独立的,所以需要再去`MindSpider\config.py`配置一下 +配置模板可以参考`MindSpider\config.py\config.py.example`,可以复制该文件并命名为`config.py` + ```bash # 本地MySQL数据库初始化 cd MindSpider From 5b125ea91ab9c1f22015133840f70f1a08e04628 Mon Sep 17 00:00:00 2001 From: ghmark675 Date: Mon, 3 Nov 2025 13:49:29 +0800 Subject: [PATCH 7/7] hotfix(database): fix `source_keyword` not in table bilibili_video Fix: #51 --- .../MediaCrawler/schema/tables.sql | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql b/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql index 7310625..f5d1899 100644 --- a/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql @@ -455,19 +455,12 @@ CREATE TABLE tieba_comment KEY `idx_tieba_comment_publish_time` (`publish_time`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; --- 增加搜索来源关键字字段 -alter table bilibili_video - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; -alter table douyin_aweme - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; -alter table kuaishou_video - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; -alter table weibo_note - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; -alter table xhs_note - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; -alter table tieba_note - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table bilibili_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table douyin_aweme add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table kuaishou_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; DROP TABLE IF EXISTS `weibo_creator`;