This commit is contained in:
666ghj
2025-11-03 16:36:32 +08:00
8 changed files with 65 additions and 49 deletions
+2
View File
@@ -295,6 +295,8 @@ secrets.json
*.key *.key
*.pem *.pem
*.crt *.crt
config.py
MindSpider/config.py
# API 密钥 # API 密钥
api_keys.txt api_keys.txt
@@ -12,6 +12,7 @@ import json
from datetime import datetime, date from datetime import datetime, date
from pathlib import Path from pathlib import Path
from typing import List, Dict, Optional from typing import List, Dict, Optional
from loguru import logger
# 添加项目根目录到路径 # 添加项目根目录到路径
project_root = Path(__file__).parent.parent project_root = Path(__file__).parent.parent
@@ -38,8 +39,7 @@ SOURCE_NAMES = {
"wallstreetcn": "华尔街见闻", "wallstreetcn": "华尔街见闻",
"thepaper": "澎湃新闻", "thepaper": "澎湃新闻",
"cls-hot": "财联社", "cls-hot": "财联社",
"xueqiu": "雪球热榜", "xueqiu": "雪球热榜"
"kuaishou": "快手热榜"
} }
class NewsCollector: class NewsCollector:
@@ -72,15 +72,25 @@ class NewsCollector:
async def fetch_news(self, source: str) -> dict: async def fetch_news(self, source: str) -> dict:
"""从指定源获取最新新闻""" """从指定源获取最新新闻"""
url = f"{BASE_URL}/api/s?id={source}&latest" url = f"{BASE_URL}/api/s?id={source}&latest"
headers = {"Accept": "application/json"} headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Referer": BASE_URL,
"Connection": "keep-alive",
}
try: try:
async with httpx.AsyncClient(timeout=30.0) as client: async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
response = await client.get(url, headers=headers) response = await client.get(url, headers=headers)
response.raise_for_status() response.raise_for_status()
# 解析JSON响应 # 解析JSON响应
data = json.loads(response.text) data = response.json()
return { return {
"source": source, "source": source,
"status": "success", "status": "success",
@@ -91,21 +101,21 @@ class NewsCollector:
return { return {
"source": source, "source": source,
"status": "timeout", "status": "timeout",
"error": "请求超时", "error": f"请求超时: {source}({url})",
"timestamp": datetime.now().isoformat() "timestamp": datetime.now().isoformat()
} }
except httpx.HTTPStatusError as e: except httpx.HTTPStatusError as e:
return { return {
"source": source, "source": source,
"status": "http_error", "status": "http_error",
"error": f"HTTP错误: {e.response.status_code}", "error": f"HTTP错误: {source}({url}) - {e.response.status_code}",
"timestamp": datetime.now().isoformat() "timestamp": datetime.now().isoformat()
} }
except Exception as e: except Exception as e:
return { return {
"source": source, "source": source,
"status": "error", "status": "error",
"error": f"未知错误: {str(e)}", "error": f"未知错误: {source}({url}) - {str(e)}",
"timestamp": datetime.now().isoformat() "timestamp": datetime.now().isoformat()
} }
@@ -114,13 +124,13 @@ class NewsCollector:
if sources is None: if sources is None:
sources = list(SOURCE_NAMES.keys()) sources = list(SOURCE_NAMES.keys())
print(f"正在获取 {len(sources)} 个新闻源的最新内容...") logger.info(f"正在获取 {len(sources)} 个新闻源的最新内容...")
print("=" * 80) logger.info("=" * 80)
results = [] results = []
for source in sources: for source in sources:
source_name = SOURCE_NAMES.get(source, source) source_name = SOURCE_NAMES.get(source, source)
print(f"正在获取 {source_name} 的新闻...") logger.info(f"正在获取 {source_name} 的新闻...")
result = await self.fetch_news(source) result = await self.fetch_news(source)
results.append(result) results.append(result)
@@ -128,11 +138,11 @@ class NewsCollector:
data = result["data"] data = result["data"]
if 'items' in data and isinstance(data['items'], list): if 'items' in data and isinstance(data['items'], list):
count = len(data['items']) count = len(data['items'])
print(f"{source_name}: 获取成功,共 {count} 条新闻") logger.info(f"{source_name}: 获取成功,共 {count} 条新闻")
else: else:
print(f"{source_name}: 获取成功") logger.info(f"{source_name}: 获取成功")
else: else:
print(f"{source_name}: {result.get('error', '获取失败')}") logger.error(f"{source_name}: {result.get('error', '获取失败')}")
# 避免请求过快 # 避免请求过快
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
@@ -151,18 +161,21 @@ class NewsCollector:
Returns: Returns:
包含收集结果的字典 包含收集结果的字典
""" """
print(f"开始收集每日热点新闻...") collection_summary_message = ""
print(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") collection_summary_message += "\n开始收集每日热点新闻...\n"
collection_summary_message += f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
# 选择新闻源 # 选择新闻源
if sources is None: if sources is None:
# 使用所有支持的新闻源 # 使用所有支持的新闻源
sources = list(SOURCE_NAMES.keys()) sources = list(SOURCE_NAMES.keys())
print(f"将从 {len(sources)} 个新闻源收集数据:") collection_summary_message += f"将从 {len(sources)} 个新闻源收集数据:\n"
for source in sources: for source in sources:
source_name = SOURCE_NAMES.get(source, source) source_name = SOURCE_NAMES.get(source, source)
print(f" - {source_name}") collection_summary_message += f" - {source_name}\n"
logger.info(collection_summary_message)
try: try:
# 获取新闻数据 # 获取新闻数据
@@ -185,7 +198,7 @@ class NewsCollector:
return processed_data return processed_data
except Exception as e: except Exception as e:
print(f"收集新闻失败: {e}") logger.exception(f"收集新闻失败: {e}")
return { return {
'success': False, 'success': False,
'error': str(e), 'error': str(e),
@@ -255,35 +268,30 @@ class NewsCollector:
} }
except Exception as e: except Exception as e:
print(f"处理新闻项失败: {e}") logger.exception(f"处理新闻项失败: {e}")
return None return None
def _print_collection_summary(self, data: Dict): def _print_collection_summary(self, data: Dict):
"""打印收集摘要""" """打印收集摘要"""
print("\n" + "=" * 50) collection_summary_message = ""
print("新闻收集摘要") collection_summary_message += f"\n总新闻源: {data['total_sources']}\n"
print("=" * 50) collection_summary_message += f"成功源数: {data['successful_sources']}\n"
collection_summary_message += f"总新闻数: {data['total_news']}\n"
print(f"总新闻源: {data['total_sources']}")
print(f"成功源数: {data['successful_sources']}")
print(f"总新闻数: {data['total_news']}")
if 'saved_count' in data: if 'saved_count' in data:
print(f"已保存数: {data['saved_count']}") collection_summary_message += f"已保存数: {data['saved_count']}\n"
logger.info(collection_summary_message)
print("=" * 50)
def get_today_news(self) -> List[Dict]: def get_today_news(self) -> List[Dict]:
"""获取今天的新闻""" """获取今天的新闻"""
try: try:
return self.db_manager.get_daily_news(date.today()) return self.db_manager.get_daily_news(date.today())
except Exception as e: except Exception as e:
print(f"获取今日新闻失败: {e}") logger.exception(f"获取今日新闻失败: {e}")
return [] return []
async def main(): async def main():
"""测试新闻收集器""" """测试新闻收集器"""
print("测试新闻收集器...") logger.info("测试新闻收集器...")
async with NewsCollector() as collector: async with NewsCollector() as collector:
# 收集新闻 # 收集新闻
@@ -292,9 +300,9 @@ async def main():
) )
if result['success']: if result['success']:
print(f"收集成功!共获取 {result['total_news']} 条新闻") logger.info(f"收集成功!共获取 {result['total_news']} 条新闻")
else: else:
print(f"收集失败: {result.get('error', '未知错误')}") logger.error(f"收集失败: {result.get('error', '未知错误')}")
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())
@@ -455,19 +455,12 @@ CREATE TABLE tieba_comment
KEY `idx_tieba_comment_publish_time` (`publish_time`) KEY `idx_tieba_comment_publish_time` (`publish_time`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表';
-- 增加搜索来源关键字字段 alter table bilibili_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
alter table bilibili_video alter table douyin_aweme add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; alter table kuaishou_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
alter table douyin_aweme alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
alter table kuaishou_video alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
alter table weibo_note
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
alter table xhs_note
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
alter table tieba_note
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
DROP TABLE IF EXISTS `weibo_creator`; DROP TABLE IF EXISTS `weibo_creator`;
+5
View File
@@ -218,6 +218,8 @@ playwright install chromium
#### 4.1 Configure API Keys #### 4.1 Configure API Keys
Copy the `config.py.example` file to `config.py`
Edit the `config.py` file and fill in your API keys (you can also choose your own models and search proxies; see the config file for details): Edit the `config.py` file and fill in your API keys (you can also choose your own models and search proxies; see the config file for details):
```python ```python
@@ -243,6 +245,9 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview"
#### 4.2 Database Initialization #### 4.2 Database Initialization
**Option 1: Use Local Database** **Option 1: Use Local Database**
You can refer to `MindSpider\config.py\config.py.example` for the configuration template, and you can copy this file and rename it to `config.py`.
```bash ```bash
# Local MySQL database initialization # Local MySQL database initialization
cd MindSpider cd MindSpider
+7
View File
@@ -21,6 +21,9 @@
</div> </div>
> [!IMPORTANT]
> 周一(11.3)会上**在线一键部署体验**,欢迎持续关注!
## ⚡ 项目概述 ## ⚡ 项目概述
“**微舆**” 是一个从0实现的创新型 多智能体 舆情分析系统,帮助大家破除信息茧房,还原舆情原貌,预测未来走向,辅助决策。用户只需像聊天一样提出分析需求,智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。 “**微舆**” 是一个从0实现的创新型 多智能体 舆情分析系统,帮助大家破除信息茧房,还原舆情原貌,预测未来走向,辅助决策。用户只需像聊天一样提出分析需求,智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。
@@ -220,6 +223,8 @@ playwright install chromium
#### 4.1 配置API密钥 #### 4.1 配置API密钥
复制一份 `config.py.example` 文件,命名为 `config.py`
编辑 `config.py` 文件,填入您的API密钥(您也可以选择自己的模型、搜索代理,详情见config文件内): 编辑 `config.py` 文件,填入您的API密钥(您也可以选择自己的模型、搜索代理,详情见config文件内):
```python ```python
@@ -248,6 +253,8 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview"
> MindSpider爬虫系统跟舆情系统是各自独立的,所以需要再去`MindSpider\config.py`配置一下 > MindSpider爬虫系统跟舆情系统是各自独立的,所以需要再去`MindSpider\config.py`配置一下
配置模板可以参考`MindSpider\config.py\config.py.example`,可以复制该文件并命名为`config.py`
```bash ```bash
# 本地MySQL数据库初始化 # 本地MySQL数据库初始化
cd MindSpider cd MindSpider
View File
+2 -1
View File
@@ -72,4 +72,5 @@ flake8>=6.0.0
# ===== Web服务器 ===== # ===== Web服务器 =====
fastapi==0.110.2 fastapi==0.110.2
uvicorn==0.29.0 uvicorn==0.29.0
loguru