Merge branch 'main' of https://github.com/666ghj/Weibo_PublicOpinion_AnalysisSystem
This commit is contained in:
@@ -295,6 +295,8 @@ secrets.json
|
|||||||
*.key
|
*.key
|
||||||
*.pem
|
*.pem
|
||||||
*.crt
|
*.crt
|
||||||
|
config.py
|
||||||
|
MindSpider/config.py
|
||||||
|
|
||||||
# API 密钥
|
# API 密钥
|
||||||
api_keys.txt
|
api_keys.txt
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import json
|
|||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Optional
|
from typing import List, Dict, Optional
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
# 添加项目根目录到路径
|
# 添加项目根目录到路径
|
||||||
project_root = Path(__file__).parent.parent
|
project_root = Path(__file__).parent.parent
|
||||||
@@ -38,8 +39,7 @@ SOURCE_NAMES = {
|
|||||||
"wallstreetcn": "华尔街见闻",
|
"wallstreetcn": "华尔街见闻",
|
||||||
"thepaper": "澎湃新闻",
|
"thepaper": "澎湃新闻",
|
||||||
"cls-hot": "财联社",
|
"cls-hot": "财联社",
|
||||||
"xueqiu": "雪球热榜",
|
"xueqiu": "雪球热榜"
|
||||||
"kuaishou": "快手热榜"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class NewsCollector:
|
class NewsCollector:
|
||||||
@@ -72,15 +72,25 @@ class NewsCollector:
|
|||||||
async def fetch_news(self, source: str) -> dict:
|
async def fetch_news(self, source: str) -> dict:
|
||||||
"""从指定源获取最新新闻"""
|
"""从指定源获取最新新闻"""
|
||||||
url = f"{BASE_URL}/api/s?id={source}&latest"
|
url = f"{BASE_URL}/api/s?id={source}&latest"
|
||||||
headers = {"Accept": "application/json"}
|
headers = {
|
||||||
|
"Accept": "application/json, text/plain, */*",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/124.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Referer": BASE_URL,
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
||||||
response = await client.get(url, headers=headers)
|
response = await client.get(url, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
# 解析JSON响应
|
# 解析JSON响应
|
||||||
data = json.loads(response.text)
|
data = response.json()
|
||||||
return {
|
return {
|
||||||
"source": source,
|
"source": source,
|
||||||
"status": "success",
|
"status": "success",
|
||||||
@@ -91,21 +101,21 @@ class NewsCollector:
|
|||||||
return {
|
return {
|
||||||
"source": source,
|
"source": source,
|
||||||
"status": "timeout",
|
"status": "timeout",
|
||||||
"error": "请求超时",
|
"error": f"请求超时: {source}({url})",
|
||||||
"timestamp": datetime.now().isoformat()
|
"timestamp": datetime.now().isoformat()
|
||||||
}
|
}
|
||||||
except httpx.HTTPStatusError as e:
|
except httpx.HTTPStatusError as e:
|
||||||
return {
|
return {
|
||||||
"source": source,
|
"source": source,
|
||||||
"status": "http_error",
|
"status": "http_error",
|
||||||
"error": f"HTTP错误: {e.response.status_code}",
|
"error": f"HTTP错误: {source}({url}) - {e.response.status_code}",
|
||||||
"timestamp": datetime.now().isoformat()
|
"timestamp": datetime.now().isoformat()
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {
|
return {
|
||||||
"source": source,
|
"source": source,
|
||||||
"status": "error",
|
"status": "error",
|
||||||
"error": f"未知错误: {str(e)}",
|
"error": f"未知错误: {source}({url}) - {str(e)}",
|
||||||
"timestamp": datetime.now().isoformat()
|
"timestamp": datetime.now().isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,13 +124,13 @@ class NewsCollector:
|
|||||||
if sources is None:
|
if sources is None:
|
||||||
sources = list(SOURCE_NAMES.keys())
|
sources = list(SOURCE_NAMES.keys())
|
||||||
|
|
||||||
print(f"正在获取 {len(sources)} 个新闻源的最新内容...")
|
logger.info(f"正在获取 {len(sources)} 个新闻源的最新内容...")
|
||||||
print("=" * 80)
|
logger.info("=" * 80)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for source in sources:
|
for source in sources:
|
||||||
source_name = SOURCE_NAMES.get(source, source)
|
source_name = SOURCE_NAMES.get(source, source)
|
||||||
print(f"正在获取 {source_name} 的新闻...")
|
logger.info(f"正在获取 {source_name} 的新闻...")
|
||||||
result = await self.fetch_news(source)
|
result = await self.fetch_news(source)
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
@@ -128,11 +138,11 @@ class NewsCollector:
|
|||||||
data = result["data"]
|
data = result["data"]
|
||||||
if 'items' in data and isinstance(data['items'], list):
|
if 'items' in data and isinstance(data['items'], list):
|
||||||
count = len(data['items'])
|
count = len(data['items'])
|
||||||
print(f"✓ {source_name}: 获取成功,共 {count} 条新闻")
|
logger.info(f"✓ {source_name}: 获取成功,共 {count} 条新闻")
|
||||||
else:
|
else:
|
||||||
print(f"✓ {source_name}: 获取成功")
|
logger.info(f"✓ {source_name}: 获取成功")
|
||||||
else:
|
else:
|
||||||
print(f"✗ {source_name}: {result.get('error', '获取失败')}")
|
logger.error(f"✗ {source_name}: {result.get('error', '获取失败')}")
|
||||||
|
|
||||||
# 避免请求过快
|
# 避免请求过快
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
@@ -151,18 +161,21 @@ class NewsCollector:
|
|||||||
Returns:
|
Returns:
|
||||||
包含收集结果的字典
|
包含收集结果的字典
|
||||||
"""
|
"""
|
||||||
print(f"开始收集每日热点新闻...")
|
collection_summary_message = ""
|
||||||
print(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
collection_summary_message += "\n开始收集每日热点新闻...\n"
|
||||||
|
collection_summary_message += f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
|
||||||
|
|
||||||
# 选择新闻源
|
# 选择新闻源
|
||||||
if sources is None:
|
if sources is None:
|
||||||
# 使用所有支持的新闻源
|
# 使用所有支持的新闻源
|
||||||
sources = list(SOURCE_NAMES.keys())
|
sources = list(SOURCE_NAMES.keys())
|
||||||
|
|
||||||
print(f"将从 {len(sources)} 个新闻源收集数据:")
|
collection_summary_message += f"将从 {len(sources)} 个新闻源收集数据:\n"
|
||||||
for source in sources:
|
for source in sources:
|
||||||
source_name = SOURCE_NAMES.get(source, source)
|
source_name = SOURCE_NAMES.get(source, source)
|
||||||
print(f" - {source_name}")
|
collection_summary_message += f" - {source_name}\n"
|
||||||
|
|
||||||
|
logger.info(collection_summary_message)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 获取新闻数据
|
# 获取新闻数据
|
||||||
@@ -185,7 +198,7 @@ class NewsCollector:
|
|||||||
return processed_data
|
return processed_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"收集新闻失败: {e}")
|
logger.exception(f"收集新闻失败: {e}")
|
||||||
return {
|
return {
|
||||||
'success': False,
|
'success': False,
|
||||||
'error': str(e),
|
'error': str(e),
|
||||||
@@ -255,35 +268,30 @@ class NewsCollector:
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"处理新闻项失败: {e}")
|
logger.exception(f"处理新闻项失败: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _print_collection_summary(self, data: Dict):
|
def _print_collection_summary(self, data: Dict):
|
||||||
"""打印收集摘要"""
|
"""打印收集摘要"""
|
||||||
print("\n" + "=" * 50)
|
collection_summary_message = ""
|
||||||
print("新闻收集摘要")
|
collection_summary_message += f"\n总新闻源: {data['total_sources']}\n"
|
||||||
print("=" * 50)
|
collection_summary_message += f"成功源数: {data['successful_sources']}\n"
|
||||||
|
collection_summary_message += f"总新闻数: {data['total_news']}\n"
|
||||||
print(f"总新闻源: {data['total_sources']}")
|
|
||||||
print(f"成功源数: {data['successful_sources']}")
|
|
||||||
print(f"总新闻数: {data['total_news']}")
|
|
||||||
|
|
||||||
if 'saved_count' in data:
|
if 'saved_count' in data:
|
||||||
print(f"已保存数: {data['saved_count']}")
|
collection_summary_message += f"已保存数: {data['saved_count']}\n"
|
||||||
|
logger.info(collection_summary_message)
|
||||||
print("=" * 50)
|
|
||||||
|
|
||||||
def get_today_news(self) -> List[Dict]:
|
def get_today_news(self) -> List[Dict]:
|
||||||
"""获取今天的新闻"""
|
"""获取今天的新闻"""
|
||||||
try:
|
try:
|
||||||
return self.db_manager.get_daily_news(date.today())
|
return self.db_manager.get_daily_news(date.today())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"获取今日新闻失败: {e}")
|
logger.exception(f"获取今日新闻失败: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""测试新闻收集器"""
|
"""测试新闻收集器"""
|
||||||
print("测试新闻收集器...")
|
logger.info("测试新闻收集器...")
|
||||||
|
|
||||||
async with NewsCollector() as collector:
|
async with NewsCollector() as collector:
|
||||||
# 收集新闻
|
# 收集新闻
|
||||||
@@ -292,9 +300,9 @@ async def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
if result['success']:
|
if result['success']:
|
||||||
print(f"收集成功!共获取 {result['total_news']} 条新闻")
|
logger.info(f"收集成功!共获取 {result['total_news']} 条新闻")
|
||||||
else:
|
else:
|
||||||
print(f"收集失败: {result.get('error', '未知错误')}")
|
logger.error(f"收集失败: {result.get('error', '未知错误')}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
@@ -455,19 +455,12 @@ CREATE TABLE tieba_comment
|
|||||||
KEY `idx_tieba_comment_publish_time` (`publish_time`)
|
KEY `idx_tieba_comment_publish_time` (`publish_time`)
|
||||||
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表';
|
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表';
|
||||||
|
|
||||||
-- 增加搜索来源关键字字段
|
alter table bilibili_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
alter table bilibili_video
|
alter table douyin_aweme add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
alter table kuaishou_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
alter table douyin_aweme
|
alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
alter table kuaishou_video
|
alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
|
||||||
alter table weibo_note
|
|
||||||
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
|
||||||
alter table xhs_note
|
|
||||||
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
|
||||||
alter table tieba_note
|
|
||||||
add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
|
||||||
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS `weibo_creator`;
|
DROP TABLE IF EXISTS `weibo_creator`;
|
||||||
|
|||||||
@@ -218,6 +218,8 @@ playwright install chromium
|
|||||||
|
|
||||||
#### 4.1 Configure API Keys
|
#### 4.1 Configure API Keys
|
||||||
|
|
||||||
|
Copy the `config.py.example` file to `config.py`
|
||||||
|
|
||||||
Edit the `config.py` file and fill in your API keys (you can also choose your own models and search proxies; see the config file for details):
|
Edit the `config.py` file and fill in your API keys (you can also choose your own models and search proxies; see the config file for details):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@@ -243,6 +245,9 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview"
|
|||||||
#### 4.2 Database Initialization
|
#### 4.2 Database Initialization
|
||||||
|
|
||||||
**Option 1: Use Local Database**
|
**Option 1: Use Local Database**
|
||||||
|
|
||||||
|
You can refer to `MindSpider\config.py\config.py.example` for the configuration template, and you can copy this file and rename it to `config.py`.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Local MySQL database initialization
|
# Local MySQL database initialization
|
||||||
cd MindSpider
|
cd MindSpider
|
||||||
|
|||||||
@@ -21,6 +21,9 @@
|
|||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
> 周一(11.3)会上**在线一键部署体验**,欢迎持续关注!
|
||||||
|
|
||||||
## ⚡ 项目概述
|
## ⚡ 项目概述
|
||||||
|
|
||||||
“**微舆**” 是一个从0实现的创新型 多智能体 舆情分析系统,帮助大家破除信息茧房,还原舆情原貌,预测未来走向,辅助决策。用户只需像聊天一样提出分析需求,智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。
|
“**微舆**” 是一个从0实现的创新型 多智能体 舆情分析系统,帮助大家破除信息茧房,还原舆情原貌,预测未来走向,辅助决策。用户只需像聊天一样提出分析需求,智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。
|
||||||
@@ -220,6 +223,8 @@ playwright install chromium
|
|||||||
|
|
||||||
#### 4.1 配置API密钥
|
#### 4.1 配置API密钥
|
||||||
|
|
||||||
|
复制一份 `config.py.example` 文件,命名为 `config.py`
|
||||||
|
|
||||||
编辑 `config.py` 文件,填入您的API密钥(您也可以选择自己的模型、搜索代理,详情见config文件内):
|
编辑 `config.py` 文件,填入您的API密钥(您也可以选择自己的模型、搜索代理,详情见config文件内):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@@ -248,6 +253,8 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview"
|
|||||||
|
|
||||||
> MindSpider爬虫系统跟舆情系统是各自独立的,所以需要再去`MindSpider\config.py`配置一下
|
> MindSpider爬虫系统跟舆情系统是各自独立的,所以需要再去`MindSpider\config.py`配置一下
|
||||||
|
|
||||||
|
配置模板可以参考`MindSpider\config.py\config.py.example`,可以复制该文件并命名为`config.py`
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 本地MySQL数据库初始化
|
# 本地MySQL数据库初始化
|
||||||
cd MindSpider
|
cd MindSpider
|
||||||
|
|||||||
+2
-1
@@ -72,4 +72,5 @@ flake8>=6.0.0
|
|||||||
|
|
||||||
# ===== Web服务器 =====
|
# ===== Web服务器 =====
|
||||||
fastapi==0.110.2
|
fastapi==0.110.2
|
||||||
uvicorn==0.29.0
|
uvicorn==0.29.0
|
||||||
|
loguru
|
||||||
Reference in New Issue
Block a user