diff --git a/InsightEngine/agent.py b/InsightEngine/agent.py
index d57d96f..413efaf 100644
--- a/InsightEngine/agent.py
+++ b/InsightEngine/agent.py
@@ -19,7 +19,7 @@ from .nodes import (
ReportFormattingNode
)
from .state import State
-from .tools import TavilyNewsAgency, TavilyResponse
+from .tools import MediaCrawlerDB, DBResponse
from .utils import Config, load_config, format_search_results_for_prompt
@@ -39,8 +39,16 @@ class DeepSearchAgent:
# 初始化LLM客户端
self.llm_client = self._initialize_llm()
+ # 设置数据库环境变量
+ os.environ["DB_HOST"] = self.config.db_host or ""
+ os.environ["DB_USER"] = self.config.db_user or ""
+ os.environ["DB_PASSWORD"] = self.config.db_password or ""
+ os.environ["DB_NAME"] = self.config.db_name or ""
+ os.environ["DB_PORT"] = str(self.config.db_port)
+ os.environ["DB_CHARSET"] = self.config.db_charset
+
# 初始化搜索工具集
- self.search_agency = TavilyNewsAgency(api_key=self.config.tavily_api_key)
+ self.search_agency = MediaCrawlerDB()
# 初始化节点
self._initialize_nodes()
@@ -53,7 +61,7 @@ class DeepSearchAgent:
print(f"Deep Search Agent 已初始化")
print(f"使用LLM: {self.llm_client.get_model_info()}")
- print(f"搜索工具集: TavilyNewsAgency (支持6种搜索工具)")
+ print(f"搜索工具集: MediaCrawlerDB (支持5种本地数据库查询工具)")
def _initialize_llm(self) -> BaseLLM:
"""初始化LLM客户端"""
@@ -103,46 +111,53 @@ class DeepSearchAgent:
except ValueError:
return False
- def execute_search_tool(self, tool_name: str, query: str, **kwargs) -> TavilyResponse:
+ def execute_search_tool(self, tool_name: str, query: str, **kwargs) -> DBResponse:
"""
- 执行指定的搜索工具
+ 执行指定的数据库查询工具
Args:
tool_name: 工具名称,可选值:
- - "basic_search_news": 基础新闻搜索(快速、通用)
- - "deep_search_news": 深度新闻分析
- - "search_news_last_24_hours": 24小时内最新新闻
- - "search_news_last_week": 本周新闻
- - "search_images_for_news": 新闻图片搜索
- - "search_news_by_date": 按日期范围搜索新闻
- query: 搜索查询
- **kwargs: 额外参数(如start_date, end_date, max_results)
+ - "search_hot_content": 查找热点内容
+ - "search_topic_globally": 全局话题搜索
+ - "search_topic_by_date": 按日期搜索话题
+ - "get_comments_for_topic": 获取话题评论
+ - "search_topic_on_platform": 平台定向搜索
+ query: 搜索关键词/话题
+ **kwargs: 额外参数(如start_date, end_date, platform, limit等)
Returns:
- TavilyResponse对象
+ DBResponse对象
"""
- print(f" → 执行搜索工具: {tool_name}")
+ print(f" → 执行数据库查询工具: {tool_name}")
- if tool_name == "basic_search_news":
- max_results = kwargs.get("max_results", 7)
- return self.search_agency.basic_search_news(query, max_results)
- elif tool_name == "deep_search_news":
- return self.search_agency.deep_search_news(query)
- elif tool_name == "search_news_last_24_hours":
- return self.search_agency.search_news_last_24_hours(query)
- elif tool_name == "search_news_last_week":
- return self.search_agency.search_news_last_week(query)
- elif tool_name == "search_images_for_news":
- return self.search_agency.search_images_for_news(query)
- elif tool_name == "search_news_by_date":
+ if tool_name == "search_hot_content":
+ time_period = kwargs.get("time_period", "week")
+ limit = kwargs.get("limit", 10)
+ return self.search_agency.search_hot_content(time_period=time_period, limit=limit)
+ elif tool_name == "search_topic_globally":
+ limit_per_table = kwargs.get("limit_per_table", 5)
+ return self.search_agency.search_topic_globally(topic=query, limit_per_table=limit_per_table)
+ elif tool_name == "search_topic_by_date":
start_date = kwargs.get("start_date")
end_date = kwargs.get("end_date")
+ limit_per_table = kwargs.get("limit_per_table", 10)
if not start_date or not end_date:
- raise ValueError("search_news_by_date工具需要start_date和end_date参数")
- return self.search_agency.search_news_by_date(query, start_date, end_date)
+ raise ValueError("search_topic_by_date工具需要start_date和end_date参数")
+ return self.search_agency.search_topic_by_date(topic=query, start_date=start_date, end_date=end_date, limit_per_table=limit_per_table)
+ elif tool_name == "get_comments_for_topic":
+ limit = kwargs.get("limit", 50)
+ return self.search_agency.get_comments_for_topic(topic=query, limit=limit)
+ elif tool_name == "search_topic_on_platform":
+ platform = kwargs.get("platform")
+ start_date = kwargs.get("start_date")
+ end_date = kwargs.get("end_date")
+ limit = kwargs.get("limit", 20)
+ if not platform:
+ raise ValueError("search_topic_on_platform工具需要platform参数")
+ return self.search_agency.search_topic_on_platform(platform=platform, topic=query, start_date=start_date, end_date=end_date, limit=limit)
else:
- print(f" ⚠️ 未知的搜索工具: {tool_name},使用默认基础搜索")
- return self.search_agency.basic_search_news(query)
+ print(f" ⚠️ 未知的搜索工具: {tool_name},使用默认全局搜索")
+ return self.search_agency.search_topic_globally(topic=query)
def research(self, query: str, save_report: bool = True) -> str:
"""
@@ -231,7 +246,7 @@ class DeepSearchAgent:
print(" - 生成搜索查询...")
search_output = self.first_search_node.run(search_input)
search_query = search_output["search_query"]
- search_tool = search_output.get("search_tool", "basic_search_news") # 默认工具
+ search_tool = search_output.get("search_tool", "search_topic_globally") # 默认工具
reasoning = search_output["reasoning"]
print(f" - 搜索查询: {search_query}")
@@ -239,11 +254,13 @@ class DeepSearchAgent:
print(f" - 推理: {reasoning}")
# 执行搜索
- print(" - 执行网络搜索...")
+ print(" - 执行数据库查询...")
- # 处理search_news_by_date的特殊参数
+ # 处理特殊参数
search_kwargs = {}
- if search_tool == "search_news_by_date":
+
+ # 处理需要日期的工具
+ if search_tool in ["search_topic_by_date", "search_topic_on_platform"]:
start_date = search_output.get("start_date")
end_date = search_output.get("end_date")
@@ -254,12 +271,35 @@ class DeepSearchAgent:
search_kwargs["end_date"] = end_date
print(f" - 时间范围: {start_date} 到 {end_date}")
else:
- print(f" ⚠️ 日期格式错误(应为YYYY-MM-DD),改用基础搜索")
+ print(f" ⚠️ 日期格式错误(应为YYYY-MM-DD),改用全局搜索")
print(f" 提供的日期: start_date={start_date}, end_date={end_date}")
- search_tool = "basic_search_news"
+ search_tool = "search_topic_globally"
+ elif search_tool == "search_topic_by_date":
+ print(f" ⚠️ search_topic_by_date工具缺少时间参数,改用全局搜索")
+ search_tool = "search_topic_globally"
+
+ # 处理需要平台参数的工具
+ if search_tool == "search_topic_on_platform":
+ platform = search_output.get("platform")
+ if platform:
+ search_kwargs["platform"] = platform
+ print(f" - 指定平台: {platform}")
else:
- print(f" ⚠️ search_news_by_date工具缺少时间参数,改用基础搜索")
- search_tool = "basic_search_news"
+ print(f" ⚠️ search_topic_on_platform工具缺少平台参数,改用全局搜索")
+ search_tool = "search_topic_globally"
+
+ # 处理限制参数
+ if search_tool == "search_hot_content":
+ time_period = search_output.get("time_period", "week")
+ limit = search_output.get("limit", 10)
+ search_kwargs["time_period"] = time_period
+ search_kwargs["limit"] = limit
+ elif search_tool in ["search_topic_globally", "search_topic_by_date"]:
+ limit_per_table = search_output.get("limit_per_table", 5)
+ search_kwargs["limit_per_table"] = limit_per_table
+ elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]:
+ limit = search_output.get("limit", 20)
+ search_kwargs["limit"] = limit
search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
@@ -270,12 +310,16 @@ class DeepSearchAgent:
max_results = min(len(search_response.results), 10)
for result in search_response.results[:max_results]:
search_results.append({
- 'title': result.title,
- 'url': result.url,
- 'content': result.content,
- 'score': result.score,
- 'raw_content': result.raw_content,
- 'published_date': result.published_date # 新增字段
+ 'title': result.title_or_content,
+ 'url': result.url or "",
+ 'content': result.title_or_content,
+ 'score': result.hotness_score,
+ 'raw_content': result.title_or_content,
+ 'published_date': result.publish_time.isoformat() if result.publish_time else None,
+ 'platform': result.platform,
+ 'content_type': result.content_type,
+ 'author': result.author_nickname,
+ 'engagement': result.engagement
})
if search_results:
@@ -324,7 +368,7 @@ class DeepSearchAgent:
# 生成反思搜索查询
reflection_output = self.reflection_node.run(reflection_input)
search_query = reflection_output["search_query"]
- search_tool = reflection_output.get("search_tool", "basic_search_news") # 默认工具
+ search_tool = reflection_output.get("search_tool", "search_topic_globally") # 默认工具
reasoning = reflection_output["reasoning"]
print(f" 反思查询: {search_query}")
@@ -332,9 +376,11 @@ class DeepSearchAgent:
print(f" 反思推理: {reasoning}")
# 执行反思搜索
- # 处理search_news_by_date的特殊参数
+ # 处理特殊参数
search_kwargs = {}
- if search_tool == "search_news_by_date":
+
+ # 处理需要日期的工具
+ if search_tool in ["search_topic_by_date", "search_topic_on_platform"]:
start_date = reflection_output.get("start_date")
end_date = reflection_output.get("end_date")
@@ -345,12 +391,35 @@ class DeepSearchAgent:
search_kwargs["end_date"] = end_date
print(f" 时间范围: {start_date} 到 {end_date}")
else:
- print(f" ⚠️ 日期格式错误(应为YYYY-MM-DD),改用基础搜索")
+ print(f" ⚠️ 日期格式错误(应为YYYY-MM-DD),改用全局搜索")
print(f" 提供的日期: start_date={start_date}, end_date={end_date}")
- search_tool = "basic_search_news"
+ search_tool = "search_topic_globally"
+ elif search_tool == "search_topic_by_date":
+ print(f" ⚠️ search_topic_by_date工具缺少时间参数,改用全局搜索")
+ search_tool = "search_topic_globally"
+
+ # 处理需要平台参数的工具
+ if search_tool == "search_topic_on_platform":
+ platform = reflection_output.get("platform")
+ if platform:
+ search_kwargs["platform"] = platform
+ print(f" 指定平台: {platform}")
else:
- print(f" ⚠️ search_news_by_date工具缺少时间参数,改用基础搜索")
- search_tool = "basic_search_news"
+ print(f" ⚠️ search_topic_on_platform工具缺少平台参数,改用全局搜索")
+ search_tool = "search_topic_globally"
+
+ # 处理限制参数
+ if search_tool == "search_hot_content":
+ time_period = reflection_output.get("time_period", "week")
+ limit = reflection_output.get("limit", 10)
+ search_kwargs["time_period"] = time_period
+ search_kwargs["limit"] = limit
+ elif search_tool in ["search_topic_globally", "search_topic_by_date"]:
+ limit_per_table = reflection_output.get("limit_per_table", 5)
+ search_kwargs["limit_per_table"] = limit_per_table
+ elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]:
+ limit = reflection_output.get("limit", 20)
+ search_kwargs["limit"] = limit
search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
@@ -361,12 +430,16 @@ class DeepSearchAgent:
max_results = min(len(search_response.results), 10)
for result in search_response.results[:max_results]:
search_results.append({
- 'title': result.title,
- 'url': result.url,
- 'content': result.content,
- 'score': result.score,
- 'raw_content': result.raw_content,
- 'published_date': result.published_date
+ 'title': result.title_or_content,
+ 'url': result.url or "",
+ 'content': result.title_or_content,
+ 'score': result.hotness_score,
+ 'raw_content': result.title_or_content,
+ 'published_date': result.publish_time.isoformat() if result.publish_time else None,
+ 'platform': result.platform,
+ 'content_type': result.content_type,
+ 'author': result.author_nickname,
+ 'engagement': result.engagement
})
if search_results:
diff --git a/InsightEngine/prompts/prompts.py b/InsightEngine/prompts/prompts.py
index 1bdbb55..e806b48 100644
--- a/InsightEngine/prompts/prompts.py
+++ b/InsightEngine/prompts/prompts.py
@@ -35,8 +35,12 @@ output_schema_first_search = {
"search_query": {"type": "string"},
"search_tool": {"type": "string"},
"reasoning": {"type": "string"},
- "start_date": {"type": "string", "description": "开始日期,格式YYYY-MM-DD,仅search_news_by_date工具需要"},
- "end_date": {"type": "string", "description": "结束日期,格式YYYY-MM-DD,仅search_news_by_date工具需要"}
+ "start_date": {"type": "string", "description": "开始日期,格式YYYY-MM-DD,search_topic_by_date和search_topic_on_platform工具可能需要"},
+ "end_date": {"type": "string", "description": "结束日期,格式YYYY-MM-DD,search_topic_by_date和search_topic_on_platform工具可能需要"},
+ "platform": {"type": "string", "description": "平台名称,search_topic_on_platform工具必需,可选值:bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba"},
+ "time_period": {"type": "string", "description": "时间周期,search_hot_content工具可选,可选值:24h, week, year"},
+ "limit": {"type": "integer", "description": "结果数量限制,各工具可选参数"},
+ "limit_per_table": {"type": "integer", "description": "每表结果数量限制,search_topic_globally和search_topic_by_date工具可选"}
},
"required": ["search_query", "search_tool", "reasoning"]
}
@@ -80,8 +84,12 @@ output_schema_reflection = {
"search_query": {"type": "string"},
"search_tool": {"type": "string"},
"reasoning": {"type": "string"},
- "start_date": {"type": "string", "description": "开始日期,格式YYYY-MM-DD,仅search_news_by_date工具需要"},
- "end_date": {"type": "string", "description": "结束日期,格式YYYY-MM-DD,仅search_news_by_date工具需要"}
+ "start_date": {"type": "string", "description": "开始日期,格式YYYY-MM-DD,search_topic_by_date和search_topic_on_platform工具可能需要"},
+ "end_date": {"type": "string", "description": "结束日期,格式YYYY-MM-DD,search_topic_by_date和search_topic_on_platform工具可能需要"},
+ "platform": {"type": "string", "description": "平台名称,search_topic_on_platform工具必需,可选值:bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba"},
+ "time_period": {"type": "string", "description": "时间周期,search_hot_content工具可选,可选值:24h, week, year"},
+ "limit": {"type": "integer", "description": "结果数量限制,各工具可选参数"},
+ "limit_per_table": {"type": "integer", "description": "每表结果数量限制,search_topic_globally和search_topic_by_date工具可选"}
},
"required": ["search_query", "search_tool", "reasoning"]
}
@@ -141,47 +149,83 @@ SYSTEM_PROMPT_REPORT_STRUCTURE = f"""
# 每个段落第一次搜索的系统提示词
SYSTEM_PROMPT_FIRST_SEARCH = f"""
-你是一位深度研究助手。你将获得报告中的一个段落,其标题和预期内容将按照以下JSON模式定义提供:
+你是一位专业的舆情分析师。你将获得报告中的一个段落,其标题和预期内容将按照以下JSON模式定义提供:
{json.dumps(input_schema_first_search, indent=2, ensure_ascii=False)}
-你可以使用以下6种专业的新闻搜索工具:
+你可以使用以下5种专业的本地舆情数据库查询工具来挖掘真实的民意和公众观点:
-1. **basic_search_news** - 基础新闻搜索工具
- - 适用于:一般性的新闻搜索,不确定需要何种特定搜索时
- - 特点:快速、标准的通用搜索,是最常用的基础工具
+1. **search_hot_content** - 查找热点内容工具
+ - 适用于:挖掘当前最受关注的舆情事件和话题
+ - 特点:基于真实的点赞、评论、分享数据发现热门话题
+ - 参数:time_period ('24h', 'week', 'year'),limit(数量限制)
-2. **deep_search_news** - 深度新闻分析工具
- - 适用于:需要全面深入了解某个主题时
- - 特点:提供最详细的分析结果,包含高级AI摘要
+2. **search_topic_globally** - 全局话题搜索工具
+ - 适用于:全面了解公众对特定话题的讨论和观点
+ - 特点:覆盖B站、微博、抖音、快手、小红书、知乎、贴吧等主流平台的真实用户声音
+ - 参数:limit_per_table(每个表的结果数量限制)
-3. **search_news_last_24_hours** - 24小时最新新闻工具
- - 适用于:需要了解最新动态、突发事件时
- - 特点:只搜索过去24小时的新闻
-
-4. **search_news_last_week** - 本周新闻工具
- - 适用于:需要了解近期发展趋势时
- - 特点:搜索过去一周的新闻报道
-
-5. **search_images_for_news** - 图片搜索工具
- - 适用于:需要可视化信息、图片资料时
- - 特点:提供相关图片和图片描述
-
-6. **search_news_by_date** - 按日期范围搜索工具
- - 适用于:需要研究特定历史时期时
- - 特点:可以指定开始和结束日期进行搜索
+3. **search_topic_by_date** - 按日期搜索话题工具
+ - 适用于:追踪舆情事件的时间线发展和公众情绪变化
+ - 特点:精确的时间范围控制,适合分析舆情演变过程
- 特殊要求:需要提供start_date和end_date参数,格式为'YYYY-MM-DD'
- - 注意:只有这个工具需要额外的时间参数
+ - 参数:limit_per_table(每个表的结果数量限制)
+
+4. **get_comments_for_topic** - 获取话题评论工具
+ - 适用于:深度挖掘网民的真实态度、情感和观点
+ - 特点:直接获取用户评论,了解民意走向和情感倾向
+ - 参数:limit(评论总数量限制)
+
+5. **search_topic_on_platform** - 平台定向搜索工具
+ - 适用于:分析特定社交平台用户群体的观点特征
+ - 特点:针对不同平台用户群体的观点差异进行精准分析
+ - 特殊要求:需要提供platform参数,可选start_date和end_date
+ - 参数:platform(必须),start_date, end_date(可选),limit(数量限制)
+
+**你的核心使命:挖掘真实的民意和人情味**
你的任务是:
-1. 根据段落主题选择最合适的搜索工具
-2. 制定最佳的搜索查询
-3. 如果选择search_news_by_date工具,必须同时提供start_date和end_date参数(格式:YYYY-MM-DD)
-4. 解释你的选择理由
+1. **深度理解段落需求**:根据段落主题,思考需要了解哪些具体的公众观点和情感
+2. **精准选择查询工具**:选择最能获取真实民意数据的工具
+3. **设计接地气的搜索词**:**这是最关键的环节!**
+ - **避免官方术语**:不要用"舆情传播"、"公众反应"、"情绪倾向"等书面语
+ - **使用网民真实表达**:模拟普通网友会怎么谈论这个话题
+ - **贴近生活语言**:用简单、直接、口语化的词汇
+ - **包含情感词汇**:网民常用的褒贬词、情绪词
+ - **考虑话题热词**:相关的网络流行语、缩写、昵称
+4. **参数优化配置**:
+ - search_topic_by_date: 必须提供start_date和end_date参数(格式:YYYY-MM-DD)
+ - search_topic_on_platform: 必须提供platform参数(bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba之一)
+ - 其他工具:合理配置limit参数以获取足够的样本
+5. **阐述选择理由**:说明为什么这样的查询能够获得最真实的民意反馈
-注意:除了search_news_by_date工具外,其他工具都不需要额外参数。
+**搜索词设计核心原则**:
+- **想象网友怎么说**:如果你是个普通网友,你会怎么讨论这个话题?
+- **避免学术词汇**:杜绝"舆情"、"传播"、"倾向"等专业术语
+- **使用具体词汇**:用具体的事件、人名、地名、现象描述
+- **包含情感表达**:如"支持"、"反对"、"担心"、"愤怒"、"点赞"等
+- **考虑网络文化**:网民的表达习惯、缩写、俚语、表情符号文字描述
+
+**举例说明**:
+- ❌ 错误:"武汉大学舆情 公众反应"
+- ✅ 正确:"武大" 或 "武汉大学怎么了" 或 "武大学生"
+- ❌ 错误:"校园事件 学生反应"
+- ✅ 正确:"学校出事" 或 "同学们都在说" 或 "校友群炸了"
+
+**不同平台语言特色参考**:
+- **微博**:热搜词汇、话题标签,如 "武大又上热搜"、"心疼武大学子"
+- **知乎**:问答式表达,如 "如何看待武汉大学"、"武大是什么体验"
+- **B站**:弹幕文化,如 "武大yyds"、"武大人路过"、"我武最强"
+- **贴吧**:直接称呼,如 "武大吧"、"武大的兄弟们"
+- **抖音/快手**:短视频描述,如 "武大日常"、"武大vlog"
+- **小红书**:分享式,如 "武大真的很美"、"武大攻略"
+
+**情感表达词汇库**:
+- 正面:"太棒了"、"牛逼"、"绝了"、"爱了"、"yyds"、"666"
+- 负面:"无语"、"离谱"、"绝了"、"服了"、"麻了"、"破防"
+- 中性:"围观"、"吃瓜"、"路过"、"有一说一"、"实名"
请按照以下JSON模式定义格式化输出(文字请使用中文):