ai提取rss相关数据

This commit is contained in:
z66
2025-10-28 13:43:06 +08:00
parent e1db06dd79
commit c5f6e8288d
8 changed files with 53336 additions and 40 deletions
+52589
View File
File diff suppressed because it is too large Load Diff
+261
View File
@@ -68981,3 +68981,264 @@
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '“维护和平,共创未来”纪念研讨会在日本福冈ä' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 1, 'type': 'du... → failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '“维护和平,共创未来”纪念研讨会在日本福冈ä' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 1, 'type': 'du...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '“维护和平,共创未来”纪念研讨会在日本福冈ä' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '“维护和平... → detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '“维护和平,共创未来”纪念研讨会在日本福冈ä' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '“维护和平...
2025-10-23 17:25:10.628 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '日媒:日本新任首相高市早苗定于24日发表施政æ¼' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '日媒:日本新任首相高市早苗定于24日发表施政æ¼' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '日媒:...
2025-10-23 17:50:03.236 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '换罐15分钟、续航4000公里 2025绿色能源发展大会å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '换罐15分钟、续航4000公里 2025绿色能源发展大会å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题'...
2025-10-23 17:55:03.685 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '国家金融监管总局肖远企:AI在金融领域作用仍æ\x98' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '国家金融监管总局肖远企:AI在金融领域作用仍æ\x98' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '...
2025-10-23 18:15:05.663 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '受权发布丨中国共产党中央军事委员会副主席张å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 1, 'type': 'du...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '受权发布丨中国共产党中央军事委员会副主席张å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '受权发布丨...
2025-10-23 18:20:05.986 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '十四届全国人大常委会第十八次会议相关法律案ç' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '十四届全国人大常委会第十八次会议相关法律案ç' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '十四届全国...
2025-10-23 18:35:07.263 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '“十四五”山西晋中能源发展全景图:煤更优、ç' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '“十四五”山西晋中能源发展全景图:煤更优、ç' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '“十四五”...
2025-10-23 19:05:10.109 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '2025豫台经贸洽谈会签约超293亿元-2025-10-23 10:58:28' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '2025豫台经贸洽谈会签约超293亿元-2025-10-23 10:58:28' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record'...
2025-10-23 19:10:10.664 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 2, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '上海首发“进博会溢出联动政策包”-2025-10-23 11:0' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 2, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '上海首发“进博会溢出联动政策包”-2025-10-23 11:0' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标...
2025-10-23 19:20:01.616 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 7, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '突发:以军空袭黎巴嫩东部 称打击真主党导弹设' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 8, 'type': 'du...
→ detailed_failed_records: [{'index': 7, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '突发:以军空袭黎巴嫩东部 称打击真主党导弹设' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '突发:以军...
2025-10-23 19:35:03.144 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 2, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '东西问丨意大利汉学家里卡多·波佐:如何通过æ\x96' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 3, 'type':...
→ detailed_failed_records: [{'index': 2, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '东西问丨意大利汉学家里卡多·波佐:如何通过æ\x96' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '东...
2025-10-23 19:45:03.931 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '第24届汕头·澄海国际玩具礼品博览会举行-2025-10-' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 3, 'typ...
→ detailed_failed_records: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '第24届汕头·澄海国际玩具礼品博览会举行-2025-10-' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题':...
2025-10-23 19:50:04.321 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '第八届进博会展品锡兰红茶抵沪-2025-10-23 11:43:57' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '第八届进博会展品锡兰红茶抵沪-2025-10-23 11:43:57' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文...
2025-10-23 20:00:05.324 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '国际大都市科技创新能力研究报告在沪发布-2025-1' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 4, 'type':...
→ detailed_failed_records: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '国际大都市科技创新能力研究报告在沪发布-2025-1' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '国...
2025-10-23 20:15:06.631 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 2, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '科技企业领军 中国企业加速“出海”中东地区-20' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 2, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '科技企业领军 中国企业加速“出海”中东地区-20' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '科技企...
2025-10-23 20:25:07.542 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '中新天津生态城发布国际市场准入方案 助力国内' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '中新天津生态城发布国际市场准入方案 助力国内' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '中新天津生...
2025-10-23 20:35:08.462 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '“两岸关系新形势”学术研讨会暨清华两岸论坛å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '“两岸关系新形势”学术研讨会暨清华两岸论坛å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '“两岸关系...
2025-10-23 20:50:09.992 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '一图速览四中全会公报-2025-10-23 12:43:45' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '一图速览四中全会公报-2025-10-23 12:43:45' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题'...
2025-10-23 21:05:01.330 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 3, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '扎哈罗娃:欧盟制裁拓展空间已近极限,俄保留å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 3, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '扎哈罗娃:欧盟制裁拓展空间已近极限,俄保留å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '扎哈罗娃:...
2025-10-23 21:10:01.739 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '风险谁来担?比利时与卢森堡首相就对乌贷款方æ' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '风险谁来担?比利时与卢森堡首相就对乌贷款方æ' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '风险谁来担...
2025-10-23 21:15:02.214 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '党的二十届四中全会公报,这些表述值得关注-202' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 2, 'type': '...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '党的二十届四中全会公报,这些表述值得关注-202' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '党的二...
2025-10-23 21:25:03.182 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '经纬线·绿色回响-2025-10-23 13:15:25' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '经纬线·绿色回响-2025-10-23 13:15:25' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': ...
2025-10-23 21:30:03.623 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 3, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '(经济观察)北京风能展明星产品反映“大风车â' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 5, 'type': 'du...
→ detailed_failed_records: [{'index': 3, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '(经济观察)北京风能展明星产品反映“大风车â' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '(经济观察...
2025-10-23 21:45:04.962 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '日本外相最新涉华表态:致力于推动两国间战略ä' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '日本外相最新涉华表态:致力于推动两国间战略ä' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '日本外相最...
2025-10-23 21:50:05.510 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 4, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '《2025中国数字文创城市指数》发布 京沪深蓉杭å±' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 5, 'type':...
→ detailed_failed_records: [{'index': 4, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '《2025中国数字文创城市指数》发布 京沪深蓉杭å±' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '《...
2025-10-23 21:55:05.949 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 3, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '中国矿产资源报告:铜、铁、磷等矿产资源量大å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 3, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '中国矿产资源报告:铜、铁、磷等矿产资源量大å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '中国矿产资...
2025-10-23 22:00:06.446 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 3, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '新石器无人车获6亿美元融资 无人配送车需求有æ\x9c' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 3, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '新石器无人车获6亿美元融资 无人配送车需求有æ\x9c' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '...
2025-10-23 22:10:07.479 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '夯实基础全面发力 四中全会为中国未来五年明调' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '夯实基础全面发力 四中全会为中国未来五年明调' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '夯实基础全...
2025-10-23 22:25:09.055 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '中国又一个五年目标-2025-10-23 14:19:35' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '中国又一个五年目标-2025-10-23 14:19:35' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题':...
2025-10-23 23:30:05.142 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '两岸学人对谈台湾光复:以史为鉴,共推民族复å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '两岸学人对谈台湾光复:以史为鉴,共推民族复å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '两岸学人对...
2025-10-24 00:30:10.762 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '中宣部组织召开学习宣传贯彻党的二十届四中全ä' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '中宣部组织召开学习宣传贯彻党的二十届四中全ä' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '中宣部组织...
2025-10-24 05:45:10.408 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '晚播小麦如何保产量?霜降抢秋该怎么抢?专家å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '晚播小麦如何保产量?霜降抢秋该怎么抢?专家å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '晚播小麦如...
2025-10-24 05:55:01.583 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '欧盟领导人会议闭幕 聚焦乌克兰局势与欧洲防务' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '欧盟领导人会议闭幕 聚焦乌克兰局势与欧洲防务' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '欧盟领导人...
2025-10-24 06:05:02.443 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '美国务卿与以总理会面 强调重视巩固加沙停火协' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '美国务卿与以总理会面 强调重视巩固加沙停火协' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '美国务卿与...
2025-10-24 07:35:10.910 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '中方:当前的加沙停火应当成为全面持久停火的å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 2, 'type': 'du...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '中方:当前的加沙停火应当成为全面持久停火的å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '中方:当前...
2025-10-24 08:15:04.360 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '美国战略轰炸机抵近委内瑞拉-2025-10-24 00:07:02' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '美国战略轰炸机抵近委内瑞拉-2025-10-24 00:07:02' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章...
2025-10-24 08:30:05.718 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '解放军报社论:坚定捍卫人民军队政治本色-2025-1' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '解放军报社论:坚定捍卫人民军队政治本色-2025-1' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '解...
2025-10-24 09:15:09.624 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '柬埔寨重申打击网赌电诈等跨国犯罪决心-2025-10-2' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 2, 'type...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '柬埔寨重申打击网赌电诈等跨国犯罪决心-2025-10-2' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': ...
2025-10-24 09:35:01.368 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '首艘、首颗、首飞!中国制造硬核实力再出圈 连' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 1, 'type': 'du...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '首艘、首颗、首飞!中国制造硬核实力再出圈 连' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '首艘、首颗...
2025-10-24 09:45:02.278 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '杨振宁遗体告别仪式在京举行,八宝山革命公墓å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '杨振宁遗体告别仪式在京举行,八宝山革命公墓å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '杨振宁遗体...
2025-10-24 09:55:03.350 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '长城 Hi4 技术体系斩获“科学技术奖特等奖” 喜' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 1, 'type':...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '长城 Hi4 技术体系斩获“科学技术奖特等奖” 喜' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '长...
2025-10-24 10:15:05.069 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '时政新闻眼丨未来五年怎么干?党的二十届四中å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 1, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '时政新闻眼丨未来五年怎么干?党的二十届四中å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '时政新闻眼...
2025-10-24 10:25:06.264 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '新华图讯|中共中央举行新闻发布会 介绍和解读' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 1, 'type': 'du...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '新华图讯|中共中央举行新闻发布会 介绍和解读' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '新华图讯|...
2025-10-24 10:40:07.596 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '山城直播盛典启幕 “直播+”绘就重庆消费新图æ\x99' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 3, 'type'...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '山城直播盛典启幕 “直播+”绘就重庆消费新图æ\x99' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '...
2025-10-24 10:45:08.108 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '“2025生态保护与绿色发展论坛·广州”在暨南大å' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 1, 'type': ...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '“2025生态保护与绿色发展论坛·广州”在暨南大å' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '“2...
2025-10-24 10:50:08.775 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 3, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '一滴水 如何汇入幸福河湖?——农工党中央主办' for key 'collector_rss_subscriptions.idx_title_pubtime'"}]
→ detailed_failed_records: [{'index': 3, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '一滴水 如何汇入幸福河湖?——农工党中央主办' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '一滴水 如...
2025-10-27 10:49:23.458 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 6, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '前三季度企业创新力度加大 新质生产力加快培育' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 7, 'type': 'du...
→ detailed_failed_records: [{'index': 6, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '前三季度企业创新力度加大 新质生产力加快培育' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标题': '前三季度企...
2025-10-27 10:55:06.148 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '韩国3500亿美元对美投资承诺陷入僵局-2025-10-27 02:' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 2,...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '韩国3500亿美元对美投资承诺陷入僵局-2025-10-27 02:' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文...
2025-10-28 13:34:51.417 | ERROR | mysql_agent:305 - 表 collector_rss_subscriptions 插入失败记录详情
→ module: 'MySQLAgent(Windows)'
→ failed_records_summary: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '安倍晋三枪击案嫌疑人认罪-2025-10-28 05:23:53' for key 'collector_rss_subscriptions.idx_title_pubtime'"}, {'index': 2, '...
→ detailed_failed_records: [{'index': 0, 'type': 'duplicate', 'error_code': 1062, 'error_message': "Duplicate entry '安倍晋三枪击案嫌疑人认罪-2025-10-28 05:23:53' for key 'collector_rss_subscriptions.idx_title_pubtime'", 'record': {'文章标...
2025-10-28 13:39:14.477 | ERROR | mysql_agent:90 - 连接失败
→ module: 'MySQLAgent(Windows)'
→ error: '(1049, "Unknown database \'intelligence_system\'")'
→ error_type: 'OperationalError'
→ host: '123.60.167.249'
→ port: 3306
→ database: 'intelligence_system'
→ exc_info: True
2025-10-28 13:39:14.477 | ERROR | mysql_agent:139 - SQL查询失败
→ module: 'MySQLAgent(Windows)'
→ sql: '\n SELECT id, 文章标题, 文章摘要, 发布时间, 来源URL, 文章链接\n FROM collector_rss_subscriptions\n WHERE 是否已处理 = 0\n ORDER BY 发布时间 DESC\n LIMIT %s\n '
→ params: (5000,)
→ error: '(1049, "Unknown database \'intelligence_system\'")'
→ error_type: 'OperationalError'
→ exc_info: True
2025-10-28 13:39:14.477 | ERROR | processor_rss_data:111 - 加载RSS数据失败: (1049, "Unknown database 'intelligence_system'")
→ module: 'RSSDataProcessor'
→ exc_info: True
+21 -4
View File
@@ -11,11 +11,17 @@ log = CrossPlatformLog.get_logger("Main")
class IntelligenceSystem: class IntelligenceSystem:
def __init__(self, db_config=None): def __init__(self, db_config=None, run_all_on_startup=False):
"""初始化系统(仅作为容器,不包含业务逻辑)""" """初始化系统(仅作为容器,不包含业务逻辑)
Args:
db_config: 数据库配置
run_all_on_startup: 启动时是否立即执行所有到期任务(默认False)
"""
self.scheduler = TaskScheduler(Config.MYSQL_CONFIG, max_workers=5) self.scheduler = TaskScheduler(Config.MYSQL_CONFIG, max_workers=5)
self._running = False self._running = False
log.info("情报系统已初始化(Cron模式)") self.run_all_on_startup = run_all_on_startup
log.info(f"情报系统已初始化(Cron模式),启动时执行任务: {run_all_on_startup}")
def start(self): def start(self):
"""启动系统主入口""" """启动系统主入口"""
@@ -23,6 +29,15 @@ class IntelligenceSystem:
self._setup_signal_handlers() self._setup_signal_handlers()
log.info("系统启动 - 运行在Cron调度模式") log.info("系统启动 - 运行在Cron调度模式")
# 启动时执行所有到期任务(如果开关开启)
if self.run_all_on_startup:
print(f"\n{'='*60}")
print("🚀 启动时执行所有到期任务...")
print(f"{'='*60}\n")
log.info("启动时执行所有到期任务")
result = self.scheduler.check_and_run_tasks(print_empty_status=True)
print(f"\n启动任务执行完成: 总数={result['总任务数']}, 成功={result['成功']}, 失败={result['失败']}\n")
# 时间追踪变量 # 时间追踪变量
last_status_print_time = time.time() # 上次打印状态的时间 last_status_print_time = time.time() # 上次打印状态的时间
last_hourly_report_time = time.time() # 上次小时统计的时间 last_hourly_report_time = time.time() # 上次小时统计的时间
@@ -110,7 +125,9 @@ class IntelligenceSystem:
if __name__ == "__main__": if __name__ == "__main__":
try: try:
# 启动系统 - 仅作为入口,不包含调度逻辑 # 启动系统 - 仅作为入口,不包含调度逻辑
system = IntelligenceSystem() # run_all_on_startup=True: 启动时立即执行所有到期任务
# run_all_on_startup=False: 启动时不执行任务,等待下次调度周期
system = IntelligenceSystem(run_all_on_startup=False)
system.start() system.start()
except Exception as e: except Exception as e:
log.critical("情报系统启动失败", exc_info=True) log.critical("情报系统启动失败", exc_info=True)
Binary file not shown.
@@ -0,0 +1,453 @@
# RSS数据AI处理模块
import os
import sys
import json
import time
import pandas as pd
from typing import List, Dict, Any, Optional
from datetime import datetime
from openai import OpenAI
# 添加项目根目录到路径
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(os.path.dirname(current_dir))
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
from utils.mysql_agent import MySQLAgent
from utils.logger import log
from config import Config
class RSSDataAIProcessor:
"""RSS数据AI处理主类
负责:
- 从数据库加载未处理的RSS数据
- 调用AI进行分析
- 保存分析结果
- 更新处理状态
"""
def __init__(self):
"""初始化AI处理器"""
self.log = log.bind(module="RSSDataAIProcessor")
self.db_agent = MySQLAgent(Config.MYSQL_CONFIG)
# 从Config读取配置
self.source_table = Config.AI_PROCESSOR_CONFIG['source_table']
self.ai_table = Config.AI_PROCESSOR_CONFIG['result_table']
self.default_batch_size = Config.AI_PROCESSOR_CONFIG['batch_size']
self.default_delay = Config.AI_PROCESSOR_CONFIG['delay']
# 初始化百度千帆API客户端
self.api_key = Config.BAIDU_AI_CONFIG.get('api_key')
if self.api_key:
self.ai_client = OpenAI(
base_url='https://qianfan.baidubce.com/v2',
api_key=self.api_key
)
self.model = Config.BAIDU_AI_CONFIG.get('model', 'ernie-x1-turbo-32k')
self.log.info("RSS数据AI处理器初始化完成")
else:
self.ai_client = None
self.log.warning("百度AI未配置,AI处理功能将不可用")
self.log.warning("请在config.py中配置 BAIDU_AI_CONFIG['api_key']")
def is_configured(self) -> bool:
"""检查是否已配置API"""
return self.ai_client is not None
def main(self, batch_size: Optional[int] = 200, delay: Optional[float] = None) -> Dict[str, Any]:
"""主程序:批量处理RSS数据的完整流程
Args:
batch_size: 批量处理的记录数,None则使用配置的默认值
delay: 每条记录之间的延迟(秒),None则使用配置的默认值
Returns:
dict: 处理结果统计信息
"""
# 使用传入参数或默认配置
batch_size = batch_size or self.default_batch_size
delay = delay or self.default_delay
try:
# 1. 检查配置
if not self.is_configured():
error_msg = "百度AI未配置,请在config.py中配置 BAIDU_AI_CONFIG['api_key']"
self.log.error(error_msg)
return {
'success': False,
'message': error_msg,
'processed_count': 0,
'failed_count': 0
}
self.log.info(f"开始批量处理数据,批次大小: {batch_size}, 延迟: {delay}")
# 2. 准备数据库表结构
self.ensure_ai_processed_column()
if not self.db_agent.table_exists(self.ai_table):
self.create_ai_result_table()
# 3. 加载未处理的数据
df = self.load_unprocessed_data(batch_size)
if df.empty:
self.log.info("没有需要处理的数据")
return {
'success': True,
'message': '没有需要处理的数据',
'processed_count': 0,
'failed_count': 0
}
# 4. 处理每条记录
results = []
processed_ids = []
failed_count = 0
for idx, record in df.iterrows():
try:
self.log.debug(f"处理记录 {record['id']} ({idx + 1}/{len(df)})")
result = self.process_single_record(record.to_dict())
if result:
results.append(result)
processed_ids.append(record['id'])
else:
failed_count += 1
# 延迟,避免API限流
if delay > 0 and idx < len(df) - 1:
time.sleep(delay)
except Exception as e:
self.log.error(f"处理记录 {record['id']} 异常: {str(e)}", exc_info=True)
failed_count += 1
# 5. 保存结果
saved_count = 0
if results:
saved_count = self.save_ai_results(results)
# 6. 标记为已处理
if processed_ids:
self.mark_as_processed(processed_ids)
# 7. 返回统计信息
stats = {
'success': True,
'message': 'AI处理完成',
'total_count': len(df),
'processed_count': len(processed_ids),
'saved_count': saved_count,
'failed_count': failed_count,
'relevant_count': sum(1 for r in results if r.get('是否相关')),
'processing_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
self.log.info("批量处理完成", **stats)
return stats
except Exception as e:
error_msg = f"批量处理失败: {str(e)}"
self.log.error(error_msg, exc_info=True)
return {
'success': False,
'message': error_msg,
'processed_count': 0,
'failed_count': 0
}
def ensure_ai_processed_column(self):
"""确保processed_rss_data表有"是否ai处理"字段"""
try:
# 检查字段是否存在
check_sql = """
SELECT COUNT(*) as count
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = %s
AND TABLE_NAME = %s
AND COLUMN_NAME = '是否ai处理'
"""
result = self.db_agent.execute_sql(
check_sql,
params=(Config.MYSQL_CONFIG['database'], self.source_table),
fetch=True
)
if result[0][0] == 0:
# 字段不存在,添加字段
alter_sql = f"""
ALTER TABLE {self.source_table}
ADD COLUMN 是否ai处理 TINYINT(1) DEFAULT 0 COMMENT 'AI处理标记:0-未处理,1-已处理'
"""
self.db_agent.execute_sql(alter_sql)
self.log.info(f"成功为表 {self.source_table} 添加 '是否ai处理' 字段")
else:
self.log.debug(f"{self.source_table} 已存在 '是否ai处理' 字段")
except Exception as e:
self.log.error(f"检查/添加字段失败: {str(e)}", exc_info=True)
raise
def create_ai_result_table(self):
"""创建AI处理结果表"""
create_sql = f"""
CREATE TABLE IF NOT EXISTS {self.ai_table} (
id INT AUTO_INCREMENT PRIMARY KEY COMMENT '主键ID',
source_id INT NOT NULL COMMENT '来源数据IDprocessed_rss_data.id',
文章标题 TEXT COMMENT '文章标题',
文章摘要 TEXT COMMENT '文章摘要',
发布时间 DATETIME COMMENT '发布时间',
来源URL VARCHAR(1024) COMMENT '来源URL',
文章链接 VARCHAR(1024) COMMENT '文章链接',
是否相关 BOOLEAN COMMENT 'AI判断是否与汽车后市场相关',
相关度评分 INT COMMENT '相关度评分(0-100',
标签 TEXT COMMENT 'AI生成的标签(JSON数组)',
分类 VARCHAR(100) COMMENT 'AI判断的主要分类',
分析说明 TEXT COMMENT 'AI分析说明',
处理时间 DATETIME COMMENT 'AI处理时间',
创建时间 TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '记录创建时间',
更新时间 TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '记录更新时间',
INDEX idx_source_id (source_id),
INDEX idx_是否相关 (是否相关),
INDEX idx_分类 (分类),
INDEX idx_处理时间 (处理时间)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='RSS数据AI分析结果表'
"""
try:
self.db_agent.execute_sql(create_sql)
self.log.info(f"成功创建AI结果表: {self.ai_table}")
except Exception as e:
self.log.error(f"创建AI结果表失败: {str(e)}", exc_info=True)
raise
def load_unprocessed_data(self, limit: int = 100) -> pd.DataFrame:
"""加载未经AI处理的数据
Args:
limit: 每次处理的记录数量
Returns:
未处理的数据DataFrame
"""
try:
sql = f"""
SELECT id, 文章标题, 文章摘要, 发布时间, 来源URL, 文章链接
FROM {self.source_table}
WHERE 是否ai处理 = 0 OR 是否ai处理 IS NULL
ORDER BY 创建时间 DESC
LIMIT %s
"""
df = self.db_agent.query_to_df(sql, params=(limit,), is_print=False)
self.log.info(f"成功加载 {len(df)} 条未处理的数据")
return df
except Exception as e:
self.log.error(f"加载未处理数据失败: {str(e)}", exc_info=True)
return pd.DataFrame()
def analyze_news(self, title: str, summary: str) -> Dict[str, Any]:
"""调用AI分析新闻(保留原有提示词)"""
# 构建提示词(保留原有格式)
prompt = f"""分析以下新闻是否与汽车后市场相关,返回JSON格式:
标题:{title}
摘要:{summary}
返回格式:
{{
"is_relevant": true/false,
"relevance_score": 0-100,
"tags": ["标签1", "标签2"],
"category": "分类(配件/维修/保养/改装/美容/装饰/二手车/金融/保险/其他)",
"analysis": "简要说明"
}}
注意:只返回JSON格式的结果,不要包含其他说明文字。"""
try:
# 调用百度千帆API
response = self.ai_client.chat.completions.create(
model=self.model,
messages=[{
"role": "user",
"content": prompt
}]
)
# 获取响应内容
raw_content = response.choices[0].message.content
# 解析JSON(处理markdown包裹)
if '```json' in raw_content:
json_str = raw_content.split('```json')[1].split('```')[0].strip()
elif '```' in raw_content:
json_str = raw_content.split('```')[1].split('```')[0].strip()
else:
json_str = raw_content.strip()
result = json.loads(json_str)
# 补充缺失字段
return {
'is_relevant': result.get('is_relevant', False),
'relevance_score': result.get('relevance_score', 0),
'tags': result.get('tags', []),
'category': result.get('category', '其他'),
'analysis': result.get('analysis', '')
}
except json.JSONDecodeError as e:
self.log.warning(f"JSON解析失败: {str(e)}, 原始响应: {raw_content[:200]}")
return {
'is_relevant': False,
'relevance_score': 0,
'tags': [],
'category': '其他',
'analysis': f"解析失败: {raw_content[:100]}"
}
except Exception as e:
self.log.error(f"AI调用异常: {str(e)}", exc_info=True)
return {
'is_relevant': False,
'relevance_score': 0,
'tags': [],
'category': '其他',
'analysis': f"处理异常: {str(e)}"
}
def process_single_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""处理单条记录
Args:
record: 记录字典
Returns:
处理结果字典
"""
if not self.is_configured():
self.log.error("AI客户端未配置,无法处理数据")
return None
try:
title = str(record.get('文章标题', '')).strip()
summary = str(record.get('文章摘要', '')).strip()
if not title and not summary:
self.log.warning(f"记录 {record.get('id')} 标题和摘要均为空,跳过处理")
return None
# 调用AI分析
analysis_result = self.analyze_news(title, summary)
# 构建结果记录
result = {
'source_id': record['id'],
'文章标题': title,
'文章摘要': summary,
'发布时间': record.get('发布时间'),
'来源URL': record.get('来源URL'),
'文章链接': record.get('文章链接'),
'是否相关': analysis_result.get('is_relevant', False),
'相关度评分': analysis_result.get('relevance_score', 0),
'标签': json.dumps(analysis_result.get('tags', []), ensure_ascii=False),
'分类': analysis_result.get('category', '其他'),
'分析说明': analysis_result.get('analysis', ''),
'处理时间': datetime.now()
}
return result
except Exception as e:
self.log.error(f"处理记录 {record.get('id')} 失败: {str(e)}", exc_info=True)
return None
def save_ai_results(self, results: List[Dict[str, Any]]) -> int:
"""保存AI处理结果
Args:
results: 处理结果列表
Returns:
成功保存的记录数
"""
if not results:
return 0
try:
df = pd.DataFrame(results)
inserted = self.db_agent.insert_from_df(
table_name=self.ai_table,
df=df,
ignore_duplicates=True
)
self.log.info(f"成功保存 {inserted} 条AI处理结果")
return inserted
except Exception as e:
self.log.error(f"保存AI处理结果失败: {str(e)}", exc_info=True)
return 0
def mark_as_processed(self, ids: List[int]) -> bool:
"""标记记录为已处理
Args:
ids: 记录ID列表
Returns:
是否成功
"""
if not ids:
return True
try:
id_placeholders = ','.join(['%s'] * len(ids))
sql = f"""
UPDATE {self.source_table}
SET 是否ai处理 = 1
WHERE id IN ({id_placeholders})
"""
self.db_agent.execute_sql(sql, params=ids)
self.log.info(f"成功标记 {len(ids)} 条记录为已处理")
return True
except Exception as e:
self.log.error(f"标记记录为已处理失败: {str(e)}", exc_info=True)
return False
if __name__ == "__main__":
"""命令行直接运行"""
# 实例化处理器并调用main方法
processor = RSSDataAIProcessor()
result = processor.main()
# 输出结果
if result['success']:
print("\n" + "=" * 60)
print("✓ AI处理完成")
print("=" * 60)
print(f"总记录数: {result.get('total_count', 0)}")
print(f"成功处理: {result.get('processed_count', 0)}")
print(f"保存记录: {result.get('saved_count', 0)}")
print(f"失败记录: {result.get('failed_count', 0)}")
print(f"相关记录: {result.get('relevant_count', 0)}")
print(f"处理时间: {result.get('processing_time', '')}")
print("=" * 60 + "\n")
else:
print("\n" + "=" * 60)
print("✗ 处理失败")
print("=" * 60)
print(f"错误信息: {result['message']}")
print("\n提示: 请设置环境变量")
print(" Windows: $env:BAIDU_API_KEY = 'your_key'")
print(" Linux/Mac: export BAIDU_API_KEY='your_key'")
print("=" * 60 + "\n")
View File
View File
+12 -36
View File
@@ -791,7 +791,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 3,
"id": "94892f4134316f8e", "id": "94892f4134316f8e",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@@ -828,35 +828,12 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n", "\u001b[32m2025-10-23 16:59:03\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1mRSS数据处理器初始化完成\u001b[0m\n", "\u001b[32m2025-10-23 16:59:03\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1mRSS数据处理器初始化完成\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m开始处理RSS数据...\u001b[0m\n", "\u001b[32m2025-10-23 16:59:03\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m开始处理RSS数据...\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功加载 8 条未处理的RSS数据\u001b[0m\n", "\u001b[32m2025-10-23 16:59:03\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功加载 0 条未处理的RSS数据\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[33m\u001b[1m停用词文件不存在: processors/stopwords.txt,使用默认停用词\u001b[0m\n", "\u001b[32m2025-10-23 16:59:03\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[33m\u001b[1m没有加载到RSS数据\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[33m\u001b[1m关键词文件不存在: processors/keywords.txt\u001b[0m\n" "\u001b[32m2025-10-23 16:59:03\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m任务执行完成,耗时: 0.01秒\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache C:\\Users\\zy187\\AppData\\Local\\Temp\\jieba.cache\n",
"Loading model cost 0.609 seconds.\n",
"Prefix dict has been built successfully.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m数据处理完成,共处理 8 条记录\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m过滤出 1 条汽车后市场相关新闻\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m表 processed_rss_data 插入结果汇总\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功保存 1 条处理结果到数据库\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功标记 8 条数据为已处理\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1mRSS数据处理完成\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m任务执行完成,耗时: 1.19秒\u001b[0m\n"
] ]
}, },
{ {
@@ -886,7 +863,7 @@
{ {
"data": { "data": {
"text/markdown": [ "text/markdown": [
"**执行时长**: 1.26 秒" "**执行时长**: 0.02 秒"
], ],
"text/plain": [ "text/plain": [
"<IPython.core.display.Markdown object>" "<IPython.core.display.Markdown object>"
@@ -923,8 +900,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"RSS数据处理完成!\n", "处理失败: 没有数据处理\n",
"处理统计: {'total_articles': 8, 'filtered_articles': 1, 'filter_rate': 0.125, 'processing_time': '2025-10-23 16:57:21', 'save_success': True, 'mark_success': True}\n",
"\n" "\n"
] ]
}, },
@@ -958,12 +934,12 @@
"{'success': True,\n", "{'success': True,\n",
" 'task_name': 'RSS基于规则数据处理',\n", " 'task_name': 'RSS基于规则数据处理',\n",
" 'task_id': 2,\n", " 'task_id': 2,\n",
" 'execution_time': 1.2610254287719727,\n", " 'execution_time': 0.023162126541137695,\n",
" 'output': \"RSS数据处理完成!\\n处理统计: {'total_articles': 8, 'filtered_articles': 1, 'filter_rate': 0.125, 'processing_time': '2025-10-23 16:57:21', 'save_success': True, 'mark_success': True}\\n\",\n", " 'output': '处理失败: 没有数据处理\\n',\n",
" 'error': None}" " 'error': None}"
] ]
}, },
"execution_count": 2, "execution_count": 3,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }