185 lines
6.3 KiB
Python
185 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
情报收集系统主程序
|
||
功能:
|
||
1. 调度数据采集、处理、存储流程
|
||
2. 生成日报/月报
|
||
3. 异常监控和报警
|
||
"""
|
||
|
||
import sys
|
||
import logging
|
||
from datetime import datetime, timedelta
|
||
from typing import List, Dict, Any
|
||
|
||
# 自定义模块
|
||
from config.settings import API_KEYS, DATA_SOURCES
|
||
from collectors.news_api import NewsAPICollector
|
||
from collectors.complaint_spider import ComplaintSpider
|
||
from processors.data_processor import DataProcessor
|
||
from storage.database import IntelligenceDB
|
||
from applications.reporter import ReportGenerator
|
||
from applications.alert import AlertService
|
||
from utils.logger import setup_logging
|
||
from utils.mail import send_email
|
||
|
||
class IntelligenceSystem:
|
||
def __init__(self):
|
||
# 初始化核心组件
|
||
setup_logging()
|
||
self.logger = logging.getLogger(__name__)
|
||
|
||
self.db = IntelligenceDB()
|
||
self.processor = DataProcessor()
|
||
self.alert = AlertService()
|
||
|
||
# 数据采集器注册
|
||
self.collectors = {
|
||
"news": NewsAPICollector(API_KEYS['newsapi']),
|
||
"complaint": ComplaintSpider(
|
||
base_url=DATA_SOURCES['blackcat'],
|
||
rate_limit=30 # 30秒爬取间隔
|
||
)
|
||
}
|
||
|
||
def run_daily_pipeline(self):
|
||
"""每日数据采集处理流程"""
|
||
try:
|
||
# 阶段1:数据采集
|
||
raw_data = self._collect_data()
|
||
|
||
# 阶段2:数据处理
|
||
processed_data = self._process_data(raw_data)
|
||
|
||
# 阶段3:数据存储
|
||
self._store_data(processed_data)
|
||
|
||
# 阶段4:生成日报
|
||
self._generate_reports()
|
||
|
||
# 阶段5:异常检测
|
||
self._check_alerts()
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"主流程执行失败: {str(e)}", exc_info=True)
|
||
self.alert.send_critical(f"系统异常: {str(e)}")
|
||
|
||
def _collect_data(self) -> Dict[str, List[Dict]]:
|
||
"""执行所有数据采集任务"""
|
||
collected = {}
|
||
for name, collector in self.collectors.items():
|
||
try:
|
||
self.logger.info(f"开始采集 {name} 数据...")
|
||
data = collector.fetch_data({
|
||
'keywords': '汽车后市场',
|
||
'max_results': 100
|
||
})
|
||
collected[name] = data
|
||
self.logger.info(f"{name} 采集完成,共 {len(data)} 条数据")
|
||
except Exception as e:
|
||
self.logger.error(f"{name} 采集器异常: {str(e)}")
|
||
continue
|
||
return collected
|
||
|
||
def _process_data(self, raw_data: Dict) -> Dict:
|
||
"""处理原始数据"""
|
||
processed = {}
|
||
for data_type, items in raw_data.items():
|
||
processed[data_type] = []
|
||
for item in items:
|
||
try:
|
||
# 文本数据标准处理
|
||
if data_type in ['news', 'complaint']:
|
||
result = self.processor.process_text(item['content'])
|
||
processed_item = {
|
||
**item,
|
||
'keywords': result['keywords'],
|
||
'category': result['category']
|
||
}
|
||
processed[data_type].append(processed_item)
|
||
|
||
# 图像处理(预留接口)
|
||
elif data_type == 'images':
|
||
processed[data_type].append(
|
||
self.processor.image_to_text(item)
|
||
)
|
||
except Exception as e:
|
||
self.logger.warning(f"数据处理失败: {item.get('id', '')} - {str(e)}")
|
||
continue
|
||
return processed
|
||
|
||
def _store_data(self, processed_data: Dict):
|
||
"""存储到数据库"""
|
||
for data_type, items in processed_data.items():
|
||
success_count = 0
|
||
for item in items:
|
||
try:
|
||
if self.db.insert_data(data_type, item):
|
||
success_count += 1
|
||
except Exception as e:
|
||
self.logger.error(f"数据存储失败: {str(e)}")
|
||
|
||
self.logger.info(
|
||
f"{data_type} 数据存储完成,成功 {success_count}/{len(items)} 条"
|
||
)
|
||
|
||
def _generate_reports(self):
|
||
"""生成报告并发送"""
|
||
try:
|
||
# 日报生成
|
||
report_html = ReportGenerator(self.db).generate_daily()
|
||
with open(f"reports/daily_{datetime.now().date()}.html", 'w') as f:
|
||
f.write(report_html)
|
||
|
||
# 每月1号生成月报
|
||
if datetime.now().day == 1:
|
||
monthly_report = ReportGenerator(self.db).generate_monthly()
|
||
send_email(
|
||
to="team@example.com",
|
||
subject=f"{datetime.now().strftime('%Y-%m')} 情报月报",
|
||
content=monthly_report
|
||
)
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"报告生成失败: {str(e)}")
|
||
|
||
def _check_alerts(self):
|
||
"""检查预警信息"""
|
||
# 负面舆情监测
|
||
negative_keywords = ['投诉', '造假', '违规']
|
||
alerts = self.alert.check_negative(negative_keywords)
|
||
|
||
if alerts:
|
||
self.alert.send_urgent(
|
||
"负面舆情警报",
|
||
"\n".join([f"[{a['source']}] {a['content']}" for a in alerts])
|
||
)
|
||
|
||
def cleanup(self):
|
||
"""资源清理"""
|
||
self.db.close()
|
||
self.logger.info("系统资源已释放")
|
||
|
||
if __name__ == "__main__":
|
||
system = IntelligenceSystem()
|
||
|
||
try:
|
||
# 执行每日任务
|
||
if len(sys.argv) > 1 and sys.argv[1] == "--manual":
|
||
system.logger.info("手动执行模式启动")
|
||
system.run_daily_pipeline()
|
||
else:
|
||
# 定时任务模式(实际部署时改用crontab或APScheduler)
|
||
system.logger.info("定时任务模式启动")
|
||
while True:
|
||
now = datetime.now()
|
||
if now.hour == 9 and now.minute == 0: # 每天9点执行
|
||
system.run_daily_pipeline()
|
||
time.sleep(60) # 避免重复执行
|
||
time.sleep(30)
|
||
|
||
except KeyboardInterrupt:
|
||
system.logger.info("用户中断执行")
|
||
finally:
|
||
system.cleanup() |