本地化&2.0
This commit is contained in:
@@ -251,16 +251,30 @@ postgresql_db_config = {{
|
||||
|
||||
logger.info(f"执行命令: {' '.join(cmd)}")
|
||||
|
||||
# 切换到MediaCrawler目录并执行
|
||||
# 切换到MediaCrawler目录并执行,捕获输出
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
cwd=self.mediacrawler_path,
|
||||
timeout=3600 # 60分钟超时
|
||||
timeout=3600, # 60分钟超时
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding='utf-8',
|
||||
errors='replace'
|
||||
)
|
||||
|
||||
end_time = datetime.now()
|
||||
duration = (end_time - start_time).total_seconds()
|
||||
|
||||
# 解析输出,提取实际保存的数据量
|
||||
output_lines = result.stdout.split('\n') if result.stdout else []
|
||||
error_lines = result.stderr.split('\n') if result.stderr else []
|
||||
|
||||
# 合并所有输出行用于解析
|
||||
all_lines = output_lines + error_lines
|
||||
|
||||
# 解析统计信息
|
||||
parsed_stats = self._parse_crawl_output(all_lines, error_lines)
|
||||
|
||||
# 创建统计信息
|
||||
crawl_stats = {
|
||||
"platform": platform,
|
||||
@@ -270,9 +284,11 @@ postgresql_db_config = {{
|
||||
"end_time": end_time.isoformat(),
|
||||
"return_code": result.returncode,
|
||||
"success": result.returncode == 0,
|
||||
"notes_count": 0,
|
||||
"comments_count": 0,
|
||||
"errors_count": 0
|
||||
"notes_count": parsed_stats.get("notes_count", 0),
|
||||
"comments_count": parsed_stats.get("comments_count", 0),
|
||||
"errors_count": parsed_stats.get("errors_count", 0),
|
||||
"output_preview": '\n'.join(output_lines[-20:]) if output_lines else "", # 最后20行输出
|
||||
"error_preview": '\n'.join(error_lines[-20:]) if error_lines else "" # 最后20行错误
|
||||
}
|
||||
|
||||
# 保存统计信息
|
||||
@@ -280,8 +296,16 @@ postgresql_db_config = {{
|
||||
|
||||
if result.returncode == 0:
|
||||
logger.info(f"✅ {platform} 爬取完成,耗时: {duration:.1f}秒")
|
||||
logger.info(f" 保存内容: {crawl_stats['notes_count']} 条,评论: {crawl_stats['comments_count']} 条")
|
||||
if crawl_stats['notes_count'] == 0 and crawl_stats['comments_count'] == 0:
|
||||
logger.warning(f"⚠️ {platform} 爬取成功但未保存任何数据,请检查数据库连接和保存逻辑")
|
||||
# 输出部分日志用于调试
|
||||
if crawl_stats['error_preview']:
|
||||
logger.warning(f" 错误信息: {crawl_stats['error_preview'][:500]}")
|
||||
else:
|
||||
logger.error(f"❌ {platform} 爬取失败,返回码: {result.returncode}")
|
||||
if error_lines:
|
||||
logger.error(f" 错误信息: {crawl_stats['error_preview'][:500]}")
|
||||
|
||||
return crawl_stats
|
||||
|
||||
@@ -294,6 +318,7 @@ postgresql_db_config = {{
|
||||
|
||||
def _parse_crawl_output(self, output_lines: List[str], error_lines: List[str]) -> Dict:
|
||||
"""解析爬取输出,提取统计信息"""
|
||||
import re
|
||||
stats = {
|
||||
"notes_count": 0,
|
||||
"comments_count": 0,
|
||||
@@ -301,32 +326,60 @@ postgresql_db_config = {{
|
||||
"login_required": False
|
||||
}
|
||||
|
||||
# 解析输出行
|
||||
for line in output_lines:
|
||||
if "条笔记" in line or "条内容" in line:
|
||||
# 合并所有行用于解析
|
||||
all_lines = output_lines + error_lines
|
||||
|
||||
# 解析输出行,查找各种可能的数据保存信息
|
||||
for line in all_lines:
|
||||
line_lower = line.lower()
|
||||
|
||||
# 查找保存的内容数量(多种可能的格式)
|
||||
# 例如:"保存了 10 条笔记"、"成功保存 5 条内容"、"inserted 3 records"等
|
||||
if any(keyword in line_lower for keyword in ["条笔记", "条内容", "条视频", "条帖子", "条回答"]):
|
||||
try:
|
||||
# 提取数字
|
||||
import re
|
||||
# 提取数字,优先取第一个数字
|
||||
numbers = re.findall(r'\d+', line)
|
||||
if numbers:
|
||||
stats["notes_count"] = int(numbers[0])
|
||||
# 如果找到多个数字,取最大的(通常是总数)
|
||||
potential_count = max([int(n) for n in numbers])
|
||||
if potential_count > stats["notes_count"]:
|
||||
stats["notes_count"] = potential_count
|
||||
except:
|
||||
pass
|
||||
elif "条评论" in line:
|
||||
|
||||
# 查找保存的评论数量
|
||||
if "条评论" in line_lower:
|
||||
try:
|
||||
import re
|
||||
numbers = re.findall(r'\d+', line)
|
||||
if numbers:
|
||||
stats["comments_count"] = int(numbers[0])
|
||||
potential_count = max([int(n) for n in numbers])
|
||||
if potential_count > stats["comments_count"]:
|
||||
stats["comments_count"] = potential_count
|
||||
except:
|
||||
pass
|
||||
elif "登录" in line or "扫码" in line:
|
||||
|
||||
# 查找数据库相关错误
|
||||
if any(keyword in line_lower for keyword in ["数据库", "database", "connection", "连接失败", "保存失败"]):
|
||||
if "error" in line_lower or "失败" in line_lower or "异常" in line_lower:
|
||||
stats["errors_count"] += 1
|
||||
|
||||
# 查找登录相关
|
||||
if any(keyword in line_lower for keyword in ["登录", "扫码", "login", "需要登录"]):
|
||||
stats["login_required"] = True
|
||||
|
||||
# 解析错误行
|
||||
for line in error_lines:
|
||||
if "error" in line.lower() or "异常" in line:
|
||||
stats["errors_count"] += 1
|
||||
# 如果没有找到明确的保存数量,尝试从数据库操作日志中提取
|
||||
if stats["notes_count"] == 0 and stats["comments_count"] == 0:
|
||||
# 查找可能的数据库插入信息
|
||||
for line in all_lines:
|
||||
# 查找类似 "insert into" 或 "保存到" 的信息
|
||||
if "insert" in line_lower or "保存到" in line_lower:
|
||||
try:
|
||||
numbers = re.findall(r'\d+', line)
|
||||
if numbers:
|
||||
# 尝试提取可能的记录数
|
||||
pass # 这里可以进一步解析
|
||||
except:
|
||||
pass
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
Reference in New Issue
Block a user