本地化&2.0

This commit is contained in:
z66
2025-12-02 14:01:39 +08:00
parent ec1baf539c
commit a9eda60493
15 changed files with 409 additions and 140 deletions
@@ -251,16 +251,30 @@ postgresql_db_config = {{
logger.info(f"执行命令: {' '.join(cmd)}")
# 切换到MediaCrawler目录并执行
# 切换到MediaCrawler目录并执行,捕获输出
result = subprocess.run(
cmd,
cwd=self.mediacrawler_path,
timeout=3600 # 60分钟超时
timeout=3600, # 60分钟超时
capture_output=True,
text=True,
encoding='utf-8',
errors='replace'
)
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
# 解析输出,提取实际保存的数据量
output_lines = result.stdout.split('\n') if result.stdout else []
error_lines = result.stderr.split('\n') if result.stderr else []
# 合并所有输出行用于解析
all_lines = output_lines + error_lines
# 解析统计信息
parsed_stats = self._parse_crawl_output(all_lines, error_lines)
# 创建统计信息
crawl_stats = {
"platform": platform,
@@ -270,9 +284,11 @@ postgresql_db_config = {{
"end_time": end_time.isoformat(),
"return_code": result.returncode,
"success": result.returncode == 0,
"notes_count": 0,
"comments_count": 0,
"errors_count": 0
"notes_count": parsed_stats.get("notes_count", 0),
"comments_count": parsed_stats.get("comments_count", 0),
"errors_count": parsed_stats.get("errors_count", 0),
"output_preview": '\n'.join(output_lines[-20:]) if output_lines else "", # 最后20行输出
"error_preview": '\n'.join(error_lines[-20:]) if error_lines else "" # 最后20行错误
}
# 保存统计信息
@@ -280,8 +296,16 @@ postgresql_db_config = {{
if result.returncode == 0:
logger.info(f"{platform} 爬取完成,耗时: {duration:.1f}")
logger.info(f" 保存内容: {crawl_stats['notes_count']} 条,评论: {crawl_stats['comments_count']}")
if crawl_stats['notes_count'] == 0 and crawl_stats['comments_count'] == 0:
logger.warning(f"⚠️ {platform} 爬取成功但未保存任何数据,请检查数据库连接和保存逻辑")
# 输出部分日志用于调试
if crawl_stats['error_preview']:
logger.warning(f" 错误信息: {crawl_stats['error_preview'][:500]}")
else:
logger.error(f"{platform} 爬取失败,返回码: {result.returncode}")
if error_lines:
logger.error(f" 错误信息: {crawl_stats['error_preview'][:500]}")
return crawl_stats
@@ -294,6 +318,7 @@ postgresql_db_config = {{
def _parse_crawl_output(self, output_lines: List[str], error_lines: List[str]) -> Dict:
"""解析爬取输出,提取统计信息"""
import re
stats = {
"notes_count": 0,
"comments_count": 0,
@@ -301,32 +326,60 @@ postgresql_db_config = {{
"login_required": False
}
# 解析输出行
for line in output_lines:
if "条笔记" in line or "条内容" in line:
# 合并所有行用于解析
all_lines = output_lines + error_lines
# 解析输出行,查找各种可能的数据保存信息
for line in all_lines:
line_lower = line.lower()
# 查找保存的内容数量(多种可能的格式)
# 例如:"保存了 10 条笔记"、"成功保存 5 条内容"、"inserted 3 records"等
if any(keyword in line_lower for keyword in ["条笔记", "条内容", "条视频", "条帖子", "条回答"]):
try:
# 提取数字
import re
# 提取数字,优先取第一个数字
numbers = re.findall(r'\d+', line)
if numbers:
stats["notes_count"] = int(numbers[0])
# 如果找到多个数字,取最大的(通常是总数)
potential_count = max([int(n) for n in numbers])
if potential_count > stats["notes_count"]:
stats["notes_count"] = potential_count
except:
pass
elif "条评论" in line:
# 查找保存的评论数量
if "条评论" in line_lower:
try:
import re
numbers = re.findall(r'\d+', line)
if numbers:
stats["comments_count"] = int(numbers[0])
potential_count = max([int(n) for n in numbers])
if potential_count > stats["comments_count"]:
stats["comments_count"] = potential_count
except:
pass
elif "登录" in line or "扫码" in line:
# 查找数据库相关错误
if any(keyword in line_lower for keyword in ["数据库", "database", "connection", "连接失败", "保存失败"]):
if "error" in line_lower or "失败" in line_lower or "异常" in line_lower:
stats["errors_count"] += 1
# 查找登录相关
if any(keyword in line_lower for keyword in ["登录", "扫码", "login", "需要登录"]):
stats["login_required"] = True
# 解析错误行
for line in error_lines:
if "error" in line.lower() or "异常" in line:
stats["errors_count"] += 1
# 如果没有找到明确的保存数量,尝试从数据库操作日志中提取
if stats["notes_count"] == 0 and stats["comments_count"] == 0:
# 查找可能的数据库插入信息
for line in all_lines:
# 查找类似 "insert into" 或 "保存到" 的信息
if "insert" in line_lower or "保存到" in line_lower:
try:
numbers = re.findall(r'\d+', line)
if numbers:
# 尝试提取可能的记录数
pass # 这里可以进一步解析
except:
pass
return stats