更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -252,14 +252,63 @@ postgresql_db_config = {{
logger.info(f"执行命令: {' '.join(cmd)}")
# 切换到MediaCrawler目录并执行,捕获输出
result = subprocess.run(
# 使用utf-8编码,errors='surrogateescape'可以更好地处理编码问题
# 设置环境变量确保子进程使用UTF-8编码
env = os.environ.copy()
env['PYTHONIOENCODING'] = 'utf-8'
env['PYTHONUTF8'] = '1'
# 使用 Popen 实时输出日志,而不是等到结束才显示
import subprocess as sp
process = sp.Popen(
cmd,
cwd=self.mediacrawler_path,
timeout=3600, # 60分钟超时
capture_output=True,
stdout=sp.PIPE,
stderr=sp.STDOUT, # 将stderr合并到stdout
text=True,
encoding='utf-8',
errors='replace'
errors='surrogateescape',
env=env,
bufsize=1, # 行缓冲
universal_newlines=True
)
# 实时读取并输出日志
output_lines = []
error_lines = []
try:
for line in process.stdout:
line = line.rstrip()
if line:
output_lines.append(line)
# 实时输出到控制台
print(f"[{platform}] {line}", flush=True)
logger.info(f"[{platform}] {line}")
# 等待进程完成
return_code = process.wait(timeout=3600)
except sp.TimeoutExpired:
process.kill()
process.wait()
return_code = -1
logger.error(f"[{platform}] 爬取超时")
except Exception as e:
process.kill()
process.wait()
return_code = -1
logger.error(f"[{platform}] 执行异常: {e}", exc_info=True)
# 创建类似 subprocess.run 的 result 对象
class Result:
def __init__(self, returncode, stdout, stderr):
self.returncode = returncode
self.stdout = stdout
self.stderr = stderr
result = Result(
returncode=return_code,
stdout='\n'.join(output_lines),
stderr='\n'.join(error_lines)
)
end_time = datetime.now()
@@ -269,6 +318,19 @@ postgresql_db_config = {{
output_lines = result.stdout.split('\n') if result.stdout else []
error_lines = result.stderr.split('\n') if result.stderr else []
# 输出日志到控制台和日志文件
if output_lines:
logger.info(f"[{platform}] 爬虫标准输出:")
for line in output_lines:
if line.strip(): # 忽略空行
logger.info(f"[{platform}] {line}")
if error_lines:
logger.warning(f"[{platform}] 爬虫错误输出:")
for line in error_lines:
if line.strip(): # 忽略空行
logger.warning(f"[{platform}] {line}")
# 合并所有输出行用于解析
all_lines = output_lines + error_lines
@@ -329,10 +391,64 @@ postgresql_db_config = {{
# 合并所有行用于解析
all_lines = output_lines + error_lines
# 用于统计各平台的保存操作次数(通过日志关键字统计)
# 视频/内容保存操作的关键字
content_save_keywords = [
"[store.bilibili.update_bilibili_video]",
"update_bilibili_video",
"[store.douyin.update_dy_aweme]",
"update_dy_aweme",
"[store.kuaishou.update_kuaishou_video]",
"update_kuaishou_video",
"[store.xhs.update_xhs_note]",
"update_xhs_note",
"[store.weibo.update_weibo_note]",
"update_weibo_note",
"[store.tieba.update_tieba_note]",
"update_tieba_note",
"[store.zhihu.update_zhihu_content]",
"update_zhihu_content",
]
# 评论保存操作的关键字
comment_save_keywords = [
"[store.bilibili.update_bilibili_video_comment]",
"update_bilibili_video_comment",
"[store.douyin.update_dy_aweme_comment]",
"update_dy_aweme_comment",
"[store.kuaishou.update_ks_video_comment]",
"update_ks_video_comment",
"[store.xhs.update_xhs_note_comment]",
"update_xhs_note_comment",
"[store.weibo.update_weibo_note_comment]",
"update_weibo_note_comment",
"[store.tieba.update_tieba_note_comment]",
"update_tieba_note_comment",
"[store.zhihu.update_zhihu_content_comment]",
"update_zhihu_note_comment",
"update_zhihu_content_comment",
]
# 先统计日志关键字出现的次数(用于bilibili等没有汇总信息的平台)
log_keyword_content_count = 0
log_keyword_comment_count = 0
# 解析输出行,查找各种可能的数据保存信息
for line in all_lines:
line_lower = line.lower()
# 统计视频/内容保存操作(通过日志关键字)
for keyword in content_save_keywords:
if keyword in line or keyword.lower() in line_lower:
log_keyword_content_count += 1
break # 避免重复计数
# 统计评论保存操作(通过日志关键字)
for keyword in comment_save_keywords:
if keyword in line or keyword.lower() in line_lower:
log_keyword_comment_count += 1
break # 避免重复计数
# 查找保存的内容数量(多种可能的格式)
# 例如:"保存了 10 条笔记"、"成功保存 5 条内容"、"inserted 3 records"等
if any(keyword in line_lower for keyword in ["条笔记", "条内容", "条视频", "条帖子", "条回答"]):
@@ -367,10 +483,18 @@ postgresql_db_config = {{
if any(keyword in line_lower for keyword in ["登录", "扫码", "login", "需要登录"]):
stats["login_required"] = True
# 如果通过汇总信息没有找到保存数量,使用日志关键字统计的结果
# 这样可以支持bilibili等没有输出汇总信息的平台
if stats["notes_count"] == 0 and log_keyword_content_count > 0:
stats["notes_count"] = log_keyword_content_count
if stats["comments_count"] == 0 and log_keyword_comment_count > 0:
stats["comments_count"] = log_keyword_comment_count
# 如果没有找到明确的保存数量,尝试从数据库操作日志中提取
if stats["notes_count"] == 0 and stats["comments_count"] == 0:
# 查找可能的数据库插入信息
for line in all_lines:
line_lower = line.lower()
# 查找类似 "insert into" 或 "保存到" 的信息
if "insert" in line_lower or "保存到" in line_lower:
try: