更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -252,14 +252,63 @@ postgresql_db_config = {{
|
||||
logger.info(f"执行命令: {' '.join(cmd)}")
|
||||
|
||||
# 切换到MediaCrawler目录并执行,捕获输出
|
||||
result = subprocess.run(
|
||||
# 使用utf-8编码,errors='surrogateescape'可以更好地处理编码问题
|
||||
# 设置环境变量确保子进程使用UTF-8编码
|
||||
env = os.environ.copy()
|
||||
env['PYTHONIOENCODING'] = 'utf-8'
|
||||
env['PYTHONUTF8'] = '1'
|
||||
|
||||
# 使用 Popen 实时输出日志,而不是等到结束才显示
|
||||
import subprocess as sp
|
||||
process = sp.Popen(
|
||||
cmd,
|
||||
cwd=self.mediacrawler_path,
|
||||
timeout=3600, # 60分钟超时
|
||||
capture_output=True,
|
||||
stdout=sp.PIPE,
|
||||
stderr=sp.STDOUT, # 将stderr合并到stdout
|
||||
text=True,
|
||||
encoding='utf-8',
|
||||
errors='replace'
|
||||
errors='surrogateescape',
|
||||
env=env,
|
||||
bufsize=1, # 行缓冲
|
||||
universal_newlines=True
|
||||
)
|
||||
|
||||
# 实时读取并输出日志
|
||||
output_lines = []
|
||||
error_lines = []
|
||||
try:
|
||||
for line in process.stdout:
|
||||
line = line.rstrip()
|
||||
if line:
|
||||
output_lines.append(line)
|
||||
# 实时输出到控制台
|
||||
print(f"[{platform}] {line}", flush=True)
|
||||
logger.info(f"[{platform}] {line}")
|
||||
|
||||
# 等待进程完成
|
||||
return_code = process.wait(timeout=3600)
|
||||
except sp.TimeoutExpired:
|
||||
process.kill()
|
||||
process.wait()
|
||||
return_code = -1
|
||||
logger.error(f"[{platform}] 爬取超时")
|
||||
except Exception as e:
|
||||
process.kill()
|
||||
process.wait()
|
||||
return_code = -1
|
||||
logger.error(f"[{platform}] 执行异常: {e}", exc_info=True)
|
||||
|
||||
# 创建类似 subprocess.run 的 result 对象
|
||||
class Result:
|
||||
def __init__(self, returncode, stdout, stderr):
|
||||
self.returncode = returncode
|
||||
self.stdout = stdout
|
||||
self.stderr = stderr
|
||||
|
||||
result = Result(
|
||||
returncode=return_code,
|
||||
stdout='\n'.join(output_lines),
|
||||
stderr='\n'.join(error_lines)
|
||||
)
|
||||
|
||||
end_time = datetime.now()
|
||||
@@ -269,6 +318,19 @@ postgresql_db_config = {{
|
||||
output_lines = result.stdout.split('\n') if result.stdout else []
|
||||
error_lines = result.stderr.split('\n') if result.stderr else []
|
||||
|
||||
# 输出日志到控制台和日志文件
|
||||
if output_lines:
|
||||
logger.info(f"[{platform}] 爬虫标准输出:")
|
||||
for line in output_lines:
|
||||
if line.strip(): # 忽略空行
|
||||
logger.info(f"[{platform}] {line}")
|
||||
|
||||
if error_lines:
|
||||
logger.warning(f"[{platform}] 爬虫错误输出:")
|
||||
for line in error_lines:
|
||||
if line.strip(): # 忽略空行
|
||||
logger.warning(f"[{platform}] {line}")
|
||||
|
||||
# 合并所有输出行用于解析
|
||||
all_lines = output_lines + error_lines
|
||||
|
||||
@@ -329,10 +391,64 @@ postgresql_db_config = {{
|
||||
# 合并所有行用于解析
|
||||
all_lines = output_lines + error_lines
|
||||
|
||||
# 用于统计各平台的保存操作次数(通过日志关键字统计)
|
||||
# 视频/内容保存操作的关键字
|
||||
content_save_keywords = [
|
||||
"[store.bilibili.update_bilibili_video]",
|
||||
"update_bilibili_video",
|
||||
"[store.douyin.update_dy_aweme]",
|
||||
"update_dy_aweme",
|
||||
"[store.kuaishou.update_kuaishou_video]",
|
||||
"update_kuaishou_video",
|
||||
"[store.xhs.update_xhs_note]",
|
||||
"update_xhs_note",
|
||||
"[store.weibo.update_weibo_note]",
|
||||
"update_weibo_note",
|
||||
"[store.tieba.update_tieba_note]",
|
||||
"update_tieba_note",
|
||||
"[store.zhihu.update_zhihu_content]",
|
||||
"update_zhihu_content",
|
||||
]
|
||||
|
||||
# 评论保存操作的关键字
|
||||
comment_save_keywords = [
|
||||
"[store.bilibili.update_bilibili_video_comment]",
|
||||
"update_bilibili_video_comment",
|
||||
"[store.douyin.update_dy_aweme_comment]",
|
||||
"update_dy_aweme_comment",
|
||||
"[store.kuaishou.update_ks_video_comment]",
|
||||
"update_ks_video_comment",
|
||||
"[store.xhs.update_xhs_note_comment]",
|
||||
"update_xhs_note_comment",
|
||||
"[store.weibo.update_weibo_note_comment]",
|
||||
"update_weibo_note_comment",
|
||||
"[store.tieba.update_tieba_note_comment]",
|
||||
"update_tieba_note_comment",
|
||||
"[store.zhihu.update_zhihu_content_comment]",
|
||||
"update_zhihu_note_comment",
|
||||
"update_zhihu_content_comment",
|
||||
]
|
||||
|
||||
# 先统计日志关键字出现的次数(用于bilibili等没有汇总信息的平台)
|
||||
log_keyword_content_count = 0
|
||||
log_keyword_comment_count = 0
|
||||
|
||||
# 解析输出行,查找各种可能的数据保存信息
|
||||
for line in all_lines:
|
||||
line_lower = line.lower()
|
||||
|
||||
# 统计视频/内容保存操作(通过日志关键字)
|
||||
for keyword in content_save_keywords:
|
||||
if keyword in line or keyword.lower() in line_lower:
|
||||
log_keyword_content_count += 1
|
||||
break # 避免重复计数
|
||||
|
||||
# 统计评论保存操作(通过日志关键字)
|
||||
for keyword in comment_save_keywords:
|
||||
if keyword in line or keyword.lower() in line_lower:
|
||||
log_keyword_comment_count += 1
|
||||
break # 避免重复计数
|
||||
|
||||
# 查找保存的内容数量(多种可能的格式)
|
||||
# 例如:"保存了 10 条笔记"、"成功保存 5 条内容"、"inserted 3 records"等
|
||||
if any(keyword in line_lower for keyword in ["条笔记", "条内容", "条视频", "条帖子", "条回答"]):
|
||||
@@ -367,10 +483,18 @@ postgresql_db_config = {{
|
||||
if any(keyword in line_lower for keyword in ["登录", "扫码", "login", "需要登录"]):
|
||||
stats["login_required"] = True
|
||||
|
||||
# 如果通过汇总信息没有找到保存数量,使用日志关键字统计的结果
|
||||
# 这样可以支持bilibili等没有输出汇总信息的平台
|
||||
if stats["notes_count"] == 0 and log_keyword_content_count > 0:
|
||||
stats["notes_count"] = log_keyword_content_count
|
||||
if stats["comments_count"] == 0 and log_keyword_comment_count > 0:
|
||||
stats["comments_count"] = log_keyword_comment_count
|
||||
|
||||
# 如果没有找到明确的保存数量,尝试从数据库操作日志中提取
|
||||
if stats["notes_count"] == 0 and stats["comments_count"] == 0:
|
||||
# 查找可能的数据库插入信息
|
||||
for line in all_lines:
|
||||
line_lower = line.lower()
|
||||
# 查找类似 "insert into" 或 "保存到" 的信息
|
||||
if "insert" in line_lower or "保存到" in line_lower:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user