更新部分爬虫以兼容本地运行及数据库存储

2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -252,14 +252,63 @@ postgresql_db_config = {{
            logger.info(f"执行命令: {' '.join(cmd)}")
            
            # 切换到MediaCrawler目录并执行，捕获输出
-            result = subprocess.run(
+            # 使用utf-8编码，errors='surrogateescape'可以更好地处理编码问题
+            # 设置环境变量确保子进程使用UTF-8编码
+            env = os.environ.copy()
+            env['PYTHONIOENCODING'] = 'utf-8'
+            env['PYTHONUTF8'] = '1'
+            
+            # 使用 Popen 实时输出日志，而不是等到结束才显示
+            import subprocess as sp
+            process = sp.Popen(
                cmd,
                cwd=self.mediacrawler_path,
-                timeout=3600,  # 60分钟超时
-                capture_output=True,
+                stdout=sp.PIPE,
+                stderr=sp.STDOUT,  # 将stderr合并到stdout
                text=True,
                encoding='utf-8',
-                errors='replace'
+                errors='surrogateescape',
+                env=env,
+                bufsize=1,  # 行缓冲
+                universal_newlines=True
+            )
+            
+            # 实时读取并输出日志
+            output_lines = []
+            error_lines = []
+            try:
+                for line in process.stdout:
+                    line = line.rstrip()
+                    if line:
+                        output_lines.append(line)
+                        # 实时输出到控制台
+                        print(f"[{platform}] {line}", flush=True)
+                        logger.info(f"[{platform}] {line}")
+                
+                # 等待进程完成
+                return_code = process.wait(timeout=3600)
+            except sp.TimeoutExpired:
+                process.kill()
+                process.wait()
+                return_code = -1
+                logger.error(f"[{platform}] 爬取超时")
+            except Exception as e:
+                process.kill()
+                process.wait()
+                return_code = -1
+                logger.error(f"[{platform}] 执行异常: {e}", exc_info=True)
+            
+            # 创建类似 subprocess.run 的 result 对象
+            class Result:
+                def __init__(self, returncode, stdout, stderr):
+                    self.returncode = returncode
+                    self.stdout = stdout
+                    self.stderr = stderr
+            
+            result = Result(
+                returncode=return_code,
+                stdout='\n'.join(output_lines),
+                stderr='\n'.join(error_lines)
            )
            
            end_time = datetime.now()
@@ -269,6 +318,19 @@ postgresql_db_config = {{
            output_lines = result.stdout.split('\n') if result.stdout else []
            error_lines = result.stderr.split('\n') if result.stderr else []
            
+            # 输出日志到控制台和日志文件
+            if output_lines:
+                logger.info(f"[{platform}] 爬虫标准输出:")
+                for line in output_lines:
+                    if line.strip():  # 忽略空行
+                        logger.info(f"[{platform}] {line}")
+            
+            if error_lines:
+                logger.warning(f"[{platform}] 爬虫错误输出:")
+                for line in error_lines:
+                    if line.strip():  # 忽略空行
+                        logger.warning(f"[{platform}] {line}")
+            
            # 合并所有输出行用于解析
            all_lines = output_lines + error_lines
            
@@ -329,10 +391,64 @@ postgresql_db_config = {{
        # 合并所有行用于解析
        all_lines = output_lines + error_lines
        
+        # 用于统计各平台的保存操作次数（通过日志关键字统计）
+        # 视频/内容保存操作的关键字
+        content_save_keywords = [
+            "[store.bilibili.update_bilibili_video]",
+            "update_bilibili_video",
+            "[store.douyin.update_dy_aweme]",
+            "update_dy_aweme",
+            "[store.kuaishou.update_kuaishou_video]",
+            "update_kuaishou_video",
+            "[store.xhs.update_xhs_note]",
+            "update_xhs_note",
+            "[store.weibo.update_weibo_note]",
+            "update_weibo_note",
+            "[store.tieba.update_tieba_note]",
+            "update_tieba_note",
+            "[store.zhihu.update_zhihu_content]",
+            "update_zhihu_content",
+        ]
+        
+        # 评论保存操作的关键字
+        comment_save_keywords = [
+            "[store.bilibili.update_bilibili_video_comment]",
+            "update_bilibili_video_comment",
+            "[store.douyin.update_dy_aweme_comment]",
+            "update_dy_aweme_comment",
+            "[store.kuaishou.update_ks_video_comment]",
+            "update_ks_video_comment",
+            "[store.xhs.update_xhs_note_comment]",
+            "update_xhs_note_comment",
+            "[store.weibo.update_weibo_note_comment]",
+            "update_weibo_note_comment",
+            "[store.tieba.update_tieba_note_comment]",
+            "update_tieba_note_comment",
+            "[store.zhihu.update_zhihu_content_comment]",
+            "update_zhihu_note_comment",
+            "update_zhihu_content_comment",
+        ]
+        
+        # 先统计日志关键字出现的次数（用于bilibili等没有汇总信息的平台）
+        log_keyword_content_count = 0
+        log_keyword_comment_count = 0
+        
        # 解析输出行，查找各种可能的数据保存信息
        for line in all_lines:
            line_lower = line.lower()
            
+            # 统计视频/内容保存操作（通过日志关键字）
+            for keyword in content_save_keywords:
+                if keyword in line or keyword.lower() in line_lower:
+                    log_keyword_content_count += 1
+                    break  # 避免重复计数
+            
+            # 统计评论保存操作（通过日志关键字）
+            for keyword in comment_save_keywords:
+                if keyword in line or keyword.lower() in line_lower:
+                    log_keyword_comment_count += 1
+                    break  # 避免重复计数
+            
            # 查找保存的内容数量（多种可能的格式）
            # 例如："保存了 10 条笔记"、"成功保存 5 条内容"、"inserted 3 records"等
            if any(keyword in line_lower for keyword in ["条笔记", "条内容", "条视频", "条帖子", "条回答"]):
@@ -367,10 +483,18 @@ postgresql_db_config = {{
            if any(keyword in line_lower for keyword in ["登录", "扫码", "login", "需要登录"]):
                stats["login_required"] = True
        
+        # 如果通过汇总信息没有找到保存数量，使用日志关键字统计的结果
+        # 这样可以支持bilibili等没有输出汇总信息的平台
+        if stats["notes_count"] == 0 and log_keyword_content_count > 0:
+            stats["notes_count"] = log_keyword_content_count
+        if stats["comments_count"] == 0 and log_keyword_comment_count > 0:
+            stats["comments_count"] = log_keyword_comment_count
+        
        # 如果没有找到明确的保存数量，尝试从数据库操作日志中提取
        if stats["notes_count"] == 0 and stats["comments_count"] == 0:
            # 查找可能的数据库插入信息
            for line in all_lines:
+                line_lower = line.lower()
                # 查找类似 "insert into" 或 "保存到" 的信息
                if "insert" in line_lower or "保存到" in line_lower:
                    try: