本地化&2.0

2025-12-02 14:01:39 +08:00
parent ec1baf539c
commit a9eda60493
15 changed files with 409 additions and 140 deletions
@@ -251,16 +251,30 @@ postgresql_db_config = {{
            
            logger.info(f"执行命令: {' '.join(cmd)}")
            
-            # 切换到MediaCrawler目录并执行
+            # 切换到MediaCrawler目录并执行，捕获输出
            result = subprocess.run(
                cmd,
                cwd=self.mediacrawler_path,
-                timeout=3600  # 60分钟超时
+                timeout=3600,  # 60分钟超时
+                capture_output=True,
+                text=True,
+                encoding='utf-8',
+                errors='replace'
            )
            
            end_time = datetime.now()
            duration = (end_time - start_time).total_seconds()
            
+            # 解析输出，提取实际保存的数据量
+            output_lines = result.stdout.split('\n') if result.stdout else []
+            error_lines = result.stderr.split('\n') if result.stderr else []
+            
+            # 合并所有输出行用于解析
+            all_lines = output_lines + error_lines
+            
+            # 解析统计信息
+            parsed_stats = self._parse_crawl_output(all_lines, error_lines)
+            
            # 创建统计信息
            crawl_stats = {
                "platform": platform,
@@ -270,9 +284,11 @@ postgresql_db_config = {{
                "end_time": end_time.isoformat(),
                "return_code": result.returncode,
                "success": result.returncode == 0,
-                "notes_count": 0,
-                "comments_count": 0,
-                "errors_count": 0
+                "notes_count": parsed_stats.get("notes_count", 0),
+                "comments_count": parsed_stats.get("comments_count", 0),
+                "errors_count": parsed_stats.get("errors_count", 0),
+                "output_preview": '\n'.join(output_lines[-20:]) if output_lines else "",  # 最后20行输出
+                "error_preview": '\n'.join(error_lines[-20:]) if error_lines else ""  # 最后20行错误
            }
            
            # 保存统计信息
@@ -280,8 +296,16 @@ postgresql_db_config = {{
            
            if result.returncode == 0:
                logger.info(f"✅ {platform} 爬取完成，耗时: {duration:.1f}秒")
+                logger.info(f"   保存内容: {crawl_stats['notes_count']} 条，评论: {crawl_stats['comments_count']} 条")
+                if crawl_stats['notes_count'] == 0 and crawl_stats['comments_count'] == 0:
+                    logger.warning(f"⚠️ {platform} 爬取成功但未保存任何数据，请检查数据库连接和保存逻辑")
+                    # 输出部分日志用于调试
+                    if crawl_stats['error_preview']:
+                        logger.warning(f"   错误信息: {crawl_stats['error_preview'][:500]}")
            else:
                logger.error(f"❌ {platform} 爬取失败，返回码: {result.returncode}")
+                if error_lines:
+                    logger.error(f"   错误信息: {crawl_stats['error_preview'][:500]}")
            
            return crawl_stats
            
@@ -294,6 +318,7 @@ postgresql_db_config = {{
    
    def _parse_crawl_output(self, output_lines: List[str], error_lines: List[str]) -> Dict:
        """解析爬取输出，提取统计信息"""
+        import re
        stats = {
            "notes_count": 0,
            "comments_count": 0,
@@ -301,32 +326,60 @@ postgresql_db_config = {{
            "login_required": False
        }
        
-        # 解析输出行
-        for line in output_lines:
-            if "条笔记" in line or "条内容" in line:
+        # 合并所有行用于解析
+        all_lines = output_lines + error_lines
+        
+        # 解析输出行，查找各种可能的数据保存信息
+        for line in all_lines:
+            line_lower = line.lower()
+            
+            # 查找保存的内容数量（多种可能的格式）
+            # 例如："保存了 10 条笔记"、"成功保存 5 条内容"、"inserted 3 records"等
+            if any(keyword in line_lower for keyword in ["条笔记", "条内容", "条视频", "条帖子", "条回答"]):
                try:
-                    # 提取数字
-                    import re
+                    # 提取数字，优先取第一个数字
                    numbers = re.findall(r'\d+', line)
                    if numbers:
-                        stats["notes_count"] = int(numbers[0])
+                        # 如果找到多个数字，取最大的（通常是总数）
+                        potential_count = max([int(n) for n in numbers])
+                        if potential_count > stats["notes_count"]:
+                            stats["notes_count"] = potential_count
                except:
                    pass
-            elif "条评论" in line:
+            
+            # 查找保存的评论数量
+            if "条评论" in line_lower:
                try:
-                    import re
                    numbers = re.findall(r'\d+', line)
                    if numbers:
-                        stats["comments_count"] = int(numbers[0])
+                        potential_count = max([int(n) for n in numbers])
+                        if potential_count > stats["comments_count"]:
+                            stats["comments_count"] = potential_count
                except:
                    pass
-            elif "登录" in line or "扫码" in line:
+            
+            # 查找数据库相关错误
+            if any(keyword in line_lower for keyword in ["数据库", "database", "connection", "连接失败", "保存失败"]):
+                if "error" in line_lower or "失败" in line_lower or "异常" in line_lower:
+                    stats["errors_count"] += 1
+            
+            # 查找登录相关
+            if any(keyword in line_lower for keyword in ["登录", "扫码", "login", "需要登录"]):
                stats["login_required"] = True
        
-        # 解析错误行
-        for line in error_lines:
-            if "error" in line.lower() or "异常" in line:
-                stats["errors_count"] += 1
+        # 如果没有找到明确的保存数量，尝试从数据库操作日志中提取
+        if stats["notes_count"] == 0 and stats["comments_count"] == 0:
+            # 查找可能的数据库插入信息
+            for line in all_lines:
+                # 查找类似 "insert into" 或 "保存到" 的信息
+                if "insert" in line_lower or "保存到" in line_lower:
+                    try:
+                        numbers = re.findall(r'\d+', line)
+                        if numbers:
+                            # 尝试提取可能的记录数
+                            pass  # 这里可以进一步解析
+                    except:
+                        pass
        
        return stats