1. 同步MediaCrawler为最新版本
2. 修复数据库not null错误 3. 支持PG数据库 4. 规范环境变量及配置使用 5. 规范为uv安装 6. 使用loggru
This commit is contained in:
@@ -0,0 +1,107 @@
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Dict, List
|
||||
import aiofiles
|
||||
import config
|
||||
from tools.utils import utils
|
||||
from tools.words import AsyncWordCloudGenerator
|
||||
|
||||
class AsyncFileWriter:
|
||||
def __init__(self, platform: str, crawler_type: str):
|
||||
self.lock = asyncio.Lock()
|
||||
self.platform = platform
|
||||
self.crawler_type = crawler_type
|
||||
self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None
|
||||
|
||||
def _get_file_path(self, file_type: str, item_type: str) -> str:
|
||||
base_path = f"data/{self.platform}/{file_type}"
|
||||
pathlib.Path(base_path).mkdir(parents=True, exist_ok=True)
|
||||
file_name = f"{self.crawler_type}_{item_type}_{utils.get_current_date()}.{file_type}"
|
||||
return f"{base_path}/{file_name}"
|
||||
|
||||
async def write_to_csv(self, item: Dict, item_type: str):
|
||||
file_path = self._get_file_path('csv', item_type)
|
||||
async with self.lock:
|
||||
file_exists = os.path.exists(file_path)
|
||||
async with aiofiles.open(file_path, 'a', newline='', encoding='utf-8-sig') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=item.keys())
|
||||
if not file_exists or await f.tell() == 0:
|
||||
await writer.writeheader()
|
||||
await writer.writerow(item)
|
||||
|
||||
async def write_single_item_to_json(self, item: Dict, item_type: str):
|
||||
file_path = self._get_file_path('json', item_type)
|
||||
async with self.lock:
|
||||
existing_data = []
|
||||
if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
|
||||
async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
|
||||
try:
|
||||
content = await f.read()
|
||||
if content:
|
||||
existing_data = json.loads(content)
|
||||
if not isinstance(existing_data, list):
|
||||
existing_data = [existing_data]
|
||||
except json.JSONDecodeError:
|
||||
existing_data = []
|
||||
|
||||
existing_data.append(item)
|
||||
|
||||
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
|
||||
await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
|
||||
|
||||
async def generate_wordcloud_from_comments(self):
|
||||
"""
|
||||
Generate wordcloud from comments data
|
||||
Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True
|
||||
"""
|
||||
if not config.ENABLE_GET_WORDCLOUD or not config.ENABLE_GET_COMMENTS:
|
||||
return
|
||||
|
||||
if not self.wordcloud_generator:
|
||||
return
|
||||
|
||||
try:
|
||||
# Read comments from JSON file
|
||||
comments_file_path = self._get_file_path('json', 'comments')
|
||||
if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
|
||||
return
|
||||
|
||||
async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
|
||||
content = await f.read()
|
||||
if not content:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
|
||||
return
|
||||
|
||||
comments_data = json.loads(content)
|
||||
if not isinstance(comments_data, list):
|
||||
comments_data = [comments_data]
|
||||
|
||||
# Filter comments data to only include 'content' field
|
||||
# Handle different comment data structures across platforms
|
||||
filtered_data = []
|
||||
for comment in comments_data:
|
||||
if isinstance(comment, dict):
|
||||
# Try different possible content field names
|
||||
content_text = comment.get('content') or comment.get('comment_text') or comment.get('text') or ''
|
||||
if content_text:
|
||||
filtered_data.append({'content': content_text})
|
||||
|
||||
if not filtered_data:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found")
|
||||
return
|
||||
|
||||
# Generate wordcloud
|
||||
words_base_path = f"data/{self.platform}/words"
|
||||
pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
|
||||
words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
|
||||
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from {len(filtered_data)} comments")
|
||||
await self.wordcloud_generator.generate_word_frequency_and_cloud(filtered_data, words_file_prefix)
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at {words_file_prefix}")
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: {e}")
|
||||
@@ -14,6 +14,7 @@ import platform
|
||||
import subprocess
|
||||
import time
|
||||
import socket
|
||||
import signal
|
||||
from typing import Optional, List, Tuple
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
@@ -106,7 +107,7 @@ class BrowserLauncher:
|
||||
|
||||
raise RuntimeError(f"无法找到可用的端口,已尝试 {start_port} 到 {port-1}")
|
||||
|
||||
def launch_browser(self, browser_path: str, debug_port: int, headless: bool = False,
|
||||
def launch_browser(self, browser_path: str, debug_port: int, headless: bool = False,
|
||||
user_data_dir: Optional[str] = None) -> subprocess.Popen:
|
||||
"""
|
||||
启动浏览器进程
|
||||
@@ -126,23 +127,24 @@ class BrowserLauncher:
|
||||
"--disable-hang-monitor",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-sync",
|
||||
"--disable-web-security", # 可能有助于某些网站的访问
|
||||
"--disable-features=VizDisplayCompositor",
|
||||
"--disable-dev-shm-usage", # 避免共享内存问题
|
||||
"--no-sandbox", # 在CDP模式下关闭沙箱
|
||||
# 🔥 关键反检测参数
|
||||
"--disable-blink-features=AutomationControlled", # 禁用自动化控制标记
|
||||
"--exclude-switches=enable-automation", # 排除自动化开关
|
||||
"--disable-infobars", # 禁用信息栏
|
||||
]
|
||||
|
||||
|
||||
# 无头模式
|
||||
if headless:
|
||||
args.extend([
|
||||
"--headless",
|
||||
"--headless=new", # 使用新的headless模式
|
||||
"--disable-gpu",
|
||||
])
|
||||
else:
|
||||
# 非无头模式下也保持一些稳定性参数
|
||||
# 非无头模式的额外参数
|
||||
args.extend([
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-infobars",
|
||||
"--start-maximized", # 最大化窗口,更像真实用户
|
||||
])
|
||||
|
||||
# 用户数据目录
|
||||
@@ -169,7 +171,8 @@ class BrowserLauncher:
|
||||
stderr=subprocess.DEVNULL,
|
||||
preexec_fn=os.setsid # 创建新的进程组
|
||||
)
|
||||
|
||||
|
||||
self.browser_process = process
|
||||
return process
|
||||
|
||||
except Exception as e:
|
||||
@@ -230,20 +233,48 @@ class BrowserLauncher:
|
||||
"""
|
||||
清理资源,关闭浏览器进程
|
||||
"""
|
||||
if self.browser_process:
|
||||
try:
|
||||
utils.logger.info("[BrowserLauncher] 正在关闭浏览器进程...")
|
||||
|
||||
if self.system == "Windows":
|
||||
# Windows下使用taskkill强制终止进程树
|
||||
subprocess.run(["taskkill", "/F", "/T", "/PID", str(self.browser_process.pid)],
|
||||
capture_output=True)
|
||||
if not self.browser_process:
|
||||
return
|
||||
|
||||
process = self.browser_process
|
||||
|
||||
if process.poll() is not None:
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程已退出,无需清理")
|
||||
self.browser_process = None
|
||||
return
|
||||
|
||||
utils.logger.info("[BrowserLauncher] 正在关闭浏览器进程...")
|
||||
|
||||
try:
|
||||
if self.system == "Windows":
|
||||
# 先尝试正常终止
|
||||
process.terminate()
|
||||
try:
|
||||
process.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
utils.logger.warning("[BrowserLauncher] 正常终止超时,使用taskkill强制结束")
|
||||
subprocess.run(
|
||||
["taskkill", "/F", "/T", "/PID", str(process.pid)],
|
||||
capture_output=True,
|
||||
check=False,
|
||||
)
|
||||
process.wait(timeout=5)
|
||||
else:
|
||||
pgid = os.getpgid(process.pid)
|
||||
try:
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程组不存在,可能已退出")
|
||||
else:
|
||||
# Unix系统下终止进程组
|
||||
os.killpg(os.getpgid(self.browser_process.pid), 9)
|
||||
|
||||
self.browser_process = None
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程已关闭")
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BrowserLauncher] 关闭浏览器进程时出错: {e}")
|
||||
try:
|
||||
process.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
utils.logger.warning("[BrowserLauncher] 优雅关闭超时,发送SIGKILL")
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
process.wait(timeout=5)
|
||||
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程已关闭")
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BrowserLauncher] 关闭浏览器进程时出错: {e}")
|
||||
finally:
|
||||
self.browser_process = None
|
||||
|
||||
@@ -291,16 +291,28 @@ class CDPBrowserManager:
|
||||
"""
|
||||
try:
|
||||
# 关闭浏览器上下文
|
||||
# if self.browser_context:
|
||||
# await self.browser_context.close()
|
||||
# self.browser_context = None
|
||||
# utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
|
||||
if self.browser_context:
|
||||
try:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
|
||||
except Exception as context_error:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
|
||||
)
|
||||
finally:
|
||||
self.browser_context = None
|
||||
|
||||
# # 断开浏览器连接
|
||||
# if self.browser:
|
||||
# await self.browser.close()
|
||||
# self.browser = None
|
||||
# utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
|
||||
# 断开浏览器连接
|
||||
if self.browser:
|
||||
try:
|
||||
await self.browser.close()
|
||||
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
|
||||
except Exception as browser_error:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
|
||||
)
|
||||
finally:
|
||||
self.browser = None
|
||||
|
||||
# 关闭浏览器进程(如果配置为自动关闭)
|
||||
if config.AUTO_CLOSE_BROWSER:
|
||||
|
||||
@@ -33,6 +33,12 @@ def get_current_time() -> str:
|
||||
"""
|
||||
return time.strftime('%Y-%m-%d %X', time.localtime())
|
||||
|
||||
def get_current_time_hour() -> str:
|
||||
"""
|
||||
获取当前的时间:'2023-12-02-13'
|
||||
:return:
|
||||
"""
|
||||
return time.strftime('%Y-%m-%d-%H', time.localtime())
|
||||
|
||||
def get_current_date() -> str:
|
||||
"""
|
||||
|
||||
@@ -26,6 +26,10 @@ def init_loging_config():
|
||||
)
|
||||
_logger = logging.getLogger("MediaCrawler")
|
||||
_logger.setLevel(level)
|
||||
|
||||
# 关闭 httpx 的 INFO 日志
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
|
||||
return _logger
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user