更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -11,6 +11,11 @@
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from pathlib import Path
|
||||
|
||||
from .crawler_util import *
|
||||
from .slider_util import *
|
||||
@@ -19,17 +24,80 @@ from .time_util import *
|
||||
|
||||
def init_loging_config():
|
||||
level = logging.INFO
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
|
||||
# 日志格式
|
||||
log_format = "%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s"
|
||||
date_format = '%Y-%m-%d %H:%M:%S'
|
||||
|
||||
# 创建日志目录(项目根目录的 logs 文件夹)
|
||||
# 从当前文件位置向上查找,直到找到包含 logs 目录的项目根目录
|
||||
current_file = Path(__file__).resolve()
|
||||
project_root = None
|
||||
|
||||
# 方法1: 向上查找直到找到 logs 目录
|
||||
for parent in current_file.parents:
|
||||
logs_dir = parent / "logs"
|
||||
if logs_dir.exists() or parent.name == "BettaFish-1.2.0":
|
||||
project_root = parent
|
||||
break
|
||||
|
||||
# 方法2: 如果没找到,使用当前工作目录
|
||||
if project_root is None:
|
||||
project_root = Path.cwd()
|
||||
# 如果当前在 MediaCrawler 目录,向上查找
|
||||
if project_root.name == "MediaCrawler":
|
||||
project_root = project_root.parent.parent
|
||||
|
||||
log_dir = project_root / "logs"
|
||||
log_dir.mkdir(exist_ok=True)
|
||||
|
||||
# 日志文件路径
|
||||
log_file = log_dir / "mediacrawler.log"
|
||||
|
||||
# 配置根日志记录器
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(level)
|
||||
|
||||
# 清除已有的处理器,避免重复
|
||||
root_logger.handlers.clear()
|
||||
|
||||
# 控制台处理器 - 明确使用 sys.stdout 确保输出到控制台
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(level)
|
||||
console_formatter = logging.Formatter(log_format, datefmt=date_format)
|
||||
console_handler.setFormatter(console_formatter)
|
||||
root_logger.addHandler(console_handler)
|
||||
|
||||
# 确保输出立即刷新
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
|
||||
# 文件处理器(带轮转,最大10MB,保留5个备份)
|
||||
try:
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file,
|
||||
maxBytes=10 * 1024 * 1024, # 10MB
|
||||
backupCount=5,
|
||||
encoding='utf-8'
|
||||
)
|
||||
file_handler.setLevel(level)
|
||||
file_formatter = logging.Formatter(log_format, datefmt=date_format)
|
||||
file_handler.setFormatter(file_formatter)
|
||||
root_logger.addHandler(file_handler)
|
||||
except Exception as e:
|
||||
# 如果文件日志初始化失败,至少保证控制台日志可用
|
||||
print(f"警告: 无法初始化文件日志: {e}")
|
||||
|
||||
# 创建 MediaCrawler 专用日志记录器
|
||||
_logger = logging.getLogger("MediaCrawler")
|
||||
_logger.setLevel(level)
|
||||
|
||||
|
||||
# 关闭 httpx 的 INFO 日志
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
# 输出日志文件位置
|
||||
_logger.info(f"日志文件: {log_file}")
|
||||
|
||||
return _logger
|
||||
|
||||
|
||||
@@ -44,3 +112,101 @@ def str2bool(v):
|
||||
return False
|
||||
else:
|
||||
raise argparse.ArgumentTypeError('Boolean value expected.')
|
||||
|
||||
|
||||
def check_keyword_match_strict(content: str, keyword: str) -> bool:
|
||||
"""
|
||||
严格关键词匹配:检查内容是否包含关键词(严格模式)
|
||||
|
||||
Args:
|
||||
content: 要检查的内容文本
|
||||
keyword: 关键词(可以是单个关键词,也可以是逗号分隔的多个关键词)
|
||||
|
||||
Returns:
|
||||
bool: 如果内容包含任意一个关键词返回True,否则返回False
|
||||
"""
|
||||
if not content or not keyword:
|
||||
return False
|
||||
|
||||
# 清理HTML标签
|
||||
clean_content = re.sub(r"<.*?>", "", content)
|
||||
# 转换为小写进行匹配
|
||||
clean_content_lower = clean_content.lower()
|
||||
|
||||
# 支持多个关键词(逗号分隔),只要匹配任意一个即可
|
||||
keywords = [k.strip().lower() for k in keyword.split(",") if k.strip()]
|
||||
|
||||
# 检查内容是否包含任意一个关键词
|
||||
for kw in keywords:
|
||||
if kw in clean_content_lower:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def check_keyword_match_fuzzy(content: str, keyword: str) -> bool:
|
||||
"""
|
||||
模糊关键词匹配:检查内容是否包含关键词(模糊模式,支持部分匹配)
|
||||
|
||||
Args:
|
||||
content: 要检查的内容文本
|
||||
keyword: 关键词(可以是单个关键词,也可以是逗号分隔的多个关键词)
|
||||
|
||||
Returns:
|
||||
bool: 如果内容包含任意一个关键词(或关键词的部分)返回True,否则返回False
|
||||
"""
|
||||
if not content or not keyword:
|
||||
return False
|
||||
|
||||
# 清理HTML标签
|
||||
clean_content = re.sub(r"<.*?>", "", content)
|
||||
# 转换为小写进行匹配
|
||||
clean_content_lower = clean_content.lower()
|
||||
|
||||
# 支持多个关键词(逗号分隔),只要匹配任意一个即可
|
||||
keywords = [k.strip().lower() for k in keyword.split(",") if k.strip()]
|
||||
|
||||
# 检查内容是否包含任意一个关键词(或关键词的部分)
|
||||
for kw in keywords:
|
||||
# 精确匹配
|
||||
if kw in clean_content_lower:
|
||||
return True
|
||||
# 模糊匹配:如果关键词长度>=3,检查是否包含关键词的主要部分
|
||||
if len(kw) >= 3:
|
||||
# 去除空格后匹配
|
||||
kw_no_space = kw.replace(" ", "")
|
||||
content_no_space = clean_content_lower.replace(" ", "")
|
||||
if kw_no_space in content_no_space:
|
||||
return True
|
||||
# 检查关键词的前半部分(至少2个字符)
|
||||
if len(kw) >= 4:
|
||||
half_kw = kw[:len(kw)//2]
|
||||
if half_kw in clean_content_lower:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def check_keyword_match_with_modes(content: str, strict_keywords: str = None, fuzzy_keywords: str = None) -> bool:
|
||||
"""
|
||||
使用精确和模糊两种模式检查关键词匹配
|
||||
|
||||
Args:
|
||||
content: 要检查的内容文本
|
||||
strict_keywords: 精确匹配关键词(逗号分隔)
|
||||
fuzzy_keywords: 模糊匹配关键词(逗号分隔)
|
||||
|
||||
Returns:
|
||||
bool: 如果内容匹配任意一个关键词(精确或模糊)返回True,否则返回False
|
||||
"""
|
||||
# 先检查精确匹配关键词
|
||||
if strict_keywords:
|
||||
if check_keyword_match_strict(content, strict_keywords):
|
||||
return True
|
||||
|
||||
# 再检查模糊匹配关键词
|
||||
if fuzzy_keywords:
|
||||
if check_keyword_match_fuzzy(content, fuzzy_keywords):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user