更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -89,6 +89,34 @@ class XhsDbStoreImplement(AbstractStore):
note_id = content_item.get("note_id")
if not note_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
# 添加项目根目录到路径,以便导入 MindSpider 的 config
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
title = content_item.get("title", "")
desc = content_item.get("desc", "")
content_text = title + " " + desc
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
# 如果配置了关键词,进行匹配检查
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[XhsDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
return
except Exception as e:
# 如果配置读取失败,记录警告但不阻止保存(向后兼容)
utils.logger.debug(f"[XhsDbStoreImplement.store_content] Failed to load keyword config: {e}")
async with get_session() as session:
if await self.content_is_exist(session, note_id):
await self.update_content(session, content_item)