更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -93,7 +93,12 @@ async def update_weibo_note(note_item: Dict):
"source_keyword": source_keyword_var.get(),
}
utils.logger.info(f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
try:
await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
utils.logger.debug(f"[store.weibo.update_weibo_note] Successfully saved note {note_id}")
except Exception as e:
utils.logger.error(f"[store.weibo.update_weibo_note] Failed to save note {note_id}: {e}", exc_info=True)
raise
async def batch_update_weibo_note_comments(note_id: str, comments: List[Dict]):
@@ -148,7 +153,12 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
"avatar": user_info.get("profile_image_url", ""),
}
utils.logger.info(f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
try:
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
utils.logger.debug(f"[store.weibo.update_weibo_note_comment] Successfully saved comment {comment_id}")
except Exception as e:
utils.logger.error(f"[store.weibo.update_weibo_note_comment] Failed to save comment {comment_id}: {e}", exc_info=True)
raise
async def update_weibo_note_image(picid: str, pic_content, extension_file_name):
@@ -21,7 +21,7 @@ import pathlib
from typing import Dict
import aiofiles
from sqlalchemy import select
from sqlalchemy import select, text
from sqlalchemy.ext.asyncio import AsyncSession
import config
@@ -29,7 +29,7 @@ from base.base_crawler import AbstractStore
from database.models import WeiboCreator, WeiboNote, WeiboNoteComment
from tools import utils, words
from tools.async_file_writer import AsyncFileWriter
from database.db_session import get_session
from database.db_session import get_session, get_async_engine
from var import crawler_type_var
@@ -88,6 +88,33 @@ class WeiboCsvStoreImplement(AbstractStore):
class WeiboDbStoreImplement(AbstractStore):
def __init__(self, **kwargs):
super().__init__(**kwargs)
async def _check_connection(self):
"""检查数据库连接是否正常(使用类变量缓存检查结果)"""
# 使用类变量缓存检查结果,避免重复检查
if not hasattr(WeiboDbStoreImplement, '_global_connection_checked'):
WeiboDbStoreImplement._global_connection_checked = False
if WeiboDbStoreImplement._global_connection_checked:
return True
try:
engine = get_async_engine(config.SAVE_DATA_OPTION)
if engine is None:
utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Engine is None for SAVE_DATA_OPTION={config.SAVE_DATA_OPTION}")
return False
async with engine.connect() as conn:
await conn.execute(text("SELECT 1"))
WeiboDbStoreImplement._global_connection_checked = True
utils.logger.info(f"[WeiboDbStoreImplement._check_connection] Database connection verified")
return True
except Exception as e:
utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Database connection failed: {e}", exc_info=True)
return False
async def store_content(self, content_item: Dict):
"""
@@ -99,21 +126,62 @@ class WeiboDbStoreImplement(AbstractStore):
"""
note_id = content_item.get("note_id")
async with get_session() as session:
stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
res = await session.execute(stmt)
db_note = res.scalar_one_or_none()
if db_note:
db_note.last_modify_ts = utils.get_current_timestamp()
for key, value in content_item.items():
if hasattr(db_note, key):
setattr(db_note, key, value)
else:
content_item["add_ts"] = utils.get_current_timestamp()
content_item["last_modify_ts"] = utils.get_current_timestamp()
db_note = WeiboNote(**content_item)
session.add(db_note)
await session.commit()
if not note_id:
utils.logger.error(f"[WeiboDbStoreImplement.store_content] note_id is missing in content_item: {content_item}")
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
content_text = content_item.get("content", "")
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[WeiboDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Failed to load keyword config: {e}")
# 检查数据库连接
if not await self._check_connection():
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database connection check failed, skipping save for note {note_id}")
return
try:
async with get_session() as session:
if session is None:
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database session is None, check SAVE_DATA_OPTION config")
return
stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
res = await session.execute(stmt)
db_note = res.scalar_one_or_none()
if db_note:
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Updating existing note {note_id}")
db_note.last_modify_ts = utils.get_current_timestamp()
for key, value in content_item.items():
if hasattr(db_note, key):
setattr(db_note, key, value)
else:
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Creating new note {note_id}")
content_item["add_ts"] = utils.get_current_timestamp()
content_item["last_modify_ts"] = utils.get_current_timestamp()
db_note = WeiboNote(**content_item)
session.add(db_note)
await session.commit()
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Successfully committed note {note_id} to database")
except Exception as e:
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database error saving note {note_id}: {e}", exc_info=True)
raise
async def store_comment(self, comment_item: Dict):
"""
@@ -125,21 +193,36 @@ class WeiboDbStoreImplement(AbstractStore):
"""
comment_id = comment_item.get("comment_id")
async with get_session() as session:
stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
res = await session.execute(stmt)
db_comment = res.scalar_one_or_none()
if db_comment:
db_comment.last_modify_ts = utils.get_current_timestamp()
for key, value in comment_item.items():
if hasattr(db_comment, key):
setattr(db_comment, key, value)
else:
comment_item["add_ts"] = utils.get_current_timestamp()
comment_item["last_modify_ts"] = utils.get_current_timestamp()
db_comment = WeiboNoteComment(**comment_item)
session.add(db_comment)
await session.commit()
if not comment_id:
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] comment_id is missing in comment_item: {comment_item}")
return
try:
async with get_session() as session:
if session is None:
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database session is None, check SAVE_DATA_OPTION config")
return
stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
res = await session.execute(stmt)
db_comment = res.scalar_one_or_none()
if db_comment:
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Updating existing comment {comment_id}")
db_comment.last_modify_ts = utils.get_current_timestamp()
for key, value in comment_item.items():
if hasattr(db_comment, key):
setattr(db_comment, key, value)
else:
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Creating new comment {comment_id}")
comment_item["add_ts"] = utils.get_current_timestamp()
comment_item["last_modify_ts"] = utils.get_current_timestamp()
db_comment = WeiboNoteComment(**comment_item)
session.add(db_comment)
await session.commit()
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Successfully committed comment {comment_id} to database")
except Exception as e:
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database error saving comment {comment_id}: {e}", exc_info=True)
raise
async def store_creator(self, creator: Dict):
"""