更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -93,7 +93,12 @@ async def update_weibo_note(note_item: Dict):
|
||||
"source_keyword": source_keyword_var.get(),
|
||||
}
|
||||
utils.logger.info(f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
|
||||
await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
|
||||
try:
|
||||
await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
|
||||
utils.logger.debug(f"[store.weibo.update_weibo_note] Successfully saved note {note_id}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[store.weibo.update_weibo_note] Failed to save note {note_id}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
async def batch_update_weibo_note_comments(note_id: str, comments: List[Dict]):
|
||||
@@ -148,7 +153,12 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
|
||||
"avatar": user_info.get("profile_image_url", ""),
|
||||
}
|
||||
utils.logger.info(f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
|
||||
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
|
||||
try:
|
||||
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
|
||||
utils.logger.debug(f"[store.weibo.update_weibo_note_comment] Successfully saved comment {comment_id}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[store.weibo.update_weibo_note_comment] Failed to save comment {comment_id}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
async def update_weibo_note_image(picid: str, pic_content, extension_file_name):
|
||||
|
||||
@@ -21,7 +21,7 @@ import pathlib
|
||||
from typing import Dict
|
||||
|
||||
import aiofiles
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
import config
|
||||
@@ -29,7 +29,7 @@ from base.base_crawler import AbstractStore
|
||||
from database.models import WeiboCreator, WeiboNote, WeiboNoteComment
|
||||
from tools import utils, words
|
||||
from tools.async_file_writer import AsyncFileWriter
|
||||
from database.db_session import get_session
|
||||
from database.db_session import get_session, get_async_engine
|
||||
from var import crawler_type_var
|
||||
|
||||
|
||||
@@ -88,6 +88,33 @@ class WeiboCsvStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class WeiboDbStoreImplement(AbstractStore):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
async def _check_connection(self):
|
||||
"""检查数据库连接是否正常(使用类变量缓存检查结果)"""
|
||||
# 使用类变量缓存检查结果,避免重复检查
|
||||
if not hasattr(WeiboDbStoreImplement, '_global_connection_checked'):
|
||||
WeiboDbStoreImplement._global_connection_checked = False
|
||||
|
||||
if WeiboDbStoreImplement._global_connection_checked:
|
||||
return True
|
||||
|
||||
try:
|
||||
engine = get_async_engine(config.SAVE_DATA_OPTION)
|
||||
if engine is None:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Engine is None for SAVE_DATA_OPTION={config.SAVE_DATA_OPTION}")
|
||||
return False
|
||||
|
||||
async with engine.connect() as conn:
|
||||
await conn.execute(text("SELECT 1"))
|
||||
WeiboDbStoreImplement._global_connection_checked = True
|
||||
utils.logger.info(f"[WeiboDbStoreImplement._check_connection] Database connection verified")
|
||||
return True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Database connection failed: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
@@ -99,21 +126,62 @@ class WeiboDbStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
note_id = content_item.get("note_id")
|
||||
async with get_session() as session:
|
||||
stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
|
||||
res = await session.execute(stmt)
|
||||
db_note = res.scalar_one_or_none()
|
||||
if db_note:
|
||||
db_note.last_modify_ts = utils.get_current_timestamp()
|
||||
for key, value in content_item.items():
|
||||
if hasattr(db_note, key):
|
||||
setattr(db_note, key, value)
|
||||
else:
|
||||
content_item["add_ts"] = utils.get_current_timestamp()
|
||||
content_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
db_note = WeiboNote(**content_item)
|
||||
session.add(db_note)
|
||||
await session.commit()
|
||||
if not note_id:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_content] note_id is missing in content_item: {content_item}")
|
||||
return
|
||||
|
||||
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
|
||||
# 支持精确匹配和模糊匹配两种模式
|
||||
try:
|
||||
import sys
|
||||
from pathlib import Path
|
||||
project_root = Path(__file__).resolve().parents[4]
|
||||
if str(project_root) not in sys.path:
|
||||
sys.path.insert(0, str(project_root))
|
||||
from config import settings
|
||||
|
||||
content_text = content_item.get("content", "")
|
||||
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
|
||||
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
|
||||
|
||||
if strict_keywords or fuzzy_keywords:
|
||||
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
|
||||
utils.logger.warning(f"[WeiboDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
|
||||
return
|
||||
except Exception as e:
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Failed to load keyword config: {e}")
|
||||
|
||||
# 检查数据库连接
|
||||
if not await self._check_connection():
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database connection check failed, skipping save for note {note_id}")
|
||||
return
|
||||
|
||||
try:
|
||||
async with get_session() as session:
|
||||
if session is None:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database session is None, check SAVE_DATA_OPTION config")
|
||||
return
|
||||
|
||||
stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
|
||||
res = await session.execute(stmt)
|
||||
db_note = res.scalar_one_or_none()
|
||||
if db_note:
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Updating existing note {note_id}")
|
||||
db_note.last_modify_ts = utils.get_current_timestamp()
|
||||
for key, value in content_item.items():
|
||||
if hasattr(db_note, key):
|
||||
setattr(db_note, key, value)
|
||||
else:
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Creating new note {note_id}")
|
||||
content_item["add_ts"] = utils.get_current_timestamp()
|
||||
content_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
db_note = WeiboNote(**content_item)
|
||||
session.add(db_note)
|
||||
await session.commit()
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Successfully committed note {note_id} to database")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database error saving note {note_id}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
@@ -125,21 +193,36 @@ class WeiboDbStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
async with get_session() as session:
|
||||
stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
|
||||
res = await session.execute(stmt)
|
||||
db_comment = res.scalar_one_or_none()
|
||||
if db_comment:
|
||||
db_comment.last_modify_ts = utils.get_current_timestamp()
|
||||
for key, value in comment_item.items():
|
||||
if hasattr(db_comment, key):
|
||||
setattr(db_comment, key, value)
|
||||
else:
|
||||
comment_item["add_ts"] = utils.get_current_timestamp()
|
||||
comment_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
db_comment = WeiboNoteComment(**comment_item)
|
||||
session.add(db_comment)
|
||||
await session.commit()
|
||||
if not comment_id:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] comment_id is missing in comment_item: {comment_item}")
|
||||
return
|
||||
|
||||
try:
|
||||
async with get_session() as session:
|
||||
if session is None:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database session is None, check SAVE_DATA_OPTION config")
|
||||
return
|
||||
|
||||
stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
|
||||
res = await session.execute(stmt)
|
||||
db_comment = res.scalar_one_or_none()
|
||||
if db_comment:
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Updating existing comment {comment_id}")
|
||||
db_comment.last_modify_ts = utils.get_current_timestamp()
|
||||
for key, value in comment_item.items():
|
||||
if hasattr(db_comment, key):
|
||||
setattr(db_comment, key, value)
|
||||
else:
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Creating new comment {comment_id}")
|
||||
comment_item["add_ts"] = utils.get_current_timestamp()
|
||||
comment_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
db_comment = WeiboNoteComment(**comment_item)
|
||||
session.add(db_comment)
|
||||
await session.commit()
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Successfully committed comment {comment_id} to database")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database error saving comment {comment_id}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user