更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -119,6 +119,32 @@ class BiliDbStoreImplement(AbstractStore):
content_item: content item dict
"""
video_id = content_item.get("video_id")
if not video_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
title = content_item.get("title", "")
desc = content_item.get("desc", "")
content_text = title + " " + desc
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[BilibiliDbStoreImplement.store_content] ❌ Filtered video {video_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[BilibiliDbStoreImplement.store_content] Failed to load keyword config: {e}")
# 确保 video_id 为整数类型,匹配数据库 BigInteger 字段
if video_id is not None:
video_id = int(video_id) if not isinstance(video_id, int) else video_id
@@ -88,6 +88,30 @@ class DouyinDbStoreImplement(AbstractStore):
content_item: content item dict
"""
aweme_id = content_item.get("aweme_id")
if not aweme_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
desc = content_item.get("desc", "")
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(desc, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[DouyinDbStoreImplement.store_content] ❌ Filtered aweme {aweme_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[DouyinDbStoreImplement.store_content] Failed to load keyword config: {e}")
async with get_session() as session:
result = await session.execute(select(DouyinAweme).where(DouyinAweme.aweme_id == aweme_id))
aweme_detail = result.scalar_one_or_none()
@@ -89,6 +89,30 @@ class KuaishouDbStoreImplement(AbstractStore):
content_item: content item dict
"""
video_id = content_item.get("video_id")
if not video_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
caption = content_item.get("caption", "")
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(caption, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[KuaishouDbStoreImplement.store_content] ❌ Filtered video {video_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[KuaishouDbStoreImplement.store_content] Failed to load keyword config: {e}")
async with get_session() as session:
result = await session.execute(select(KuaishouVideo).where(KuaishouVideo.video_id == video_id))
video_detail = result.scalar_one_or_none()
@@ -95,6 +95,32 @@ class TieBaDbStoreImplement(AbstractStore):
content_item: content item dict
"""
note_id = content_item.get("note_id")
if not note_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
title = content_item.get("title", "")
text = content_item.get("text", "")
content_text = title + " " + text
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[TiebaDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[TiebaDbStoreImplement.store_content] Failed to load keyword config: {e}")
async with get_session() as session:
stmt = select(TiebaNote).where(TiebaNote.note_id == note_id)
res = await session.execute(stmt)
@@ -93,7 +93,12 @@ async def update_weibo_note(note_item: Dict):
"source_keyword": source_keyword_var.get(),
}
utils.logger.info(f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
try:
await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
utils.logger.debug(f"[store.weibo.update_weibo_note] Successfully saved note {note_id}")
except Exception as e:
utils.logger.error(f"[store.weibo.update_weibo_note] Failed to save note {note_id}: {e}", exc_info=True)
raise
async def batch_update_weibo_note_comments(note_id: str, comments: List[Dict]):
@@ -148,7 +153,12 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
"avatar": user_info.get("profile_image_url", ""),
}
utils.logger.info(f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
try:
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
utils.logger.debug(f"[store.weibo.update_weibo_note_comment] Successfully saved comment {comment_id}")
except Exception as e:
utils.logger.error(f"[store.weibo.update_weibo_note_comment] Failed to save comment {comment_id}: {e}", exc_info=True)
raise
async def update_weibo_note_image(picid: str, pic_content, extension_file_name):
@@ -21,7 +21,7 @@ import pathlib
from typing import Dict
import aiofiles
from sqlalchemy import select
from sqlalchemy import select, text
from sqlalchemy.ext.asyncio import AsyncSession
import config
@@ -29,7 +29,7 @@ from base.base_crawler import AbstractStore
from database.models import WeiboCreator, WeiboNote, WeiboNoteComment
from tools import utils, words
from tools.async_file_writer import AsyncFileWriter
from database.db_session import get_session
from database.db_session import get_session, get_async_engine
from var import crawler_type_var
@@ -88,6 +88,33 @@ class WeiboCsvStoreImplement(AbstractStore):
class WeiboDbStoreImplement(AbstractStore):
def __init__(self, **kwargs):
super().__init__(**kwargs)
async def _check_connection(self):
"""检查数据库连接是否正常(使用类变量缓存检查结果)"""
# 使用类变量缓存检查结果,避免重复检查
if not hasattr(WeiboDbStoreImplement, '_global_connection_checked'):
WeiboDbStoreImplement._global_connection_checked = False
if WeiboDbStoreImplement._global_connection_checked:
return True
try:
engine = get_async_engine(config.SAVE_DATA_OPTION)
if engine is None:
utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Engine is None for SAVE_DATA_OPTION={config.SAVE_DATA_OPTION}")
return False
async with engine.connect() as conn:
await conn.execute(text("SELECT 1"))
WeiboDbStoreImplement._global_connection_checked = True
utils.logger.info(f"[WeiboDbStoreImplement._check_connection] Database connection verified")
return True
except Exception as e:
utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Database connection failed: {e}", exc_info=True)
return False
async def store_content(self, content_item: Dict):
"""
@@ -99,21 +126,62 @@ class WeiboDbStoreImplement(AbstractStore):
"""
note_id = content_item.get("note_id")
async with get_session() as session:
stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
res = await session.execute(stmt)
db_note = res.scalar_one_or_none()
if db_note:
db_note.last_modify_ts = utils.get_current_timestamp()
for key, value in content_item.items():
if hasattr(db_note, key):
setattr(db_note, key, value)
else:
content_item["add_ts"] = utils.get_current_timestamp()
content_item["last_modify_ts"] = utils.get_current_timestamp()
db_note = WeiboNote(**content_item)
session.add(db_note)
await session.commit()
if not note_id:
utils.logger.error(f"[WeiboDbStoreImplement.store_content] note_id is missing in content_item: {content_item}")
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
content_text = content_item.get("content", "")
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[WeiboDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Failed to load keyword config: {e}")
# 检查数据库连接
if not await self._check_connection():
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database connection check failed, skipping save for note {note_id}")
return
try:
async with get_session() as session:
if session is None:
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database session is None, check SAVE_DATA_OPTION config")
return
stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
res = await session.execute(stmt)
db_note = res.scalar_one_or_none()
if db_note:
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Updating existing note {note_id}")
db_note.last_modify_ts = utils.get_current_timestamp()
for key, value in content_item.items():
if hasattr(db_note, key):
setattr(db_note, key, value)
else:
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Creating new note {note_id}")
content_item["add_ts"] = utils.get_current_timestamp()
content_item["last_modify_ts"] = utils.get_current_timestamp()
db_note = WeiboNote(**content_item)
session.add(db_note)
await session.commit()
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Successfully committed note {note_id} to database")
except Exception as e:
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database error saving note {note_id}: {e}", exc_info=True)
raise
async def store_comment(self, comment_item: Dict):
"""
@@ -125,21 +193,36 @@ class WeiboDbStoreImplement(AbstractStore):
"""
comment_id = comment_item.get("comment_id")
async with get_session() as session:
stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
res = await session.execute(stmt)
db_comment = res.scalar_one_or_none()
if db_comment:
db_comment.last_modify_ts = utils.get_current_timestamp()
for key, value in comment_item.items():
if hasattr(db_comment, key):
setattr(db_comment, key, value)
else:
comment_item["add_ts"] = utils.get_current_timestamp()
comment_item["last_modify_ts"] = utils.get_current_timestamp()
db_comment = WeiboNoteComment(**comment_item)
session.add(db_comment)
await session.commit()
if not comment_id:
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] comment_id is missing in comment_item: {comment_item}")
return
try:
async with get_session() as session:
if session is None:
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database session is None, check SAVE_DATA_OPTION config")
return
stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
res = await session.execute(stmt)
db_comment = res.scalar_one_or_none()
if db_comment:
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Updating existing comment {comment_id}")
db_comment.last_modify_ts = utils.get_current_timestamp()
for key, value in comment_item.items():
if hasattr(db_comment, key):
setattr(db_comment, key, value)
else:
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Creating new comment {comment_id}")
comment_item["add_ts"] = utils.get_current_timestamp()
comment_item["last_modify_ts"] = utils.get_current_timestamp()
db_comment = WeiboNoteComment(**comment_item)
session.add(db_comment)
await session.commit()
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Successfully committed comment {comment_id} to database")
except Exception as e:
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database error saving comment {comment_id}: {e}", exc_info=True)
raise
async def store_creator(self, creator: Dict):
"""
@@ -89,6 +89,34 @@ class XhsDbStoreImplement(AbstractStore):
note_id = content_item.get("note_id")
if not note_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
# 添加项目根目录到路径,以便导入 MindSpider 的 config
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
title = content_item.get("title", "")
desc = content_item.get("desc", "")
content_text = title + " " + desc
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
# 如果配置了关键词,进行匹配检查
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[XhsDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
return
except Exception as e:
# 如果配置读取失败,记录警告但不阻止保存(向后兼容)
utils.logger.debug(f"[XhsDbStoreImplement.store_content] Failed to load keyword config: {e}")
async with get_session() as session:
if await self.content_is_exist(session, note_id):
await self.update_content(session, content_item)
@@ -85,8 +85,21 @@ async def batch_update_zhihu_note_comments(comments: List[ZhihuComment]):
if not comments:
return
success_count = 0
error_count = 0
for comment_item in comments:
await update_zhihu_content_comment(comment_item)
try:
await update_zhihu_content_comment(comment_item)
success_count += 1
except Exception as e:
error_count += 1
comment_id = getattr(comment_item, 'comment_id', 'unknown')
utils.logger.error(f"[store.zhihu.batch_update_zhihu_note_comments] 保存评论失败 (comment_id={comment_id}): {e}", exc_info=True)
if error_count > 0:
utils.logger.warning(f"[store.zhihu.batch_update_zhihu_note_comments] 批量保存完成: 成功 {success_count} 条, 失败 {error_count}")
else:
utils.logger.info(f"[store.zhihu.batch_update_zhihu_note_comments] 批量保存完成: 成功 {success_count}")
async def update_zhihu_content_comment(comment_item: ZhihuComment):
@@ -98,10 +111,17 @@ async def update_zhihu_content_comment(comment_item: ZhihuComment):
Returns:
"""
local_db_item = comment_item.model_dump()
local_db_item.update({"last_modify_ts": utils.get_current_timestamp()})
utils.logger.info(f"[store.zhihu.update_zhihu_note_comment] zhihu content comment:{local_db_item}")
await ZhihuStoreFactory.create_store().store_comment(local_db_item)
try:
local_db_item = comment_item.model_dump()
local_db_item.update({"last_modify_ts": utils.get_current_timestamp()})
# 使用更安全的日志记录方式,避免编码问题导致日志输出异常
comment_id = local_db_item.get('comment_id', 'unknown')
utils.logger.debug(f"[store.zhihu.update_zhihu_note_comment] 准备保存评论: comment_id={comment_id}")
await ZhihuStoreFactory.create_store().store_comment(local_db_item)
except Exception as e:
comment_id = getattr(comment_item, 'comment_id', 'unknown')
utils.logger.error(f"[store.zhihu.update_zhihu_note_comment] 保存评论异常 (comment_id={comment_id}): {e}", exc_info=True)
raise
async def save_creator(creator: ZhihuCreator):
@@ -94,23 +94,71 @@ class ZhihuDbStoreImplement(AbstractStore):
content_item: content item dict
"""
content_id = content_item.get("content_id")
if not content_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
title = content_item.get("title", "")
content = content_item.get("content", "")
content_text = title + " " + content
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[ZhihuDbStoreImplement.store_content] ❌ Filtered content {content_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[ZhihuDbStoreImplement.store_content] Failed to load keyword config: {e}")
try:
# 确保所有字符串值都是正确的UTF-8编码
cleaned_item = {}
for key, value in content_item.items():
if isinstance(value, bytes):
# 如果是bytes类型,尝试解码为UTF-8
try:
value = value.decode('utf-8')
except UnicodeDecodeError:
# 如果UTF-8解码失败,尝试其他编码
try:
value = value.decode('gbk', errors='replace')
except:
value = value.decode('utf-8', errors='replace')
elif isinstance(value, str):
# 确保字符串是有效的UTF-8
try:
value.encode('utf-8')
except UnicodeEncodeError:
# 如果编码失败,尝试修复
value = value.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
cleaned_item[key] = value
async with get_session() as session:
stmt = select(ZhihuContent).where(ZhihuContent.content_id == content_id)
result = await session.execute(stmt)
existing_content = result.scalars().first()
if existing_content:
for key, value in content_item.items():
for key, value in cleaned_item.items():
setattr(existing_content, key, value)
utils.logger.debug(f"[ZhihuDbStore] 更新内容: {content_id}")
else:
new_content = ZhihuContent(**content_item)
new_content = ZhihuContent(**cleaned_item)
session.add(new_content)
utils.logger.debug(f"[ZhihuDbStore] 新增内容: {content_id}")
await session.commit()
utils.logger.info(f"[ZhihuDbStore] 成功保存内容到数据库: {content_id}")
except Exception as e:
utils.logger.error(f"[ZhihuDbStore] 保存内容失败 (content_id={content_id}): {e}")
utils.logger.error(f"[ZhihuDbStore] 保存内容失败 (content_id={content_id}): {e}", exc_info=True)
raise
async def store_comment(self, comment_item: Dict):
@@ -121,22 +169,44 @@ class ZhihuDbStoreImplement(AbstractStore):
"""
comment_id = comment_item.get("comment_id")
try:
# 确保所有字符串值都是正确的UTF-8编码
cleaned_item = {}
for key, value in comment_item.items():
if isinstance(value, bytes):
# 如果是bytes类型,尝试解码为UTF-8
try:
value = value.decode('utf-8')
except UnicodeDecodeError:
# 如果UTF-8解码失败,尝试其他编码
try:
value = value.decode('gbk', errors='replace')
except:
value = value.decode('utf-8', errors='replace')
elif isinstance(value, str):
# 确保字符串是有效的UTF-8
try:
value.encode('utf-8')
except UnicodeEncodeError:
# 如果编码失败,尝试修复
value = value.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
cleaned_item[key] = value
async with get_session() as session:
stmt = select(ZhihuComment).where(ZhihuComment.comment_id == comment_id)
result = await session.execute(stmt)
existing_comment = result.scalars().first()
if existing_comment:
for key, value in comment_item.items():
for key, value in cleaned_item.items():
setattr(existing_comment, key, value)
utils.logger.debug(f"[ZhihuDbStore] 更新评论: {comment_id}")
else:
new_comment = ZhihuComment(**comment_item)
new_comment = ZhihuComment(**cleaned_item)
session.add(new_comment)
utils.logger.debug(f"[ZhihuDbStore] 新增评论: {comment_id}")
await session.commit()
utils.logger.info(f"[ZhihuDbStore] 成功保存评论到数据库: {comment_id}")
except Exception as e:
utils.logger.error(f"[ZhihuDbStore] 保存评论失败 (comment_id={comment_id}): {e}")
utils.logger.error(f"[ZhihuDbStore] 保存评论失败 (comment_id={comment_id}): {e}", exc_info=True)
raise
async def store_creator(self, creator: Dict):