更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -119,6 +119,32 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
content_item: content item dict
|
||||
"""
|
||||
video_id = content_item.get("video_id")
|
||||
if not video_id:
|
||||
return
|
||||
|
||||
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
|
||||
# 支持精确匹配和模糊匹配两种模式
|
||||
try:
|
||||
import sys
|
||||
from pathlib import Path
|
||||
project_root = Path(__file__).resolve().parents[4]
|
||||
if str(project_root) not in sys.path:
|
||||
sys.path.insert(0, str(project_root))
|
||||
from config import settings
|
||||
|
||||
title = content_item.get("title", "")
|
||||
desc = content_item.get("desc", "")
|
||||
content_text = title + " " + desc
|
||||
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
|
||||
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
|
||||
|
||||
if strict_keywords or fuzzy_keywords:
|
||||
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
|
||||
utils.logger.warning(f"[BilibiliDbStoreImplement.store_content] ❌ Filtered video {video_id} - content does not match any keyword")
|
||||
return
|
||||
except Exception as e:
|
||||
utils.logger.debug(f"[BilibiliDbStoreImplement.store_content] Failed to load keyword config: {e}")
|
||||
|
||||
# 确保 video_id 为整数类型,匹配数据库 BigInteger 字段
|
||||
if video_id is not None:
|
||||
video_id = int(video_id) if not isinstance(video_id, int) else video_id
|
||||
|
||||
@@ -88,6 +88,30 @@ class DouyinDbStoreImplement(AbstractStore):
|
||||
content_item: content item dict
|
||||
"""
|
||||
aweme_id = content_item.get("aweme_id")
|
||||
if not aweme_id:
|
||||
return
|
||||
|
||||
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
|
||||
# 支持精确匹配和模糊匹配两种模式
|
||||
try:
|
||||
import sys
|
||||
from pathlib import Path
|
||||
project_root = Path(__file__).resolve().parents[4]
|
||||
if str(project_root) not in sys.path:
|
||||
sys.path.insert(0, str(project_root))
|
||||
from config import settings
|
||||
|
||||
desc = content_item.get("desc", "")
|
||||
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
|
||||
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
|
||||
|
||||
if strict_keywords or fuzzy_keywords:
|
||||
if not utils.check_keyword_match_with_modes(desc, strict_keywords, fuzzy_keywords):
|
||||
utils.logger.warning(f"[DouyinDbStoreImplement.store_content] ❌ Filtered aweme {aweme_id} - content does not match any keyword")
|
||||
return
|
||||
except Exception as e:
|
||||
utils.logger.debug(f"[DouyinDbStoreImplement.store_content] Failed to load keyword config: {e}")
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(DouyinAweme).where(DouyinAweme.aweme_id == aweme_id))
|
||||
aweme_detail = result.scalar_one_or_none()
|
||||
|
||||
@@ -89,6 +89,30 @@ class KuaishouDbStoreImplement(AbstractStore):
|
||||
content_item: content item dict
|
||||
"""
|
||||
video_id = content_item.get("video_id")
|
||||
if not video_id:
|
||||
return
|
||||
|
||||
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
|
||||
# 支持精确匹配和模糊匹配两种模式
|
||||
try:
|
||||
import sys
|
||||
from pathlib import Path
|
||||
project_root = Path(__file__).resolve().parents[4]
|
||||
if str(project_root) not in sys.path:
|
||||
sys.path.insert(0, str(project_root))
|
||||
from config import settings
|
||||
|
||||
caption = content_item.get("caption", "")
|
||||
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
|
||||
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
|
||||
|
||||
if strict_keywords or fuzzy_keywords:
|
||||
if not utils.check_keyword_match_with_modes(caption, strict_keywords, fuzzy_keywords):
|
||||
utils.logger.warning(f"[KuaishouDbStoreImplement.store_content] ❌ Filtered video {video_id} - content does not match any keyword")
|
||||
return
|
||||
except Exception as e:
|
||||
utils.logger.debug(f"[KuaishouDbStoreImplement.store_content] Failed to load keyword config: {e}")
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(KuaishouVideo).where(KuaishouVideo.video_id == video_id))
|
||||
video_detail = result.scalar_one_or_none()
|
||||
|
||||
@@ -95,6 +95,32 @@ class TieBaDbStoreImplement(AbstractStore):
|
||||
content_item: content item dict
|
||||
"""
|
||||
note_id = content_item.get("note_id")
|
||||
if not note_id:
|
||||
return
|
||||
|
||||
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
|
||||
# 支持精确匹配和模糊匹配两种模式
|
||||
try:
|
||||
import sys
|
||||
from pathlib import Path
|
||||
project_root = Path(__file__).resolve().parents[4]
|
||||
if str(project_root) not in sys.path:
|
||||
sys.path.insert(0, str(project_root))
|
||||
from config import settings
|
||||
|
||||
title = content_item.get("title", "")
|
||||
text = content_item.get("text", "")
|
||||
content_text = title + " " + text
|
||||
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
|
||||
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
|
||||
|
||||
if strict_keywords or fuzzy_keywords:
|
||||
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
|
||||
utils.logger.warning(f"[TiebaDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
|
||||
return
|
||||
except Exception as e:
|
||||
utils.logger.debug(f"[TiebaDbStoreImplement.store_content] Failed to load keyword config: {e}")
|
||||
|
||||
async with get_session() as session:
|
||||
stmt = select(TiebaNote).where(TiebaNote.note_id == note_id)
|
||||
res = await session.execute(stmt)
|
||||
|
||||
@@ -93,7 +93,12 @@ async def update_weibo_note(note_item: Dict):
|
||||
"source_keyword": source_keyword_var.get(),
|
||||
}
|
||||
utils.logger.info(f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
|
||||
await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
|
||||
try:
|
||||
await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
|
||||
utils.logger.debug(f"[store.weibo.update_weibo_note] Successfully saved note {note_id}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[store.weibo.update_weibo_note] Failed to save note {note_id}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
async def batch_update_weibo_note_comments(note_id: str, comments: List[Dict]):
|
||||
@@ -148,7 +153,12 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
|
||||
"avatar": user_info.get("profile_image_url", ""),
|
||||
}
|
||||
utils.logger.info(f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
|
||||
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
|
||||
try:
|
||||
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
|
||||
utils.logger.debug(f"[store.weibo.update_weibo_note_comment] Successfully saved comment {comment_id}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[store.weibo.update_weibo_note_comment] Failed to save comment {comment_id}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
async def update_weibo_note_image(picid: str, pic_content, extension_file_name):
|
||||
|
||||
@@ -21,7 +21,7 @@ import pathlib
|
||||
from typing import Dict
|
||||
|
||||
import aiofiles
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
import config
|
||||
@@ -29,7 +29,7 @@ from base.base_crawler import AbstractStore
|
||||
from database.models import WeiboCreator, WeiboNote, WeiboNoteComment
|
||||
from tools import utils, words
|
||||
from tools.async_file_writer import AsyncFileWriter
|
||||
from database.db_session import get_session
|
||||
from database.db_session import get_session, get_async_engine
|
||||
from var import crawler_type_var
|
||||
|
||||
|
||||
@@ -88,6 +88,33 @@ class WeiboCsvStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class WeiboDbStoreImplement(AbstractStore):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
async def _check_connection(self):
|
||||
"""检查数据库连接是否正常(使用类变量缓存检查结果)"""
|
||||
# 使用类变量缓存检查结果,避免重复检查
|
||||
if not hasattr(WeiboDbStoreImplement, '_global_connection_checked'):
|
||||
WeiboDbStoreImplement._global_connection_checked = False
|
||||
|
||||
if WeiboDbStoreImplement._global_connection_checked:
|
||||
return True
|
||||
|
||||
try:
|
||||
engine = get_async_engine(config.SAVE_DATA_OPTION)
|
||||
if engine is None:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Engine is None for SAVE_DATA_OPTION={config.SAVE_DATA_OPTION}")
|
||||
return False
|
||||
|
||||
async with engine.connect() as conn:
|
||||
await conn.execute(text("SELECT 1"))
|
||||
WeiboDbStoreImplement._global_connection_checked = True
|
||||
utils.logger.info(f"[WeiboDbStoreImplement._check_connection] Database connection verified")
|
||||
return True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Database connection failed: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
@@ -99,21 +126,62 @@ class WeiboDbStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
note_id = content_item.get("note_id")
|
||||
async with get_session() as session:
|
||||
stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
|
||||
res = await session.execute(stmt)
|
||||
db_note = res.scalar_one_or_none()
|
||||
if db_note:
|
||||
db_note.last_modify_ts = utils.get_current_timestamp()
|
||||
for key, value in content_item.items():
|
||||
if hasattr(db_note, key):
|
||||
setattr(db_note, key, value)
|
||||
else:
|
||||
content_item["add_ts"] = utils.get_current_timestamp()
|
||||
content_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
db_note = WeiboNote(**content_item)
|
||||
session.add(db_note)
|
||||
await session.commit()
|
||||
if not note_id:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_content] note_id is missing in content_item: {content_item}")
|
||||
return
|
||||
|
||||
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
|
||||
# 支持精确匹配和模糊匹配两种模式
|
||||
try:
|
||||
import sys
|
||||
from pathlib import Path
|
||||
project_root = Path(__file__).resolve().parents[4]
|
||||
if str(project_root) not in sys.path:
|
||||
sys.path.insert(0, str(project_root))
|
||||
from config import settings
|
||||
|
||||
content_text = content_item.get("content", "")
|
||||
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
|
||||
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
|
||||
|
||||
if strict_keywords or fuzzy_keywords:
|
||||
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
|
||||
utils.logger.warning(f"[WeiboDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
|
||||
return
|
||||
except Exception as e:
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Failed to load keyword config: {e}")
|
||||
|
||||
# 检查数据库连接
|
||||
if not await self._check_connection():
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database connection check failed, skipping save for note {note_id}")
|
||||
return
|
||||
|
||||
try:
|
||||
async with get_session() as session:
|
||||
if session is None:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database session is None, check SAVE_DATA_OPTION config")
|
||||
return
|
||||
|
||||
stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
|
||||
res = await session.execute(stmt)
|
||||
db_note = res.scalar_one_or_none()
|
||||
if db_note:
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Updating existing note {note_id}")
|
||||
db_note.last_modify_ts = utils.get_current_timestamp()
|
||||
for key, value in content_item.items():
|
||||
if hasattr(db_note, key):
|
||||
setattr(db_note, key, value)
|
||||
else:
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Creating new note {note_id}")
|
||||
content_item["add_ts"] = utils.get_current_timestamp()
|
||||
content_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
db_note = WeiboNote(**content_item)
|
||||
session.add(db_note)
|
||||
await session.commit()
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Successfully committed note {note_id} to database")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database error saving note {note_id}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
@@ -125,21 +193,36 @@ class WeiboDbStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
async with get_session() as session:
|
||||
stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
|
||||
res = await session.execute(stmt)
|
||||
db_comment = res.scalar_one_or_none()
|
||||
if db_comment:
|
||||
db_comment.last_modify_ts = utils.get_current_timestamp()
|
||||
for key, value in comment_item.items():
|
||||
if hasattr(db_comment, key):
|
||||
setattr(db_comment, key, value)
|
||||
else:
|
||||
comment_item["add_ts"] = utils.get_current_timestamp()
|
||||
comment_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
db_comment = WeiboNoteComment(**comment_item)
|
||||
session.add(db_comment)
|
||||
await session.commit()
|
||||
if not comment_id:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] comment_id is missing in comment_item: {comment_item}")
|
||||
return
|
||||
|
||||
try:
|
||||
async with get_session() as session:
|
||||
if session is None:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database session is None, check SAVE_DATA_OPTION config")
|
||||
return
|
||||
|
||||
stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
|
||||
res = await session.execute(stmt)
|
||||
db_comment = res.scalar_one_or_none()
|
||||
if db_comment:
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Updating existing comment {comment_id}")
|
||||
db_comment.last_modify_ts = utils.get_current_timestamp()
|
||||
for key, value in comment_item.items():
|
||||
if hasattr(db_comment, key):
|
||||
setattr(db_comment, key, value)
|
||||
else:
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Creating new comment {comment_id}")
|
||||
comment_item["add_ts"] = utils.get_current_timestamp()
|
||||
comment_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
db_comment = WeiboNoteComment(**comment_item)
|
||||
session.add(db_comment)
|
||||
await session.commit()
|
||||
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Successfully committed comment {comment_id} to database")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database error saving comment {comment_id}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
|
||||
@@ -89,6 +89,34 @@ class XhsDbStoreImplement(AbstractStore):
|
||||
note_id = content_item.get("note_id")
|
||||
if not note_id:
|
||||
return
|
||||
|
||||
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
|
||||
# 支持精确匹配和模糊匹配两种模式
|
||||
try:
|
||||
import sys
|
||||
from pathlib import Path
|
||||
# 添加项目根目录到路径,以便导入 MindSpider 的 config
|
||||
project_root = Path(__file__).resolve().parents[4]
|
||||
if str(project_root) not in sys.path:
|
||||
sys.path.insert(0, str(project_root))
|
||||
from config import settings
|
||||
|
||||
title = content_item.get("title", "")
|
||||
desc = content_item.get("desc", "")
|
||||
content_text = title + " " + desc
|
||||
|
||||
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
|
||||
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
|
||||
|
||||
# 如果配置了关键词,进行匹配检查
|
||||
if strict_keywords or fuzzy_keywords:
|
||||
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
|
||||
utils.logger.warning(f"[XhsDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
|
||||
return
|
||||
except Exception as e:
|
||||
# 如果配置读取失败,记录警告但不阻止保存(向后兼容)
|
||||
utils.logger.debug(f"[XhsDbStoreImplement.store_content] Failed to load keyword config: {e}")
|
||||
|
||||
async with get_session() as session:
|
||||
if await self.content_is_exist(session, note_id):
|
||||
await self.update_content(session, content_item)
|
||||
|
||||
@@ -85,8 +85,21 @@ async def batch_update_zhihu_note_comments(comments: List[ZhihuComment]):
|
||||
if not comments:
|
||||
return
|
||||
|
||||
success_count = 0
|
||||
error_count = 0
|
||||
for comment_item in comments:
|
||||
await update_zhihu_content_comment(comment_item)
|
||||
try:
|
||||
await update_zhihu_content_comment(comment_item)
|
||||
success_count += 1
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
comment_id = getattr(comment_item, 'comment_id', 'unknown')
|
||||
utils.logger.error(f"[store.zhihu.batch_update_zhihu_note_comments] 保存评论失败 (comment_id={comment_id}): {e}", exc_info=True)
|
||||
|
||||
if error_count > 0:
|
||||
utils.logger.warning(f"[store.zhihu.batch_update_zhihu_note_comments] 批量保存完成: 成功 {success_count} 条, 失败 {error_count} 条")
|
||||
else:
|
||||
utils.logger.info(f"[store.zhihu.batch_update_zhihu_note_comments] 批量保存完成: 成功 {success_count} 条")
|
||||
|
||||
|
||||
async def update_zhihu_content_comment(comment_item: ZhihuComment):
|
||||
@@ -98,10 +111,17 @@ async def update_zhihu_content_comment(comment_item: ZhihuComment):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
local_db_item = comment_item.model_dump()
|
||||
local_db_item.update({"last_modify_ts": utils.get_current_timestamp()})
|
||||
utils.logger.info(f"[store.zhihu.update_zhihu_note_comment] zhihu content comment:{local_db_item}")
|
||||
await ZhihuStoreFactory.create_store().store_comment(local_db_item)
|
||||
try:
|
||||
local_db_item = comment_item.model_dump()
|
||||
local_db_item.update({"last_modify_ts": utils.get_current_timestamp()})
|
||||
# 使用更安全的日志记录方式,避免编码问题导致日志输出异常
|
||||
comment_id = local_db_item.get('comment_id', 'unknown')
|
||||
utils.logger.debug(f"[store.zhihu.update_zhihu_note_comment] 准备保存评论: comment_id={comment_id}")
|
||||
await ZhihuStoreFactory.create_store().store_comment(local_db_item)
|
||||
except Exception as e:
|
||||
comment_id = getattr(comment_item, 'comment_id', 'unknown')
|
||||
utils.logger.error(f"[store.zhihu.update_zhihu_note_comment] 保存评论异常 (comment_id={comment_id}): {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
async def save_creator(creator: ZhihuCreator):
|
||||
|
||||
@@ -94,23 +94,71 @@ class ZhihuDbStoreImplement(AbstractStore):
|
||||
content_item: content item dict
|
||||
"""
|
||||
content_id = content_item.get("content_id")
|
||||
if not content_id:
|
||||
return
|
||||
|
||||
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
|
||||
# 支持精确匹配和模糊匹配两种模式
|
||||
try:
|
||||
import sys
|
||||
from pathlib import Path
|
||||
project_root = Path(__file__).resolve().parents[4]
|
||||
if str(project_root) not in sys.path:
|
||||
sys.path.insert(0, str(project_root))
|
||||
from config import settings
|
||||
|
||||
title = content_item.get("title", "")
|
||||
content = content_item.get("content", "")
|
||||
content_text = title + " " + content
|
||||
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
|
||||
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
|
||||
|
||||
if strict_keywords or fuzzy_keywords:
|
||||
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
|
||||
utils.logger.warning(f"[ZhihuDbStoreImplement.store_content] ❌ Filtered content {content_id} - content does not match any keyword")
|
||||
return
|
||||
except Exception as e:
|
||||
utils.logger.debug(f"[ZhihuDbStoreImplement.store_content] Failed to load keyword config: {e}")
|
||||
|
||||
try:
|
||||
# 确保所有字符串值都是正确的UTF-8编码
|
||||
cleaned_item = {}
|
||||
for key, value in content_item.items():
|
||||
if isinstance(value, bytes):
|
||||
# 如果是bytes类型,尝试解码为UTF-8
|
||||
try:
|
||||
value = value.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# 如果UTF-8解码失败,尝试其他编码
|
||||
try:
|
||||
value = value.decode('gbk', errors='replace')
|
||||
except:
|
||||
value = value.decode('utf-8', errors='replace')
|
||||
elif isinstance(value, str):
|
||||
# 确保字符串是有效的UTF-8
|
||||
try:
|
||||
value.encode('utf-8')
|
||||
except UnicodeEncodeError:
|
||||
# 如果编码失败,尝试修复
|
||||
value = value.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
||||
cleaned_item[key] = value
|
||||
|
||||
async with get_session() as session:
|
||||
stmt = select(ZhihuContent).where(ZhihuContent.content_id == content_id)
|
||||
result = await session.execute(stmt)
|
||||
existing_content = result.scalars().first()
|
||||
if existing_content:
|
||||
for key, value in content_item.items():
|
||||
for key, value in cleaned_item.items():
|
||||
setattr(existing_content, key, value)
|
||||
utils.logger.debug(f"[ZhihuDbStore] 更新内容: {content_id}")
|
||||
else:
|
||||
new_content = ZhihuContent(**content_item)
|
||||
new_content = ZhihuContent(**cleaned_item)
|
||||
session.add(new_content)
|
||||
utils.logger.debug(f"[ZhihuDbStore] 新增内容: {content_id}")
|
||||
await session.commit()
|
||||
utils.logger.info(f"[ZhihuDbStore] 成功保存内容到数据库: {content_id}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhihuDbStore] 保存内容失败 (content_id={content_id}): {e}")
|
||||
utils.logger.error(f"[ZhihuDbStore] 保存内容失败 (content_id={content_id}): {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
@@ -121,22 +169,44 @@ class ZhihuDbStoreImplement(AbstractStore):
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
try:
|
||||
# 确保所有字符串值都是正确的UTF-8编码
|
||||
cleaned_item = {}
|
||||
for key, value in comment_item.items():
|
||||
if isinstance(value, bytes):
|
||||
# 如果是bytes类型,尝试解码为UTF-8
|
||||
try:
|
||||
value = value.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# 如果UTF-8解码失败,尝试其他编码
|
||||
try:
|
||||
value = value.decode('gbk', errors='replace')
|
||||
except:
|
||||
value = value.decode('utf-8', errors='replace')
|
||||
elif isinstance(value, str):
|
||||
# 确保字符串是有效的UTF-8
|
||||
try:
|
||||
value.encode('utf-8')
|
||||
except UnicodeEncodeError:
|
||||
# 如果编码失败,尝试修复
|
||||
value = value.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
||||
cleaned_item[key] = value
|
||||
|
||||
async with get_session() as session:
|
||||
stmt = select(ZhihuComment).where(ZhihuComment.comment_id == comment_id)
|
||||
result = await session.execute(stmt)
|
||||
existing_comment = result.scalars().first()
|
||||
if existing_comment:
|
||||
for key, value in comment_item.items():
|
||||
for key, value in cleaned_item.items():
|
||||
setattr(existing_comment, key, value)
|
||||
utils.logger.debug(f"[ZhihuDbStore] 更新评论: {comment_id}")
|
||||
else:
|
||||
new_comment = ZhihuComment(**comment_item)
|
||||
new_comment = ZhihuComment(**cleaned_item)
|
||||
session.add(new_comment)
|
||||
utils.logger.debug(f"[ZhihuDbStore] 新增评论: {comment_id}")
|
||||
await session.commit()
|
||||
utils.logger.info(f"[ZhihuDbStore] 成功保存评论到数据库: {comment_id}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhihuDbStore] 保存评论失败 (comment_id={comment_id}): {e}")
|
||||
utils.logger.error(f"[ZhihuDbStore] 保存评论失败 (comment_id={comment_id}): {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
|
||||
Reference in New Issue
Block a user