1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误 3. 支持PG数据库 4. 规范环境变量及配置使用 5. 规范为uv安装 6. 使用loggru
2025-11-03 22:38:34 +08:00
parent 62fac9ee2e
commit f4fe4141d4
155 changed files with 9414 additions and 6247 deletions
@@ -15,7 +15,7 @@

 import asyncio
 import os
-import random
+# import random  # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
 from asyncio import Task
 from typing import Dict, List, Optional, Tuple

@@ -77,8 +77,11 @@ class WeiboCrawler(AbstractCrawler):
                # Launch a browser context.
                chromium = playwright.chromium
                self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
-            # stealth.min.js is a js script to prevent the website from detecting the crawler.
-            await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
+                # stealth.min.js is a js script to prevent the website from detecting the crawler.
+                await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
+
            self.context_page = await self.browser_context.new_page()
            await self.context_page.goto(self.mobile_index_url)

@@ -160,6 +163,11 @@ class WeiboCrawler(AbstractCrawler):
                            await self.get_note_images(mblog)

                page += 1
+                
+                # Sleep after page navigation
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[WeiboCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
+                
                await self.batch_get_notes_comments(note_id_list)

    async def get_specified_notes(self):
@@ -185,6 +193,11 @@ class WeiboCrawler(AbstractCrawler):
        async with semaphore:
            try:
                result = await self.wb_client.get_note_info_by_id(note_id)
+                
+                # Sleep after fetching note details
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[WeiboCrawler.get_note_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
+                
                return result
            except DataFetchError as ex:
                utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
@@ -221,9 +234,14 @@ class WeiboCrawler(AbstractCrawler):
        async with semaphore:
            try:
                utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
+                
+                # Sleep before fetching comments
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[WeiboCrawler.get_note_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_id}")
+                
                await self.wb_client.get_note_all_comments(
                    note_id=note_id,
-                    crawl_interval=random.randint(1, 3),  # 微博对API的限流比较严重，所以延时提高一些
+                    crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,  # Use fixed interval instead of random
                    callback=weibo_store.batch_update_weibo_note_comments,
                    max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
                )
@@ -250,7 +268,8 @@ class WeiboCrawler(AbstractCrawler):
            if not url:
                continue
            content = await self.wb_client.get_note_image(url)
-            await asyncio.sleep(random.random())
+            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+            utils.logger.info(f"[WeiboCrawler.get_note_images] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching image")
            if content != None:
                extension_file_name = url.split(".")[-1]
                await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)