Files
saas/test/天猫爬虫.ipynb
T
2025-08-12 13:43:10 +08:00

169 lines
9.8 KiB
Plaintext

{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "## 门店审批",
"id": "8c0d287f696c5a35"
},
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-04-24T01:45:24.050728Z",
"start_time": "2025-04-24T01:45:21.916224Z"
}
},
"source": [
"#!/Users/xuyeqiang/opt/miniconda3/envs/f6/bin/python3.9\n",
"from playwright.sync_api import Playwright, sync_playwright\n",
"import re\n",
"import pandas as pd\n",
"\n",
"js = \"\"\"\n",
"Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});\n",
"\"\"\"\n",
"\n",
"\n",
"def run(playwright: Playwright) -> None:\n",
" browser = playwright.chromium.launch(headless=False)\n",
" context = browser.new_context(viewport={'width': 1700, 'height': 1080})\n",
"\n",
" # Open new page\n",
" page = context.new_page()\n",
" page.add_init_script(js) # 隐藏 webdriver属性,不然拖动滑块会失败。\n",
"\n",
" # Go to https://fws.carzone365.com/#/store/quitAudit\n",
" page.goto(\"https://fws.carzone365.com/#/store/quitAudit\")\n",
"\n",
" # Click [placeholder=\"请输入用户名\"]\n",
" page.click(\"[placeholder=\\\"请输入用户名\\\"]\")\n",
"\n",
" # Fill [placeholder=\"请输入用户名\"]\n",
" page.fill(\"[placeholder=\\\"请输入用户名\\\"]\", \"17710217084\")\n",
"\n",
" # Click [placeholder=\"请输入密码\"]\n",
" page.click(\"[placeholder=\\\"请输入密码\\\"]\")\n",
"\n",
" # Fill [placeholder=\"请输入密码\"]\n",
" page.fill(\"[placeholder=\\\"请输入密码\\\"]\", \"123456F6!\")\n",
"\n",
" \"\"\" 拖拽滑块验证 \"\"\"\n",
" deltaX = 50000\n",
" steps = 100\n",
" element = page.wait_for_selector(\"text=请按住滑块,拖动到最右边\")\n",
" boundingBox = element.bounding_box()\n",
" if boundingBox:\n",
" x = boundingBox.get('x') + boundingBox.get('width') / 2\n",
" y = boundingBox.get('y') + boundingBox.get('height') / 2\n",
" page.mouse.move(x, y)\n",
" page.mouse.down()\n",
" x1 = x + deltaX\n",
" page.mouse.move(x1, y, steps=steps)\n",
" page.mouse.up()\n",
" page.wait_for_timeout(1000)\n",
" page.click('xpath=//*[@id=\"app\"]//button[contains(@class,\"login-btn\")]') # 登录\n",
" \"\"\" 开始自动化点击操作 \"\"\"\n",
" page.click('xpath=//*[@id=\"app\"]/section/section/aside/ul/li[2]/ul/li[2]/div/div') # 门店审批\n",
" # 将每一页显示的数量设置为100\n",
" page.click('xpath=//*[@id=\"app\"]//input[@placeholder=\"请选择\"]')\n",
" page.click('xpath=//span[text()=\"100条/页\"]')\n",
" page.wait_for_timeout(2000)\n",
" page.click('xpath=//*[@id=\"app\"]/section/section/main/div/div[3]/div[2]/div[2]/button[2]/span') # 查询\n",
" page.wait_for_timeout(1000)\n",
" # 查询出一共有多少条数据\n",
" input_string = page.text_content('xpath=//*[@id=\"app\"]/section/section/main/div/div[4]/div[3]/div/span[1]')\n",
" # 使用正则表达式提取数字部分\n",
" numbers = re.findall(r'\\d+', input_string)\n",
" # 将提取到的数字部分转换为整数列表\n",
" numbers = [int(num) for num in numbers][0]\n",
" print(f'numbers:{numbers}')\n",
" # 计算总页数\n",
" total_pages = (numbers + 100 - 1) // 100\n",
" # 计算最后一页条数\n",
" def calculate_last_page_data(total_numbers):\n",
" data_per_page = 100\n",
" last_page_data = total_numbers % data_per_page\n",
" return last_page_data if last_page_data != 0 else data_per_page\n",
" last_page_data = calculate_last_page_data(numbers)\n",
" print(\"最后一页显示的数据条数:\", last_page_data)\n",
"\n",
" # 如果需要翻页,可以在这里添加翻页的逻辑\n",
" # 创建一个空列表来存储每行的数据\n",
" data = []\n",
" last_page_data_len = 100\n",
" for page_new in range(1, total_pages + 1):\n",
" print(f\"处理第 {page_new} 页的数据\")\n",
" if page_new == total_pages:last_page_data_len = last_page_data\n",
" for i in range(1,last_page_data_len + 1):\n",
" # 逐条获取明细\n",
" string_1 = page.text_content('xpath=//*[@id=\"app\"]/section/section/main/div/div[4]/div[2]/div[3]/table/tbody/tr['+str(i)+']/td[1]/div')\n",
" string_2 = page.text_content('xpath=//*[@id=\"app\"]/section/section/main/div/div[4]/div[2]/div[3]/table/tbody/tr['+str(i)+']/td[2]/div')\n",
" string_3 = page.text_content('xpath=//*[@id=\"app\"]/section/section/main/div/div[4]/div[2]/div[3]/table/tbody/tr['+str(i)+']/td[3]/div')\n",
" string_4 = page.text_content('xpath=//*[@id=\"app\"]/section/section/main/div/div[4]/div[2]/div[3]/table/tbody/tr['+str(i)+']/td[4]/div')\n",
" string_5 = page.text_content('xpath=//*[@id=\"app\"]/section/section/main/div/div[4]/div[2]/div[3]/table/tbody/tr['+str(i)+']/td[5]/div')\n",
" string_6 = page.text_content('xpath=//*[@id=\"app\"]/section/section/main/div/div[4]/div[2]/div[3]/table/tbody/tr['+str(i)+']/td[6]/div')\n",
" string_7 = page.text_content('xpath=//*[@id=\"app\"]/section/section/main/div/div[4]/div[2]/div[3]/table/tbody/tr['+str(i)+']/td[7]/div')\n",
" string_8 = page.text_content('xpath=//*[@id=\"app\"]/section/section/main/div/div[4]/div[2]/div[3]/table/tbody/tr['+str(i)+']/td[8]/div')\n",
" # 将数据添加到列表中\n",
" data.append([string_1, string_2, string_3, string_4, string_5, string_6, string_7, string_8])\n",
" if page_new != total_pages:\n",
" try:\n",
" page.click('xpath=//*[@id=\"app\"]/section/section/main/div/div[4]/div[3]/div/button[2]/i') # 下一页\n",
" page.wait_for_timeout(1000)\n",
" except:\n",
" pass\n",
" # 创建DataFrame\n",
" df = pd.DataFrame(data, columns=[\"类型\", \"门店名称\", \"门店id\", \"门店地址\", \"分类\", \"申请人\", \"状态\", \"申请时间\"])\n",
" df.to_excel(r\"C:\\Users\\admin\\Desktop\\门店审批明细.xlsx\")\n",
" page.wait_for_timeout(1000)\n",
" # ---------------------\n",
" context.close()\n",
" browser.close()\n",
"\n",
"\n",
"with sync_playwright() as playwright:\n",
" run(playwright)\n"
],
"outputs": [
{
"ename": "Error",
"evalue": "It looks like you are using Playwright Sync API inside the asyncio loop.\nPlease use the Async API instead.",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[1], line 108\u001B[0m\n\u001B[0;32m 104\u001B[0m context\u001B[38;5;241m.\u001B[39mclose()\n\u001B[0;32m 105\u001B[0m browser\u001B[38;5;241m.\u001B[39mclose()\n\u001B[1;32m--> 108\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m sync_playwright() \u001B[38;5;28;01mas\u001B[39;00m playwright:\n\u001B[0;32m 109\u001B[0m run(playwright)\n",
"File \u001B[1;32mD:\\ProgramTools\\Anaconda\\Lib\\site-packages\\playwright\\sync_api\\_context_manager.py:47\u001B[0m, in \u001B[0;36mPlaywrightContextManager.__enter__\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 45\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_own_loop \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mTrue\u001B[39;00m\n\u001B[0;32m 46\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_loop\u001B[38;5;241m.\u001B[39mis_running():\n\u001B[1;32m---> 47\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m Error(\n\u001B[0;32m 48\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"It looks like you are using Playwright Sync API inside the asyncio loop.\u001B[39;00m\n\u001B[0;32m 49\u001B[0m \u001B[38;5;124;03mPlease use the Async API instead.\"\"\"\u001B[39;00m\n\u001B[0;32m 50\u001B[0m )\n\u001B[0;32m 52\u001B[0m \u001B[38;5;66;03m# Create a new fiber for the protocol dispatcher. It will be pumping events\u001B[39;00m\n\u001B[0;32m 53\u001B[0m \u001B[38;5;66;03m# until the end of times. We will pass control to that fiber every time we\u001B[39;00m\n\u001B[0;32m 54\u001B[0m \u001B[38;5;66;03m# block while waiting for a response.\u001B[39;00m\n\u001B[0;32m 55\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mgreenlet_main\u001B[39m() \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m:\n",
"\u001B[1;31mError\u001B[0m: It looks like you are using Playwright Sync API inside the asyncio loop.\nPlease use the Async API instead."
]
}
],
"execution_count": 1
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}