Files
python/python爬虫/爬虫权威指南.ipynb
2025-08-05 09:19:34 +08:00

273 lines
12 KiB
Plaintext

{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "### 子标签与父标签练习",
"id": "f8ac5dcb35b0ea7b"
},
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-04-19T01:29:54.860227Z",
"start_time": "2025-04-19T01:29:53.792880Z"
}
},
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"url = 'https://www.pythonscraping.com/pages/page3.html'\n",
"res = requests.get(url)\n",
"\n",
"soup = BeautifulSoup(res.text, 'html.parser')\n",
"\n",
"# for child in soup.find('table', {'id': 'giftList'}).tr: # 这种写法获取第一个tr\n",
"# print(child)\n",
"# for child in soup.find('table', {'id': 'giftList'}).tr.next_siblings: # 这种写法获取除第一个tr外的所有tr\n",
"# print(child)\n",
"for child in soup.find_all(lambda tag: len(tag.attrs) == 2)[:2]: # lambda 表达式,获取tag有两个的标签\n",
" print(child)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<img src=\"../img/gifts/logo.jpg\" style=\"float:left;\"/>\n",
"<tr class=\"gift\" id=\"gift1\"><td>\n",
"Vegetable Basket\n",
"</td><td>\n",
"This vegetable basket is the perfect gift for your health conscious (or overweight) friends!\n",
"<span class=\"excitingNote\">Now with super-colorful bell peppers!</span>\n",
"</td><td>\n",
"$15.00\n",
"</td><td>\n",
"<img src=\"../img/gifts/img1.jpg\"/>\n",
"</td></tr>\n"
]
}
],
"execution_count": 11
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### 内链外链爬取",
"id": "823840fbf3bb10e6"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-19T02:36:49.111144Z",
"start_time": "2025-04-19T02:36:47.998458Z"
}
},
"cell_type": "code",
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import random\n",
"import re\n",
"from urllib.parse import urljoin, urlparse\n",
"\n",
"url = \"https://www.oreilly.com/\"\n",
"\n",
"pages = set()\n",
"random.seed = 42\n",
"\n",
"\n",
"# 获取内链列表\n",
"def get_internal_links(url):\n",
" try:\n",
" # 发送 HTTP 请求获取网页内容\n",
" response = requests.get(url)\n",
" response.raise_for_status() # 检查请求是否成功\n",
" html_content = response.text\n",
"\n",
" # 使用 BeautifulSoup 解析 HTML\n",
" soup = BeautifulSoup(html_content, 'html.parser')\n",
"\n",
" # 获取网页的域名\n",
" base_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))\n",
" internal_links = set()\n",
"\n",
" # 查找所有 <a> 标签\n",
" for a_tag in soup.find_all('a', href=True):\n",
" href = a_tag['href']\n",
" # 将相对 URL 转换为绝对 URL\n",
" full_url = urljoin(base_url, href)\n",
"\n",
" # 判断链接是否是内链\n",
" if urlparse(full_url).netloc == urlparse(base_url).netloc:\n",
" internal_links.add(full_url)\n",
"\n",
" return list(internal_links)\n",
"\n",
" except requests.exceptions.RequestException as e:\n",
" print(f\"请求错误: {e}\")\n",
" return []\n",
"\n",
"\n",
"def get_external_links(url):\n",
" try:\n",
" # 发送 HTTP 请求获取网页内容\n",
" response = requests.get(url)\n",
" response.raise_for_status() # 检查请求是否成功\n",
" html_content = response.text\n",
"\n",
" # 使用 BeautifulSoup 解析 HTML\n",
" soup = BeautifulSoup(html_content, 'html.parser')\n",
"\n",
" # 获取网页的域名\n",
" base_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))\n",
" external_links = set()\n",
"\n",
" # 查找所有 <a> 标签\n",
" for a_tag in soup.find_all('a', href=True):\n",
" href = a_tag['href']\n",
" # 将相对 URL 转换为绝对 URL\n",
" full_url = urljoin(base_url, href)\n",
"\n",
" # 判断链接是否是外链\n",
" if urlparse(full_url).netloc != urlparse(base_url).netloc:\n",
" external_links.add(full_url)\n",
"\n",
" return list(external_links)\n",
"\n",
" except requests.exceptions.RequestException as e:\n",
" print(f\"请求错误: {e}\")\n",
" return []\n",
"\n",
"\n",
"# internal_return_list = get_internal_links(url)\n",
"# print(internal_return_list)\n",
"external_return_list = get_external_links(url)\n",
"print(external_return_list)\n"
],
"id": "8fb78f7b36cf38af",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['https://learning.oreilly.com/search/?query=author%3A%22Kelsey%20Hightower%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://www.amazon.com/OReilly-Media-Inc/dp/B087YYHL5C/ref=sr_1_2?dchild=1&keywords=oreilly&qid=1604964116&s=mobile-apps&sr=1-2', 'https://play.google.com/store/apps/details?id=com.safariflow.queue', 'https://learning.oreilly.com/search/?query=author%3A%22Bruno%20Gon%C3%A7alves%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://www.linkedin.com/company/oreilly-media', 'https://www.oreilly.co.jp/index.shtml', 'https://channelstore.roku.com/details/c9d25fa651f0ad84e484b0dfd4b20172:856a240ad268961983e91ae52c1e1e5c/oreilly', 'https://learning.oreilly.com/search/?query=author%3A%22Ken%20Kousen%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://learning.oreilly.com/search/?query=author%3A%22Neal%20Ford%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://learning.oreilly.com/search/?query=author%3A%22Arianne%20Dee%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=suggestion&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://learning.oreilly.com/search/?query=author%3A%22Sari%20Greene%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://learning.oreilly.com/start-trial/', 'https://itunes.apple.com/us/app/safari-to-go/id881697395', 'https://www.youtube.com/user/OreillyMedia']\n"
]
}
],
"execution_count": 13
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### 结构化爬虫模板",
"id": "9a7b773c865a8a43"
},
{
"metadata": {
"jupyter": {
"is_executing": true
}
},
"cell_type": "code",
"source": [
"import requests\n",
"import re\n",
"from bs4 import BeautifulSoup\n",
"\n",
"\n",
"class Website:\n",
" def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):\n",
" self.url = url\n",
" self.name = name\n",
" self.targetPattern = targetPattern\n",
" self.absoluteUrl = absoluteUrl\n",
" self.titleTag = titleTag\n",
" self.bodyTag = bodyTag\n",
"\n",
"class Content:\n",
" def __init__(self, url, title, body):\n",
" self.url = url\n",
" self.title = title\n",
" self.body = body\n",
"\n",
" def print(self):\n",
" print(\"URL: {}\".format(self.url))\n",
" print(\"Title: {}\".format(self.title))\n",
" print(\"Body: {}\".format(self.body))\n",
" print(\"-----\")\n",
" \n",
"class Crawler:\n",
" def __init__(self, site):\n",
" self.site = site\n",
" self.visited = []\n",
" \n",
" def getPages(self, url):\n",
" try:\n",
" req = requests.get(url)\n",
" except requests.exceptions.RequestException as e:\n",
" return None\n",
" \n",
" def safeGet(self,pageObj,selector):\n",
" selectedElems = pageObj.select(selector)\n",
" if selectedElems is not None and len(selectedElems) > 0:\n",
" return '\\n'.join([elem.get_text() for elem in selectedElems])\n",
" else:\n",
" return ''\n",
" \n",
" def parse(self, url):\n",
" soup = self.getPages(url)\n",
" if soup is not None:\n",
" title = self.safeGet(soup, self.site.titleTag)\n",
" body = self.safeGet(soup, self.site.bodyTag)\n",
" if title != '' and body != '':\n",
" content = Content(url, title, body)\n",
" content.print()\n",
" \n",
"\n",
" def crawl(self):\n",
" soup = self.getPages(self.site.url)\n",
" targetPages = soup.findAll('a', href=re.compile(self.site.targetPattern))\n",
" for targetPage in targetPages:\n",
" targetPage = targetPage.attrs['href']\n",
" if targetPage not in self.visited:\n",
" self.visited.append(targetPage)\n",
" if not self.site.absoluteUrl:\n",
" targetPage = '{}{}'.format(self.site.url, targetPage)\n",
" self.parse(targetPage)\n",
" \n",
"reuters = Website('Reuters','https://www.reuters.com', '^(/article/)', False, 'h1', 'div.StandarArticleBody_body_1gnLA')\n",
"crawler = Crawler(reuters)\n",
"crawler.crawl()"
],
"id": "845de096617b4e36",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}