python/python爬虫/爬虫权威指南.ipynb

{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### 子标签与父标签练习",
   "id": "f8ac5dcb35b0ea7b"
  },
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-04-19T01:29:54.860227Z",
     "start_time": "2025-04-19T01:29:53.792880Z"
    }
   },
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "url = 'https://www.pythonscraping.com/pages/page3.html'\n",
    "res = requests.get(url)\n",
    "\n",
    "soup = BeautifulSoup(res.text, 'html.parser')\n",
    "\n",
    "# for child in soup.find('table', {'id': 'giftList'}).tr: # 这种写法获取第一个tr\n",
    "#     print(child)\n",
    "# for child in soup.find('table', {'id': 'giftList'}).tr.next_siblings: # 这种写法获取除第一个tr外的所有tr\n",
    "#     print(child)\n",
    "for child in soup.find_all(lambda tag: len(tag.attrs) == 2)[:2]:  # lambda 表达式，获取tag有两个的标签\n",
    "    print(child)\n"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<img src=\"../img/gifts/logo.jpg\" style=\"float:left;\"/>\n",
      "<tr class=\"gift\" id=\"gift1\"><td>\n",
      "Vegetable Basket\n",
      "</td><td>\n",
      "This vegetable basket is the perfect gift for your health conscious (or overweight) friends!\n",
      "<span class=\"excitingNote\">Now with super-colorful bell peppers!</span>\n",
      "</td><td>\n",
      "$15.00\n",
      "</td><td>\n",
      "<img src=\"../img/gifts/img1.jpg\"/>\n",
      "</td></tr>\n"
     ]
    }
   ],
   "execution_count": 11
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### 内链外链爬取",
   "id": "823840fbf3bb10e6"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-19T02:36:49.111144Z",
     "start_time": "2025-04-19T02:36:47.998458Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import random\n",
    "import re\n",
    "from urllib.parse import urljoin, urlparse\n",
    "\n",
    "url = \"https://www.oreilly.com/\"\n",
    "\n",
    "pages = set()\n",
    "random.seed = 42\n",
    "\n",
    "\n",
    "# 获取内链列表\n",
    "def get_internal_links(url):\n",
    "    try:\n",
    "        # 发送 HTTP 请求获取网页内容\n",
    "        response = requests.get(url)\n",
    "        response.raise_for_status()  # 检查请求是否成功\n",
    "        html_content = response.text\n",
    "\n",
    "        # 使用 BeautifulSoup 解析 HTML\n",
    "        soup = BeautifulSoup(html_content, 'html.parser')\n",
    "\n",
    "        # 获取网页的域名\n",
    "        base_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))\n",
    "        internal_links = set()\n",
    "\n",
    "        # 查找所有 <a> 标签\n",
    "        for a_tag in soup.find_all('a', href=True):\n",
    "            href = a_tag['href']\n",
    "            # 将相对 URL 转换为绝对 URL\n",
    "            full_url = urljoin(base_url, href)\n",
    "\n",
    "            # 判断链接是否是内链\n",
    "            if urlparse(full_url).netloc == urlparse(base_url).netloc:\n",
    "                internal_links.add(full_url)\n",
    "\n",
    "        return list(internal_links)\n",
    "\n",
    "    except requests.exceptions.RequestException as e:\n",
    "        print(f\"请求错误: {e}\")\n",
    "        return []\n",
    "\n",
    "\n",
    "def get_external_links(url):\n",
    "    try:\n",
    "        # 发送 HTTP 请求获取网页内容\n",
    "        response = requests.get(url)\n",
    "        response.raise_for_status()  # 检查请求是否成功\n",
    "        html_content = response.text\n",
    "\n",
    "        # 使用 BeautifulSoup 解析 HTML\n",
    "        soup = BeautifulSoup(html_content, 'html.parser')\n",
    "\n",
    "        # 获取网页的域名\n",
    "        base_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))\n",
    "        external_links = set()\n",
    "\n",
    "        # 查找所有 <a> 标签\n",
    "        for a_tag in soup.find_all('a', href=True):\n",
    "            href = a_tag['href']\n",
    "            # 将相对 URL 转换为绝对 URL\n",
    "            full_url = urljoin(base_url, href)\n",
    "\n",
    "            # 判断链接是否是外链\n",
    "            if urlparse(full_url).netloc != urlparse(base_url).netloc:\n",
    "                external_links.add(full_url)\n",
    "\n",
    "        return list(external_links)\n",
    "\n",
    "    except requests.exceptions.RequestException as e:\n",
    "        print(f\"请求错误: {e}\")\n",
    "        return []\n",
    "\n",
    "\n",
    "# internal_return_list = get_internal_links(url)\n",
    "# print(internal_return_list)\n",
    "external_return_list = get_external_links(url)\n",
    "print(external_return_list)\n"
   ],
   "id": "8fb78f7b36cf38af",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['https://learning.oreilly.com/search/?query=author%3A%22Kelsey%20Hightower%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://www.amazon.com/OReilly-Media-Inc/dp/B087YYHL5C/ref=sr_1_2?dchild=1&keywords=oreilly&qid=1604964116&s=mobile-apps&sr=1-2', 'https://play.google.com/store/apps/details?id=com.safariflow.queue', 'https://learning.oreilly.com/search/?query=author%3A%22Bruno%20Gon%C3%A7alves%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://www.linkedin.com/company/oreilly-media', 'https://www.oreilly.co.jp/index.shtml', 'https://channelstore.roku.com/details/c9d25fa651f0ad84e484b0dfd4b20172:856a240ad268961983e91ae52c1e1e5c/oreilly', 'https://learning.oreilly.com/search/?query=author%3A%22Ken%20Kousen%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://learning.oreilly.com/search/?query=author%3A%22Neal%20Ford%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://learning.oreilly.com/search/?query=author%3A%22Arianne%20Dee%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=suggestion&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://learning.oreilly.com/search/?query=author%3A%22Sari%20Greene%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false', 'https://learning.oreilly.com/start-trial/', 'https://itunes.apple.com/us/app/safari-to-go/id881697395', 'https://www.youtube.com/user/OreillyMedia']\n"
     ]
    }
   ],
   "execution_count": 13
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### 结构化爬虫模板",
   "id": "9a7b773c865a8a43"
  },
  {
   "metadata": {
    "jupyter": {
     "is_executing": true
    }
   },
   "cell_type": "code",
   "source": [
    "import requests\n",
    "import re\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "\n",
    "class Website:\n",
    "    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):\n",
    "        self.url = url\n",
    "        self.name = name\n",
    "        self.targetPattern = targetPattern\n",
    "        self.absoluteUrl = absoluteUrl\n",
    "        self.titleTag = titleTag\n",
    "        self.bodyTag = bodyTag\n",
    "\n",
    "class Content:\n",
    "    def __init__(self, url, title, body):\n",
    "        self.url = url\n",
    "        self.title = title\n",
    "        self.body = body\n",
    "\n",
    "    def print(self):\n",
    "        print(\"URL: {}\".format(self.url))\n",
    "        print(\"Title: {}\".format(self.title))\n",
    "        print(\"Body: {}\".format(self.body))\n",
    "        print(\"-----\")\n",
    "        \n",
    "class Crawler:\n",
    "    def __init__(self, site):\n",
    "        self.site = site\n",
    "        self.visited = []\n",
    "    \n",
    "    def getPages(self, url):\n",
    "        try:\n",
    "            req = requests.get(url)\n",
    "        except requests.exceptions.RequestException as e:\n",
    "            return None\n",
    "        \n",
    "    def safeGet(self,pageObj,selector):\n",
    "        selectedElems = pageObj.select(selector)\n",
    "        if selectedElems is not None and len(selectedElems) > 0:\n",
    "            return '\\n'.join([elem.get_text() for elem in selectedElems])\n",
    "        else:\n",
    "            return ''\n",
    "    \n",
    "    def parse(self, url):\n",
    "        soup = self.getPages(url)\n",
    "        if soup is not None:\n",
    "            title = self.safeGet(soup, self.site.titleTag)\n",
    "            body = self.safeGet(soup, self.site.bodyTag)\n",
    "            if title != '' and body != '':\n",
    "                content = Content(url, title, body)\n",
    "                content.print()\n",
    "        \n",
    "\n",
    "    def crawl(self):\n",
    "        soup = self.getPages(self.site.url)\n",
    "        targetPages = soup.findAll('a', href=re.compile(self.site.targetPattern))\n",
    "        for targetPage in targetPages:\n",
    "            targetPage = targetPage.attrs['href']\n",
    "            if targetPage not in self.visited:\n",
    "                self.visited.append(targetPage)\n",
    "                if not self.site.absoluteUrl:\n",
    "                    targetPage = '{}{}'.format(self.site.url, targetPage)\n",
    "                self.parse(targetPage)\n",
    "                \n",
    "reuters = Website('Reuters','https://www.reuters.com', '^(/article/)', False, 'h1', 'div.StandarArticleBody_body_1gnLA')\n",
    "crawler = Crawler(reuters)\n",
    "crawler.crawl()"
   ],
   "id": "845de096617b4e36",
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}