python/python爬虫/基于bs4库对html遍历.py

import requests
from bs4 import BeautifulSoup

url = "https://python123.io/ws/demo.html"
try:
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
except:
    print("出现异常")

demo = r.text
soup = BeautifulSoup(demo, "html.parser")

# 下行遍历
print(soup.head)  # <head><title>This is a python demo page</title></head>
print(soup.head.contents)  # [<title>This is a python demo page</title>]

print(soup.body)
print(soup.body.contents)
print(len(soup.body.contents))  # body标签儿子节点有5个
print(soup.body.contents[1])  # 检索body第一个儿子节点

# 遍历儿子节点
for child in soup.body.children:
    print(child)

# 遍历子孙节点
for childs in soup.body.descendants:
    print(childs)

# 上行遍历
print(soup.a.parent)  # 返回a节点的父节点
for parent in soup.a.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

# 平行遍历
print(soup.a.next_sibling)  # 结果为and ，平行遍历能遍历到NavigableString类型
print(soup.a.previous_sibling)
for sibling in soup.a.next_siblings:
    print(sibling)
#   结果为
#    and
# <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>
# .
for siblings in soup.a.previous_siblings:
    print(siblings)
# 结果为 Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: