51 lines
1.7 KiB
Python
51 lines
1.7 KiB
Python
from bs4 import BeautifulSoup
|
|
import requests
|
|
|
|
url = "https://python123.io/ws/demo.html"
|
|
try:
|
|
r = requests.get(url, timeout=20)
|
|
r.raise_for_status()
|
|
r.encoding = r.apparent_encoding
|
|
except:
|
|
print("出现异常")
|
|
|
|
demo = r.text
|
|
|
|
soup = BeautifulSoup(demo, "html.parser")
|
|
|
|
# 标签
|
|
print(soup.title) # 获取title标签的内容
|
|
|
|
# 标签的名字
|
|
tag = soup.a
|
|
print(tag) # 获得了第一个a标签的内容
|
|
print(soup.a.name)
|
|
print(soup.a.parent.name) # 查询a标签父标签名字
|
|
print(soup.a.parent.parent.name) # 查询p标签父标签名字
|
|
|
|
# 标签的属性
|
|
print(tag.attrs) # 查询标签的属性
|
|
# 结果{'href': 'http://www.icourse163.org/course/BIT-268001', 'class': ['py1'], 'id': 'link1'}
|
|
print(tag.attrs['class']) # 查询class属性内容
|
|
# 结果['py1']
|
|
print(type(tag.attrs)) # <class 'dict'>
|
|
print(type(tag)) # <class 'bs4.element.Tag'>
|
|
|
|
# 标签内非属性字符串
|
|
print(soup.a) # 查询a标签内容
|
|
# <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>
|
|
print(soup.a.string) # 查询a标签内非属性字符产
|
|
# Basic Python
|
|
print(soup.b) # <b>The demo python introduces several python courses.</b>
|
|
print(soup.b.string) # The demo python introduces several python courses.
|
|
# 由于b.string没有<b></b>
|
|
# 所以Navigable String 可以跨越多个标签属性的
|
|
print(type(soup.a.string)) # <class 'bs4.element.NavigableString'>
|
|
|
|
# 标签注释
|
|
newsoup = BeautifulSoup("<b><!-- This is a comment--></b><p>This is not a comment</p>", "html.parser")
|
|
print(newsoup.b.string)
|
|
print(newsoup.p.string)# 注释并未表明,因此区分注释需要用type()来区分
|
|
print(type(newsoup.b.string))
|
|
print(type(newsoup.p.string))
|