# 删除HTML标签
import requests
import re
from bs4 import BeautifulSoup
# 检索网页得到源文本
data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.content
print(content[1163:2200])
def strip_html_tags(text):
soup = BeautifulSoup(text,"html.parser")
[s.extract() for s in soup(['iframe','script'])]# 去除iframe,script标签
stripped_text = soup.get_text()
stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n
return stripped_text
clean_content= strip_html_tags(content)
print(clean_content[1163:2200])
data.close()