# 删除HTML标签 import requests import re from bs4 import BeautifulSoup # 检索网页得到源文本 data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html') content = data.content print(content[1163:2200]) def strip_html_tags(text): soup = BeautifulSoup(text,"html.parser") [s.extract() for s in soup(['iframe','script'])]# 去除iframe,script标签 stripped_text = soup.get_text() stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n return stripped_text clean_content= strip_html_tags(content) print(clean_content[1163:2200]) data.close()