23 lines
618 B
Python
23 lines
618 B
Python
# 删除HTML标签
|
||
|
||
import requests
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
|
||
# 检索网页得到源文本
|
||
data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
|
||
content = data.content
|
||
print(content[1163:2200])
|
||
|
||
def strip_html_tags(text):
|
||
soup = BeautifulSoup(text,"html.parser")
|
||
[s.extract() for s in soup(['iframe','script'])]# 去除iframe,script标签
|
||
stripped_text = soup.get_text()
|
||
stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n
|
||
return stripped_text
|
||
|
||
clean_content= strip_html_tags(content)
|
||
print(clean_content[1163:2200])
|
||
|
||
|
||
data.close() |