Files
python/python文本分析/文本预处理/去除HTML标签.py
T
2025-08-05 09:19:34 +08:00

23 lines
618 B
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 删除HTML标签
import requests
import re
from bs4 import BeautifulSoup
# 检索网页得到源文本
data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.content
print(content[1163:2200])
def strip_html_tags(text):
soup = BeautifulSoup(text,"html.parser")
[s.extract() for s in soup(['iframe','script'])]# 去除iframescript标签
stripped_text = soup.get_text()
stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n
return stripped_text
clean_content= strip_html_tags(content)
print(clean_content[1163:2200])
data.close()