Files
python/python文本分析/文本预处理/文本规范器.py
T
2025-08-05 09:19:34 +08:00

118 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import nltk
from bs4 import BeautifulSoup
import unicodedata
from contractions import CONTRACTION_MAP
# import contractions
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
# html 处理
def strip_html_tags(text):
soup = BeautifulSoup(text,"html.parser")
[s.extract() for s in soup(['iframe','script'])]# 去除iframescript标签
stripped_text = soup.get_text()
stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n
return stripped_text
# 删除重音字
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
remove_accented_chars('Sómě Áccěntěd těxt')
# 扩展缩写词
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match) \
if contraction_mapping.get(match) \
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
# 词形还原
# use spacy.load('en') if you have downloaded the language model en directly after install spacy
nlp = spacy.load('en_core', parse=True, tag=True, entity=True)
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
def lemmatize_text(text):
text = nlp(text)
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")
# 去除特殊字符
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
text = re.sub(pattern, '', text)
return text
remove_special_characters("Well this was fun! What do you think? 123#@!",
remove_digits=True)
# 去除停用词
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopwords]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
remove_stopwords("The, and, if are stopwords, computer is not")
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
accented_char_removal=True, text_lower_case=True,
text_lemmatization=True, special_char_removal=True,
stopword_removal=True, remove_digits=True):
normalized_corpus = []
# normalize each document in the corpus
for doc in corpus:
# strip HTML
if html_stripping:
doc = strip_html_tags(doc)
# remove accented characters
if accented_char_removal:
doc = remove_accented_chars(doc)
# expand contractions
if contraction_expansion:
doc = expand_contractions(doc)
# lowercase the text
if text_lower_case:
doc = doc.lower()
# remove extra newlines
doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
# lemmatize text
if text_lemmatization:
doc = lemmatize_text(doc)
# remove special characters and\or digits
if special_char_removal:
# insert spaces between special characters to isolate them
special_char_pattern = re.compile(r'([{.(-)!}])')
doc = special_char_pattern.sub(" \\1 ", doc)
doc = remove_special_characters(doc, remove_digits=remove_digits)
# remove extra whitespace
doc = re.sub(' +', ' ', doc)
# remove stopwords
if stopword_removal:
doc = remove_stopwords(doc, is_lower_case=text_lower_case)
normalized_corpus.append(doc)
return normalized_corpus