118 lines
4.4 KiB
Python
118 lines
4.4 KiB
Python
import re
|
||
import nltk
|
||
from bs4 import BeautifulSoup
|
||
import unicodedata
|
||
from contractions import CONTRACTION_MAP
|
||
# import contractions
|
||
import spacy
|
||
from nltk.tokenize.toktok import ToktokTokenizer
|
||
|
||
# html 处理
|
||
def strip_html_tags(text):
|
||
soup = BeautifulSoup(text,"html.parser")
|
||
[s.extract() for s in soup(['iframe','script'])]# 去除iframe,script标签
|
||
stripped_text = soup.get_text()
|
||
stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n
|
||
return stripped_text
|
||
|
||
# 删除重音字
|
||
def remove_accented_chars(text):
|
||
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
|
||
return text
|
||
|
||
remove_accented_chars('Sómě Áccěntěd těxt')
|
||
|
||
# 扩展缩写词
|
||
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
|
||
|
||
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
|
||
flags=re.IGNORECASE|re.DOTALL)
|
||
def expand_match(contraction):
|
||
match = contraction.group(0)
|
||
first_char = match[0]
|
||
expanded_contraction = contraction_mapping.get(match) \
|
||
if contraction_mapping.get(match) \
|
||
else contraction_mapping.get(match.lower())
|
||
expanded_contraction = first_char+expanded_contraction[1:]
|
||
return expanded_contraction
|
||
|
||
expanded_text = contractions_pattern.sub(expand_match, text)
|
||
expanded_text = re.sub("'", "", expanded_text)
|
||
return expanded_text
|
||
|
||
# 词形还原
|
||
# use spacy.load('en') if you have downloaded the language model en directly after install spacy
|
||
nlp = spacy.load('en_core', parse=True, tag=True, entity=True)
|
||
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
|
||
|
||
def lemmatize_text(text):
|
||
text = nlp(text)
|
||
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
|
||
return text
|
||
|
||
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")
|
||
|
||
# 去除特殊字符
|
||
def remove_special_characters(text, remove_digits=False):
|
||
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
|
||
text = re.sub(pattern, '', text)
|
||
return text
|
||
|
||
remove_special_characters("Well this was fun! What do you think? 123#@!",
|
||
remove_digits=True)
|
||
|
||
# 去除停用词
|
||
tokenizer = ToktokTokenizer()
|
||
stopword_list = nltk.corpus.stopwords.words('english')
|
||
def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
|
||
tokens = tokenizer.tokenize(text)
|
||
tokens = [token.strip() for token in tokens]
|
||
if is_lower_case:
|
||
filtered_tokens = [token for token in tokens if token not in stopwords]
|
||
else:
|
||
filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
|
||
filtered_text = ' '.join(filtered_tokens)
|
||
return filtered_text
|
||
|
||
remove_stopwords("The, and, if are stopwords, computer is not")
|
||
|
||
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
|
||
accented_char_removal=True, text_lower_case=True,
|
||
text_lemmatization=True, special_char_removal=True,
|
||
stopword_removal=True, remove_digits=True):
|
||
|
||
normalized_corpus = []
|
||
# normalize each document in the corpus
|
||
for doc in corpus:
|
||
# strip HTML
|
||
if html_stripping:
|
||
doc = strip_html_tags(doc)
|
||
# remove accented characters
|
||
if accented_char_removal:
|
||
doc = remove_accented_chars(doc)
|
||
# expand contractions
|
||
if contraction_expansion:
|
||
doc = expand_contractions(doc)
|
||
# lowercase the text
|
||
if text_lower_case:
|
||
doc = doc.lower()
|
||
# remove extra newlines
|
||
doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
|
||
# lemmatize text
|
||
if text_lemmatization:
|
||
doc = lemmatize_text(doc)
|
||
# remove special characters and\or digits
|
||
if special_char_removal:
|
||
# insert spaces between special characters to isolate them
|
||
special_char_pattern = re.compile(r'([{.(-)!}])')
|
||
doc = special_char_pattern.sub(" \\1 ", doc)
|
||
doc = remove_special_characters(doc, remove_digits=remove_digits)
|
||
# remove extra whitespace
|
||
doc = re.sub(' +', ' ', doc)
|
||
# remove stopwords
|
||
if stopword_removal:
|
||
doc = remove_stopwords(doc, is_lower_case=text_lower_case)
|
||
|
||
normalized_corpus.append(doc)
|
||
|
||
return normalized_corpus |