import re import nltk from bs4 import BeautifulSoup import unicodedata from contractions import CONTRACTION_MAP # import contractions import spacy from nltk.tokenize.toktok import ToktokTokenizer # html 处理 def strip_html_tags(text): soup = BeautifulSoup(text,"html.parser") [s.extract() for s in soup(['iframe','script'])]# 去除iframe,script标签 stripped_text = soup.get_text() stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n return stripped_text # 删除重音字 def remove_accented_chars(text): text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') return text remove_accented_chars('Sómě Áccěntěd těxt') # 扩展缩写词 def expand_contractions(text, contraction_mapping=CONTRACTION_MAP): contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL) def expand_match(contraction): match = contraction.group(0) first_char = match[0] expanded_contraction = contraction_mapping.get(match) \ if contraction_mapping.get(match) \ else contraction_mapping.get(match.lower()) expanded_contraction = first_char+expanded_contraction[1:] return expanded_contraction expanded_text = contractions_pattern.sub(expand_match, text) expanded_text = re.sub("'", "", expanded_text) return expanded_text # 词形还原 # use spacy.load('en') if you have downloaded the language model en directly after install spacy nlp = spacy.load('en_core', parse=True, tag=True, entity=True) text = 'My system keeps crashing his crashed yesterday, ours crashes daily' def lemmatize_text(text): text = nlp(text) text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text]) return text lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily") # 去除特殊字符 def remove_special_characters(text, remove_digits=False): pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]' text = re.sub(pattern, '', text) return text remove_special_characters("Well this was fun! What do you think? 123#@!", remove_digits=True) # 去除停用词 tokenizer = ToktokTokenizer() stopword_list = nltk.corpus.stopwords.words('english') def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list): tokens = tokenizer.tokenize(text) tokens = [token.strip() for token in tokens] if is_lower_case: filtered_tokens = [token for token in tokens if token not in stopwords] else: filtered_tokens = [token for token in tokens if token.lower() not in stopwords] filtered_text = ' '.join(filtered_tokens) return filtered_text remove_stopwords("The, and, if are stopwords, computer is not") def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True, accented_char_removal=True, text_lower_case=True, text_lemmatization=True, special_char_removal=True, stopword_removal=True, remove_digits=True): normalized_corpus = [] # normalize each document in the corpus for doc in corpus: # strip HTML if html_stripping: doc = strip_html_tags(doc) # remove accented characters if accented_char_removal: doc = remove_accented_chars(doc) # expand contractions if contraction_expansion: doc = expand_contractions(doc) # lowercase the text if text_lower_case: doc = doc.lower() # remove extra newlines doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc) # lemmatize text if text_lemmatization: doc = lemmatize_text(doc) # remove special characters and\or digits if special_char_removal: # insert spaces between special characters to isolate them special_char_pattern = re.compile(r'([{.(-)!}])') doc = special_char_pattern.sub(" \\1 ", doc) doc = remove_special_characters(doc, remove_digits=remove_digits) # remove extra whitespace doc = re.sub(' +', ' ', doc) # remove stopwords if stopword_removal: doc = remove_stopwords(doc, is_lower_case=text_lower_case) normalized_corpus.append(doc) return normalized_corpus