变更
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
# 删除HTML标签
|
||||
|
||||
import requests
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 检索网页得到源文本
|
||||
data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
|
||||
content = data.content
|
||||
print(content[1163:2200])
|
||||
|
||||
def strip_html_tags(text):
|
||||
soup = BeautifulSoup(text,"html.parser")
|
||||
[s.extract() for s in soup(['iframe','script'])]# 去除iframe,script标签
|
||||
stripped_text = soup.get_text()
|
||||
stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n
|
||||
return stripped_text
|
||||
|
||||
clean_content= strip_html_tags(content)
|
||||
print(clean_content[1163:2200])
|
||||
|
||||
|
||||
data.close()
|
||||
@@ -0,0 +1,58 @@
|
||||
import nltk
|
||||
from nltk.corpus import gutenberg
|
||||
from pprint import pprint
|
||||
import numpy as np
|
||||
|
||||
# loading text corpora
|
||||
alice = gutenberg.raw(fileids='carroll-alice.txt')
|
||||
sample_text = ("US unveils world's most powerful supercomputer, beats China. "
|
||||
"The US has unveiled the world's most powerful supercomputer called 'Summit', "
|
||||
"beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
|
||||
"of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
|
||||
"which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
|
||||
"which reportedly take up the size of two tennis courts.")
|
||||
|
||||
# Alice in Wonderland语料库
|
||||
# # Total characters in Alice in Wonderland
|
||||
# len(alice)
|
||||
#
|
||||
# # First 100 characters in the corpus
|
||||
# alice[0:100]
|
||||
|
||||
# 默认句子标记解释器
|
||||
default_st = nltk.sent_tokenize
|
||||
alice_sentences = default_st(text=alice)
|
||||
sample_sentences = default_st(text=sample_text)
|
||||
|
||||
print('Total sentences in sample_text:', len(sample_sentences))
|
||||
print('Sample text sentences :-')
|
||||
print(np.array(sample_sentences))
|
||||
|
||||
print('\nTotal sentences in alice:', len(alice_sentences))
|
||||
print('First 5 sentences in alice:-')
|
||||
print(np.array(alice_sentences[0:5]))
|
||||
|
||||
# 默认单词标记解析
|
||||
default_wt = nltk.word_tokenize
|
||||
words = default_wt(sample_text)
|
||||
np.array(words)
|
||||
|
||||
# TreebankWordTokenizer 原理基于正则
|
||||
treebank_wt = nltk.TreebankWordTokenizer()
|
||||
words = treebank_wt.tokenize(sample_text)
|
||||
np.array(words)
|
||||
|
||||
# ToktokTokenizer 仅最后解析最后一句话
|
||||
from nltk.tokenize.toktok import ToktokTokenizer
|
||||
tokenizer = ToktokTokenizer()
|
||||
words = tokenizer.tokenize(sample_text)
|
||||
np.array(words)
|
||||
|
||||
# RegexpTokenizer 有两个主要参数
|
||||
# pattern 用于构建标记解析器
|
||||
# gaps 设置为True时用于查找标记符之间的间隙,繁殖查找标记符本身
|
||||
TOKEN_PATTERN = r'\w+'
|
||||
regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN,
|
||||
gaps=False)
|
||||
words = regex_wt.tokenize(sample_text)
|
||||
np.array(words)
|
||||
@@ -0,0 +1,118 @@
|
||||
import re
|
||||
import nltk
|
||||
from bs4 import BeautifulSoup
|
||||
import unicodedata
|
||||
from contractions import CONTRACTION_MAP
|
||||
# import contractions
|
||||
import spacy
|
||||
from nltk.tokenize.toktok import ToktokTokenizer
|
||||
|
||||
# html 处理
|
||||
def strip_html_tags(text):
|
||||
soup = BeautifulSoup(text,"html.parser")
|
||||
[s.extract() for s in soup(['iframe','script'])]# 去除iframe,script标签
|
||||
stripped_text = soup.get_text()
|
||||
stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n
|
||||
return stripped_text
|
||||
|
||||
# 删除重音字
|
||||
def remove_accented_chars(text):
|
||||
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
|
||||
return text
|
||||
|
||||
remove_accented_chars('Sómě Áccěntěd těxt')
|
||||
|
||||
# 扩展缩写词
|
||||
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
|
||||
|
||||
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
|
||||
flags=re.IGNORECASE|re.DOTALL)
|
||||
def expand_match(contraction):
|
||||
match = contraction.group(0)
|
||||
first_char = match[0]
|
||||
expanded_contraction = contraction_mapping.get(match) \
|
||||
if contraction_mapping.get(match) \
|
||||
else contraction_mapping.get(match.lower())
|
||||
expanded_contraction = first_char+expanded_contraction[1:]
|
||||
return expanded_contraction
|
||||
|
||||
expanded_text = contractions_pattern.sub(expand_match, text)
|
||||
expanded_text = re.sub("'", "", expanded_text)
|
||||
return expanded_text
|
||||
|
||||
# 词形还原
|
||||
# use spacy.load('en') if you have downloaded the language model en directly after install spacy
|
||||
nlp = spacy.load('en_core', parse=True, tag=True, entity=True)
|
||||
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
|
||||
|
||||
def lemmatize_text(text):
|
||||
text = nlp(text)
|
||||
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
|
||||
return text
|
||||
|
||||
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")
|
||||
|
||||
# 去除特殊字符
|
||||
def remove_special_characters(text, remove_digits=False):
|
||||
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
|
||||
text = re.sub(pattern, '', text)
|
||||
return text
|
||||
|
||||
remove_special_characters("Well this was fun! What do you think? 123#@!",
|
||||
remove_digits=True)
|
||||
|
||||
# 去除停用词
|
||||
tokenizer = ToktokTokenizer()
|
||||
stopword_list = nltk.corpus.stopwords.words('english')
|
||||
def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
|
||||
tokens = tokenizer.tokenize(text)
|
||||
tokens = [token.strip() for token in tokens]
|
||||
if is_lower_case:
|
||||
filtered_tokens = [token for token in tokens if token not in stopwords]
|
||||
else:
|
||||
filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
|
||||
filtered_text = ' '.join(filtered_tokens)
|
||||
return filtered_text
|
||||
|
||||
remove_stopwords("The, and, if are stopwords, computer is not")
|
||||
|
||||
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
|
||||
accented_char_removal=True, text_lower_case=True,
|
||||
text_lemmatization=True, special_char_removal=True,
|
||||
stopword_removal=True, remove_digits=True):
|
||||
|
||||
normalized_corpus = []
|
||||
# normalize each document in the corpus
|
||||
for doc in corpus:
|
||||
# strip HTML
|
||||
if html_stripping:
|
||||
doc = strip_html_tags(doc)
|
||||
# remove accented characters
|
||||
if accented_char_removal:
|
||||
doc = remove_accented_chars(doc)
|
||||
# expand contractions
|
||||
if contraction_expansion:
|
||||
doc = expand_contractions(doc)
|
||||
# lowercase the text
|
||||
if text_lower_case:
|
||||
doc = doc.lower()
|
||||
# remove extra newlines
|
||||
doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
|
||||
# lemmatize text
|
||||
if text_lemmatization:
|
||||
doc = lemmatize_text(doc)
|
||||
# remove special characters and\or digits
|
||||
if special_char_removal:
|
||||
# insert spaces between special characters to isolate them
|
||||
special_char_pattern = re.compile(r'([{.(-)!}])')
|
||||
doc = special_char_pattern.sub(" \\1 ", doc)
|
||||
doc = remove_special_characters(doc, remove_digits=remove_digits)
|
||||
# remove extra whitespace
|
||||
doc = re.sub(' +', ' ', doc)
|
||||
# remove stopwords
|
||||
if stopword_removal:
|
||||
doc = remove_stopwords(doc, is_lower_case=text_lower_case)
|
||||
|
||||
normalized_corpus.append(doc)
|
||||
|
||||
return normalized_corpus
|
||||
Reference in New Issue
Block a user