变更

2025-08-05 09:19:34 +08:00
commit 584548d006
1696 changed files with 53855 additions and 0 deletions
@@ -0,0 +1,23 @@
+# 删除HTML标签
+
+import requests
+import re
+from bs4 import BeautifulSoup
+
+# 检索网页得到源文本
+data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
+content = data.content
+print(content[1163:2200])
+
+def strip_html_tags(text):
+    soup = BeautifulSoup(text,"html.parser")
+    [s.extract() for s in soup(['iframe','script'])]# 去除iframe，script标签
+    stripped_text = soup.get_text()
+    stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n
+    return stripped_text
+
+clean_content= strip_html_tags(content)
+print(clean_content[1163:2200])
+
+
+data.close()
@@ -0,0 +1,58 @@
+import nltk
+from nltk.corpus import gutenberg
+from pprint import pprint
+import numpy as np
+
+# loading text corpora
+alice = gutenberg.raw(fileids='carroll-alice.txt')
+sample_text = ("US unveils world's most powerful supercomputer, beats China. "
+               "The US has unveiled the world's most powerful supercomputer called 'Summit', "
+               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
+               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
+               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
+               "which reportedly take up the size of two tennis courts.")
+
+# Alice in Wonderland语料库
+# # Total characters in Alice in Wonderland
+# len(alice)
+#
+# # First 100 characters in the corpus
+# alice[0:100]
+
+# 默认句子标记解释器
+default_st = nltk.sent_tokenize
+alice_sentences = default_st(text=alice)
+sample_sentences = default_st(text=sample_text)
+
+print('Total sentences in sample_text:', len(sample_sentences))
+print('Sample text sentences :-')
+print(np.array(sample_sentences))
+
+print('\nTotal sentences in alice:', len(alice_sentences))
+print('First 5 sentences in alice:-')
+print(np.array(alice_sentences[0:5]))
+
+# 默认单词标记解析
+default_wt = nltk.word_tokenize
+words = default_wt(sample_text)
+np.array(words)
+
+# TreebankWordTokenizer 原理基于正则
+treebank_wt = nltk.TreebankWordTokenizer()
+words = treebank_wt.tokenize(sample_text)
+np.array(words)
+
+# ToktokTokenizer 仅最后解析最后一句话
+from nltk.tokenize.toktok import ToktokTokenizer
+tokenizer = ToktokTokenizer()
+words = tokenizer.tokenize(sample_text)
+np.array(words)
+
+# RegexpTokenizer 有两个主要参数
+# pattern 用于构建标记解析器
+# gaps 设置为True时用于查找标记符之间的间隙，繁殖查找标记符本身
+TOKEN_PATTERN = r'\w+'
+regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN,
+                                gaps=False)
+words = regex_wt.tokenize(sample_text)
+np.array(words)
@@ -0,0 +1,118 @@
+import re
+import nltk
+from bs4 import BeautifulSoup
+import unicodedata
+from contractions import CONTRACTION_MAP
+# import contractions
+import spacy
+from nltk.tokenize.toktok import ToktokTokenizer
+
+# html 处理
+def strip_html_tags(text):
+    soup = BeautifulSoup(text,"html.parser")
+    [s.extract() for s in soup(['iframe','script'])]# 去除iframe，script标签
+    stripped_text = soup.get_text()
+    stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text)# re去除\r|\n|\r\n
+    return stripped_text
+
+# 删除重音字
+def remove_accented_chars(text):
+    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
+    return text
+
+remove_accented_chars('Sómě Áccěntěd těxt')
+
+# 扩展缩写词
+def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
+
+    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
+                                      flags=re.IGNORECASE|re.DOTALL)
+    def expand_match(contraction):
+        match = contraction.group(0)
+        first_char = match[0]
+        expanded_contraction = contraction_mapping.get(match) \
+            if contraction_mapping.get(match) \
+            else contraction_mapping.get(match.lower())
+        expanded_contraction = first_char+expanded_contraction[1:]
+        return expanded_contraction
+
+    expanded_text = contractions_pattern.sub(expand_match, text)
+    expanded_text = re.sub("'", "", expanded_text)
+    return expanded_text
+
+# 词形还原
+# use spacy.load('en') if you have downloaded the language model en directly after install spacy
+nlp = spacy.load('en_core', parse=True, tag=True, entity=True)
+text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
+
+def lemmatize_text(text):
+    text = nlp(text)
+    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
+    return text
+
+lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")
+
+# 去除特殊字符
+def remove_special_characters(text, remove_digits=False):
+    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
+    text = re.sub(pattern, '', text)
+    return text
+
+remove_special_characters("Well this was fun! What do you think? 123#@!",
+                          remove_digits=True)
+
+# 去除停用词
+tokenizer = ToktokTokenizer()
+stopword_list = nltk.corpus.stopwords.words('english')
+def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
+    tokens = tokenizer.tokenize(text)
+    tokens = [token.strip() for token in tokens]
+    if is_lower_case:
+        filtered_tokens = [token for token in tokens if token not in stopwords]
+    else:
+        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
+    filtered_text = ' '.join(filtered_tokens)
+    return filtered_text
+
+remove_stopwords("The, and, if are stopwords, computer is not")
+
+def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
+                     accented_char_removal=True, text_lower_case=True,
+                     text_lemmatization=True, special_char_removal=True,
+                     stopword_removal=True, remove_digits=True):
+
+    normalized_corpus = []
+    # normalize each document in the corpus
+    for doc in corpus:
+        # strip HTML
+        if html_stripping:
+            doc = strip_html_tags(doc)
+        # remove accented characters
+        if accented_char_removal:
+            doc = remove_accented_chars(doc)
+        # expand contractions
+        if contraction_expansion:
+            doc = expand_contractions(doc)
+        # lowercase the text
+        if text_lower_case:
+            doc = doc.lower()
+        # remove extra newlines
+        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
+        # lemmatize text
+        if text_lemmatization:
+            doc = lemmatize_text(doc)
+        # remove special characters and\or digits
+        if special_char_removal:
+            # insert spaces between special characters to isolate them
+            special_char_pattern = re.compile(r'([{.(-)!}])')
+            doc = special_char_pattern.sub(" \\1 ", doc)
+            doc = remove_special_characters(doc, remove_digits=remove_digits)
+            # remove extra whitespace
+        doc = re.sub(' +', ' ', doc)
+        # remove stopwords
+        if stopword_removal:
+            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
+
+        normalized_corpus.append(doc)
+
+    return normalized_corpus