变更

2025-08-05 09:19:34 +08:00
commit 584548d006
1696 changed files with 53855 additions and 0 deletions
@@ -0,0 +1,58 @@
+import nltk
+from nltk.corpus import gutenberg
+from pprint import pprint
+import numpy as np
+
+# loading text corpora
+alice = gutenberg.raw(fileids='carroll-alice.txt')
+sample_text = ("US unveils world's most powerful supercomputer, beats China. "
+               "The US has unveiled the world's most powerful supercomputer called 'Summit', "
+               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
+               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
+               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
+               "which reportedly take up the size of two tennis courts.")
+
+# Alice in Wonderland语料库
+# # Total characters in Alice in Wonderland
+# len(alice)
+#
+# # First 100 characters in the corpus
+# alice[0:100]
+
+# 默认句子标记解释器
+default_st = nltk.sent_tokenize
+alice_sentences = default_st(text=alice)
+sample_sentences = default_st(text=sample_text)
+
+print('Total sentences in sample_text:', len(sample_sentences))
+print('Sample text sentences :-')
+print(np.array(sample_sentences))
+
+print('\nTotal sentences in alice:', len(alice_sentences))
+print('First 5 sentences in alice:-')
+print(np.array(alice_sentences[0:5]))
+
+# 默认单词标记解析
+default_wt = nltk.word_tokenize
+words = default_wt(sample_text)
+np.array(words)
+
+# TreebankWordTokenizer 原理基于正则
+treebank_wt = nltk.TreebankWordTokenizer()
+words = treebank_wt.tokenize(sample_text)
+np.array(words)
+
+# ToktokTokenizer 仅最后解析最后一句话
+from nltk.tokenize.toktok import ToktokTokenizer
+tokenizer = ToktokTokenizer()
+words = tokenizer.tokenize(sample_text)
+np.array(words)
+
+# RegexpTokenizer 有两个主要参数
+# pattern 用于构建标记解析器
+# gaps 设置为True时用于查找标记符之间的间隙，繁殖查找标记符本身
+TOKEN_PATTERN = r'\w+'
+regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN,
+                                gaps=False)
+words = regex_wt.tokenize(sample_text)
+np.array(words)