变更
This commit is contained in:
@@ -0,0 +1,58 @@
|
||||
import nltk
|
||||
from nltk.corpus import gutenberg
|
||||
from pprint import pprint
|
||||
import numpy as np
|
||||
|
||||
# loading text corpora
|
||||
alice = gutenberg.raw(fileids='carroll-alice.txt')
|
||||
sample_text = ("US unveils world's most powerful supercomputer, beats China. "
|
||||
"The US has unveiled the world's most powerful supercomputer called 'Summit', "
|
||||
"beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
|
||||
"of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
|
||||
"which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
|
||||
"which reportedly take up the size of two tennis courts.")
|
||||
|
||||
# Alice in Wonderland语料库
|
||||
# # Total characters in Alice in Wonderland
|
||||
# len(alice)
|
||||
#
|
||||
# # First 100 characters in the corpus
|
||||
# alice[0:100]
|
||||
|
||||
# 默认句子标记解释器
|
||||
default_st = nltk.sent_tokenize
|
||||
alice_sentences = default_st(text=alice)
|
||||
sample_sentences = default_st(text=sample_text)
|
||||
|
||||
print('Total sentences in sample_text:', len(sample_sentences))
|
||||
print('Sample text sentences :-')
|
||||
print(np.array(sample_sentences))
|
||||
|
||||
print('\nTotal sentences in alice:', len(alice_sentences))
|
||||
print('First 5 sentences in alice:-')
|
||||
print(np.array(alice_sentences[0:5]))
|
||||
|
||||
# 默认单词标记解析
|
||||
default_wt = nltk.word_tokenize
|
||||
words = default_wt(sample_text)
|
||||
np.array(words)
|
||||
|
||||
# TreebankWordTokenizer 原理基于正则
|
||||
treebank_wt = nltk.TreebankWordTokenizer()
|
||||
words = treebank_wt.tokenize(sample_text)
|
||||
np.array(words)
|
||||
|
||||
# ToktokTokenizer 仅最后解析最后一句话
|
||||
from nltk.tokenize.toktok import ToktokTokenizer
|
||||
tokenizer = ToktokTokenizer()
|
||||
words = tokenizer.tokenize(sample_text)
|
||||
np.array(words)
|
||||
|
||||
# RegexpTokenizer 有两个主要参数
|
||||
# pattern 用于构建标记解析器
|
||||
# gaps 设置为True时用于查找标记符之间的间隙,繁殖查找标记符本身
|
||||
TOKEN_PATTERN = r'\w+'
|
||||
regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN,
|
||||
gaps=False)
|
||||
words = regex_wt.tokenize(sample_text)
|
||||
np.array(words)
|
||||
Reference in New Issue
Block a user