import nltk from nltk.corpus import gutenberg from pprint import pprint import numpy as np # loading text corpora alice = gutenberg.raw(fileids='carroll-alice.txt') sample_text = ("US unveils world's most powerful supercomputer, beats China. " "The US has unveiled the world's most powerful supercomputer called 'Summit', " "beating the previous record-holder China's Sunway TaihuLight. With a peak performance " "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, " "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, " "which reportedly take up the size of two tennis courts.") # Alice in Wonderland语料库 # # Total characters in Alice in Wonderland # len(alice) # # # First 100 characters in the corpus # alice[0:100] # 默认句子标记解释器 default_st = nltk.sent_tokenize alice_sentences = default_st(text=alice) sample_sentences = default_st(text=sample_text) print('Total sentences in sample_text:', len(sample_sentences)) print('Sample text sentences :-') print(np.array(sample_sentences)) print('\nTotal sentences in alice:', len(alice_sentences)) print('First 5 sentences in alice:-') print(np.array(alice_sentences[0:5])) # 默认单词标记解析 default_wt = nltk.word_tokenize words = default_wt(sample_text) np.array(words) # TreebankWordTokenizer 原理基于正则 treebank_wt = nltk.TreebankWordTokenizer() words = treebank_wt.tokenize(sample_text) np.array(words) # ToktokTokenizer 仅最后解析最后一句话 from nltk.tokenize.toktok import ToktokTokenizer tokenizer = ToktokTokenizer() words = tokenizer.tokenize(sample_text) np.array(words) # RegexpTokenizer 有两个主要参数 # pattern 用于构建标记解析器 # gaps 设置为True时用于查找标记符之间的间隙,繁殖查找标记符本身 TOKEN_PATTERN = r'\w+' regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=False) words = regex_wt.tokenize(sample_text) np.array(words)