Files
bettafish-company/WeiboSentiment_MachineLearning/4.lstm.ipynb
T

718 lines
23 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"cell_id": 40
},
"source": [
"### 加载数据集"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"cell_id": 1
},
"outputs": [],
"source": [
"from utils import load_corpus, stopwords\n",
"\n",
"TRAIN_PATH = \"./data/weibo2018/train.txt\"\n",
"TEST_PATH = \"./data/weibo2018/test.txt\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"cell_id": 2
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache /tmp/jieba.cache\n",
"Loading model cost 0.826 seconds.\n",
"Prefix dict has been built successfully.\n"
]
}
],
"source": [
"# 分别加载训练集和测试集\n",
"train_data = load_corpus(TRAIN_PATH)\n",
"test_data = load_corpus(TEST_PATH)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"cell_id": 3
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... 1\n",
"1 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... 0\n",
"2 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... 1\n",
"3 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... 1\n",
"4 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... 1"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df_train = pd.DataFrame(train_data, columns=[\"text\", \"label\"])\n",
"df_test = pd.DataFrame(test_data, columns=[\"text\", \"label\"])\n",
"df_train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"cell_id": 42
},
"source": [
"### 训练词向量"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"cell_id": 44
},
"outputs": [
{
"data": {
"text/plain": [
"0 [书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...\n",
"1 [这是, 英超, 被, 黑, 的, 最惨, 的, 一次, 二哈, 二哈, 十几年来, 中国,...\n",
"2 [中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...\n",
"3 [看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...\n",
"4 [汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...\n",
"Name: text, dtype: object"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# word2vec要求的输入格式: list(word)\n",
"wv_input = df_train['text'].map(lambda s: s.split(\" \")) # [for w in s.split(\" \") if w not in stopwords]\n",
"wv_input.head() "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"cell_id": 4
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/tiger/.local/lib/python3.7/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
" warnings.warn(msg)\n"
]
}
],
"source": [
"from gensim import models\n",
"\n",
"# Word2Vec\n",
"word2vec = models.Word2Vec(wv_input, \n",
" vector_size=64, # 词向量维度\n",
" min_count=1, # 最小词频, 因为数据量较小, 这里卡1\n",
" epochs=1000) # 迭代轮次"
]
},
{
"cell_type": "markdown",
"metadata": {
"cell_id": 46
},
"source": [
"查找近义词, 直观感受训练得到的word2vec效果"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"cell_id": 5,
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('我', 0.9441561102867126),\n",
" ('自己', 0.8928312659263611),\n",
" ('他', 0.8796129822731018),\n",
" ('的', 0.8601957559585571),\n",
" ('她', 0.855070948600769),\n",
" ('人', 0.8349815607070923),\n",
" ('都', 0.8168802261352539),\n",
" ('了', 0.8017680644989014),\n",
" ('就', 0.7990766763687134),\n",
" ('也', 0.7883183360099792)]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word2vec.wv.most_similar(\"你\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"cell_id": 38
},
"outputs": [
{
"data": {
"text/plain": [
"[('哈哈哈', 0.6309624910354614),\n",
" ('啦', 0.5457888841629028),\n",
" ('可爱', 0.5375339984893799),\n",
" ('了', 0.4885959327220917),\n",
" ('本柔', 0.46517741680145264),\n",
" ('笑', 0.4639575779438019),\n",
" ('哈哈哈哈', 0.45851588249206543),\n",
" ('心虚', 0.4576280415058136),\n",
" ('又', 0.45520466566085815),\n",
" ('呀', 0.4494859576225281)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word2vec.wv.most_similar(\"哈哈\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"cell_id": 39
},
"outputs": [
{
"data": {
"text/plain": [
"[('难过', 0.724579393863678),\n",
" ('哭', 0.6421604752540588),\n",
" ('想', 0.6415957808494568),\n",
" ('也', 0.6394745707511902),\n",
" ('真的', 0.6263709664344788),\n",
" ('我', 0.6136066317558289),\n",
" ('都', 0.608888566493988),\n",
" ('的', 0.6078368425369263),\n",
" ('就', 0.5916700959205627),\n",
" ('开心', 0.5899774432182312)]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word2vec.wv.most_similar(\"伤心\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"cell_id": 48
},
"source": [
"### 神经网络"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"cell_id": 14
},
"outputs": [],
"source": [
"import torch\n",
"from torch import nn\n",
"from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence,pad_packed_sequence\n",
"from torch.utils.data import Dataset, DataLoader\n",
"\n",
"device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"cell_id": 19
},
"outputs": [],
"source": [
"# 超参数\n",
"learning_rate = 5e-4\n",
"input_size = 768\n",
"num_epoches = 5\n",
"batch_size = 100\n",
"embed_size = 64\n",
"hidden_size = 64\n",
"num_layers = 2"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"cell_id": 7
},
"outputs": [],
"source": [
"# 数据集\n",
"class MyDataset(Dataset):\n",
" def __init__(self, df):\n",
" self.data = []\n",
" self.label = df[\"label\"].tolist()\n",
" for s in df[\"text\"].tolist():\n",
" vectors = []\n",
" for w in s.split(\" \"):\n",
" if w in word2vec.wv.key_to_index:\n",
" vectors.append(word2vec.wv[w]) # 将每个词替换为对应的词向量\n",
" vectors = torch.Tensor(vectors)\n",
" self.data.append(vectors)\n",
" \n",
" def __getitem__(self, index):\n",
" data = self.data[index]\n",
" label = self.label[index]\n",
" return data, label\n",
"\n",
" def __len__(self):\n",
" return len(self.label)\n",
"\n",
"def collate_fn(data):\n",
" \"\"\"\n",
" :param data: 第0维:data,第1维:label\n",
" :return: 序列化的data、记录实际长度的序列、以及label列表\n",
" \"\"\"\n",
" data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列\n",
" data_length = [len(sq[0]) for sq in data]\n",
" x = [i[0] for i in data]\n",
" y = [i[1] for i in data]\n",
" data = pad_sequence(x, batch_first=True, padding_value=0) # 用RNN处理变长序列的必要操作\n",
" return data, torch.tensor(y, dtype=torch.float32), data_length\n",
"\n",
"\n",
"# 训练集\n",
"train_data = MyDataset(df_train)\n",
"train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)\n",
"\n",
"# 测试集\n",
"test_data = MyDataset(df_test)\n",
"test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"cell_id": 11
},
"outputs": [],
"source": [
"# 网络结构\n",
"class LSTM(nn.Module):\n",
" def __init__(self, input_size, hidden_size, num_layers):\n",
" super(LSTM, self).__init__()\n",
" self.hidden_size = hidden_size\n",
" self.num_layers = num_layers\n",
" self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)\n",
" self.fc = nn.Linear(hidden_size * 2, 1) # 双向, 输出维度要*2\n",
" self.sigmoid = nn.Sigmoid()\n",
"\n",
" def forward(self, x, lengths):\n",
" h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) # 双向, 第一个维度要*2\n",
" c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)\n",
" \n",
" packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)\n",
" packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))\n",
"\n",
" lstm_out = torch.cat([h_n[-2], h_n[-1]], 1) # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出\n",
" out = self.fc(lstm_out)\n",
" out = self.sigmoid(out)\n",
" return out\n",
"\n",
"lstm = LSTM(embed_size, hidden_size, num_layers)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"cell_id": 26
},
"outputs": [],
"source": [
"from sklearn import metrics\n",
"\n",
"# 在测试集效果检验\n",
"def test():\n",
" y_pred, y_true = [], []\n",
"\n",
" with torch.no_grad():\n",
" for x, labels, lengths in test_loader:\n",
" x = x.to(device)\n",
" outputs = lstm(x, lengths) # 前向传播\n",
" outputs = outputs.view(-1) # 将输出展平\n",
" y_pred.append(outputs)\n",
" y_true.append(labels)\n",
"\n",
" y_prob = torch.cat(y_pred)\n",
" y_true = torch.cat(y_true)\n",
" y_pred = y_prob.clone()\n",
" y_pred[y_pred > 0.5] = 1\n",
" y_pred[y_pred <= 0.5] = 0\n",
" \n",
" print(metrics.classification_report(y_true, y_pred))\n",
" print(\"准确率:\", metrics.accuracy_score(y_true, y_pred))\n",
" print(\"AUC:\", metrics.roc_auc_score(y_true, y_prob) )"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"cell_id": 32
},
"outputs": [],
"source": [
"# 定义损失函数和优化器\n",
"criterion = nn.BCELoss()\n",
"optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"cell_id": 33,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1, step:10, loss:0.689099133014679\n",
"epoch:1, step:20, loss:0.6717442870140076\n",
"epoch:1, step:30, loss:0.650161862373352\n",
"epoch:1, step:40, loss:0.5935518741607666\n",
"epoch:1, step:50, loss:0.4994719922542572\n",
"epoch:1, step:60, loss:0.4774974286556244\n",
"epoch:1, step:70, loss:0.482360303401947\n",
"epoch:1, step:80, loss:0.44858306646347046\n",
"epoch:1, step:90, loss:0.4513603746891022\n",
"epoch:1, step:100, loss:0.4386572241783142\n",
" precision recall f1-score support\n",
"\n",
" 0.0 0.75 0.80 0.78 155\n",
" 1.0 0.91 0.88 0.89 345\n",
"\n",
" accuracy 0.86 500\n",
" macro avg 0.83 0.84 0.83 500\n",
"weighted avg 0.86 0.86 0.86 500\n",
"\n",
"准确率: 0.856\n",
"AUC: 0.9141841982234689\n",
"saved model: ./model/lstm_1.model\n",
"epoch:2, step:10, loss:0.4317778944969177\n",
"epoch:2, step:20, loss:0.41387200355529785\n",
"epoch:2, step:30, loss:0.4237545430660248\n",
"epoch:2, step:40, loss:0.364933043718338\n",
"epoch:2, step:50, loss:0.37595903873443604\n",
"epoch:2, step:60, loss:0.4067295491695404\n",
"epoch:2, step:70, loss:0.41071224212646484\n",
"epoch:2, step:80, loss:0.39134103059768677\n",
"epoch:2, step:90, loss:0.37907883524894714\n",
"epoch:2, step:100, loss:0.4322803020477295\n",
" precision recall f1-score support\n",
"\n",
" 0.0 0.80 0.63 0.71 155\n",
" 1.0 0.85 0.93 0.89 345\n",
"\n",
" accuracy 0.84 500\n",
" macro avg 0.83 0.78 0.80 500\n",
"weighted avg 0.83 0.84 0.83 500\n",
"\n",
"准确率: 0.838\n",
"AUC: 0.9174193548387096\n",
"saved model: ./model/lstm_2.model\n",
"epoch:3, step:10, loss:0.37696003913879395\n",
"epoch:3, step:20, loss:0.36385685205459595\n",
"epoch:3, step:30, loss:0.3907310664653778\n",
"epoch:3, step:40, loss:0.35576874017715454\n",
"epoch:3, step:50, loss:0.36152324080467224\n",
"epoch:3, step:60, loss:0.3620041608810425\n",
"epoch:3, step:70, loss:0.32647013664245605\n",
"epoch:3, step:80, loss:0.38903307914733887\n",
"epoch:3, step:90, loss:0.34238141775131226\n",
"epoch:3, step:100, loss:0.3952549397945404\n",
" precision recall f1-score support\n",
"\n",
" 0.0 0.75 0.79 0.77 155\n",
" 1.0 0.90 0.88 0.89 345\n",
"\n",
" accuracy 0.85 500\n",
" macro avg 0.83 0.84 0.83 500\n",
"weighted avg 0.86 0.85 0.85 500\n",
"\n",
"准确率: 0.854\n",
"AUC: 0.9280411407199626\n",
"saved model: ./model/lstm_3.model\n",
"epoch:4, step:10, loss:0.34902292490005493\n",
"epoch:4, step:20, loss:0.3277026116847992\n",
"epoch:4, step:30, loss:0.32119297981262207\n",
"epoch:4, step:40, loss:0.34501412510871887\n",
"epoch:4, step:50, loss:0.3202686905860901\n",
"epoch:4, step:60, loss:0.3599391579627991\n",
"epoch:4, step:70, loss:0.2958642542362213\n",
"epoch:4, step:80, loss:0.3152882158756256\n",
"epoch:4, step:90, loss:0.3151417374610901\n",
"epoch:4, step:100, loss:0.3314781188964844\n",
" precision recall f1-score support\n",
"\n",
" 0.0 0.78 0.81 0.79 155\n",
" 1.0 0.91 0.90 0.90 345\n",
"\n",
" accuracy 0.87 500\n",
" macro avg 0.84 0.85 0.85 500\n",
"weighted avg 0.87 0.87 0.87 500\n",
"\n",
"准确率: 0.868\n",
"AUC: 0.9314258999532491\n",
"saved model: ./model/lstm_4.model\n",
"epoch:5, step:10, loss:0.2638005316257477\n",
"epoch:5, step:20, loss:0.3028942048549652\n",
"epoch:5, step:30, loss:0.2819410562515259\n",
"epoch:5, step:40, loss:0.2857419550418854\n",
"epoch:5, step:50, loss:0.3177730441093445\n",
"epoch:5, step:60, loss:0.3140687346458435\n",
"epoch:5, step:70, loss:0.32480892539024353\n",
"epoch:5, step:80, loss:0.2964351177215576\n",
"epoch:5, step:90, loss:0.27567631006240845\n",
"epoch:5, step:100, loss:0.2848973870277405\n",
" precision recall f1-score support\n",
"\n",
" 0.0 0.83 0.74 0.78 155\n",
" 1.0 0.89 0.93 0.91 345\n",
"\n",
" accuracy 0.87 500\n",
" macro avg 0.86 0.83 0.84 500\n",
"weighted avg 0.87 0.87 0.87 500\n",
"\n",
"准确率: 0.87\n",
"AUC: 0.9310892940626461\n",
"saved model: ./model/lstm_5.model\n"
]
}
],
"source": [
"# 迭代训练\n",
"for epoch in range(num_epoches):\n",
" total_loss = 0\n",
" for i, (x, labels, lengths) in enumerate(train_loader):\n",
" x = x.to(device)\n",
" labels = labels.to(device)\n",
" outputs = lstm(x, lengths) # 前向传播\n",
" logits = outputs.view(-1) # 将输出展平\n",
" loss = criterion(logits, labels) # loss计算\n",
" total_loss += loss\n",
" optimizer.zero_grad() # 梯度清零\n",
" loss.backward(retain_graph=True) # 反向传播,计算梯度\n",
" optimizer.step() # 梯度更新\n",
" if (i+1) % 10 == 0:\n",
" print(\"epoch:{}, step:{}, loss:{}\".format(epoch+1, i+1, total_loss/10))\n",
" total_loss = 0\n",
" \n",
" # test\n",
" test()\n",
" \n",
" # save model\n",
" model_path = \"./model/lstm_{}.model\".format(epoch+1)\n",
" torch.save(lstm, model_path)\n",
" print(\"saved model: \", model_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"cell_id": 36
},
"source": [
"### 手动输入句子,判断情感倾向(1正/0负)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"cell_id": 51
},
"outputs": [],
"source": [
"net = torch.load(\"./model/lstm_5.model\") # 训练过程中的巅峰时刻"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"cell_id": 52
},
"outputs": [
{
"data": {
"text/plain": [
"tensor([0.9657, 0.3921])"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from utils import processing\n",
"\n",
"strs = [\"我想说我会爱你多一点点\", \"日有所思梦感伤\"]\n",
"\n",
"data = []\n",
"for s in strs:\n",
" vectors = []\n",
" for w in processing(s).split(\" \"):\n",
" if w in word2vec.wv.key_to_index:\n",
" vectors.append(word2vec.wv[w]) # 将每个词替换为对应的词向量\n",
" vectors = torch.Tensor(vectors)\n",
" data.append(vectors)\n",
"x, _, lengths = collate_fn(list(zip(data, [-1] * len(strs))))\n",
"with torch.no_grad():\n",
" x = x.to(device)\n",
" outputs = lstm(x, lengths) # 前向传播\n",
" outputs = outputs.view(-1) # 将输出展平\n",
"outputs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cell_id": 54
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"max_cell_id": 55
},
"nbformat": 4,
"nbformat_minor": 5
}